Python Examples of tensorflow.python.keras.preprocessing.sequence.pad

Source File: utils.py From cloudml-samples with Apache License 2.0

6 votes

def preprocess(train_data_file, word_index_file, num_words):
  """Loads Numpy file .npz format and process its the data.

  Pad the arrays so they all have the same length, then create an integer
  tensor of shape max_length * num_reviews. Then we use an embedding layer
  capable of handling this shape as the first layer in our network.

  Args:
    train_data_file: (str) Location of file.
    word_index_file: (str) Location of JSON file with index information.
    num_words: (int) Number of words to get from IMDB dataset.

  Returns:
    A tuple of training and test data.
  """
  (train_data, train_labels), (test_data, test_labels) = _load_data(
      path=train_data_file, num_words=num_words)
  word_index = _get_word_index(word_index_file)
  # Standardize the lengths for training.
  train_data = pad_sequences(train_data, value=word_index['<PAD>'],
                             padding='post', maxlen=SENTENCE_SIZE)
  # Standardize the lengths for test.
  test_data = pad_sequences(test_data, value=word_index['<PAD>'],
                            padding='post', maxlen=SENTENCE_SIZE)
  return (train_data, train_labels), (test_data, test_labels)

Source File: base_processor.py From text2vec with Apache License 2.0

5 votes

def process_x_dataset(self,
                          data: List[List[str]],
                          max_len: Optional[int] = None,
                          subset: Optional[List[int]] = None) -> np.ndarray:
        from tensorflow.python.keras.preprocessing.sequence import pad_sequences
        if max_len is None:
            max_len = self.sequence_length
        if subset is not None:
            target = get_list_subset(data, subset)
        else:
            target = data
        numerized_samples = self.numerize_token_sequences(target)

        return pad_sequences(numerized_samples, max_len, padding='post', truncating='post')

Source File: util.py From DiPS with Apache License 2.0

5 votes

def split_and_zero_padding(df, max_seq_length):
	# Split to dicts
	X = {'left': df['question1_n'], 'right': df['question2_n']}

	# Zero padding
	for dataset, side in itertools.product([X], ['left', 'right']):
		dataset[side] = pad_sequences(dataset[side], padding='pre', truncating='post', maxlen=max_seq_length)

	return dataset


#  --

Source File: test_explanation_model.py From cxplain with MIT License

5 votes

def test_nlp_padded_valid(self):
        num_words = 1024
        (x_train, y_train), (x_test, y_test) = TestUtil.get_random_variable_length_dataset(max_value=num_words)

        explained_model = RandomForestClassifier(n_estimators=64, max_depth=5, random_state=1)

        counter = CountVectoriser(num_words)
        tfidf_transformer = TfidfTransformer()

        explained_model = Pipeline([('counts', counter),
                                    ('tfidf', tfidf_transformer),
                                    ('model', explained_model)])
        explained_model.fit(x_train, y_train)

        model_builder = RNNModelBuilder(embedding_size=num_words, with_embedding=True,
                                        num_layers=2, num_units=32, activation="relu", p_dropout=0.2, verbose=0,
                                        batch_size=32, learning_rate=0.001, num_epochs=2, early_stopping_patience=128)
        masking_operation = WordDropMasking()
        loss = binary_crossentropy
        explainer = CXPlain(explained_model, model_builder, masking_operation, loss)

        x_train = pad_sequences(x_train, padding="post", truncating="post", dtype=int)
        x_test = pad_sequences(x_test, padding="post", truncating="post", dtype=int, maxlen=x_train.shape[1])

        explainer.fit(x_train, y_train)
        eval_score = explainer.score(x_test, y_test)
        train_score = explainer.get_last_fit_score()
        median = explainer.predict(x_test)
        self.assertTrue(median.shape == x_test.shape)

Source File: test_explanation_model.py From cxplain with MIT License

5 votes

def test_imdb_padded_valid(self):
        num_samples = 32
        num_words = 1024
        (x_train, y_train), (x_test, y_test) = TestUtil.get_imdb(word_dictionary_size=num_words,
                                                                 num_subsamples=num_samples)

        explained_model = RandomForestClassifier(n_estimators=64, max_depth=5, random_state=1)

        counter = CountVectoriser(num_words)
        tfidf_transformer = TfidfTransformer()

        explained_model = Pipeline([('counts', counter),
                                    ('tfidf', tfidf_transformer),
                                    ('model', explained_model)])
        explained_model.fit(x_train, y_train)

        model_builder = RNNModelBuilder(embedding_size=num_words, with_embedding=True,
                                        num_layers=2, num_units=32, activation="relu", p_dropout=0.2, verbose=0,
                                        batch_size=32, learning_rate=0.001, num_epochs=2, early_stopping_patience=128)
        masking_operation = WordDropMasking()
        loss = binary_crossentropy
        explainer = CXPlain(explained_model, model_builder, masking_operation, loss)

        x_train = pad_sequences(x_train, padding="post", truncating="post", dtype=int)
        x_test = pad_sequences(x_test, padding="post", truncating="post", dtype=int, maxlen=x_train.shape[1])

        explainer.fit(x_train, y_train)
        eval_score = explainer.score(x_test, y_test)
        train_score = explainer.get_last_fit_score()
        median = explainer.predict(x_test)
        self.assertTrue(median.shape == x_test.shape)

Source File: test_validation.py From cxplain with MIT License

5 votes

def test_is_variable_length_padded_false(self):
        (x, _), _ = TestUtil.get_random_variable_length_dataset(max_value=1024)
        x = pad_sequences(x, padding="post", truncating="post", dtype=int)
        return_value = Validation.is_variable_length(x)
        self.assertEqual(return_value, False)

Source File: data_helper.py From attention_keras with MIT License

5 votes

def sents2sequences(tokenizer, sentences, reverse=False, pad_length=None, padding_type='post'):
    encoded_text = tokenizer.texts_to_sequences(sentences)
    preproc_text = pad_sequences(encoded_text, padding=padding_type, maxlen=pad_length)
    if reverse:
        preproc_text = np.flip(preproc_text, axis=1)

    return preproc_text

Source File: test_causal_loss.py From cxplain with MIT License

4 votes

def test_causal_loss_padded_input(self):
        models = TestUtil.get_classification_models()

        batch_size = 32
        num_samples = 1024
        num_words = 1024

        (x_train, y_train), (x_test, y_test) = \
            TestUtil.get_random_variable_length_dataset(num_samples=num_samples, max_value=num_words)
        x, y = np.concatenate([x_train, x_test], axis=0), np.concatenate([y_train, y_test], axis=0)

        self.assertEqual(x.shape[0], num_samples)

        for explained_model in models:
            counter = CountVectoriser(num_words)
            tfidf_transformer = TfidfTransformer()

            explained_model = Pipeline([('counts', counter),
                                        ('tfidf', tfidf_transformer),
                                        ('model', explained_model)])
            TestUtil.fit_proxy(explained_model, x, y)
            masking = WordDropMasking()

            x = pad_sequences(x, padding="post", truncating="post", dtype=int)

            _, y_pred, all_y_pred_imputed = masking.get_predictions_after_masking(explained_model, x, y,
                                                                                  batch_size=batch_size,
                                                                                  downsample_factors=(1,),
                                                                                  flatten=False)
            auxiliary_outputs = y_pred
            all_but_one_auxiliary_outputs = all_y_pred_imputed
            all_but_one_auxiliary_outputs = TestUtil.split_auxiliary_outputs_on_feature_dim(
                all_but_one_auxiliary_outputs
            )

            delta_errors = calculate_delta_errors(y,
                                                  auxiliary_outputs,
                                                  all_but_one_auxiliary_outputs,
                                                  NumpyInterface.binary_crossentropy,
                                                  math_ops=NumpyInterface)

            # Ensure correct delta error dimensionality.
            self.assertEqual(delta_errors.shape, (num_samples, x.shape[1]))

Source File: optimize_example.py From nlp-architect with Apache License 2.0

4 votes

def run_loss(args):
    data = args["data"]

    # For each run we want to get a new random balance
    data.process()
    # split, train, test
    dense_out = len(data.labels[0])
    # split for all models
    X_train_, X_test_, Y_train, Y_test = train_test_split(
        data.text, data.labels, test_size=0.20, random_state=42
    )

    print(args)

    # Prep data for the LSTM model
    # This currently will train the tokenizer on all text (unbalanced and train/test)
    # It would be nice to replace this with a pretrained embedding on larger text

    tokenizer = Tokenizer(num_words=int(args["max_features"]), split=" ")
    tokenizer.fit_on_texts(data.all_text)
    X_train = tokenizer.texts_to_sequences(X_train_)
    X_train = pad_sequences(X_train, maxlen=max_len)
    X_test = tokenizer.texts_to_sequences(X_test_)
    X_test = pad_sequences(X_test, maxlen=max_len)

    # Train the LSTM model
    lstm_model = simple_lstm(
        int(args["max_features"]),
        dense_out,
        X_train.shape[1],
        int(args["embed_dim"]),
        int(args["lstm_out"]),
        args["dropout"],
    )

    if args["epochs"] == 0:
        args["epochs"] = 1

    es = EarlyStopping(monitor="val_acc", min_delta=0, patience=6, verbose=0, mode="max")
    model_hist = lstm_model.fit(
        X_train,
        Y_train,
        epochs=args["epochs"],
        batch_size=batch_size,
        verbose=1,
        validation_data=(X_test, Y_test),
        callbacks=[es],
    )
    lstm_acc = model_hist.history["val_acc"][-1]
    print("LSTM model accuracy ", lstm_acc)
    # This minimizes, so the maximize we have to take the inverse :)
    return 1 - lstm_acc

Python tensorflow.python.keras.preprocessing.sequence.pad_sequences() Examples