Python tensorflow.python.keras.preprocessing.sequence.pad_sequences() Examples
The following are 9
code examples of tensorflow.python.keras.preprocessing.sequence.pad_sequences().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
tensorflow.python.keras.preprocessing.sequence
, or try the search function
.
Example #1
Source File: utils.py From cloudml-samples with Apache License 2.0 | 6 votes |
def preprocess(train_data_file, word_index_file, num_words): """Loads Numpy file .npz format and process its the data. Pad the arrays so they all have the same length, then create an integer tensor of shape max_length * num_reviews. Then we use an embedding layer capable of handling this shape as the first layer in our network. Args: train_data_file: (str) Location of file. word_index_file: (str) Location of JSON file with index information. num_words: (int) Number of words to get from IMDB dataset. Returns: A tuple of training and test data. """ (train_data, train_labels), (test_data, test_labels) = _load_data( path=train_data_file, num_words=num_words) word_index = _get_word_index(word_index_file) # Standardize the lengths for training. train_data = pad_sequences(train_data, value=word_index['<PAD>'], padding='post', maxlen=SENTENCE_SIZE) # Standardize the lengths for test. test_data = pad_sequences(test_data, value=word_index['<PAD>'], padding='post', maxlen=SENTENCE_SIZE) return (train_data, train_labels), (test_data, test_labels)
Example #2
Source File: base_processor.py From text2vec with Apache License 2.0 | 5 votes |
def process_x_dataset(self, data: List[List[str]], max_len: Optional[int] = None, subset: Optional[List[int]] = None) -> np.ndarray: from tensorflow.python.keras.preprocessing.sequence import pad_sequences if max_len is None: max_len = self.sequence_length if subset is not None: target = get_list_subset(data, subset) else: target = data numerized_samples = self.numerize_token_sequences(target) return pad_sequences(numerized_samples, max_len, padding='post', truncating='post')
Example #3
Source File: util.py From DiPS with Apache License 2.0 | 5 votes |
def split_and_zero_padding(df, max_seq_length): # Split to dicts X = {'left': df['question1_n'], 'right': df['question2_n']} # Zero padding for dataset, side in itertools.product([X], ['left', 'right']): dataset[side] = pad_sequences(dataset[side], padding='pre', truncating='post', maxlen=max_seq_length) return dataset # --
Example #4
Source File: test_explanation_model.py From cxplain with MIT License | 5 votes |
def test_nlp_padded_valid(self): num_words = 1024 (x_train, y_train), (x_test, y_test) = TestUtil.get_random_variable_length_dataset(max_value=num_words) explained_model = RandomForestClassifier(n_estimators=64, max_depth=5, random_state=1) counter = CountVectoriser(num_words) tfidf_transformer = TfidfTransformer() explained_model = Pipeline([('counts', counter), ('tfidf', tfidf_transformer), ('model', explained_model)]) explained_model.fit(x_train, y_train) model_builder = RNNModelBuilder(embedding_size=num_words, with_embedding=True, num_layers=2, num_units=32, activation="relu", p_dropout=0.2, verbose=0, batch_size=32, learning_rate=0.001, num_epochs=2, early_stopping_patience=128) masking_operation = WordDropMasking() loss = binary_crossentropy explainer = CXPlain(explained_model, model_builder, masking_operation, loss) x_train = pad_sequences(x_train, padding="post", truncating="post", dtype=int) x_test = pad_sequences(x_test, padding="post", truncating="post", dtype=int, maxlen=x_train.shape[1]) explainer.fit(x_train, y_train) eval_score = explainer.score(x_test, y_test) train_score = explainer.get_last_fit_score() median = explainer.predict(x_test) self.assertTrue(median.shape == x_test.shape)
Example #5
Source File: test_explanation_model.py From cxplain with MIT License | 5 votes |
def test_imdb_padded_valid(self): num_samples = 32 num_words = 1024 (x_train, y_train), (x_test, y_test) = TestUtil.get_imdb(word_dictionary_size=num_words, num_subsamples=num_samples) explained_model = RandomForestClassifier(n_estimators=64, max_depth=5, random_state=1) counter = CountVectoriser(num_words) tfidf_transformer = TfidfTransformer() explained_model = Pipeline([('counts', counter), ('tfidf', tfidf_transformer), ('model', explained_model)]) explained_model.fit(x_train, y_train) model_builder = RNNModelBuilder(embedding_size=num_words, with_embedding=True, num_layers=2, num_units=32, activation="relu", p_dropout=0.2, verbose=0, batch_size=32, learning_rate=0.001, num_epochs=2, early_stopping_patience=128) masking_operation = WordDropMasking() loss = binary_crossentropy explainer = CXPlain(explained_model, model_builder, masking_operation, loss) x_train = pad_sequences(x_train, padding="post", truncating="post", dtype=int) x_test = pad_sequences(x_test, padding="post", truncating="post", dtype=int, maxlen=x_train.shape[1]) explainer.fit(x_train, y_train) eval_score = explainer.score(x_test, y_test) train_score = explainer.get_last_fit_score() median = explainer.predict(x_test) self.assertTrue(median.shape == x_test.shape)
Example #6
Source File: test_validation.py From cxplain with MIT License | 5 votes |
def test_is_variable_length_padded_false(self): (x, _), _ = TestUtil.get_random_variable_length_dataset(max_value=1024) x = pad_sequences(x, padding="post", truncating="post", dtype=int) return_value = Validation.is_variable_length(x) self.assertEqual(return_value, False)
Example #7
Source File: data_helper.py From attention_keras with MIT License | 5 votes |
def sents2sequences(tokenizer, sentences, reverse=False, pad_length=None, padding_type='post'): encoded_text = tokenizer.texts_to_sequences(sentences) preproc_text = pad_sequences(encoded_text, padding=padding_type, maxlen=pad_length) if reverse: preproc_text = np.flip(preproc_text, axis=1) return preproc_text
Example #8
Source File: test_causal_loss.py From cxplain with MIT License | 4 votes |
def test_causal_loss_padded_input(self): models = TestUtil.get_classification_models() batch_size = 32 num_samples = 1024 num_words = 1024 (x_train, y_train), (x_test, y_test) = \ TestUtil.get_random_variable_length_dataset(num_samples=num_samples, max_value=num_words) x, y = np.concatenate([x_train, x_test], axis=0), np.concatenate([y_train, y_test], axis=0) self.assertEqual(x.shape[0], num_samples) for explained_model in models: counter = CountVectoriser(num_words) tfidf_transformer = TfidfTransformer() explained_model = Pipeline([('counts', counter), ('tfidf', tfidf_transformer), ('model', explained_model)]) TestUtil.fit_proxy(explained_model, x, y) masking = WordDropMasking() x = pad_sequences(x, padding="post", truncating="post", dtype=int) _, y_pred, all_y_pred_imputed = masking.get_predictions_after_masking(explained_model, x, y, batch_size=batch_size, downsample_factors=(1,), flatten=False) auxiliary_outputs = y_pred all_but_one_auxiliary_outputs = all_y_pred_imputed all_but_one_auxiliary_outputs = TestUtil.split_auxiliary_outputs_on_feature_dim( all_but_one_auxiliary_outputs ) delta_errors = calculate_delta_errors(y, auxiliary_outputs, all_but_one_auxiliary_outputs, NumpyInterface.binary_crossentropy, math_ops=NumpyInterface) # Ensure correct delta error dimensionality. self.assertEqual(delta_errors.shape, (num_samples, x.shape[1]))
Example #9
Source File: optimize_example.py From nlp-architect with Apache License 2.0 | 4 votes |
def run_loss(args): data = args["data"] # For each run we want to get a new random balance data.process() # split, train, test dense_out = len(data.labels[0]) # split for all models X_train_, X_test_, Y_train, Y_test = train_test_split( data.text, data.labels, test_size=0.20, random_state=42 ) print(args) # Prep data for the LSTM model # This currently will train the tokenizer on all text (unbalanced and train/test) # It would be nice to replace this with a pretrained embedding on larger text tokenizer = Tokenizer(num_words=int(args["max_features"]), split=" ") tokenizer.fit_on_texts(data.all_text) X_train = tokenizer.texts_to_sequences(X_train_) X_train = pad_sequences(X_train, maxlen=max_len) X_test = tokenizer.texts_to_sequences(X_test_) X_test = pad_sequences(X_test, maxlen=max_len) # Train the LSTM model lstm_model = simple_lstm( int(args["max_features"]), dense_out, X_train.shape[1], int(args["embed_dim"]), int(args["lstm_out"]), args["dropout"], ) if args["epochs"] == 0: args["epochs"] = 1 es = EarlyStopping(monitor="val_acc", min_delta=0, patience=6, verbose=0, mode="max") model_hist = lstm_model.fit( X_train, Y_train, epochs=args["epochs"], batch_size=batch_size, verbose=1, validation_data=(X_test, Y_test), callbacks=[es], ) lstm_acc = model_hist.history["val_acc"][-1] print("LSTM model accuracy ", lstm_acc) # This minimizes, so the maximize we have to take the inverse :) return 1 - lstm_acc