Python tensorflow.keras.preprocessing.sequence.pad_sequences() Examples
The following are 17
code examples of tensorflow.keras.preprocessing.sequence.pad_sequences().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
tensorflow.keras.preprocessing.sequence
, or try the search function
.
Example #1
Source File: bilstm_crf.py From nlp-journey with Apache License 2.0 | 6 votes |
def get_acc_one_step(model, logits, text_lens, labels_batch): paths = [] accuracy = 0 for logit, text_len, labels in zip(logits, text_lens, labels_batch): viterbi_path, _ = ta.text.viterbi_decode(logit[:text_len], model.transition_params) paths.append(viterbi_path) correct_prediction = tf.equal( tf.convert_to_tensor(tf.keras.preprocessing.sequence.pad_sequences([viterbi_path], padding='post'), dtype=tf.int32), tf.convert_to_tensor(tf.keras.preprocessing.sequence.pad_sequences([labels[:text_len]], padding='post'), dtype=tf.int32) ) accuracy = accuracy + tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) accuracy = accuracy / len(paths) return accuracy # 识别句子中的实体
Example #2
Source File: predict.py From urduhack with MIT License | 6 votes |
def predict_tags(text: str) -> list: """ Predicts POS Tags Args: text (str): Input text string Returns: list: Containing words their tags """ global _POS_TAGGER_MODEL, _WORD2IDX, _IDX2TAG if _POS_TAGGER_MODEL is None: _POS_TAGGER_MODEL, _WORD2IDX, _IDX2TAG = _load_metadata(POS_TAGGER_WEIGHTS_PATH, POS_WORD2IDX_PATH, POS_TAG2IDX_PATH) tokens = text.split() encoded = [[_WORD2IDX[word] if word in _WORD2IDX else _WORD2IDX["UNK"] for word in tokens]] padded = pad_sequences(sequences=encoded, maxlen=50, value=_WORD2IDX['PAD'], padding='post') predictions = _POS_TAGGER_MODEL.predict(padded) pred_tags = np.argmax(predictions, axis=2).reshape(predictions.shape[1]) word_tags = [(word, _IDX2TAG[idx]) for word, idx in zip(tokens, pred_tags)] return word_tags
Example #3
Source File: predict.py From urduhack with MIT License | 6 votes |
def predict_ner(text: str) -> list: """ Predicts NER Tags Args: text (str): Input text string Returns: list: Containing words their tags """ global _NER_MODEL, _WORD2IDX, _IDX2TAG if _NER_MODEL is None: _NER_MODEL, _WORD2IDX, _IDX2TAG = _load_metadata(NER_WEIGHTS_PATH, NER_WORD2IDX_PATH, NER_TAG2IDX_PATH) tokens = text.split() encoded = [[_WORD2IDX[word] if word in _WORD2IDX else _WORD2IDX["UNK"] for word in tokens]] padded = pad_sequences(sequences=encoded, maxlen=55, value=_WORD2IDX['PAD'], padding='post') predictions = _NER_MODEL.predict(padded) pred_tags = np.argmax(predictions, axis=2).reshape(predictions.shape[1]) word_tags = [(word, _IDX2TAG[idx]) for word, idx in zip(tokens, pred_tags)] return word_tags
Example #4
Source File: execute.py From tensorflow2.0-coding with MIT License | 5 votes |
def predict(sentences): state=['pos','neg'] model=create_model() indexes = text_to_vector(sentences) print(indexes) inp = pad_sequences([indexes]) inp=tf.reshape(inp[0],(1,len(inp[0]))) predictions=model.step(inp,inp,False) pred = tf.math.argmax(predictions[0]) p=np.int32(pred.numpy()) return state[p]
Example #5
Source File: bilstm_crf.py From nlp-journey with Apache License 2.0 | 5 votes |
def _preprocess_data(self, data, max_len=100): x = [self.word2idx.get(w[0].lower(), 1) for w in data] length = len(x) x = pad_sequences([x], max_len) return x, length # 构造模型
Example #6
Source File: bilstm_crf.py From nlp-journey with Apache License 2.0 | 5 votes |
def _process_data(data, word2idx, chunk_tags, max_len=None): if max_len is None: max_len = max(len(s) for s in data) x = [[word2idx.get(w[0].lower(), 1) for w in s] for s in data] y_chunk = [[chunk_tags.index(w[1]) for w in s] for s in data] x = pad_sequences(x, max_len, padding='post') y_chunk = pad_sequences(y_chunk, max_len, padding='post') return x, y_chunk
Example #7
Source File: pbt_memnn_example.py From ray with Apache License 2.0 | 5 votes |
def vectorize_stories(word_idx, story_maxlen, query_maxlen, data): inputs, queries, answers = [], [], [] for story, query, answer in data: inputs.append([word_idx[w] for w in story]) queries.append([word_idx[w] for w in query]) answers.append(word_idx[answer]) return (pad_sequences(inputs, maxlen=story_maxlen), pad_sequences(queries, maxlen=query_maxlen), np.array(answers))
Example #8
Source File: siamese_similarity.py From nlp-journey with Apache License 2.0 | 5 votes |
def _process_data(self, text): t = [[self.word_index.get(word, 0) for word in clean_to_list( tex)] for tex in text] t = pad_sequences(t, maxlen=self.max_length) return t # 保存路径与加载路径相同
Example #9
Source File: machine_translation.py From attention-mechanisms with MIT License | 5 votes |
def tokenize(language): """Function to tokenize language by mapping words to integer indices""" # Perform tokenization language_tokenizer = Tokenizer(filters='') language_tokenizer.fit_on_texts(language) tensor = language_tokenizer.texts_to_sequences(language) # Pad sequences to maximum found sequence length by appending 0s to end tensor = pad_sequences(sequences=tensor, padding='post') return tensor, language_tokenizer
Example #10
Source File: nn.py From bugbug with Mozilla Public License 2.0 | 5 votes |
def transform(self, data): sequences = self.tokenizer.texts_to_sequences(data) return pad_sequences(sequences, maxlen=self.maxlen)
Example #11
Source File: prepare_data.py From Text-Classification with Apache License 2.0 | 5 votes |
def data_preprocessing_v2(train, test, max_len, max_words=50000): tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=max_words) tokenizer.fit_on_texts(train) train_idx = tokenizer.texts_to_sequences(train) test_idx = tokenizer.texts_to_sequences(test) train_padded = pad_sequences(train_idx, maxlen=max_len, padding='post', truncating='post') test_padded = pad_sequences(test_idx, maxlen=max_len, padding='post', truncating='post') # vocab size = len(word_docs) + 2 (<UNK>, <PAD>) return train_padded, test_padded, max_words + 2
Example #12
Source File: prepare_data.py From Text-Classification with Apache License 2.0 | 5 votes |
def data_preprocessing_with_dict(train, test, max_len): tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token='<UNK>') tokenizer.fit_on_texts(train) train_idx = tokenizer.texts_to_sequences(train) test_idx = tokenizer.texts_to_sequences(test) train_padded = pad_sequences(train_idx, maxlen=max_len, padding='post', truncating='post') test_padded = pad_sequences(test_idx, maxlen=max_len, padding='post', truncating='post') # vocab size = len(word_docs) + 2 (<UNK>, <PAD>) return train_padded, test_padded, tokenizer.word_docs, tokenizer.word_index, len(tokenizer.word_docs) + 2
Example #13
Source File: execute.py From tensorflow2.0-coding with MIT License | 5 votes |
def pad_sequences(inp): out_sequences=sequence.pad_sequences(inp, maxlen=gConfig['sentence_size'],padding='post',value=0) return out_sequences
Example #14
Source File: utils.py From deep-code-search with MIT License | 5 votes |
def pad(data, len=None): from tensorflow.keras.preprocessing.sequence import pad_sequences return pad_sequences(data, maxlen=len, padding='post', truncating='post', value=0)
Example #15
Source File: imdb.py From keras-attention-mechanism with Apache License 2.0 | 4 votes |
def train_and_evaluate_model_on_imdb(add_attention=True): numpy.random.seed(7) # load the dataset but only keep the top n words, zero the rest top_words = 5000 (X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words) # truncate and pad input sequences max_review_length = 500 X_train = sequence.pad_sequences(X_train, maxlen=max_review_length) X_test = sequence.pad_sequences(X_test, maxlen=max_review_length) # create the model embedding_vector_length = 32 i = Input(shape=(max_review_length,)) x = Embedding(top_words, embedding_vector_length, input_length=max_review_length)(i) x = Dropout(0.5)(x) if add_attention: x = LSTM(100, return_sequences=True)(x) x = attention_3d_block(x) else: x = LSTM(100, return_sequences=False)(x) x = Dense(350, activation='relu')(x) # same number of parameters so fair comparison. x = Dropout(0.5)(x) x = Dense(1, activation='sigmoid')(x) model = Model(inputs=[i], outputs=[x]) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) print(model.summary()) class RecordBestTestAccuracy(Callback): def __init__(self): super().__init__() self.val_accuracies = [] self.val_losses = [] def on_epoch_end(self, epoch, logs=None): self.val_accuracies.append(logs['val_accuracy']) self.val_losses.append(logs['val_loss']) rbta = RecordBestTestAccuracy() model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=64, callbacks=[rbta]) print(f"Max Test Accuracy: {100 * np.max(rbta.val_accuracies):.2f} %") print(f"Mean Test Accuracy: {100 * np.mean(rbta.val_accuracies):.2f} %")
Example #16
Source File: embedding_lstm.py From asreview with Apache License 2.0 | 4 votes |
def text_to_features(sequences, loop_sequence=1, num_words=20000, max_sequence_length=1000, padding='post', truncating='post'): """Convert text data into features. Arguments --------- sequences: list, numpy.ndarray, pandas.Series The sequences to convert into features. num_words: int See keras Tokenizer Returns ------- np.ndarray, dict The array with features and the dictiory that maps words to values. """ from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences # fit on texts tokenizer = Tokenizer(num_words=num_words) tokenizer.fit_on_texts(sequences) # tokenize sequences tokens = tokenizer.texts_to_sequences(sequences) # Pad sequences with zeros. x = pad_sequences( tokens, maxlen=max_sequence_length, padding=padding, truncating=truncating ) if loop_sequence == 1: x = loop_sequences(x, max_sequence_length) # word index hack. see issue # https://github.com/keras-team/keras/issues/8092 word_index = {e: i for e, i in tokenizer.word_index.items() if i <= num_words} return x, word_index
Example #17
Source File: siamese_similarity.py From nlp-journey with Apache License 2.0 | 4 votes |
def _load_data(self, test_size=0.2): log.info('数据预处理...') # word:index和index:word word_index = dict() index_word = ['<unk>'] questions_cols = ['question1', 'question2'] log.info('加载数据集...') train_data = os.path.join(self.data_path, 'train.csv') test_data = os.path.join(self.data_path, 'test.csv') train_df = pd.read_csv(train_data) test_df = pd.read_csv(test_data) # 找到最大的句子长度 sentences = [df[col].str.split(' ') for df in [train_df, test_df] for col in questions_cols] max_length = max([len(s) for ss in sentences for s in ss if isinstance(s, list)]) # 预处理(统计并将字符串转换为索引) for dataset in [train_df, test_df]: for index, row in dataset.iterrows(): for question_col in questions_cols: question_indexes = [] for word in clean_to_list(row[question_col]): if word in self.stops: continue if word not in word_index: word_index[word] = len(index_word) question_indexes.append(len(index_word)) index_word.append(word) else: question_indexes.append(word_index[word]) dataset._set_value(index, question_col, question_indexes) x = train_df[questions_cols] y = train_df['is_duplicate'] x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=test_size) x_train = {'left': x_train.question1, 'right': x_train.question2} x_val = {'left': x_val.question1, 'right': x_val.question2} y_train = y_train.values y_val = y_val.values for dataset, side in itertools.product([x_train, x_val], ['left', 'right']): dataset[side] = pad_sequences(dataset[side], maxlen=max_length) # 校验问题对各自数目是否正确 assert x_train['left'].shape == x_train['right'].shape assert len(x_train['left']) == len(y_train) return x_train, y_train, x_val, y_val, word_index, max_length