Python Examples of tensorflow.keras.preprocessing.sequence.pad

Source File: bilstm_crf.py From nlp-journey with Apache License 2.0

6 votes

def get_acc_one_step(model, logits, text_lens, labels_batch):
        paths = []
        accuracy = 0
        for logit, text_len, labels in zip(logits, text_lens, labels_batch):
            viterbi_path, _ = ta.text.viterbi_decode(logit[:text_len], model.transition_params)
            paths.append(viterbi_path)
            correct_prediction = tf.equal(
                tf.convert_to_tensor(tf.keras.preprocessing.sequence.pad_sequences([viterbi_path], padding='post'),
                                     dtype=tf.int32),
                tf.convert_to_tensor(tf.keras.preprocessing.sequence.pad_sequences([labels[:text_len]], padding='post'),
                                     dtype=tf.int32)
            )
            accuracy = accuracy + tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
        accuracy = accuracy / len(paths)
        return accuracy

    # 识别句子中的实体

Source File: predict.py From urduhack with MIT License

6 votes

def predict_tags(text: str) -> list:
    """
    Predicts POS Tags

    Args:
        text (str): Input text string

    Returns:
        list: Containing words their tags
    """

    global _POS_TAGGER_MODEL, _WORD2IDX, _IDX2TAG
    if _POS_TAGGER_MODEL is None:
        _POS_TAGGER_MODEL, _WORD2IDX, _IDX2TAG = _load_metadata(POS_TAGGER_WEIGHTS_PATH,
                                                                POS_WORD2IDX_PATH, POS_TAG2IDX_PATH)

    tokens = text.split()
    encoded = [[_WORD2IDX[word] if word in _WORD2IDX else _WORD2IDX["UNK"] for word in tokens]]
    padded = pad_sequences(sequences=encoded, maxlen=50, value=_WORD2IDX['PAD'], padding='post')
    predictions = _POS_TAGGER_MODEL.predict(padded)
    pred_tags = np.argmax(predictions, axis=2).reshape(predictions.shape[1])
    word_tags = [(word, _IDX2TAG[idx]) for word, idx in zip(tokens, pred_tags)]
    return word_tags

Source File: predict.py From urduhack with MIT License

6 votes

def predict_ner(text: str) -> list:
    """
    Predicts NER Tags

    Args:
        text (str): Input text string

    Returns:
        list: Containing words their tags
    """

    global _NER_MODEL, _WORD2IDX, _IDX2TAG
    if _NER_MODEL is None:
        _NER_MODEL, _WORD2IDX, _IDX2TAG = _load_metadata(NER_WEIGHTS_PATH,
                                                         NER_WORD2IDX_PATH, NER_TAG2IDX_PATH)

    tokens = text.split()
    encoded = [[_WORD2IDX[word] if word in _WORD2IDX else _WORD2IDX["UNK"] for word in tokens]]
    padded = pad_sequences(sequences=encoded, maxlen=55, value=_WORD2IDX['PAD'], padding='post')
    predictions = _NER_MODEL.predict(padded)
    pred_tags = np.argmax(predictions, axis=2).reshape(predictions.shape[1])
    word_tags = [(word, _IDX2TAG[idx]) for word, idx in zip(tokens, pred_tags)]
    return word_tags

Source File: execute.py From tensorflow2.0-coding with MIT License

5 votes

def predict(sentences):
    state=['pos','neg']
    model=create_model()
    indexes = text_to_vector(sentences)
    print(indexes)
    inp = pad_sequences([indexes])
    inp=tf.reshape(inp[0],(1,len(inp[0])))
    predictions=model.step(inp,inp,False)
    pred = tf.math.argmax(predictions[0])
    p=np.int32(pred.numpy())
    return state[p]

Source File: bilstm_crf.py From nlp-journey with Apache License 2.0

5 votes

def _preprocess_data(self, data, max_len=100):
        x = [self.word2idx.get(w[0].lower(), 1) for w in data]
        length = len(x)
        x = pad_sequences([x], max_len)
        return x, length

    # 构造模型

Source File: bilstm_crf.py From nlp-journey with Apache License 2.0

5 votes

def _process_data(data, word2idx, chunk_tags, max_len=None):
        if max_len is None:
            max_len = max(len(s) for s in data)
        x = [[word2idx.get(w[0].lower(), 1) for w in s] for s in data]
        y_chunk = [[chunk_tags.index(w[1]) for w in s] for s in data]

        x = pad_sequences(x, max_len, padding='post')
        y_chunk = pad_sequences(y_chunk, max_len, padding='post')

        return x, y_chunk

Source File: pbt_memnn_example.py From ray with Apache License 2.0

5 votes

def vectorize_stories(word_idx, story_maxlen, query_maxlen, data):
    inputs, queries, answers = [], [], []
    for story, query, answer in data:
        inputs.append([word_idx[w] for w in story])
        queries.append([word_idx[w] for w in query])
        answers.append(word_idx[answer])
    return (pad_sequences(inputs, maxlen=story_maxlen),
            pad_sequences(queries, maxlen=query_maxlen), np.array(answers))

Source File: siamese_similarity.py From nlp-journey with Apache License 2.0

5 votes

def _process_data(self, text):
        t = [[self.word_index.get(word, 0) for word in clean_to_list(
            tex)] for tex in text]
        t = pad_sequences(t, maxlen=self.max_length)
        return t

    # 保存路径与加载路径相同

Source File: machine_translation.py From attention-mechanisms with MIT License

5 votes

def tokenize(language):
    """Function to tokenize language by mapping words to integer indices"""
    # Perform tokenization
    language_tokenizer = Tokenizer(filters='')
    language_tokenizer.fit_on_texts(language)
    tensor = language_tokenizer.texts_to_sequences(language)
    # Pad sequences to maximum found sequence length by appending 0s to end
    tensor = pad_sequences(sequences=tensor, padding='post')

    return tensor, language_tokenizer

Source File: nn.py From bugbug with Mozilla Public License 2.0

5 votes

def transform(self, data):
        sequences = self.tokenizer.texts_to_sequences(data)
        return pad_sequences(sequences, maxlen=self.maxlen)

Source File: prepare_data.py From Text-Classification with Apache License 2.0

5 votes

def data_preprocessing_v2(train, test, max_len, max_words=50000):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(train)
    train_idx = tokenizer.texts_to_sequences(train)
    test_idx = tokenizer.texts_to_sequences(test)
    train_padded = pad_sequences(train_idx, maxlen=max_len, padding='post', truncating='post')
    test_padded = pad_sequences(test_idx, maxlen=max_len, padding='post', truncating='post')
    # vocab size = len(word_docs) + 2  (<UNK>, <PAD>)
    return train_padded, test_padded, max_words + 2

Source File: prepare_data.py From Text-Classification with Apache License 2.0

5 votes

def data_preprocessing_with_dict(train, test, max_len):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token='<UNK>')
    tokenizer.fit_on_texts(train)
    train_idx = tokenizer.texts_to_sequences(train)
    test_idx = tokenizer.texts_to_sequences(test)
    train_padded = pad_sequences(train_idx, maxlen=max_len, padding='post', truncating='post')
    test_padded = pad_sequences(test_idx, maxlen=max_len, padding='post', truncating='post')
    # vocab size = len(word_docs) + 2  (<UNK>, <PAD>)
    return train_padded, test_padded, tokenizer.word_docs, tokenizer.word_index, len(tokenizer.word_docs) + 2

Source File: execute.py From tensorflow2.0-coding with MIT License

5 votes

def pad_sequences(inp):
    out_sequences=sequence.pad_sequences(inp, maxlen=gConfig['sentence_size'],padding='post',value=0)
    return out_sequences

Source File: utils.py From deep-code-search with MIT License

5 votes

def pad(data, len=None):
    from tensorflow.keras.preprocessing.sequence import pad_sequences
    return pad_sequences(data, maxlen=len, padding='post', truncating='post', value=0)

Source File: imdb.py From keras-attention-mechanism with Apache License 2.0

4 votes

def train_and_evaluate_model_on_imdb(add_attention=True):
    numpy.random.seed(7)
    # load the dataset but only keep the top n words, zero the rest
    top_words = 5000
    (X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)
    # truncate and pad input sequences
    max_review_length = 500
    X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
    X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)
    # create the model
    embedding_vector_length = 32
    i = Input(shape=(max_review_length,))
    x = Embedding(top_words, embedding_vector_length, input_length=max_review_length)(i)
    x = Dropout(0.5)(x)
    if add_attention:
        x = LSTM(100, return_sequences=True)(x)
        x = attention_3d_block(x)
    else:
        x = LSTM(100, return_sequences=False)(x)
        x = Dense(350, activation='relu')(x)  # same number of parameters so fair comparison.
    x = Dropout(0.5)(x)
    x = Dense(1, activation='sigmoid')(x)

    model = Model(inputs=[i], outputs=[x])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary())

    class RecordBestTestAccuracy(Callback):

        def __init__(self):
            super().__init__()
            self.val_accuracies = []
            self.val_losses = []

        def on_epoch_end(self, epoch, logs=None):
            self.val_accuracies.append(logs['val_accuracy'])
            self.val_losses.append(logs['val_loss'])

    rbta = RecordBestTestAccuracy()
    model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=64, callbacks=[rbta])

    print(f"Max Test Accuracy: {100 * np.max(rbta.val_accuracies):.2f} %")
    print(f"Mean Test Accuracy: {100 * np.mean(rbta.val_accuracies):.2f} %")

Source File: embedding_lstm.py From asreview with Apache License 2.0

4 votes

def text_to_features(sequences, loop_sequence=1, num_words=20000,
                     max_sequence_length=1000,
                     padding='post', truncating='post'):
    """Convert text data into features.

    Arguments
    ---------
    sequences: list, numpy.ndarray, pandas.Series
        The sequences to convert into features.
    num_words: int
        See keras Tokenizer

    Returns
    -------
    np.ndarray, dict
        The array with features and the dictiory that maps words to values.
    """

    from tensorflow.keras.preprocessing.text import Tokenizer
    from tensorflow.keras.preprocessing.sequence import pad_sequences

    # fit on texts
    tokenizer = Tokenizer(num_words=num_words)
    tokenizer.fit_on_texts(sequences)

    # tokenize sequences
    tokens = tokenizer.texts_to_sequences(sequences)

    # Pad sequences with zeros.
    x = pad_sequences(
        tokens,
        maxlen=max_sequence_length,
        padding=padding,
        truncating=truncating
    )

    if loop_sequence == 1:
        x = loop_sequences(x, max_sequence_length)
    # word index hack. see issue
    # https://github.com/keras-team/keras/issues/8092
    word_index = {e: i for e, i in tokenizer.word_index.items()
                  if i <= num_words}

    return x, word_index

Source File: siamese_similarity.py From nlp-journey with Apache License 2.0

4 votes

def _load_data(self, test_size=0.2):
        log.info('数据预处理...')
        # word:index和index:word
        word_index = dict()
        index_word = ['<unk>']
        questions_cols = ['question1', 'question2']

        log.info('加载数据集...')
        train_data = os.path.join(self.data_path, 'train.csv')
        test_data = os.path.join(self.data_path, 'test.csv')

        train_df = pd.read_csv(train_data)
        test_df = pd.read_csv(test_data)

        # 找到最大的句子长度
        sentences = [df[col].str.split(' ') for df in [train_df, test_df] for col in questions_cols]
        max_length = max([len(s) for ss in sentences for s in ss if isinstance(s, list)])
        # 预处理(统计并将字符串转换为索引)
        for dataset in [train_df, test_df]:
            for index, row in dataset.iterrows():
                for question_col in questions_cols:
                    question_indexes = []
                    for word in clean_to_list(row[question_col]):
                        if word in self.stops:
                            continue
                        if word not in word_index:
                            word_index[word] = len(index_word)
                            question_indexes.append(len(index_word))
                            index_word.append(word)
                        else:
                            question_indexes.append(word_index[word])
                    dataset._set_value(index, question_col, question_indexes)

        x = train_df[questions_cols]
        y = train_df['is_duplicate']
        x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=test_size)

        x_train = {'left': x_train.question1, 'right': x_train.question2}
        x_val = {'left': x_val.question1, 'right': x_val.question2}

        y_train = y_train.values
        y_val = y_val.values

        for dataset, side in itertools.product([x_train, x_val], ['left', 'right']):
            dataset[side] = pad_sequences(dataset[side], maxlen=max_length)

        # 校验问题对各自数目是否正确
        assert x_train['left'].shape == x_train['right'].shape
        assert len(x_train['left']) == len(y_train)
        return x_train, y_train, x_val, y_val, word_index, max_length

Python tensorflow.keras.preprocessing.sequence.pad_sequences() Examples