Python Examples of keras.preprocessing.sequence.pad

Source File: inputHandler.py From lstm-siamese-text-similarity with MIT License

6 votes

def create_test_data(tokenizer, test_sentences_pair, max_sequence_length):
    """
    Create training and validation dataset
    Args:
        tokenizer (keras.preprocessing.text.Tokenizer): keras tokenizer object
        test_sentences_pair (list): list of tuple of sentences pairs
        max_sequence_length (int): max sequence length of sentences to apply padding

    Returns:
        test_data_1 (list): list of input features for training set from sentences1
        test_data_2 (list): list of input features for training set from sentences2
    """
    test_sentences1 = [x[0].lower() for x in test_sentences_pair]
    test_sentences2 = [x[1].lower() for x in test_sentences_pair]

    test_sequences_1 = tokenizer.texts_to_sequences(test_sentences1)
    test_sequences_2 = tokenizer.texts_to_sequences(test_sentences2)
    leaks_test = [[len(set(x1)), len(set(x2)), len(set(x1).intersection(x2))]
                  for x1, x2 in zip(test_sequences_1, test_sequences_2)]

    leaks_test = np.array(leaks_test)
    test_data_1 = pad_sequences(test_sequences_1, maxlen=max_sequence_length)
    test_data_2 = pad_sequences(test_sequences_2, maxlen=max_sequence_length)

    return test_data_1, test_data_2, leaks_test

Source File: load_data.py From Image-Caption-Generator with MIT License

6 votes

def create_sequences(tokenizer, max_length, captions_list, image):
	# X1 : input for image features
	# X2 : input for text features
	# y  : output word
	X1, X2, y = list(), list(), list()
	vocab_size = len(tokenizer.word_index) + 1
	# Walk through each caption for the image
	for caption in captions_list:
		# Encode the sequence
		seq = tokenizer.texts_to_sequences([caption])[0]
		# Split one sequence into multiple X,y pairs
		for i in range(1, len(seq)):
			# Split into input and output pair
			in_seq, out_seq = seq[:i], seq[i]
			# Pad input sequence
			in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
			# Encode output sequence
			out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
			# Store
			X1.append(image)
			X2.append(in_seq)
			y.append(out_seq)
	return X1, X2, y

# Data generator, intended to be used in a call to model.fit_generator()

Source File: preprocessors.py From keras-image-captioning with MIT License

6 votes

def preprocess_batch(self, captions_label_encoded):
        captions = keras_seq.pad_sequences(captions_label_encoded,
                                           padding='post')
        # Because the number of timesteps/words resulted by the model is
        # maxlen(captions) + 1 (because the first "word" is the image).
        captions_extended1 = keras_seq.pad_sequences(captions,
                                                maxlen=captions.shape[-1] + 1,
                                                padding='post')
        captions_one_hot = map(self._tokenizer.sequences_to_matrix,
                               np.expand_dims(captions_extended1, -1))
        captions_one_hot = np.array(captions_one_hot, dtype='int')

        # Decrease/shift word index by 1.
        # Shifting `captions_one_hot` makes the padding word
        # (index=0, encoded=[1, 0, ...]) encoded all zeros ([0, 0, ...]),
        # so its cross entropy loss will be zero.
        captions_decreased = captions.copy()
        captions_decreased[captions_decreased > 0] -= 1
        captions_one_hot_shifted = captions_one_hot[:, :, 1:]

        captions_input = captions_decreased
        captions_output = captions_one_hot_shifted
        return captions_input, captions_output

Source File: test_model.py From caption_generator with MIT License

6 votes

def generate_captions(model, image, beam_size):
	start = [cg.word_index['<start>']]
	captions = [[start,0.0]]
	while(len(captions[0][0]) < cg.max_cap_len):
		temp_captions = []
		for caption in captions:
			partial_caption = sequence.pad_sequences([caption[0]], maxlen=cg.max_cap_len, padding='post')
			next_words_pred = model.predict([np.asarray([image]), np.asarray(partial_caption)])[0]
			next_words = np.argsort(next_words_pred)[-beam_size:]
			for word in next_words:
				new_partial_caption, new_partial_caption_prob = caption[0][:], caption[1]
				new_partial_caption.append(word)
				new_partial_caption_prob+=next_words_pred[word]
				temp_captions.append([new_partial_caption,new_partial_caption_prob])
		captions = temp_captions
		captions.sort(key = lambda l:l[1])
		captions = captions[-beam_size:]

	return captions

Source File: data.py From BERT with Apache License 2.0

6 votes

def load_question(params):
    df = pd.read_csv(config.QUESTION_FILE)
    df["words"] = df.words.str.split(" ").apply(lambda x: [_to_ind(z) for z in x])
    df["chars"] = df.chars.str.split(" ").apply(lambda x: [_to_ind(z) for z in x])
    Q = {}
    Q["seq_len_word"] = sp.minimum(df["words"].apply(len).values, params["max_seq_len_word"])
    Q["seq_len_char"] = sp.minimum(df["chars"].apply(len).values, params["max_seq_len_char"])
    Q["words"] = pad_sequences(df["words"],
                               maxlen=params["max_seq_len_word"],
                               padding=params["pad_sequences_padding"],
                               truncating=params["pad_sequences_truncating"],
                               value=config.PADDING_INDEX_WORD)
    Q["chars"] = pad_sequences(df["chars"],
                               maxlen=params["max_seq_len_char"],
                               padding=params["pad_sequences_padding"],
                               truncating=params["pad_sequences_truncating"],
                               value=config.PADDING_INDEX_CHAR)
    return Q

Source File: vectorizer.py From robotreviewer with GNU General Public License v3.0

6 votes

def texts_to_sequences(self, texts, do_pad=True):
        """Vectorize texts as sequences of indices
        
        Parameters
        ----------
        texts : list of strings to vectorize into sequences of indices
        do_pad : pad the sequences to `self.maxlen` if true
        """
        self.X = self.tok.texts_to_sequences(texts)

        if do_pad:
            self.X = sequence.pad_sequences(self.X, maxlen=self.maxlen)
            self.word2idx['[0]'], self.idx2word[0] = 0, '[0]' # add padding token
            self.vocab_size += 1

        return self.X

Source File: generate.py From recipe-summarization with MIT License

6 votes

def conv_seq_labels(xds, xhs, nflips, model, debug, oov0, glove_idx2idx, vocab_size, nb_unknown_words, idx2word):
    """Convert description and hedlines to padded input vectors; headlines are one-hot to label."""
    batch_size = len(xhs)
    assert len(xds) == batch_size
    x = [
        vocab_fold(lpadd(xd) + xh, oov0, glove_idx2idx, vocab_size, nb_unknown_words)
        for xd, xh in zip(xds, xhs)]  # the input does not have 2nd eos
    x = sequence.pad_sequences(x, maxlen=maxlen, value=empty, padding='post', truncating='post')
    x = flip_headline(x, nflips=nflips, model=model, debug=debug, oov0=oov0, idx2word=idx2word)

    y = np.zeros((batch_size, maxlenh, vocab_size))
    for i, xh in enumerate(xhs):
        xh = vocab_fold(xh, oov0, glove_idx2idx, vocab_size, nb_unknown_words) + [eos] + [empty] * maxlenh  # output does have a eos at end
        xh = xh[:maxlenh]
        y[i, :, :] = np_utils.to_categorical(xh, vocab_size)

    return x, y

Source File: ensemble_pred.py From semeval2019-hyperpartisan-bertha-von-suttner with Apache License 2.0

6 votes

def load_data(data_path, max_len=200):
    data = []
    l = []
    ids = []
    i = 0
    l_encoder = LabelEncoder()
    with open(data_path, 'rb') as inf:
        for line in inf:
            gzip_fields = line.decode('utf-8').split('\t')
            gzip_id = gzip_fields[0]
            gzip_label = gzip_fields[1]
            elmo_embd_str = gzip_fields[4].strip()
            elmo_embd_list = ast.literal_eval(elmo_embd_str)
            elmo_embd_array = np.array(elmo_embd_list)
            padded_seq = sequence.pad_sequences([elmo_embd_array], maxlen=max_len, dtype='float32')[0]
            data.append(padded_seq)
            l.append(gzip_label)
            ids.append(gzip_id)
            i += 1
            print(i)
    label = l_encoder.fit_transform(l)
    return np.array(data), np.array(label), np.array(ids)

Source File: data_utils.py From CCKS2019-Chinese-Clinical-NER with MIT License

6 votes

def load_tagged_data(tagged_data_filepath, vocab, tag2id):
    """
    Load the input data to the model
    :param tagged_data_filepath: the file path to the tagged data file
    :param vocab: the dictionary mapping from word to id
    :param tag2id: the dictionary mapping from tag to id
    :return: Numpy arrays: `train_x, train_y`
    """
    seg_samples_list = __get_seg_sample_list(tagged_data_filepath, mode="tagged")

    words_list = [[word2tag[0] for word2tag in sample] for sample in seg_samples_list]
    sample2id = [[vocab.get(word, 0) for word in sample] for sample in words_list]
    max_seq_len = max(len(sample) for sample in sample2id)
    train_x = pad_sequences(sample2id, max_seq_len, padding="post", value=0)

    tags_list = [[word2tag[1] for word2tag in sample] for sample in seg_samples_list]
    tag2id = [[tag2id.get(tag, 0) for tag in sample] for sample in tags_list]
    train_y = pad_sequences(tag2id, max_seq_len, padding="post", value=0)
    train_y = np.expand_dims(train_y, 2)

    return train_x, train_y

Source File: preprocess.py From MalConv-keras with MIT License

6 votes

def preprocess(fn_list, max_len):
    '''
    Return processed data (ndarray) and original file length (list)
    '''
    corpus = []
    for fn in fn_list:
        if not os.path.isfile(fn):
            print(fn, 'not exist')
        else:
            with open(fn, 'rb') as f:
                corpus.append(f.read())
    
    corpus = [[byte for byte in doc] for doc in corpus]
    len_list = [len(doc) for doc in corpus]
    seq = pad_sequences(corpus, maxlen=max_len, padding='post', truncating='post')
    return seq, len_list

Source File: batch_utils.py From Neural-Chatbot with GNU General Public License v3.0

6 votes

def next_batch(self):
        inverse_vocabulary = self.inverse_vocabulary
        if self.stream:
            q = [[inverse_vocabulary[word] for word in next(self.questions).strip().split() ] for i in range(self.batch_size)]
            a = [[inverse_vocabulary[word] for word in next(self.answers).strip().split() ] for i in range(self.batch_size)]
        else:
            n_example = len(self.answers)
            indices = random.randint(0, n_example, size=(self.batch_size))
            q = [[inverse_vocabulary[word] for word in self.questions[i].split()] for i in indices]
            a = [[inverse_vocabulary[word] for word in self.answers[i].split()] for i in indices]

        X = pad_sequences(q, maxlen=self.sequence_length)
        y = pad_sequences(a, maxlen=self.sequence_length)

        if self.one_hot_target:
            return (X, self.to_one_hot(y))
        else:
            return (X, y)

Source File: conll2000.py From keras-contrib with MIT License

6 votes

def _process_data(data, vocab, pos_tags, chunk_tags, maxlen=None, onehot=False):
    if maxlen is None:
        maxlen = max(len(s) for s in data)
    word2idx = dict((w, i) for i, w in enumerate(vocab))
    # set to <unk> (index 1) if not in vocab
    x = [[word2idx.get(w[0].lower(), 1) for w in s] for s in data]

    y_pos = [[pos_tags.index(w[1]) for w in s] for s in data]
    y_chunk = [[chunk_tags.index(w[2]) for w in s] for s in data]

    x = pad_sequences(x, maxlen)  # left padding

    # lef padded with -1. Indeed, any integer works as it will be masked
    y_pos = pad_sequences(y_pos, maxlen, value=-1)
    y_chunk = pad_sequences(y_chunk, maxlen, value=-1)

    if onehot:
        y_pos = numpy.eye(len(pos_tags), dtype='float32')[y]
        y_chunk = numpy.eye(len(chunk_tags), dtype='float32')[y]
    else:
        y_pos = numpy.expand_dims(y_pos, 2)
        y_chunk = numpy.expand_dims(y_chunk, 2)
    return x, y_pos, y_chunk

Source File: lstm_qa.py From keras-examples with MIT License

6 votes

def vectorize_stories(data, word_idx, story_maxlen, query_maxlen):
    X = []
    Xq = []
    Y = []
    for story, query, answer in data:
        x = [word_idx[w] for w in story]
        xq = [word_idx[w] for w in query]
        # 正解の単語のインデックスのみ1
        y = np.zeros(len(word_idx) + 1)  # 0は予約
        y[word_idx[answer]] = 1
        X.append(x)
        Xq.append(xq)
        Y.append(y)

    # 時系列データをパディング
    # >>> pad_sequences([[1,2], [1,2,3], [1], [1,2,3,4,5]], 5)
    # array([[0, 0, 0, 1, 2],
    #        [0, 0, 1, 2, 3],
    #        [0, 0, 0, 0, 1],
    #        [1, 2, 3, 4, 5]], dtype=int32)
    return pad_sequences(X, maxlen=story_maxlen), pad_sequences(Xq, maxlen=query_maxlen), np.array(Y)

Source File: model.py From DeepSequenceClassification with GNU General Public License v2.0

5 votes

def vectorize_data_old(filenames, maxlen=100, max_charlen=20, output_label_size=6, output_label_dict=None, output_type="boundary", return_chars=False):
    assert output_label_dict is not None, "The output label dictionary should be specified before vectorizing data"
    X = []
    X_char = []
    Y = []
    for i, filename in enumerate(filenames):
        for docid, doc in pp_old.get_documents(filename):
            for seq in pp_old.get_sequences(doc):
                x = []
                x_char = []
                y = []
                for token in seq:
                    x.append(1 + token.word_index) # Add 1 to include token for padding
                    if return_chars:
                        x_char.append((1 + np.array(token.char_seq)).tolist()) # Add 1 to include token for padding
                    if output_type == "category":
                        y_idx = 1 + output_label_dict.get(token.c_label, -1) # Add 1 to include token for padding
                    else:
                        y_idx = 1 + output_label_dict.get(token.b_label, -1) # Add 1 to include token for padding
                    y.append(y_idx) # Add 1 to include token for padding
                X.append(x)
                if return_chars:
                    padded_sequence = pad_sequences([[] for k in xrange(maxlen - len(x_char))], maxlen=max_charlen).tolist() +\
                            pad_sequences(x_char[:maxlen], maxlen=max_charlen).tolist()
                    X_char.append(padded_sequence)
                Y.append(y)
    X = pad_sequences(X, maxlen=maxlen)
    Y = pad_sequences(Y, maxlen=maxlen)
    
    X = np.array(X)
    Y = vtu.to_onehot(Y, output_label_size)
    if return_chars:
        return X, Y, np.array(X_char)
    return X, Y

Source File: model.py From DeepSequenceClassification with GNU General Public License v2.0

5 votes

def vectorize_data(filenames, maxlen=2000, max_charlen=20, output_label_size=6, output_label_dict=None, output_type="boundary", return_chars=False):
    """
    Using histogram of document lengths 2000 is a reasonable number train on.
    """
    assert output_label_dict is not None, "The output label dictionary should be specified before vectorizing data"
    X = []
    X_char = []
    Y = []
    for i, filename in enumerate(filenames):
        for docid, doc in pp.get_documents(filename):
            seq =  pp.get_sequences(doc)
            x = []
            x_char = []
            y = []
            for token in seq:
                x.append(1 + token.word_index) # Add 1 to include token for padding
                if return_chars:
                    x_char.append((1 + np.array(token.char_seq)).tolist()) # Add 1 to include token for padding
                if output_type == "category":
                    y_idx = 1 + output_label_dict.get(token.c_label, -1) # Add 1 to include token for padding
                else:
                    y_idx = 1 + output_label_dict.get(token.b_label, -1) # Add 1 to include token for padding
                y.append(y_idx) # Add 1 to include token for padding
            X.append(x)
            if return_chars:
                padded_sequence = pad_sequences([[] for k in xrange(maxlen - len(x_char))], maxlen=max_charlen).tolist() +\
                        pad_sequences(x_char[:maxlen], maxlen=max_charlen).tolist()
                X_char.append(padded_sequence)
            Y.append(y)
    X = pad_sequences(X, maxlen=maxlen)
    Y = pad_sequences(Y, maxlen=maxlen)
    
    X = np.array(X)
    Y = vtu.to_onehot(Y, output_label_size)
    if return_chars:
        return X, Y, np.array(X_char)
    return X, Y

Source File: generator.py From KerasDeepSpeech with GNU Affero General Public License v3.0

5 votes

def make_mfcc_shape(filename, padlen=778):
    fs, audio = wav.read(filename)
    r = p.mfcc(audio, samplerate=fs, numcep=26)  # 2D array -> timesamples x mfcc_features
    t = np.transpose(r)  # 2D array ->  mfcc_features x timesamples
    X = pad_sequences(t, maxlen=padlen, dtype='float', padding='post', truncating='post').T
    return X  # 2D array -> MAXtimesamples x mfcc_features {778 x 26}

Source File: generator.py From KerasDeepSpeech with GNU Affero General Public License v3.0

5 votes

def make_aubio_shape(filename, padlen=778):
    r = aubio(filename)
    t = np.transpose(r)  # 2D array ->  mfcc_features x timesamples
    X = pad_sequences(t, maxlen=padlen, dtype='float', padding='post', truncating='post').T
    return X  # 2D array -> MAXtimesamples x mfcc_features {778 x 26}

Source File: generator.py From KerasDeepSpeech with GNU Affero General Public License v3.0

5 votes

def make_specto_shape(filename, padlen=778):
    r = spectrogram_from_file(filename)
    t = np.transpose(r)  # 2D array ->  spec x timesamples
    X = pad_sequences(t, maxlen=padlen, dtype='float', padding='post', truncating='post').T

    return X  # MAXtimesamples x specto {max x 161}

Source File: fasttext.py From sears with BSD 2-Clause "Simplified" License

5 votes

def predict_proba(self, X):
        x_test = self.tokenizer.texts_to_sequences(X)
        x_test = sequence.pad_sequences(x_test, maxlen=self.maxlen)
        a = self.model.predict(x_test, verbose=0).flatten()
        a = a.reshape(-1, 1)
        return np.hstack((1 - a, a))

Source File: model.py From polyaxon-examples with Apache License 2.0

5 votes

def transform_data(x_train, y_train, x_test, y_test, maxlen):
    x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
    x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
    return x_train, y_train, x_test, y_test

Source File: fasttext.py From sears with BSD 2-Clause "Simplified" License

5 votes

def predict_proba(self, X):
        x_test = self.tokenizer.texts_to_sequences(X)
        x_test = self.add_ngrams(x_test)
        x_test = sequence.pad_sequences(x_test, maxlen=self.maxlen)
        a = self.model.predict(x_test).flatten()
        a = a.reshape(-1, 1)
        return np.hstack((1 - a, a))

Source File: fasttext.py From sears with BSD 2-Clause "Simplified" License

5 votes

def predict(self, X):
        x_test = self.tokenizer.texts_to_sequences(X)
        x_test = self.add_ngrams(x_test)
        x_test = sequence.pad_sequences(x_test, maxlen=self.maxlen)
        return self.model.predict_classes(x_test, verbose=0).flatten()

Source File: pipeline.py From krnnt with GNU Lesser General Public License v3.0

5 votes

def pad(batch: List[List[Sample]], unique_features_dict, feature_name: str):
        if not batch:
            return []

        result_batchX = []
        for sentence in batch:
            X_sentence = []
            for sample in sentence:
                X_sentence.append(np.array(k_hot(sample.features[feature_name], unique_features_dict[feature_name])))

            result_batchX.append(X_sentence)

        return sequence.pad_sequences(result_batchX)

Source File: new.py From krnnt with GNU Lesser General Public License v3.0

5 votes

def pad_generator(generator, sequence_length=20):
    for batch_X, batch_y, sentences, sentences_orig in generator:
        if not batch_X or not batch_y:
            continue

        # TODO pad multi inputs
        max_sentence_length = max([len(x) for x in batch_X])
        # print('max_sentence_length',max_sentence_length)
        yield (sequence.pad_sequences(batch_X, maxlen=max_sentence_length),
               sequence.pad_sequences(batch_y, maxlen=max_sentence_length),
               sentences,
               sentences_orig)

Source File: preprocessing.py From toxic_comments with MIT License

5 votes

def convert_text2seq(train_texts, test_texts,
                     max_words, max_seq_len, max_char_seq_len, embeds,
                     lower=True, oov_token='__NA__',
                     uniq=False, use_only_exists_words=False):
    texts = train_texts + test_texts
    if uniq:
        texts = [uniq_words_in_text(text) for text in texts]
    if use_only_exists_words:
        texts = [delete_unknown_words(text, embeds) for text in texts]

    # WORD TOKENIZER
    word_tokenizer = Tokenizer(num_words=max_words, lower=lower, char_level=False)
    word_tokenizer.fit_on_texts(texts)

    word_seq_train = word_tokenizer.texts_to_sequences(train_texts)
    word_seq_test = word_tokenizer.texts_to_sequences(test_texts)
    word_index = word_tokenizer.word_index

    word_seq_train = list(sequence.pad_sequences(word_seq_train, maxlen=max_seq_len))
    word_seq_test = list(sequence.pad_sequences(word_seq_test, maxlen=max_seq_len))

    # CHAR TOKENIZER
    char_tokenizer = CountVectorizer(analyzer='char', ngram_range=(3,3), stop_words=None, lowercase=True,
                            max_df=0.9, min_df=0, max_features=max_words)
    char_tokenizer.fit(texts)
    char_sparse_train = char_tokenizer.transform(train_texts)
    char_sparse_test = char_tokenizer.transform(test_texts)

    char_seq_train = sparse_to_seq(char_sparse_train, maxlen=max_char_seq_len)
    char_seq_test = sparse_to_seq(char_sparse_test, maxlen=max_char_seq_len)

    char_index = {key: val+1 for key, val in char_tokenizer.vocabulary_.items()}
    char_index[oov_token] = 0
    char_vocab_len = len(char_index)

    return word_seq_train, word_seq_test, word_index, char_seq_train, char_seq_test, char_index

Source File: sampling.py From Neural-Chatbot with GNU General Public License v3.0

5 votes

def respond(self, input, temperature=1.0, greedy=False):
        input = pad_sequences([self._encode(input)], maxlen=self.sequence_length)
        print (input)
        output = self.model.predict(input)[0]
        print (output.shape)
        output[:, 1] = 0
        indices = [probability.argmax(axis=-1) for probability in output] if greedy \
        else [self.sample(probability, temperature) for probability in output]

        return self._decode(indices)

Source File: babi.py From dl-models-for-qa with Apache License 2.0

5 votes

def vectorize(data, word2idx, story_maxlen, question_maxlen):
    """ Create the story and question vectors and the label """
    Xs, Xq, Y = [], [], []
    for story, question, answer in data:
        xs = [word2idx[word] for word in story]
        xq = [word2idx[word] for word in question]
        y = np.zeros(len(word2idx) + 1)
        y[word2idx[answer]] = 1
        Xs.append(xs)
        Xq.append(xq)
        Y.append(y)
    return (pad_sequences(Xs, maxlen=story_maxlen), 
            pad_sequences(Xq, maxlen=question_maxlen),
            np.array(Y))

Source File: Data_process.py From Text_Generate with MIT License

5 votes

def creat_x_y(self, maxlen=40, one_hot=False):
        '''
        :param one_hot: 是否对y转one-hot
        :return:
        '''
        self.one_hot = one_hot
        # 如果转编码用了mode='length',这里pad_sequences也用之前的maxlen,避免多余的填充
        if self.maxlen is not None:
            maxlen = self.maxlen
        texts_seq = self.texts_seq
        x = []
        y = []
        for i in texts_seq:
            x.append(i[:-1])
            y.append(i[1:])
        # self.x = x
        # self.y = y

        n = 0
        pad_seq = []
        # 分批执行pad_sequences
        while n < len(texts_seq):
            pad_seq += list(pad_sequences(x[n:n + 5000], maxlen=maxlen,
                                          padding='post', value=0, dtype='int'))
            n += 5000
            # if n < len(texts_seq):
            #     print('finish pad_sequences %d samples(%f)' % (n, n / len(texts_seq)))
            # else:
            #     print('finish pad_sequences %d samples(1.0)' % len(texts_seq))

        pad_seq = pad_sequences(x, maxlen, padding='post', truncating='post')
        y_pad_seq = pad_sequences(y, maxlen - 1, padding='post', truncating='post')

        # 生成x和y
        self.x_pad_seq = np.array([i[:-1] for i in pad_seq])
        self.y_pad_seq = np.array([i[1:] for i in pad_seq])

        if one_hot:
            # y转one-hot
            y_one_hot = [self.creat_one_hot(i, self.num_words) for i in y_pad_seq]
            self.y_one_hot = y_one_hot

Source File: data_helper.py From conv-emotion with MIT License

5 votes

def prepare_history(self, data, mode, maxlen):
        data = pad_sequences(data, maxlen) # (batch, maxlen)
        pads = np.zeros(data.shape, dtype=np.float32) # (batch, maxlen)
        if mode == "own":
            data = np.stack((data, pads), axis=1)
        else:
            data = np.stack((pads, data), axis=1)
        return data # (batch, 2, maxlen)

Source File: model_simple.py From DeepSequenceClassification with GNU General Public License v2.0

5 votes

def vectorize_data(filenames, maxlen=2000, max_charlen=20, output_label_size=6, output_label_dict=None, output_type="hybrid", return_chars=False):
    """
    Using histogram of document lengths 2000 is a reasonable number train on.
    """
    assert output_label_dict is not None, "The output label dictionary should be specified before vectorizing data"
    X = []
    X_char = []
    Y = []
    for i, filename in enumerate(filenames):
        for docid, doc in pp.get_documents(filename):
            seq =  pp.get_sequences(doc)
            x = []
            x_char = []
            y = []
            for token in seq:
                x.append(1 + token.word_index) # Add 1 to include token for padding
                if return_chars:
                    x_char.append((1 + np.array(token.char_seq)).tolist()) # Add 1 to include token for padding
                if output_type == "hybrid":
                    y_idx = 1 + output_label_dict.get("%s-%s" % (token.b_label, token.c_label), -1) # Add 1 to include token for padding
                elif output_type == "category":
                    y_idx = 1 + output_label_dict.get(token.c_label, -1) # Add 1 to include token for padding
                else:
                    y_idx = 1 + output_label_dict.get(token.b_label, -1) # Add 1 to include token for padding
                y.append(y_idx) # Add 1 to include token for padding
            X.append(x)
            if return_chars:
                padded_sequence = pad_sequences([[] for k in xrange(maxlen - len(x_char))], maxlen=max_charlen).tolist() +\
                        pad_sequences(x_char[:maxlen], maxlen=max_charlen).tolist()
                X_char.append(padded_sequence)
            Y.append(y)
    X = pad_sequences(X, maxlen=maxlen)
    Y = pad_sequences(Y, maxlen=maxlen)
    
    X = np.array(X)
    Y = vtu.to_onehot(Y, output_label_size)
    if return_chars:
        return X, Y, np.array(X_char)
    return X, Y

Python keras.preprocessing.sequence.pad_sequences() Examples