Python keras.preprocessing.sequence.pad_sequences() Examples
The following are 30
code examples of keras.preprocessing.sequence.pad_sequences().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
keras.preprocessing.sequence
, or try the search function
.
Example #1
Source File: inputHandler.py From lstm-siamese-text-similarity with MIT License | 6 votes |
def create_test_data(tokenizer, test_sentences_pair, max_sequence_length): """ Create training and validation dataset Args: tokenizer (keras.preprocessing.text.Tokenizer): keras tokenizer object test_sentences_pair (list): list of tuple of sentences pairs max_sequence_length (int): max sequence length of sentences to apply padding Returns: test_data_1 (list): list of input features for training set from sentences1 test_data_2 (list): list of input features for training set from sentences2 """ test_sentences1 = [x[0].lower() for x in test_sentences_pair] test_sentences2 = [x[1].lower() for x in test_sentences_pair] test_sequences_1 = tokenizer.texts_to_sequences(test_sentences1) test_sequences_2 = tokenizer.texts_to_sequences(test_sentences2) leaks_test = [[len(set(x1)), len(set(x2)), len(set(x1).intersection(x2))] for x1, x2 in zip(test_sequences_1, test_sequences_2)] leaks_test = np.array(leaks_test) test_data_1 = pad_sequences(test_sequences_1, maxlen=max_sequence_length) test_data_2 = pad_sequences(test_sequences_2, maxlen=max_sequence_length) return test_data_1, test_data_2, leaks_test
Example #2
Source File: load_data.py From Image-Caption-Generator with MIT License | 6 votes |
def create_sequences(tokenizer, max_length, captions_list, image): # X1 : input for image features # X2 : input for text features # y : output word X1, X2, y = list(), list(), list() vocab_size = len(tokenizer.word_index) + 1 # Walk through each caption for the image for caption in captions_list: # Encode the sequence seq = tokenizer.texts_to_sequences([caption])[0] # Split one sequence into multiple X,y pairs for i in range(1, len(seq)): # Split into input and output pair in_seq, out_seq = seq[:i], seq[i] # Pad input sequence in_seq = pad_sequences([in_seq], maxlen=max_length)[0] # Encode output sequence out_seq = to_categorical([out_seq], num_classes=vocab_size)[0] # Store X1.append(image) X2.append(in_seq) y.append(out_seq) return X1, X2, y # Data generator, intended to be used in a call to model.fit_generator()
Example #3
Source File: preprocessors.py From keras-image-captioning with MIT License | 6 votes |
def preprocess_batch(self, captions_label_encoded): captions = keras_seq.pad_sequences(captions_label_encoded, padding='post') # Because the number of timesteps/words resulted by the model is # maxlen(captions) + 1 (because the first "word" is the image). captions_extended1 = keras_seq.pad_sequences(captions, maxlen=captions.shape[-1] + 1, padding='post') captions_one_hot = map(self._tokenizer.sequences_to_matrix, np.expand_dims(captions_extended1, -1)) captions_one_hot = np.array(captions_one_hot, dtype='int') # Decrease/shift word index by 1. # Shifting `captions_one_hot` makes the padding word # (index=0, encoded=[1, 0, ...]) encoded all zeros ([0, 0, ...]), # so its cross entropy loss will be zero. captions_decreased = captions.copy() captions_decreased[captions_decreased > 0] -= 1 captions_one_hot_shifted = captions_one_hot[:, :, 1:] captions_input = captions_decreased captions_output = captions_one_hot_shifted return captions_input, captions_output
Example #4
Source File: test_model.py From caption_generator with MIT License | 6 votes |
def generate_captions(model, image, beam_size): start = [cg.word_index['<start>']] captions = [[start,0.0]] while(len(captions[0][0]) < cg.max_cap_len): temp_captions = [] for caption in captions: partial_caption = sequence.pad_sequences([caption[0]], maxlen=cg.max_cap_len, padding='post') next_words_pred = model.predict([np.asarray([image]), np.asarray(partial_caption)])[0] next_words = np.argsort(next_words_pred)[-beam_size:] for word in next_words: new_partial_caption, new_partial_caption_prob = caption[0][:], caption[1] new_partial_caption.append(word) new_partial_caption_prob+=next_words_pred[word] temp_captions.append([new_partial_caption,new_partial_caption_prob]) captions = temp_captions captions.sort(key = lambda l:l[1]) captions = captions[-beam_size:] return captions
Example #5
Source File: data.py From BERT with Apache License 2.0 | 6 votes |
def load_question(params): df = pd.read_csv(config.QUESTION_FILE) df["words"] = df.words.str.split(" ").apply(lambda x: [_to_ind(z) for z in x]) df["chars"] = df.chars.str.split(" ").apply(lambda x: [_to_ind(z) for z in x]) Q = {} Q["seq_len_word"] = sp.minimum(df["words"].apply(len).values, params["max_seq_len_word"]) Q["seq_len_char"] = sp.minimum(df["chars"].apply(len).values, params["max_seq_len_char"]) Q["words"] = pad_sequences(df["words"], maxlen=params["max_seq_len_word"], padding=params["pad_sequences_padding"], truncating=params["pad_sequences_truncating"], value=config.PADDING_INDEX_WORD) Q["chars"] = pad_sequences(df["chars"], maxlen=params["max_seq_len_char"], padding=params["pad_sequences_padding"], truncating=params["pad_sequences_truncating"], value=config.PADDING_INDEX_CHAR) return Q
Example #6
Source File: vectorizer.py From robotreviewer with GNU General Public License v3.0 | 6 votes |
def texts_to_sequences(self, texts, do_pad=True): """Vectorize texts as sequences of indices Parameters ---------- texts : list of strings to vectorize into sequences of indices do_pad : pad the sequences to `self.maxlen` if true """ self.X = self.tok.texts_to_sequences(texts) if do_pad: self.X = sequence.pad_sequences(self.X, maxlen=self.maxlen) self.word2idx['[0]'], self.idx2word[0] = 0, '[0]' # add padding token self.vocab_size += 1 return self.X
Example #7
Source File: generate.py From recipe-summarization with MIT License | 6 votes |
def conv_seq_labels(xds, xhs, nflips, model, debug, oov0, glove_idx2idx, vocab_size, nb_unknown_words, idx2word): """Convert description and hedlines to padded input vectors; headlines are one-hot to label.""" batch_size = len(xhs) assert len(xds) == batch_size x = [ vocab_fold(lpadd(xd) + xh, oov0, glove_idx2idx, vocab_size, nb_unknown_words) for xd, xh in zip(xds, xhs)] # the input does not have 2nd eos x = sequence.pad_sequences(x, maxlen=maxlen, value=empty, padding='post', truncating='post') x = flip_headline(x, nflips=nflips, model=model, debug=debug, oov0=oov0, idx2word=idx2word) y = np.zeros((batch_size, maxlenh, vocab_size)) for i, xh in enumerate(xhs): xh = vocab_fold(xh, oov0, glove_idx2idx, vocab_size, nb_unknown_words) + [eos] + [empty] * maxlenh # output does have a eos at end xh = xh[:maxlenh] y[i, :, :] = np_utils.to_categorical(xh, vocab_size) return x, y
Example #8
Source File: ensemble_pred.py From semeval2019-hyperpartisan-bertha-von-suttner with Apache License 2.0 | 6 votes |
def load_data(data_path, max_len=200): data = [] l = [] ids = [] i = 0 l_encoder = LabelEncoder() with open(data_path, 'rb') as inf: for line in inf: gzip_fields = line.decode('utf-8').split('\t') gzip_id = gzip_fields[0] gzip_label = gzip_fields[1] elmo_embd_str = gzip_fields[4].strip() elmo_embd_list = ast.literal_eval(elmo_embd_str) elmo_embd_array = np.array(elmo_embd_list) padded_seq = sequence.pad_sequences([elmo_embd_array], maxlen=max_len, dtype='float32')[0] data.append(padded_seq) l.append(gzip_label) ids.append(gzip_id) i += 1 print(i) label = l_encoder.fit_transform(l) return np.array(data), np.array(label), np.array(ids)
Example #9
Source File: data_utils.py From CCKS2019-Chinese-Clinical-NER with MIT License | 6 votes |
def load_tagged_data(tagged_data_filepath, vocab, tag2id): """ Load the input data to the model :param tagged_data_filepath: the file path to the tagged data file :param vocab: the dictionary mapping from word to id :param tag2id: the dictionary mapping from tag to id :return: Numpy arrays: `train_x, train_y` """ seg_samples_list = __get_seg_sample_list(tagged_data_filepath, mode="tagged") words_list = [[word2tag[0] for word2tag in sample] for sample in seg_samples_list] sample2id = [[vocab.get(word, 0) for word in sample] for sample in words_list] max_seq_len = max(len(sample) for sample in sample2id) train_x = pad_sequences(sample2id, max_seq_len, padding="post", value=0) tags_list = [[word2tag[1] for word2tag in sample] for sample in seg_samples_list] tag2id = [[tag2id.get(tag, 0) for tag in sample] for sample in tags_list] train_y = pad_sequences(tag2id, max_seq_len, padding="post", value=0) train_y = np.expand_dims(train_y, 2) return train_x, train_y
Example #10
Source File: preprocess.py From MalConv-keras with MIT License | 6 votes |
def preprocess(fn_list, max_len): ''' Return processed data (ndarray) and original file length (list) ''' corpus = [] for fn in fn_list: if not os.path.isfile(fn): print(fn, 'not exist') else: with open(fn, 'rb') as f: corpus.append(f.read()) corpus = [[byte for byte in doc] for doc in corpus] len_list = [len(doc) for doc in corpus] seq = pad_sequences(corpus, maxlen=max_len, padding='post', truncating='post') return seq, len_list
Example #11
Source File: batch_utils.py From Neural-Chatbot with GNU General Public License v3.0 | 6 votes |
def next_batch(self): inverse_vocabulary = self.inverse_vocabulary if self.stream: q = [[inverse_vocabulary[word] for word in next(self.questions).strip().split() ] for i in range(self.batch_size)] a = [[inverse_vocabulary[word] for word in next(self.answers).strip().split() ] for i in range(self.batch_size)] else: n_example = len(self.answers) indices = random.randint(0, n_example, size=(self.batch_size)) q = [[inverse_vocabulary[word] for word in self.questions[i].split()] for i in indices] a = [[inverse_vocabulary[word] for word in self.answers[i].split()] for i in indices] X = pad_sequences(q, maxlen=self.sequence_length) y = pad_sequences(a, maxlen=self.sequence_length) if self.one_hot_target: return (X, self.to_one_hot(y)) else: return (X, y)
Example #12
Source File: conll2000.py From keras-contrib with MIT License | 6 votes |
def _process_data(data, vocab, pos_tags, chunk_tags, maxlen=None, onehot=False): if maxlen is None: maxlen = max(len(s) for s in data) word2idx = dict((w, i) for i, w in enumerate(vocab)) # set to <unk> (index 1) if not in vocab x = [[word2idx.get(w[0].lower(), 1) for w in s] for s in data] y_pos = [[pos_tags.index(w[1]) for w in s] for s in data] y_chunk = [[chunk_tags.index(w[2]) for w in s] for s in data] x = pad_sequences(x, maxlen) # left padding # lef padded with -1. Indeed, any integer works as it will be masked y_pos = pad_sequences(y_pos, maxlen, value=-1) y_chunk = pad_sequences(y_chunk, maxlen, value=-1) if onehot: y_pos = numpy.eye(len(pos_tags), dtype='float32')[y] y_chunk = numpy.eye(len(chunk_tags), dtype='float32')[y] else: y_pos = numpy.expand_dims(y_pos, 2) y_chunk = numpy.expand_dims(y_chunk, 2) return x, y_pos, y_chunk
Example #13
Source File: lstm_qa.py From keras-examples with MIT License | 6 votes |
def vectorize_stories(data, word_idx, story_maxlen, query_maxlen): X = [] Xq = [] Y = [] for story, query, answer in data: x = [word_idx[w] for w in story] xq = [word_idx[w] for w in query] # 正解の単語のインデックスのみ1 y = np.zeros(len(word_idx) + 1) # 0は予約 y[word_idx[answer]] = 1 X.append(x) Xq.append(xq) Y.append(y) # 時系列データをパディング # >>> pad_sequences([[1,2], [1,2,3], [1], [1,2,3,4,5]], 5) # array([[0, 0, 0, 1, 2], # [0, 0, 1, 2, 3], # [0, 0, 0, 0, 1], # [1, 2, 3, 4, 5]], dtype=int32) return pad_sequences(X, maxlen=story_maxlen), pad_sequences(Xq, maxlen=query_maxlen), np.array(Y)
Example #14
Source File: model.py From DeepSequenceClassification with GNU General Public License v2.0 | 5 votes |
def vectorize_data_old(filenames, maxlen=100, max_charlen=20, output_label_size=6, output_label_dict=None, output_type="boundary", return_chars=False): assert output_label_dict is not None, "The output label dictionary should be specified before vectorizing data" X = [] X_char = [] Y = [] for i, filename in enumerate(filenames): for docid, doc in pp_old.get_documents(filename): for seq in pp_old.get_sequences(doc): x = [] x_char = [] y = [] for token in seq: x.append(1 + token.word_index) # Add 1 to include token for padding if return_chars: x_char.append((1 + np.array(token.char_seq)).tolist()) # Add 1 to include token for padding if output_type == "category": y_idx = 1 + output_label_dict.get(token.c_label, -1) # Add 1 to include token for padding else: y_idx = 1 + output_label_dict.get(token.b_label, -1) # Add 1 to include token for padding y.append(y_idx) # Add 1 to include token for padding X.append(x) if return_chars: padded_sequence = pad_sequences([[] for k in xrange(maxlen - len(x_char))], maxlen=max_charlen).tolist() +\ pad_sequences(x_char[:maxlen], maxlen=max_charlen).tolist() X_char.append(padded_sequence) Y.append(y) X = pad_sequences(X, maxlen=maxlen) Y = pad_sequences(Y, maxlen=maxlen) X = np.array(X) Y = vtu.to_onehot(Y, output_label_size) if return_chars: return X, Y, np.array(X_char) return X, Y
Example #15
Source File: model.py From DeepSequenceClassification with GNU General Public License v2.0 | 5 votes |
def vectorize_data(filenames, maxlen=2000, max_charlen=20, output_label_size=6, output_label_dict=None, output_type="boundary", return_chars=False): """ Using histogram of document lengths 2000 is a reasonable number train on. """ assert output_label_dict is not None, "The output label dictionary should be specified before vectorizing data" X = [] X_char = [] Y = [] for i, filename in enumerate(filenames): for docid, doc in pp.get_documents(filename): seq = pp.get_sequences(doc) x = [] x_char = [] y = [] for token in seq: x.append(1 + token.word_index) # Add 1 to include token for padding if return_chars: x_char.append((1 + np.array(token.char_seq)).tolist()) # Add 1 to include token for padding if output_type == "category": y_idx = 1 + output_label_dict.get(token.c_label, -1) # Add 1 to include token for padding else: y_idx = 1 + output_label_dict.get(token.b_label, -1) # Add 1 to include token for padding y.append(y_idx) # Add 1 to include token for padding X.append(x) if return_chars: padded_sequence = pad_sequences([[] for k in xrange(maxlen - len(x_char))], maxlen=max_charlen).tolist() +\ pad_sequences(x_char[:maxlen], maxlen=max_charlen).tolist() X_char.append(padded_sequence) Y.append(y) X = pad_sequences(X, maxlen=maxlen) Y = pad_sequences(Y, maxlen=maxlen) X = np.array(X) Y = vtu.to_onehot(Y, output_label_size) if return_chars: return X, Y, np.array(X_char) return X, Y
Example #16
Source File: generator.py From KerasDeepSpeech with GNU Affero General Public License v3.0 | 5 votes |
def make_mfcc_shape(filename, padlen=778): fs, audio = wav.read(filename) r = p.mfcc(audio, samplerate=fs, numcep=26) # 2D array -> timesamples x mfcc_features t = np.transpose(r) # 2D array -> mfcc_features x timesamples X = pad_sequences(t, maxlen=padlen, dtype='float', padding='post', truncating='post').T return X # 2D array -> MAXtimesamples x mfcc_features {778 x 26}
Example #17
Source File: generator.py From KerasDeepSpeech with GNU Affero General Public License v3.0 | 5 votes |
def make_aubio_shape(filename, padlen=778): r = aubio(filename) t = np.transpose(r) # 2D array -> mfcc_features x timesamples X = pad_sequences(t, maxlen=padlen, dtype='float', padding='post', truncating='post').T return X # 2D array -> MAXtimesamples x mfcc_features {778 x 26}
Example #18
Source File: generator.py From KerasDeepSpeech with GNU Affero General Public License v3.0 | 5 votes |
def make_specto_shape(filename, padlen=778): r = spectrogram_from_file(filename) t = np.transpose(r) # 2D array -> spec x timesamples X = pad_sequences(t, maxlen=padlen, dtype='float', padding='post', truncating='post').T return X # MAXtimesamples x specto {max x 161}
Example #19
Source File: fasttext.py From sears with BSD 2-Clause "Simplified" License | 5 votes |
def predict_proba(self, X): x_test = self.tokenizer.texts_to_sequences(X) x_test = sequence.pad_sequences(x_test, maxlen=self.maxlen) a = self.model.predict(x_test, verbose=0).flatten() a = a.reshape(-1, 1) return np.hstack((1 - a, a))
Example #20
Source File: model.py From polyaxon-examples with Apache License 2.0 | 5 votes |
def transform_data(x_train, y_train, x_test, y_test, maxlen): x_train = sequence.pad_sequences(x_train, maxlen=maxlen) x_test = sequence.pad_sequences(x_test, maxlen=maxlen) return x_train, y_train, x_test, y_test
Example #21
Source File: fasttext.py From sears with BSD 2-Clause "Simplified" License | 5 votes |
def predict_proba(self, X): x_test = self.tokenizer.texts_to_sequences(X) x_test = self.add_ngrams(x_test) x_test = sequence.pad_sequences(x_test, maxlen=self.maxlen) a = self.model.predict(x_test).flatten() a = a.reshape(-1, 1) return np.hstack((1 - a, a))
Example #22
Source File: fasttext.py From sears with BSD 2-Clause "Simplified" License | 5 votes |
def predict(self, X): x_test = self.tokenizer.texts_to_sequences(X) x_test = self.add_ngrams(x_test) x_test = sequence.pad_sequences(x_test, maxlen=self.maxlen) return self.model.predict_classes(x_test, verbose=0).flatten()
Example #23
Source File: pipeline.py From krnnt with GNU Lesser General Public License v3.0 | 5 votes |
def pad(batch: List[List[Sample]], unique_features_dict, feature_name: str): if not batch: return [] result_batchX = [] for sentence in batch: X_sentence = [] for sample in sentence: X_sentence.append(np.array(k_hot(sample.features[feature_name], unique_features_dict[feature_name]))) result_batchX.append(X_sentence) return sequence.pad_sequences(result_batchX)
Example #24
Source File: new.py From krnnt with GNU Lesser General Public License v3.0 | 5 votes |
def pad_generator(generator, sequence_length=20): for batch_X, batch_y, sentences, sentences_orig in generator: if not batch_X or not batch_y: continue # TODO pad multi inputs max_sentence_length = max([len(x) for x in batch_X]) # print('max_sentence_length',max_sentence_length) yield (sequence.pad_sequences(batch_X, maxlen=max_sentence_length), sequence.pad_sequences(batch_y, maxlen=max_sentence_length), sentences, sentences_orig)
Example #25
Source File: preprocessing.py From toxic_comments with MIT License | 5 votes |
def convert_text2seq(train_texts, test_texts, max_words, max_seq_len, max_char_seq_len, embeds, lower=True, oov_token='__NA__', uniq=False, use_only_exists_words=False): texts = train_texts + test_texts if uniq: texts = [uniq_words_in_text(text) for text in texts] if use_only_exists_words: texts = [delete_unknown_words(text, embeds) for text in texts] # WORD TOKENIZER word_tokenizer = Tokenizer(num_words=max_words, lower=lower, char_level=False) word_tokenizer.fit_on_texts(texts) word_seq_train = word_tokenizer.texts_to_sequences(train_texts) word_seq_test = word_tokenizer.texts_to_sequences(test_texts) word_index = word_tokenizer.word_index word_seq_train = list(sequence.pad_sequences(word_seq_train, maxlen=max_seq_len)) word_seq_test = list(sequence.pad_sequences(word_seq_test, maxlen=max_seq_len)) # CHAR TOKENIZER char_tokenizer = CountVectorizer(analyzer='char', ngram_range=(3,3), stop_words=None, lowercase=True, max_df=0.9, min_df=0, max_features=max_words) char_tokenizer.fit(texts) char_sparse_train = char_tokenizer.transform(train_texts) char_sparse_test = char_tokenizer.transform(test_texts) char_seq_train = sparse_to_seq(char_sparse_train, maxlen=max_char_seq_len) char_seq_test = sparse_to_seq(char_sparse_test, maxlen=max_char_seq_len) char_index = {key: val+1 for key, val in char_tokenizer.vocabulary_.items()} char_index[oov_token] = 0 char_vocab_len = len(char_index) return word_seq_train, word_seq_test, word_index, char_seq_train, char_seq_test, char_index
Example #26
Source File: sampling.py From Neural-Chatbot with GNU General Public License v3.0 | 5 votes |
def respond(self, input, temperature=1.0, greedy=False): input = pad_sequences([self._encode(input)], maxlen=self.sequence_length) print (input) output = self.model.predict(input)[0] print (output.shape) output[:, 1] = 0 indices = [probability.argmax(axis=-1) for probability in output] if greedy \ else [self.sample(probability, temperature) for probability in output] return self._decode(indices)
Example #27
Source File: babi.py From dl-models-for-qa with Apache License 2.0 | 5 votes |
def vectorize(data, word2idx, story_maxlen, question_maxlen): """ Create the story and question vectors and the label """ Xs, Xq, Y = [], [], [] for story, question, answer in data: xs = [word2idx[word] for word in story] xq = [word2idx[word] for word in question] y = np.zeros(len(word2idx) + 1) y[word2idx[answer]] = 1 Xs.append(xs) Xq.append(xq) Y.append(y) return (pad_sequences(Xs, maxlen=story_maxlen), pad_sequences(Xq, maxlen=question_maxlen), np.array(Y))
Example #28
Source File: Data_process.py From Text_Generate with MIT License | 5 votes |
def creat_x_y(self, maxlen=40, one_hot=False): ''' :param one_hot: 是否对y转one-hot :return: ''' self.one_hot = one_hot # 如果转编码用了mode='length',这里pad_sequences也用之前的maxlen,避免多余的填充 if self.maxlen is not None: maxlen = self.maxlen texts_seq = self.texts_seq x = [] y = [] for i in texts_seq: x.append(i[:-1]) y.append(i[1:]) # self.x = x # self.y = y n = 0 pad_seq = [] # 分批执行pad_sequences while n < len(texts_seq): pad_seq += list(pad_sequences(x[n:n + 5000], maxlen=maxlen, padding='post', value=0, dtype='int')) n += 5000 # if n < len(texts_seq): # print('finish pad_sequences %d samples(%f)' % (n, n / len(texts_seq))) # else: # print('finish pad_sequences %d samples(1.0)' % len(texts_seq)) pad_seq = pad_sequences(x, maxlen, padding='post', truncating='post') y_pad_seq = pad_sequences(y, maxlen - 1, padding='post', truncating='post') # 生成x和y self.x_pad_seq = np.array([i[:-1] for i in pad_seq]) self.y_pad_seq = np.array([i[1:] for i in pad_seq]) if one_hot: # y转one-hot y_one_hot = [self.creat_one_hot(i, self.num_words) for i in y_pad_seq] self.y_one_hot = y_one_hot
Example #29
Source File: data_helper.py From conv-emotion with MIT License | 5 votes |
def prepare_history(self, data, mode, maxlen): data = pad_sequences(data, maxlen) # (batch, maxlen) pads = np.zeros(data.shape, dtype=np.float32) # (batch, maxlen) if mode == "own": data = np.stack((data, pads), axis=1) else: data = np.stack((pads, data), axis=1) return data # (batch, 2, maxlen)
Example #30
Source File: model_simple.py From DeepSequenceClassification with GNU General Public License v2.0 | 5 votes |
def vectorize_data(filenames, maxlen=2000, max_charlen=20, output_label_size=6, output_label_dict=None, output_type="hybrid", return_chars=False): """ Using histogram of document lengths 2000 is a reasonable number train on. """ assert output_label_dict is not None, "The output label dictionary should be specified before vectorizing data" X = [] X_char = [] Y = [] for i, filename in enumerate(filenames): for docid, doc in pp.get_documents(filename): seq = pp.get_sequences(doc) x = [] x_char = [] y = [] for token in seq: x.append(1 + token.word_index) # Add 1 to include token for padding if return_chars: x_char.append((1 + np.array(token.char_seq)).tolist()) # Add 1 to include token for padding if output_type == "hybrid": y_idx = 1 + output_label_dict.get("%s-%s" % (token.b_label, token.c_label), -1) # Add 1 to include token for padding elif output_type == "category": y_idx = 1 + output_label_dict.get(token.c_label, -1) # Add 1 to include token for padding else: y_idx = 1 + output_label_dict.get(token.b_label, -1) # Add 1 to include token for padding y.append(y_idx) # Add 1 to include token for padding X.append(x) if return_chars: padded_sequence = pad_sequences([[] for k in xrange(maxlen - len(x_char))], maxlen=max_charlen).tolist() +\ pad_sequences(x_char[:maxlen], maxlen=max_charlen).tolist() X_char.append(padded_sequence) Y.append(y) X = pad_sequences(X, maxlen=maxlen) Y = pad_sequences(Y, maxlen=maxlen) X = np.array(X) Y = vtu.to_onehot(Y, output_label_size) if return_chars: return X, Y, np.array(X_char) return X, Y