Python keras.preprocessing.sequence() Examples
The following are 10
code examples of keras.preprocessing.sequence().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
keras.preprocessing
, or try the search function
.
Example #1
Source File: model.py From KDD2018_MPCN with GNU General Public License v3.0 | 6 votes |
def learn_single_repr(self, q1_embed, q1_len, q1_max, rnn_type, reuse=None, pool=False, name="", mask=None): """ This is the single sequence encoder function. rnn_type controls what type of encoder is used. Supports neural bag-of-words (NBOW) and CNN encoder """ if('NBOW' in rnn_type): q1_output = tf.reduce_sum(q1_embed, 1) if(pool): return q1_embed, q1_output elif('CNN' in rnn_type): q1_output = build_raw_cnn(q1_embed, self.args.rnn_size, filter_sizes=3, initializer=self.initializer, dropout=self.rnn_dropout, reuse=reuse, name=name) if(pool): q1_output = tf.reduce_max(q1_output, 1) return q1_output, q1_output else: q1_output = q1_embed return q1_output
Example #2
Source File: model.py From supervised-oie with MIT License | 6 votes |
def encode_outputs(self, sents): """ Given a dataframe split to sentences, encode outputs for rnn classification. Should return a list sequence of sample of length maxlen. """ output_encodings = [] sents = self.get_fixed_size(sents) # Encode outputs for sent in sents: output_encodings.append(list(np_utils.to_categorical(list(self.transform_labels(sent.label.values)), num_classes = self.num_of_classes()))) # Pad / truncate to maximum length return np.ndarray(shape = (len(sents), self.sent_maxlen, self.num_of_classes()), buffer = np.array(pad_sequences(output_encodings, lambda : \ np.zeros(self.num_of_classes()), maxlen = self.sent_maxlen)))
Example #3
Source File: model.py From supervised-oie with MIT License | 6 votes |
def pad_sequences(sequences, pad_func, maxlen = None): """ Similar to keras.preprocessing.sequence.pad_sequence but using Sample as higher level abstraction. pad_func is a pad class generator. """ ret = [] # Determine the maxlen max_value = max(map(len, sequences)) if maxlen is None: maxlen = max_value # Pad / truncate (done this way to deal with np.array) for sequence in sequences: cur_seq = list(sequence[:maxlen]) cur_seq.extend([pad_func()] * (maxlen - len(sequence))) ret.append(cur_seq) return ret
Example #4
Source File: Embeddings.py From delft with Apache License 2.0 | 5 votes |
def get_ELMo_lmdb_vector(self, token_list, max_size_sentence): """ Try to get the ELMo embeddings for a sequence cached in LMDB """ if self.env_ELMo is None: # db cache not available, we don't cache ELMo stuff return None try: ELMo_vector = np.zeros((len(token_list), max_size_sentence-2, ELMo_embed_size), dtype='float32') with self.env_ELMo.begin() as txn: for i in range(0, len(token_list)): txn = self.env_ELMo.begin() # get a hash for the token_list the_hash = list_digest(token_list[i]) vector = txn.get(the_hash.encode(encoding='UTF-8')) if vector: # adapt expected shape/padding local_embeddings = _deserialize_pickle(vector) if local_embeddings.shape[0] > max_size_sentence-2: # squeeze the extra padding space ELMo_vector[i] = local_embeddings[:max_size_sentence-2,] elif local_embeddings.shape[0] == max_size_sentence-2: # bingo~! ELMo_vector[i] = local_embeddings else: # fill the missing space with padding filler = np.zeros((max_size_sentence-(local_embeddings.shape[0]+2), ELMo_embed_size), dtype='float32') ELMo_vector[i] = np.concatenate((local_embeddings, filler)) vector = None else: return None except lmdb.Error: # no idea why, but we need to close and reopen the environment to avoid # mdb_txn_begin: MDB_BAD_RSLOT: Invalid reuse of reader locktable slot # when opening new transaction ! self.env_ELMo.close() self.env_ELMo = lmdb.open(self.embedding_ELMo_cache, readonly=True, max_readers=2048, max_spare_txns=2, lock=False) return self.get_ELMo_lmdb_vector(token_list) return ELMo_vector
Example #5
Source File: Embeddings.py From delft with Apache License 2.0 | 5 votes |
def cache_ELMo_lmdb_vector(self, token_list, ELMo_vector): """ Cache in LMDB the ELMo embeddings for a given sequence """ if self.env_ELMo is None: # db cache not available, we don't cache ELMo stuff return None txn = self.env_ELMo.begin(write=True) for i in range(0, len(token_list)): # get a hash for the token_list the_hash = list_digest(token_list[i]) txn.put(the_hash.encode(encoding='UTF-8'), _serialize_pickle(ELMo_vector[i])) txn.commit()
Example #6
Source File: Embeddings.py From delft with Apache License 2.0 | 5 votes |
def cache_BERT_lmdb_vector(self, sentence, BERT_vector): """ Cache in LMDB the BERT embeddings for a given sequence """ if self.env_BERT is None: # db cache not available, we don't cache BERT stuff return None txn = self.env_BERT.begin(write=True) #for i in range(0, len(sentence)): # get a hash for the token_list the_hash = list_digest(sentence) txn.put(the_hash.encode(encoding='UTF-8'), _serialize_pickle(BERT_vector)) txn.commit()
Example #7
Source File: confidence_model.py From supervised-oie with MIT License | 5 votes |
def encode_inputs(self, sents): """ Given a dataframe split to sentences, encode inputs for rnn classification. Should return a dictionary of sequences of sample of length maxlen. """ word_inputs = [] pred_inputs = [] pos_inputs = [] sents = self.get_fixed_size(sents) for sent in sents: # pd assigns NaN for very infreq. empty string (see wiki train) sent_words = [word if not (isinstance(word, float) and math.isnan(word)) else " " for word in sent.word.values] pos_tags_encodings = [NLTK_POS_TAGS.index(tag) for (_, tag) in nltk.pos_tag(sent_words)] word_encodings = [self.emb.get_word_index(w) for w in sent_words] pred_word_encodings = [self.emb.get_word_index(w) for w in sent_words] word_inputs.append([Sample(w) for w in word_encodings]) pred_inputs.append([Sample(w) for w in pred_word_encodings]) pos_inputs.append([Sample(pos) for pos in pos_tags_encodings]) # Pad / truncate to desired maximum length ret = {"word_inputs" : [], "predicate_inputs": []} ret = defaultdict(lambda: []) for name, sequence in zip(["word_inputs", "predicate_inputs", "postags_inputs"], [word_inputs, pred_inputs, pos_inputs]): for samples in pad_sequences(sequence, pad_func = lambda : Pad_sample(), maxlen = self.sent_maxlen): ret[name].append([sample.encode() for sample in samples]) return {k: np.array(v) for k, v in ret.iteritems()}
Example #8
Source File: confidence_model.py From supervised-oie with MIT License | 5 votes |
def encode_outputs(self, sents): """ Given a dataframe split to sentences, encode outputs for rnn classification. Should return a list sequence of sample of length maxlen. """ output_encodings = [] sents = self.get_fixed_size(sents) # Encode outputs for sent in sents: output_encodings.append(list(np_utils.to_categorical(\ list(self.transform_labels(sent.label.values)), nb_classes = self.num_of_classes()))) # Pad / truncate to maximum length return np.ndarray(shape = (len(sents), self.sent_maxlen, self.num_of_classes()), buffer = np.array(pad_sequences(output_encodings, lambda : \ np.zeros(self.num_of_classes()), maxlen = self.sent_maxlen))) # Functional Keras -- all of the following are currying functions expecting models as input # https://keras.io/getting-started/functional-api-guide/
Example #9
Source File: conversation_discriminator.py From Seq2seq-Chatbot-for-Keras with Apache License 2.0 | 4 votes |
def run_discriminator(q, a): sa = (a != 0).sum() # ************************************************************************* # running discriminator: # ************************************************************************* p = 1 m = 0 model_discrim = init_model() count = 0 for i, sent in enumerate(a): l = np.where(sent==3) # the position od the symbol EOS limit = l[0][0] count += limit + 1 Q = np.zeros((count,maxlen_input)) A = np.zeros((count,maxlen_input)) Y = np.zeros((count,dictionary_size)) # Loop over the training examples: count = 0 for i, sent in enumerate(a): ans_partial = np.zeros((1,maxlen_input)) # Loop over the positions of the current target output (the current output sequence): l = np.where(sent==3) # the position of the symbol EOS limit = l[0][0] for k in range(1,limit+1): # Mapping the target output (the next output word) for one-hot codding: y = np.zeros((1, dictionary_size)) y[0, int(sent[k])] = 1 # preparing the partial answer to input: ans_partial[0,-k:] = sent[0:k] # training the model for one epoch using teacher forcing: Q[count, :] = q[i:i+1] A[count, :] = ans_partial Y[count, :] = y count += 1 p = model_discrim.predict([ Q, A, Y]) p = p[-sa:-1] P = np.sum(np.log(p))/sa return P
Example #10
Source File: Embeddings.py From delft with Apache License 2.0 | 4 votes |
def get_BERT_lmdb_vector(self, sentence): """ Try to get the BERT extracted embeddings for a sequence cached in LMDB """ if self.env_BERT is None: # db cache not available, we don't cache ELMo stuff return None try: BERT_vector = np.zeros((BERT_sentence_size, BERT_embed_size), dtype='float32') with self.env_BERT.begin() as txn: txn = self.env_BERT.begin() # get a hash for the token_list the_hash = list_digest(sentence) vector = txn.get(the_hash.encode(encoding='UTF-8')) if vector: # adapt expected shape/padding BERT_vector = _deserialize_pickle(vector) ''' if local_embeddings.shape[0] > max_size_sentence: # squeeze the extra padding space BERT_vector = local_embeddings[:max_size_sentence,] elif local_embeddings.shape[0] == max_size_sentence: # bingo~! BERT_vector = local_embeddings else: # fill the missing space with padding filler = np.zeros((max_size_sentence-(local_embeddings.shape[0]), BERT_embed_size), dtype='float32') BERT_vector = np.concatenate((local_embeddings, filler)) ''' vector = None else: return None except lmdb.Error: # no idea why, but we need to close and reopen the environment to avoid # mdb_txn_begin: MDB_BAD_RSLOT: Invalid reuse of reader locktable slot # when opening new transaction ! self.env_BERT.close() self.env_BERT = lmdb.open(self.embedding_BERT_cache, readonly=True, max_readers=2048, max_spare_txns=2, lock=False) return self.get_BERT_lmdb_vector(sentence) return BERT_vector