Python keras.preprocessing.sequence() Examples

The following are 10 code examples of keras.preprocessing.sequence(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module keras.preprocessing , or try the search function .
Example #1
Source File: model.py    From KDD2018_MPCN with GNU General Public License v3.0 6 votes vote down vote up
def learn_single_repr(self, q1_embed, q1_len, q1_max, rnn_type,
                        reuse=None, pool=False, name="", mask=None):
        """ This is the single sequence encoder function.
        rnn_type controls what type of encoder is used.
        Supports neural bag-of-words (NBOW) and CNN encoder
        """
        if('NBOW' in rnn_type):
            q1_output = tf.reduce_sum(q1_embed, 1)
            if(pool):
                return q1_embed, q1_output
        elif('CNN' in rnn_type):
            q1_output = build_raw_cnn(q1_embed, self.args.rnn_size,
                filter_sizes=3,
                initializer=self.initializer,
                dropout=self.rnn_dropout, reuse=reuse, name=name)
            if(pool):
                q1_output = tf.reduce_max(q1_output, 1)
                return q1_output, q1_output
        else:
            q1_output = q1_embed

        return q1_output 
Example #2
Source File: model.py    From supervised-oie with MIT License 6 votes vote down vote up
def encode_outputs(self, sents):
        """
        Given a dataframe split to sentences, encode outputs for rnn classification.
        Should return a list sequence of sample of length maxlen.
        """
        output_encodings = []
        sents = self.get_fixed_size(sents)
        # Encode outputs
        for sent in sents:
            output_encodings.append(list(np_utils.to_categorical(list(self.transform_labels(sent.label.values)),
                                                                 num_classes = self.num_of_classes())))

        # Pad / truncate to maximum length
        return np.ndarray(shape = (len(sents),
                                  self.sent_maxlen,
                                  self.num_of_classes()),
                          buffer = np.array(pad_sequences(output_encodings,
                                                          lambda : \
                                                            np.zeros(self.num_of_classes()),
                                                          maxlen = self.sent_maxlen))) 
Example #3
Source File: model.py    From supervised-oie with MIT License 6 votes vote down vote up
def pad_sequences(sequences, pad_func, maxlen = None):
    """
    Similar to keras.preprocessing.sequence.pad_sequence but using Sample as higher level
    abstraction.
    pad_func is a pad class generator.
    """
    ret = []

    # Determine the maxlen
    max_value = max(map(len, sequences))
    if maxlen is None:
        maxlen = max_value

    # Pad / truncate (done this way to deal with np.array)
    for sequence in sequences:
        cur_seq = list(sequence[:maxlen])
        cur_seq.extend([pad_func()] * (maxlen - len(sequence)))
        ret.append(cur_seq)
    return ret 
Example #4
Source File: Embeddings.py    From delft with Apache License 2.0 5 votes vote down vote up
def get_ELMo_lmdb_vector(self, token_list, max_size_sentence):
        """
            Try to get the ELMo embeddings for a sequence cached in LMDB
        """
        if self.env_ELMo is None:
            # db cache not available, we don't cache ELMo stuff
            return None
        try:    
            ELMo_vector = np.zeros((len(token_list), max_size_sentence-2, ELMo_embed_size), dtype='float32')
            with self.env_ELMo.begin() as txn:
                for i in range(0, len(token_list)):
                    txn = self.env_ELMo.begin()
                    # get a hash for the token_list
                    the_hash = list_digest(token_list[i])
                    vector = txn.get(the_hash.encode(encoding='UTF-8'))
                    if vector:
                        # adapt expected shape/padding
                        local_embeddings = _deserialize_pickle(vector)
                        if local_embeddings.shape[0] > max_size_sentence-2:
                            # squeeze the extra padding space
                            ELMo_vector[i] = local_embeddings[:max_size_sentence-2,]
                        elif local_embeddings.shape[0] == max_size_sentence-2:
                            # bingo~!
                            ELMo_vector[i] = local_embeddings
                        else:
                            # fill the missing space with padding
                            filler = np.zeros((max_size_sentence-(local_embeddings.shape[0]+2), ELMo_embed_size), dtype='float32')
                            ELMo_vector[i] = np.concatenate((local_embeddings, filler))
                        vector = None
                    else:
                        return None
        except lmdb.Error:
            # no idea why, but we need to close and reopen the environment to avoid
            # mdb_txn_begin: MDB_BAD_RSLOT: Invalid reuse of reader locktable slot
            # when opening new transaction !
            self.env_ELMo.close()
            self.env_ELMo = lmdb.open(self.embedding_ELMo_cache, readonly=True, max_readers=2048, max_spare_txns=2, lock=False)
            return self.get_ELMo_lmdb_vector(token_list)
        return ELMo_vector 
Example #5
Source File: Embeddings.py    From delft with Apache License 2.0 5 votes vote down vote up
def cache_ELMo_lmdb_vector(self, token_list, ELMo_vector):
        """
            Cache in LMDB the ELMo embeddings for a given sequence 
        """
        if self.env_ELMo is None:
            # db cache not available, we don't cache ELMo stuff
            return None
        txn = self.env_ELMo.begin(write=True)
        for i in range(0, len(token_list)):
            # get a hash for the token_list
            the_hash = list_digest(token_list[i])
            txn.put(the_hash.encode(encoding='UTF-8'), _serialize_pickle(ELMo_vector[i]))  
        txn.commit() 
Example #6
Source File: Embeddings.py    From delft with Apache License 2.0 5 votes vote down vote up
def cache_BERT_lmdb_vector(self, sentence, BERT_vector):
        """
            Cache in LMDB the BERT embeddings for a given sequence 
        """
        if self.env_BERT is None:
            # db cache not available, we don't cache BERT stuff
            return None
        txn = self.env_BERT.begin(write=True)
        #for i in range(0, len(sentence)):
        # get a hash for the token_list
        the_hash = list_digest(sentence)
        txn.put(the_hash.encode(encoding='UTF-8'), _serialize_pickle(BERT_vector))  
        txn.commit() 
Example #7
Source File: confidence_model.py    From supervised-oie with MIT License 5 votes vote down vote up
def encode_inputs(self, sents):
        """
        Given a dataframe split to sentences, encode inputs for rnn classification.
        Should return a dictionary of sequences of sample of length maxlen.
        """
        word_inputs = []
        pred_inputs = []
        pos_inputs = []
        sents = self.get_fixed_size(sents)

        for sent in sents:
            # pd assigns NaN for very infreq. empty string (see wiki train)
            sent_words = [word
                         if not (isinstance(word, float) and math.isnan(word)) else " "
                         for word in sent.word.values]

            pos_tags_encodings = [NLTK_POS_TAGS.index(tag)
                                  for (_, tag)
                                  in nltk.pos_tag(sent_words)]
            word_encodings = [self.emb.get_word_index(w) for w in sent_words]
            pred_word_encodings = [self.emb.get_word_index(w) for w in sent_words]
            word_inputs.append([Sample(w) for w in word_encodings])
            pred_inputs.append([Sample(w) for w in pred_word_encodings])
            pos_inputs.append([Sample(pos) for pos in pos_tags_encodings])

        # Pad / truncate to desired maximum length
        ret = {"word_inputs" : [],
               "predicate_inputs": []}
        ret = defaultdict(lambda: [])

        for name, sequence in zip(["word_inputs", "predicate_inputs", "postags_inputs"],
                                  [word_inputs, pred_inputs, pos_inputs]):
            for samples in pad_sequences(sequence,
                                         pad_func = lambda : Pad_sample(),
                                         maxlen = self.sent_maxlen):
                ret[name].append([sample.encode() for sample in samples])

        return {k: np.array(v) for k, v in ret.iteritems()} 
Example #8
Source File: confidence_model.py    From supervised-oie with MIT License 5 votes vote down vote up
def encode_outputs(self, sents):
        """
        Given a dataframe split to sentences, encode outputs for rnn classification.
        Should return a list sequence of sample of length maxlen.
        """
        output_encodings = []
        sents = self.get_fixed_size(sents)
        # Encode outputs
        for sent in sents:
            output_encodings.append(list(np_utils.to_categorical(\
                                                list(self.transform_labels(sent.label.values)),
                                                            nb_classes = self.num_of_classes())))

        # Pad / truncate to maximum length
        return np.ndarray(shape = (len(sents),
                                  self.sent_maxlen,
                                  self.num_of_classes()),
                          buffer = np.array(pad_sequences(output_encodings,
                                                          lambda : \
                                                            np.zeros(self.num_of_classes()),
                                                          maxlen = self.sent_maxlen)))



    # Functional Keras -- all of the following are currying functions expecting models as input
    # https://keras.io/getting-started/functional-api-guide/ 
Example #9
Source File: conversation_discriminator.py    From Seq2seq-Chatbot-for-Keras with Apache License 2.0 4 votes vote down vote up
def run_discriminator(q, a):

    sa = (a != 0).sum()

    # *************************************************************************
    # running discriminator:
    # *************************************************************************

    p = 1
    m = 0
    model_discrim = init_model()
    count = 0
 
    for i, sent in enumerate(a):
        l = np.where(sent==3)  #  the position od the symbol EOS
        limit = l[0][0]
        count += limit + 1

    Q = np.zeros((count,maxlen_input))
    A = np.zeros((count,maxlen_input))
    Y = np.zeros((count,dictionary_size))

    # Loop over the training examples:
    count = 0
    for i, sent in enumerate(a):
        ans_partial = np.zeros((1,maxlen_input))
        
        # Loop over the positions of the current target output (the current output sequence):
        l = np.where(sent==3)  #  the position of the symbol EOS
        limit = l[0][0]

        for k in range(1,limit+1):
            # Mapping the target output (the next output word) for one-hot codding:
            y = np.zeros((1, dictionary_size))
            y[0, int(sent[k])] = 1

            # preparing the partial answer to input:
            ans_partial[0,-k:] = sent[0:k]

            # training the model for one epoch using teacher forcing:
            Q[count, :] = q[i:i+1] 
            A[count, :] = ans_partial 
            Y[count, :] = y
            count += 1

    p = model_discrim.predict([ Q, A, Y])
    p = p[-sa:-1]
    P = np.sum(np.log(p))/sa
    
    return P 
Example #10
Source File: Embeddings.py    From delft with Apache License 2.0 4 votes vote down vote up
def get_BERT_lmdb_vector(self, sentence):
        """
            Try to get the BERT extracted embeddings for a sequence cached in LMDB
        """
        if self.env_BERT is None:
            # db cache not available, we don't cache ELMo stuff
            return None
        try:    
            BERT_vector = np.zeros((BERT_sentence_size, BERT_embed_size), dtype='float32')
            with self.env_BERT.begin() as txn:
                txn = self.env_BERT.begin()
                # get a hash for the token_list
                the_hash = list_digest(sentence)
                vector = txn.get(the_hash.encode(encoding='UTF-8'))
                
                if vector:
                    # adapt expected shape/padding
                    BERT_vector = _deserialize_pickle(vector)
                    '''
                    if local_embeddings.shape[0] > max_size_sentence:
                        # squeeze the extra padding space
                        BERT_vector = local_embeddings[:max_size_sentence,]
                    elif local_embeddings.shape[0] == max_size_sentence:
                        # bingo~!
                        BERT_vector = local_embeddings
                    else:
                        # fill the missing space with padding
                        filler = np.zeros((max_size_sentence-(local_embeddings.shape[0]), BERT_embed_size), dtype='float32')
                        BERT_vector = np.concatenate((local_embeddings, filler))
                    '''
                    vector = None
                else:
                    return None
                
        except lmdb.Error:
            # no idea why, but we need to close and reopen the environment to avoid
            # mdb_txn_begin: MDB_BAD_RSLOT: Invalid reuse of reader locktable slot
            # when opening new transaction !
            self.env_BERT.close()
            self.env_BERT = lmdb.open(self.embedding_BERT_cache, readonly=True, max_readers=2048, max_spare_txns=2, lock=False)
            return self.get_BERT_lmdb_vector(sentence)
        return BERT_vector