Python Examples of keras.preprocessing.sequence

Source File: model.py From KDD2018_MPCN with GNU General Public License v3.0

6 votes

def learn_single_repr(self, q1_embed, q1_len, q1_max, rnn_type,
                        reuse=None, pool=False, name="", mask=None):
        """ This is the single sequence encoder function.
        rnn_type controls what type of encoder is used.
        Supports neural bag-of-words (NBOW) and CNN encoder
        """
        if('NBOW' in rnn_type):
            q1_output = tf.reduce_sum(q1_embed, 1)
            if(pool):
                return q1_embed, q1_output
        elif('CNN' in rnn_type):
            q1_output = build_raw_cnn(q1_embed, self.args.rnn_size,
                filter_sizes=3,
                initializer=self.initializer,
                dropout=self.rnn_dropout, reuse=reuse, name=name)
            if(pool):
                q1_output = tf.reduce_max(q1_output, 1)
                return q1_output, q1_output
        else:
            q1_output = q1_embed

        return q1_output

Source File: model.py From supervised-oie with MIT License

6 votes

def encode_outputs(self, sents):
        """
        Given a dataframe split to sentences, encode outputs for rnn classification.
        Should return a list sequence of sample of length maxlen.
        """
        output_encodings = []
        sents = self.get_fixed_size(sents)
        # Encode outputs
        for sent in sents:
            output_encodings.append(list(np_utils.to_categorical(list(self.transform_labels(sent.label.values)),
                                                                 num_classes = self.num_of_classes())))

        # Pad / truncate to maximum length
        return np.ndarray(shape = (len(sents),
                                  self.sent_maxlen,
                                  self.num_of_classes()),
                          buffer = np.array(pad_sequences(output_encodings,
                                                          lambda : \
                                                            np.zeros(self.num_of_classes()),
                                                          maxlen = self.sent_maxlen)))

Source File: model.py From supervised-oie with MIT License

6 votes

def pad_sequences(sequences, pad_func, maxlen = None):
    """
    Similar to keras.preprocessing.sequence.pad_sequence but using Sample as higher level
    abstraction.
    pad_func is a pad class generator.
    """
    ret = []

    # Determine the maxlen
    max_value = max(map(len, sequences))
    if maxlen is None:
        maxlen = max_value

    # Pad / truncate (done this way to deal with np.array)
    for sequence in sequences:
        cur_seq = list(sequence[:maxlen])
        cur_seq.extend([pad_func()] * (maxlen - len(sequence)))
        ret.append(cur_seq)
    return ret

Source File: Embeddings.py From delft with Apache License 2.0

5 votes

def get_ELMo_lmdb_vector(self, token_list, max_size_sentence):
        """
            Try to get the ELMo embeddings for a sequence cached in LMDB
        """
        if self.env_ELMo is None:
            # db cache not available, we don't cache ELMo stuff
            return None
        try:    
            ELMo_vector = np.zeros((len(token_list), max_size_sentence-2, ELMo_embed_size), dtype='float32')
            with self.env_ELMo.begin() as txn:
                for i in range(0, len(token_list)):
                    txn = self.env_ELMo.begin()
                    # get a hash for the token_list
                    the_hash = list_digest(token_list[i])
                    vector = txn.get(the_hash.encode(encoding='UTF-8'))
                    if vector:
                        # adapt expected shape/padding
                        local_embeddings = _deserialize_pickle(vector)
                        if local_embeddings.shape[0] > max_size_sentence-2:
                            # squeeze the extra padding space
                            ELMo_vector[i] = local_embeddings[:max_size_sentence-2,]
                        elif local_embeddings.shape[0] == max_size_sentence-2:
                            # bingo~!
                            ELMo_vector[i] = local_embeddings
                        else:
                            # fill the missing space with padding
                            filler = np.zeros((max_size_sentence-(local_embeddings.shape[0]+2), ELMo_embed_size), dtype='float32')
                            ELMo_vector[i] = np.concatenate((local_embeddings, filler))
                        vector = None
                    else:
                        return None
        except lmdb.Error:
            # no idea why, but we need to close and reopen the environment to avoid
            # mdb_txn_begin: MDB_BAD_RSLOT: Invalid reuse of reader locktable slot
            # when opening new transaction !
            self.env_ELMo.close()
            self.env_ELMo = lmdb.open(self.embedding_ELMo_cache, readonly=True, max_readers=2048, max_spare_txns=2, lock=False)
            return self.get_ELMo_lmdb_vector(token_list)
        return ELMo_vector

Source File: Embeddings.py From delft with Apache License 2.0

5 votes

def cache_ELMo_lmdb_vector(self, token_list, ELMo_vector):
        """
            Cache in LMDB the ELMo embeddings for a given sequence 
        """
        if self.env_ELMo is None:
            # db cache not available, we don't cache ELMo stuff
            return None
        txn = self.env_ELMo.begin(write=True)
        for i in range(0, len(token_list)):
            # get a hash for the token_list
            the_hash = list_digest(token_list[i])
            txn.put(the_hash.encode(encoding='UTF-8'), _serialize_pickle(ELMo_vector[i]))  
        txn.commit()

Source File: Embeddings.py From delft with Apache License 2.0

5 votes

def cache_BERT_lmdb_vector(self, sentence, BERT_vector):
        """
            Cache in LMDB the BERT embeddings for a given sequence 
        """
        if self.env_BERT is None:
            # db cache not available, we don't cache BERT stuff
            return None
        txn = self.env_BERT.begin(write=True)
        #for i in range(0, len(sentence)):
        # get a hash for the token_list
        the_hash = list_digest(sentence)
        txn.put(the_hash.encode(encoding='UTF-8'), _serialize_pickle(BERT_vector))  
        txn.commit()

Source File: confidence_model.py From supervised-oie with MIT License

5 votes

def encode_inputs(self, sents):
        """
        Given a dataframe split to sentences, encode inputs for rnn classification.
        Should return a dictionary of sequences of sample of length maxlen.
        """
        word_inputs = []
        pred_inputs = []
        pos_inputs = []
        sents = self.get_fixed_size(sents)

        for sent in sents:
            # pd assigns NaN for very infreq. empty string (see wiki train)
            sent_words = [word
                         if not (isinstance(word, float) and math.isnan(word)) else " "
                         for word in sent.word.values]

            pos_tags_encodings = [NLTK_POS_TAGS.index(tag)
                                  for (_, tag)
                                  in nltk.pos_tag(sent_words)]
            word_encodings = [self.emb.get_word_index(w) for w in sent_words]
            pred_word_encodings = [self.emb.get_word_index(w) for w in sent_words]
            word_inputs.append([Sample(w) for w in word_encodings])
            pred_inputs.append([Sample(w) for w in pred_word_encodings])
            pos_inputs.append([Sample(pos) for pos in pos_tags_encodings])

        # Pad / truncate to desired maximum length
        ret = {"word_inputs" : [],
               "predicate_inputs": []}
        ret = defaultdict(lambda: [])

        for name, sequence in zip(["word_inputs", "predicate_inputs", "postags_inputs"],
                                  [word_inputs, pred_inputs, pos_inputs]):
            for samples in pad_sequences(sequence,
                                         pad_func = lambda : Pad_sample(),
                                         maxlen = self.sent_maxlen):
                ret[name].append([sample.encode() for sample in samples])

        return {k: np.array(v) for k, v in ret.iteritems()}

Source File: confidence_model.py From supervised-oie with MIT License

5 votes

def encode_outputs(self, sents):
        """
        Given a dataframe split to sentences, encode outputs for rnn classification.
        Should return a list sequence of sample of length maxlen.
        """
        output_encodings = []
        sents = self.get_fixed_size(sents)
        # Encode outputs
        for sent in sents:
            output_encodings.append(list(np_utils.to_categorical(\
                                                list(self.transform_labels(sent.label.values)),
                                                            nb_classes = self.num_of_classes())))

        # Pad / truncate to maximum length
        return np.ndarray(shape = (len(sents),
                                  self.sent_maxlen,
                                  self.num_of_classes()),
                          buffer = np.array(pad_sequences(output_encodings,
                                                          lambda : \
                                                            np.zeros(self.num_of_classes()),
                                                          maxlen = self.sent_maxlen)))



    # Functional Keras -- all of the following are currying functions expecting models as input
    # https://keras.io/getting-started/functional-api-guide/

Source File: conversation_discriminator.py From Seq2seq-Chatbot-for-Keras with Apache License 2.0

4 votes

def run_discriminator(q, a):

    sa = (a != 0).sum()

    # *************************************************************************
    # running discriminator:
    # *************************************************************************

    p = 1
    m = 0
    model_discrim = init_model()
    count = 0
 
    for i, sent in enumerate(a):
        l = np.where(sent==3)  #  the position od the symbol EOS
        limit = l[0][0]
        count += limit + 1

    Q = np.zeros((count,maxlen_input))
    A = np.zeros((count,maxlen_input))
    Y = np.zeros((count,dictionary_size))

    # Loop over the training examples:
    count = 0
    for i, sent in enumerate(a):
        ans_partial = np.zeros((1,maxlen_input))
        
        # Loop over the positions of the current target output (the current output sequence):
        l = np.where(sent==3)  #  the position of the symbol EOS
        limit = l[0][0]

        for k in range(1,limit+1):
            # Mapping the target output (the next output word) for one-hot codding:
            y = np.zeros((1, dictionary_size))
            y[0, int(sent[k])] = 1

            # preparing the partial answer to input:
            ans_partial[0,-k:] = sent[0:k]

            # training the model for one epoch using teacher forcing:
            Q[count, :] = q[i:i+1] 
            A[count, :] = ans_partial 
            Y[count, :] = y
            count += 1

    p = model_discrim.predict([ Q, A, Y])
    p = p[-sa:-1]
    P = np.sum(np.log(p))/sa
    
    return P

Source File: Embeddings.py From delft with Apache License 2.0

4 votes

def get_BERT_lmdb_vector(self, sentence):
        """
            Try to get the BERT extracted embeddings for a sequence cached in LMDB
        """
        if self.env_BERT is None:
            # db cache not available, we don't cache ELMo stuff
            return None
        try:    
            BERT_vector = np.zeros((BERT_sentence_size, BERT_embed_size), dtype='float32')
            with self.env_BERT.begin() as txn:
                txn = self.env_BERT.begin()
                # get a hash for the token_list
                the_hash = list_digest(sentence)
                vector = txn.get(the_hash.encode(encoding='UTF-8'))
                
                if vector:
                    # adapt expected shape/padding
                    BERT_vector = _deserialize_pickle(vector)
                    '''
                    if local_embeddings.shape[0] > max_size_sentence:
                        # squeeze the extra padding space
                        BERT_vector = local_embeddings[:max_size_sentence,]
                    elif local_embeddings.shape[0] == max_size_sentence:
                        # bingo~!
                        BERT_vector = local_embeddings
                    else:
                        # fill the missing space with padding
                        filler = np.zeros((max_size_sentence-(local_embeddings.shape[0]), BERT_embed_size), dtype='float32')
                        BERT_vector = np.concatenate((local_embeddings, filler))
                    '''
                    vector = None
                else:
                    return None
                
        except lmdb.Error:
            # no idea why, but we need to close and reopen the environment to avoid
            # mdb_txn_begin: MDB_BAD_RSLOT: Invalid reuse of reader locktable slot
            # when opening new transaction !
            self.env_BERT.close()
            self.env_BERT = lmdb.open(self.embedding_BERT_cache, readonly=True, max_readers=2048, max_spare_txns=2, lock=False)
            return self.get_BERT_lmdb_vector(sentence)
        return BERT_vector

Python keras.preprocessing.sequence() Examples