Python keras.preprocessing.text.text_to_word_sequence() Examples

The following are 30 code examples of keras.preprocessing.text.text_to_word_sequence(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module keras.preprocessing.text , or try the search function .
Example #1
Source File: feature.py    From text-classifier with Apache License 2.0 7 votes vote down vote up
def doc_vec_feature(self, data_set, max_sentences=16):
        from keras.preprocessing.text import Tokenizer, text_to_word_sequence
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(data_set)
        data_feature = np.zeros((len(data_set), max_sentences, self.max_len), dtype='int32')
        sentence_symbols = "".join(self.sentence_symbol)
        split = "[" + sentence_symbols + "]"
        for i, sentence in enumerate(data_set):
            short_sents = re.split(split, sentence)
            for j, sent in enumerate(short_sents):
                if j < max_sentences and sent.strip():
                    words = text_to_word_sequence(sent)
                    k = 0
                    for w in words:
                        if k < self.max_len:
                            if w in tokenizer.word_index:
                                data_feature[i, j, k] = tokenizer.word_index[w]
                            k += 1
        word_index = tokenizer.word_index
        logger.info('Number of Unique Tokens: %d' % len(word_index))
        print('Shape of Data Tensor:', data_feature.shape)
        return data_feature 
Example #2
Source File: punctuator.py    From keras-punctuator with MIT License 6 votes vote down vote up
def texts_to_sequences(wordIndex, texts, num_words):
    lastWord = num_words - 1
    sequences = []
    for text in texts:
        seq = text_to_word_sequence(text)
        vect = []
        for w in seq:
            i = wordIndex.get(w)
            if i is not None:
                if num_words and i >= num_words:
                    vect.append(lastWord)
                else:
                    vect.append(i)
            else:
                vect.append(lastWord)
        sequences.append(vect)
    return sequences 
Example #3
Source File: preprocessors.py    From keras-image-captioning with MIT License 6 votes vote down vote up
def _handle_rare_words(self, captions):
        if self._rare_words_handling == 'nothing':
            return captions
        elif self._rare_words_handling == 'discard':
            tokenizer = Tokenizer()
            tokenizer.fit_on_texts(captions)
            new_captions = []
            for caption in captions:
                words = text_to_word_sequence(caption)
                new_words = [w for w in words
                             if tokenizer.word_counts.get(w, 0) >=
                             self._words_min_occur]
                new_captions.append(' '.join(new_words))
            return new_captions

        raise NotImplementedError('rare_words_handling={} is not implemented '
                                  'yet!'.format(self._rare_words_handling)) 
Example #4
Source File: text_test.py    From DeepLearning_Wavelet-LSTM with MIT License 5 votes vote down vote up
def test_text_to_word_sequence_unicode():
    text = u'ali! veli? kırk dokuz elli'
    assert text_to_word_sequence(text) == [u'ali', u'veli', u'kırk', u'dokuz', u'elli'] 
Example #5
Source File: sample_size_NN.py    From robotreviewer with GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, max_features, wvs, all_texts, unk=True, unk_symbol="unkunk"):
        '''
        max_features: the upper bound to be placed on the vocabulary size.
        embedding_dims: size of the token embeddings; over-ridden if pre-trained
                          vectors is provided (if wvs is not None).
        wvs: set of word vectors to be used for initialization
        '''
        self.unk = unk
        self.unk_symbol = unk_symbol
        self.max_features = max_features
        self.tokenizer = Tokenizer(nb_words=self.max_features)

        self.embedding_dims = wvs.vector_size
        self.word_embeddings = wvs

        self.raw_texts = all_texts
        self.unked_texts = []
        self.fit_tokenizer()

        if self.unk:
            # rewrite the 'raw texts' with unked versions, where tokens not in the
            # top max_features are unked.
            sorted_tokens = sorted(self.tokenizer.word_index, key=self.tokenizer.word_index.get)
            self.known_tokens = sorted_tokens[:self.max_features]
            self.tokens_to_unk = sorted_tokens[self.max_features:]

            for idx, text in enumerate(self.raw_texts):
                cur_text = text_to_word_sequence(text, split=self.tokenizer.split)
                t_or_unk = lambda t : t if t in self.known_tokens else self.unk_symbol
                unked_text = [t_or_unk(t) for t in cur_text]
                unked_text = self.tokenizer.split.join(unked_text)

                self.unked_texts.append(unked_text)

            self.raw_texts = self.unked_texts
            self.fit_tokenizer()

        self.init_word_vectors() 
Example #6
Source File: text_test.py    From DeepLearning_Wavelet-LSTM with MIT License 5 votes vote down vote up
def test_text_to_word_sequence_unicode_multichar_split():
    text = u'ali!stopveli?stopkırkstopdokuzstopelli'
    assert text_to_word_sequence(text, split='stop') == [u'ali', u'veli', u'kırk', u'dokuz', u'elli'] 
Example #7
Source File: text_test.py    From DeepLearning_Wavelet-LSTM with MIT License 5 votes vote down vote up
def test_text_to_word_sequence():
    text = 'hello! ? world!'
    assert text_to_word_sequence(text) == ['hello', 'world'] 
Example #8
Source File: text_test.py    From DeepLearning_Wavelet-LSTM with MIT License 5 votes vote down vote up
def test_text_to_word_sequence_unicode():
    text = u'ali! veli? kırk dokuz elli'
    assert text_to_word_sequence(text) == [u'ali', u'veli', u'kırk', u'dokuz', u'elli'] 
Example #9
Source File: text_test.py    From DeepLearning_Wavelet-LSTM with MIT License 5 votes vote down vote up
def test_text_to_word_sequence_unicode_multichar_split():
    text = u'ali!stopveli?stopkırkstopdokuzstopelli'
    assert text_to_word_sequence(text, split='stop') == [u'ali', u'veli', u'kırk', u'dokuz', u'elli'] 
Example #10
Source File: text_test.py    From DeepLearning_Wavelet-LSTM with MIT License 5 votes vote down vote up
def test_text_to_word_sequence():
    text = 'hello! ? world!'
    assert text_to_word_sequence(text) == ['hello', 'world'] 
Example #11
Source File: text_test.py    From DeepLearning_Wavelet-LSTM with MIT License 5 votes vote down vote up
def test_text_to_word_sequence_multichar_split():
    text = 'hello!stop?world!'
    assert text_to_word_sequence(text, split='stop') == ['hello', 'world'] 
Example #12
Source File: text_test.py    From DeepLearning_Wavelet-LSTM with MIT License 5 votes vote down vote up
def test_text_to_word_sequence_unicode():
    text = u'ali! veli? kırk dokuz elli'
    assert text_to_word_sequence(text) == [u'ali', u'veli', u'kırk', u'dokuz', u'elli'] 
Example #13
Source File: text_test.py    From DeepLearning_Wavelet-LSTM with MIT License 5 votes vote down vote up
def test_text_to_word_sequence():
    text = 'hello! ? world!'
    assert text_to_word_sequence(text) == ['hello', 'world'] 
Example #14
Source File: text_test.py    From DeepLearning_Wavelet-LSTM with MIT License 5 votes vote down vote up
def test_text_to_word_sequence_multichar_split():
    text = 'hello!stop?world!'
    assert text_to_word_sequence(text, split='stop') == ['hello', 'world'] 
Example #15
Source File: text_test.py    From DeepLearning_Wavelet-LSTM with MIT License 5 votes vote down vote up
def test_text_to_word_sequence_unicode():
    text = u'ali! veli? kırk dokuz elli'
    assert text_to_word_sequence(text) == [u'ali', u'veli', u'kırk', u'dokuz', u'elli'] 
Example #16
Source File: text_test.py    From DeepLearning_Wavelet-LSTM with MIT License 5 votes vote down vote up
def test_text_to_word_sequence_unicode_multichar_split():
    text = u'ali!stopveli?stopkırkstopdokuzstopelli'
    assert text_to_word_sequence(text, split='stop') == [u'ali', u'veli', u'kırk', u'dokuz', u'elli'] 
Example #17
Source File: text_test.py    From DeepLearning_Wavelet-LSTM with MIT License 5 votes vote down vote up
def test_text_to_word_sequence():
    text = 'hello! ? world!'
    assert text_to_word_sequence(text) == ['hello', 'world'] 
Example #18
Source File: text_test.py    From DeepLearning_Wavelet-LSTM with MIT License 5 votes vote down vote up
def test_text_to_word_sequence_unicode():
    text = u'ali! veli? kırk dokuz elli'
    assert text_to_word_sequence(text) == [u'ali', u'veli', u'kırk', u'dokuz', u'elli'] 
Example #19
Source File: text_test.py    From DeepLearning_Wavelet-LSTM with MIT License 5 votes vote down vote up
def test_text_to_word_sequence_unicode_multichar_split():
    text = u'ali!stopveli?stopkırkstopdokuzstopelli'
    assert text_to_word_sequence(text, split='stop') == [u'ali', u'veli', u'kırk', u'dokuz', u'elli'] 
Example #20
Source File: text_test.py    From DeepLearning_Wavelet-LSTM with MIT License 5 votes vote down vote up
def test_text_to_word_sequence():
    text = 'hello! ? world!'
    assert text_to_word_sequence(text) == ['hello', 'world'] 
Example #21
Source File: HAN.py    From DeepResearch with MIT License 5 votes vote down vote up
def preprocessing(self):
        """Preprocessing of the text to make it more resonant for training
        """
        paras = []
        labels = []
        texts = []
        for idx in range(self.text.shape[0]):
            text = self.clean_string(self.text[idx])
            texts.append(text)
            sentences = tokenize.sent_tokenize(text)
            paras.append(sentences)
        tokenizer = Tokenizer(num_words=self.max_features, oov_token=True)
        tokenizer.fit_on_texts(texts)
        data = np.zeros((len(texts), self.max_senten_num,
                         self.max_senten_len), dtype='int32')
        for i, sentences in enumerate(paras):
            for j, sent in enumerate(sentences):
                if j < self.max_senten_num:
                    wordTokens = text_to_word_sequence(sent)
                    k = 0
                    for _, word in enumerate(wordTokens):
                        if k < self.max_senten_len and word in tokenizer.word_index and tokenizer.word_index[word] < self.max_features:
                            data[i, j, k] = tokenizer.word_index[word]
                            k = k+1
        self.word_index = tokenizer.word_index
        if self.verbose == 1:
            print('Total %s unique tokens.' % len(self.word_index))
        labels = pd.get_dummies(self.categories)
        if self.verbose == 1:
            print('Shape of data tensor:', data.shape)
            print('Shape of labels tensor:', labels.shape)
        assert (len(self.classes) == labels.shape[1])
        assert (data.shape[0] == labels.shape[0])
        return data, labels 
Example #22
Source File: preprocessors.py    From keras-image-captioning with MIT License 5 votes vote down vote up
def normalize_captions(self, captions_txt):
        captions_txt = self._add_eos(captions_txt)
        word_sequences = map(text_to_word_sequence, captions_txt)
        result = map(' '.join, word_sequences)
        return result 
Example #23
Source File: handler.py    From tensorflow-chatbot-chinese with MIT License 5 votes vote down vote up
def prep(self, data):
        init = True
        for i in range(len(data)):
            reg = re.findall(r"[\w']+", data[i])
            if len(reg) == 0: # +++$+++
                init = True
                continue

            sent = text_to_word_sequence(data[i], lower=True, split=' ')
            if len(sent) > 15 or len(sent) < 2: # too long
                init = True
                continue
            idx_list = self.sentence_to_idx(sent)
            if len(idx_list) == 0: # <UNK> too many
                init = True
                continue

            if init:
                _in = idx_list
                init = False
            else:
                _out = idx_list
                #_rev_in = list(reversed(_in))
                # (the first EOS is part of the loss)
                self.data.append([_in , _out + [special_tokens['<EOS>']]])
                _in = idx_list
            if i % 100000 == 0:
                print("building data list: " + str(i) + "/" + str(len(data)) + " done.")


        print('original line num:', len(data))
        print('prep data num: ', len(self.data))
        self.data = np.array(self.data)
        self.perm = np.arange( len(self.data), dtype=np.int )
        self.shuffle_perm() 
Example #24
Source File: handler.py    From tensorflow-chatbot-chinese with MIT License 5 votes vote down vote up
def prep(self, data):
        for i in range(len(data)):
            line = data[i]
            reg = re.findall(r"[\w']+", line)
            if len(reg) == 0:
                continue
            sent = text_to_word_sequence(line, lower=True, split=" ")
            _in = self.sentence_to_idx(sent, is_test=True)
            #self.test_data.append(list(reversed(_in)))
            self.test_data.append(_in)

        print('test data num: ', len(self.test_data)) 
Example #25
Source File: text_test.py    From DeepLearning_Wavelet-LSTM with MIT License 5 votes vote down vote up
def test_text_to_word_sequence():
    text = 'hello! ? world!'
    assert text_to_word_sequence(text) == ['hello', 'world'] 
Example #26
Source File: text_test.py    From DeepLearning_Wavelet-LSTM with MIT License 5 votes vote down vote up
def test_text_to_word_sequence_multichar_split():
    text = 'hello!stop?world!'
    assert text_to_word_sequence(text, split='stop') == ['hello', 'world'] 
Example #27
Source File: text_test.py    From DeepLearning_Wavelet-LSTM with MIT License 5 votes vote down vote up
def test_text_to_word_sequence_unicode():
    text = u'ali! veli? kırk dokuz elli'
    assert text_to_word_sequence(text) == [u'ali', u'veli', u'kırk', u'dokuz', u'elli'] 
Example #28
Source File: text_test.py    From DeepLearning_Wavelet-LSTM with MIT License 5 votes vote down vote up
def test_text_to_word_sequence_unicode_multichar_split():
    text = u'ali!stopveli?stopkırkstopdokuzstopelli'
    assert text_to_word_sequence(text, split='stop') == [u'ali', u'veli', u'kırk', u'dokuz', u'elli'] 
Example #29
Source File: text_test.py    From DeepLearning_Wavelet-LSTM with MIT License 5 votes vote down vote up
def test_text_to_word_sequence():
    text = 'hello! ? world!'
    assert text_to_word_sequence(text) == ['hello', 'world'] 
Example #30
Source File: text_test.py    From DeepLearning_Wavelet-LSTM with MIT License 5 votes vote down vote up
def test_text_to_word_sequence_unicode():
    text = u'ali! veli? kırk dokuz elli'
    assert text_to_word_sequence(text) == [u'ali', u'veli', u'kırk', u'dokuz', u'elli']