Python Examples of keras.preprocessing.text.text_to_word

Source File: feature.py From text-classifier with Apache License 2.0

7 votes

def doc_vec_feature(self, data_set, max_sentences=16):
        from keras.preprocessing.text import Tokenizer, text_to_word_sequence
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(data_set)
        data_feature = np.zeros((len(data_set), max_sentences, self.max_len), dtype='int32')
        sentence_symbols = "".join(self.sentence_symbol)
        split = "[" + sentence_symbols + "]"
        for i, sentence in enumerate(data_set):
            short_sents = re.split(split, sentence)
            for j, sent in enumerate(short_sents):
                if j < max_sentences and sent.strip():
                    words = text_to_word_sequence(sent)
                    k = 0
                    for w in words:
                        if k < self.max_len:
                            if w in tokenizer.word_index:
                                data_feature[i, j, k] = tokenizer.word_index[w]
                            k += 1
        word_index = tokenizer.word_index
        logger.info('Number of Unique Tokens: %d' % len(word_index))
        print('Shape of Data Tensor:', data_feature.shape)
        return data_feature

Source File: punctuator.py From keras-punctuator with MIT License

6 votes

def texts_to_sequences(wordIndex, texts, num_words):
    lastWord = num_words - 1
    sequences = []
    for text in texts:
        seq = text_to_word_sequence(text)
        vect = []
        for w in seq:
            i = wordIndex.get(w)
            if i is not None:
                if num_words and i >= num_words:
                    vect.append(lastWord)
                else:
                    vect.append(i)
            else:
                vect.append(lastWord)
        sequences.append(vect)
    return sequences

Source File: preprocessors.py From keras-image-captioning with MIT License

6 votes

def _handle_rare_words(self, captions):
        if self._rare_words_handling == 'nothing':
            return captions
        elif self._rare_words_handling == 'discard':
            tokenizer = Tokenizer()
            tokenizer.fit_on_texts(captions)
            new_captions = []
            for caption in captions:
                words = text_to_word_sequence(caption)
                new_words = [w for w in words
                             if tokenizer.word_counts.get(w, 0) >=
                             self._words_min_occur]
                new_captions.append(' '.join(new_words))
            return new_captions

        raise NotImplementedError('rare_words_handling={} is not implemented '
                                  'yet!'.format(self._rare_words_handling))

Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License

5 votes

def test_text_to_word_sequence_unicode():
    text = u'ali! veli? kırk dokuz elli'
    assert text_to_word_sequence(text) == [u'ali', u'veli', u'kırk', u'dokuz', u'elli']

Source File: sample_size_NN.py From robotreviewer with GNU General Public License v3.0

5 votes

def __init__(self, max_features, wvs, all_texts, unk=True, unk_symbol="unkunk"):
        '''
        max_features: the upper bound to be placed on the vocabulary size.
        embedding_dims: size of the token embeddings; over-ridden if pre-trained
                          vectors is provided (if wvs is not None).
        wvs: set of word vectors to be used for initialization
        '''
        self.unk = unk
        self.unk_symbol = unk_symbol
        self.max_features = max_features
        self.tokenizer = Tokenizer(nb_words=self.max_features)

        self.embedding_dims = wvs.vector_size
        self.word_embeddings = wvs

        self.raw_texts = all_texts
        self.unked_texts = []
        self.fit_tokenizer()

        if self.unk:
            # rewrite the 'raw texts' with unked versions, where tokens not in the
            # top max_features are unked.
            sorted_tokens = sorted(self.tokenizer.word_index, key=self.tokenizer.word_index.get)
            self.known_tokens = sorted_tokens[:self.max_features]
            self.tokens_to_unk = sorted_tokens[self.max_features:]

            for idx, text in enumerate(self.raw_texts):
                cur_text = text_to_word_sequence(text, split=self.tokenizer.split)
                t_or_unk = lambda t : t if t in self.known_tokens else self.unk_symbol
                unked_text = [t_or_unk(t) for t in cur_text]
                unked_text = self.tokenizer.split.join(unked_text)

                self.unked_texts.append(unked_text)

            self.raw_texts = self.unked_texts
            self.fit_tokenizer()

        self.init_word_vectors()

Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License

5 votes

def test_text_to_word_sequence_unicode_multichar_split():
    text = u'ali!stopveli?stopkırkstopdokuzstopelli'
    assert text_to_word_sequence(text, split='stop') == [u'ali', u'veli', u'kırk', u'dokuz', u'elli']

Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License

5 votes

def test_text_to_word_sequence():
    text = 'hello! ? world!'
    assert text_to_word_sequence(text) == ['hello', 'world']

Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License

5 votes

def test_text_to_word_sequence_unicode():
    text = u'ali! veli? kırk dokuz elli'
    assert text_to_word_sequence(text) == [u'ali', u'veli', u'kırk', u'dokuz', u'elli']

Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License

5 votes

def test_text_to_word_sequence_unicode_multichar_split():
    text = u'ali!stopveli?stopkırkstopdokuzstopelli'
    assert text_to_word_sequence(text, split='stop') == [u'ali', u'veli', u'kırk', u'dokuz', u'elli']

Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License

5 votes

def test_text_to_word_sequence():
    text = 'hello! ? world!'
    assert text_to_word_sequence(text) == ['hello', 'world']

Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License

5 votes

def test_text_to_word_sequence_multichar_split():
    text = 'hello!stop?world!'
    assert text_to_word_sequence(text, split='stop') == ['hello', 'world']

Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License

5 votes

def test_text_to_word_sequence_unicode():
    text = u'ali! veli? kırk dokuz elli'
    assert text_to_word_sequence(text) == [u'ali', u'veli', u'kırk', u'dokuz', u'elli']

Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License

5 votes

def test_text_to_word_sequence():
    text = 'hello! ? world!'
    assert text_to_word_sequence(text) == ['hello', 'world']

Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License

5 votes

def test_text_to_word_sequence_multichar_split():
    text = 'hello!stop?world!'
    assert text_to_word_sequence(text, split='stop') == ['hello', 'world']

Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License

5 votes

def test_text_to_word_sequence_unicode():
    text = u'ali! veli? kırk dokuz elli'
    assert text_to_word_sequence(text) == [u'ali', u'veli', u'kırk', u'dokuz', u'elli']

Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License

5 votes

def test_text_to_word_sequence_unicode_multichar_split():
    text = u'ali!stopveli?stopkırkstopdokuzstopelli'
    assert text_to_word_sequence(text, split='stop') == [u'ali', u'veli', u'kırk', u'dokuz', u'elli']

Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License

5 votes

def test_text_to_word_sequence():
    text = 'hello! ? world!'
    assert text_to_word_sequence(text) == ['hello', 'world']

Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License

5 votes

def test_text_to_word_sequence_unicode():
    text = u'ali! veli? kırk dokuz elli'
    assert text_to_word_sequence(text) == [u'ali', u'veli', u'kırk', u'dokuz', u'elli']

Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License

5 votes

def test_text_to_word_sequence_unicode_multichar_split():
    text = u'ali!stopveli?stopkırkstopdokuzstopelli'
    assert text_to_word_sequence(text, split='stop') == [u'ali', u'veli', u'kırk', u'dokuz', u'elli']

Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License

5 votes

def test_text_to_word_sequence():
    text = 'hello! ? world!'
    assert text_to_word_sequence(text) == ['hello', 'world']

Source File: HAN.py From DeepResearch with MIT License

5 votes

def preprocessing(self):
        """Preprocessing of the text to make it more resonant for training
        """
        paras = []
        labels = []
        texts = []
        for idx in range(self.text.shape[0]):
            text = self.clean_string(self.text[idx])
            texts.append(text)
            sentences = tokenize.sent_tokenize(text)
            paras.append(sentences)
        tokenizer = Tokenizer(num_words=self.max_features, oov_token=True)
        tokenizer.fit_on_texts(texts)
        data = np.zeros((len(texts), self.max_senten_num,
                         self.max_senten_len), dtype='int32')
        for i, sentences in enumerate(paras):
            for j, sent in enumerate(sentences):
                if j < self.max_senten_num:
                    wordTokens = text_to_word_sequence(sent)
                    k = 0
                    for _, word in enumerate(wordTokens):
                        if k < self.max_senten_len and word in tokenizer.word_index and tokenizer.word_index[word] < self.max_features:
                            data[i, j, k] = tokenizer.word_index[word]
                            k = k+1
        self.word_index = tokenizer.word_index
        if self.verbose == 1:
            print('Total %s unique tokens.' % len(self.word_index))
        labels = pd.get_dummies(self.categories)
        if self.verbose == 1:
            print('Shape of data tensor:', data.shape)
            print('Shape of labels tensor:', labels.shape)
        assert (len(self.classes) == labels.shape[1])
        assert (data.shape[0] == labels.shape[0])
        return data, labels

Source File: preprocessors.py From keras-image-captioning with MIT License

5 votes

def normalize_captions(self, captions_txt):
        captions_txt = self._add_eos(captions_txt)
        word_sequences = map(text_to_word_sequence, captions_txt)
        result = map(' '.join, word_sequences)
        return result

Source File: handler.py From tensorflow-chatbot-chinese with MIT License

5 votes

def prep(self, data):
        init = True
        for i in range(len(data)):
            reg = re.findall(r"[\w']+", data[i])
            if len(reg) == 0: # +++$+++
                init = True
                continue

            sent = text_to_word_sequence(data[i], lower=True, split=' ')
            if len(sent) > 15 or len(sent) < 2: # too long
                init = True
                continue
            idx_list = self.sentence_to_idx(sent)
            if len(idx_list) == 0: # <UNK> too many
                init = True
                continue

            if init:
                _in = idx_list
                init = False
            else:
                _out = idx_list
                #_rev_in = list(reversed(_in))
                # (the first EOS is part of the loss)
                self.data.append([_in , _out + [special_tokens['<EOS>']]])
                _in = idx_list
            if i % 100000 == 0:
                print("building data list: " + str(i) + "/" + str(len(data)) + " done.")


        print('original line num:', len(data))
        print('prep data num: ', len(self.data))
        self.data = np.array(self.data)
        self.perm = np.arange( len(self.data), dtype=np.int )
        self.shuffle_perm()

Source File: handler.py From tensorflow-chatbot-chinese with MIT License

5 votes

def prep(self, data):
        for i in range(len(data)):
            line = data[i]
            reg = re.findall(r"[\w']+", line)
            if len(reg) == 0:
                continue
            sent = text_to_word_sequence(line, lower=True, split=" ")
            _in = self.sentence_to_idx(sent, is_test=True)
            #self.test_data.append(list(reversed(_in)))
            self.test_data.append(_in)

        print('test data num: ', len(self.test_data))

Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License

5 votes

def test_text_to_word_sequence():
    text = 'hello! ? world!'
    assert text_to_word_sequence(text) == ['hello', 'world']

Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License

5 votes

def test_text_to_word_sequence_multichar_split():
    text = 'hello!stop?world!'
    assert text_to_word_sequence(text, split='stop') == ['hello', 'world']

Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License

5 votes

def test_text_to_word_sequence_unicode():
    text = u'ali! veli? kırk dokuz elli'
    assert text_to_word_sequence(text) == [u'ali', u'veli', u'kırk', u'dokuz', u'elli']

Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License

5 votes

def test_text_to_word_sequence_unicode_multichar_split():
    text = u'ali!stopveli?stopkırkstopdokuzstopelli'
    assert text_to_word_sequence(text, split='stop') == [u'ali', u'veli', u'kırk', u'dokuz', u'elli']

Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License

5 votes

def test_text_to_word_sequence():
    text = 'hello! ? world!'
    assert text_to_word_sequence(text) == ['hello', 'world']

Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License

5 votes

def test_text_to_word_sequence_unicode():
    text = u'ali! veli? kırk dokuz elli'
    assert text_to_word_sequence(text) == [u'ali', u'veli', u'kırk', u'dokuz', u'elli']

Python keras.preprocessing.text.text_to_word_sequence() Examples