Python keras.preprocessing.text.text_to_word_sequence() Examples
The following are 30
code examples of keras.preprocessing.text.text_to_word_sequence().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
keras.preprocessing.text
, or try the search function
.
Example #1
Source File: feature.py From text-classifier with Apache License 2.0 | 7 votes |
def doc_vec_feature(self, data_set, max_sentences=16): from keras.preprocessing.text import Tokenizer, text_to_word_sequence tokenizer = Tokenizer() tokenizer.fit_on_texts(data_set) data_feature = np.zeros((len(data_set), max_sentences, self.max_len), dtype='int32') sentence_symbols = "".join(self.sentence_symbol) split = "[" + sentence_symbols + "]" for i, sentence in enumerate(data_set): short_sents = re.split(split, sentence) for j, sent in enumerate(short_sents): if j < max_sentences and sent.strip(): words = text_to_word_sequence(sent) k = 0 for w in words: if k < self.max_len: if w in tokenizer.word_index: data_feature[i, j, k] = tokenizer.word_index[w] k += 1 word_index = tokenizer.word_index logger.info('Number of Unique Tokens: %d' % len(word_index)) print('Shape of Data Tensor:', data_feature.shape) return data_feature
Example #2
Source File: punctuator.py From keras-punctuator with MIT License | 6 votes |
def texts_to_sequences(wordIndex, texts, num_words): lastWord = num_words - 1 sequences = [] for text in texts: seq = text_to_word_sequence(text) vect = [] for w in seq: i = wordIndex.get(w) if i is not None: if num_words and i >= num_words: vect.append(lastWord) else: vect.append(i) else: vect.append(lastWord) sequences.append(vect) return sequences
Example #3
Source File: preprocessors.py From keras-image-captioning with MIT License | 6 votes |
def _handle_rare_words(self, captions): if self._rare_words_handling == 'nothing': return captions elif self._rare_words_handling == 'discard': tokenizer = Tokenizer() tokenizer.fit_on_texts(captions) new_captions = [] for caption in captions: words = text_to_word_sequence(caption) new_words = [w for w in words if tokenizer.word_counts.get(w, 0) >= self._words_min_occur] new_captions.append(' '.join(new_words)) return new_captions raise NotImplementedError('rare_words_handling={} is not implemented ' 'yet!'.format(self._rare_words_handling))
Example #4
Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License | 5 votes |
def test_text_to_word_sequence_unicode(): text = u'ali! veli? kırk dokuz elli' assert text_to_word_sequence(text) == [u'ali', u'veli', u'kırk', u'dokuz', u'elli']
Example #5
Source File: sample_size_NN.py From robotreviewer with GNU General Public License v3.0 | 5 votes |
def __init__(self, max_features, wvs, all_texts, unk=True, unk_symbol="unkunk"): ''' max_features: the upper bound to be placed on the vocabulary size. embedding_dims: size of the token embeddings; over-ridden if pre-trained vectors is provided (if wvs is not None). wvs: set of word vectors to be used for initialization ''' self.unk = unk self.unk_symbol = unk_symbol self.max_features = max_features self.tokenizer = Tokenizer(nb_words=self.max_features) self.embedding_dims = wvs.vector_size self.word_embeddings = wvs self.raw_texts = all_texts self.unked_texts = [] self.fit_tokenizer() if self.unk: # rewrite the 'raw texts' with unked versions, where tokens not in the # top max_features are unked. sorted_tokens = sorted(self.tokenizer.word_index, key=self.tokenizer.word_index.get) self.known_tokens = sorted_tokens[:self.max_features] self.tokens_to_unk = sorted_tokens[self.max_features:] for idx, text in enumerate(self.raw_texts): cur_text = text_to_word_sequence(text, split=self.tokenizer.split) t_or_unk = lambda t : t if t in self.known_tokens else self.unk_symbol unked_text = [t_or_unk(t) for t in cur_text] unked_text = self.tokenizer.split.join(unked_text) self.unked_texts.append(unked_text) self.raw_texts = self.unked_texts self.fit_tokenizer() self.init_word_vectors()
Example #6
Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License | 5 votes |
def test_text_to_word_sequence_unicode_multichar_split(): text = u'ali!stopveli?stopkırkstopdokuzstopelli' assert text_to_word_sequence(text, split='stop') == [u'ali', u'veli', u'kırk', u'dokuz', u'elli']
Example #7
Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License | 5 votes |
def test_text_to_word_sequence(): text = 'hello! ? world!' assert text_to_word_sequence(text) == ['hello', 'world']
Example #8
Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License | 5 votes |
def test_text_to_word_sequence_unicode(): text = u'ali! veli? kırk dokuz elli' assert text_to_word_sequence(text) == [u'ali', u'veli', u'kırk', u'dokuz', u'elli']
Example #9
Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License | 5 votes |
def test_text_to_word_sequence_unicode_multichar_split(): text = u'ali!stopveli?stopkırkstopdokuzstopelli' assert text_to_word_sequence(text, split='stop') == [u'ali', u'veli', u'kırk', u'dokuz', u'elli']
Example #10
Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License | 5 votes |
def test_text_to_word_sequence(): text = 'hello! ? world!' assert text_to_word_sequence(text) == ['hello', 'world']
Example #11
Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License | 5 votes |
def test_text_to_word_sequence_multichar_split(): text = 'hello!stop?world!' assert text_to_word_sequence(text, split='stop') == ['hello', 'world']
Example #12
Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License | 5 votes |
def test_text_to_word_sequence_unicode(): text = u'ali! veli? kırk dokuz elli' assert text_to_word_sequence(text) == [u'ali', u'veli', u'kırk', u'dokuz', u'elli']
Example #13
Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License | 5 votes |
def test_text_to_word_sequence(): text = 'hello! ? world!' assert text_to_word_sequence(text) == ['hello', 'world']
Example #14
Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License | 5 votes |
def test_text_to_word_sequence_multichar_split(): text = 'hello!stop?world!' assert text_to_word_sequence(text, split='stop') == ['hello', 'world']
Example #15
Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License | 5 votes |
def test_text_to_word_sequence_unicode(): text = u'ali! veli? kırk dokuz elli' assert text_to_word_sequence(text) == [u'ali', u'veli', u'kırk', u'dokuz', u'elli']
Example #16
Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License | 5 votes |
def test_text_to_word_sequence_unicode_multichar_split(): text = u'ali!stopveli?stopkırkstopdokuzstopelli' assert text_to_word_sequence(text, split='stop') == [u'ali', u'veli', u'kırk', u'dokuz', u'elli']
Example #17
Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License | 5 votes |
def test_text_to_word_sequence(): text = 'hello! ? world!' assert text_to_word_sequence(text) == ['hello', 'world']
Example #18
Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License | 5 votes |
def test_text_to_word_sequence_unicode(): text = u'ali! veli? kırk dokuz elli' assert text_to_word_sequence(text) == [u'ali', u'veli', u'kırk', u'dokuz', u'elli']
Example #19
Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License | 5 votes |
def test_text_to_word_sequence_unicode_multichar_split(): text = u'ali!stopveli?stopkırkstopdokuzstopelli' assert text_to_word_sequence(text, split='stop') == [u'ali', u'veli', u'kırk', u'dokuz', u'elli']
Example #20
Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License | 5 votes |
def test_text_to_word_sequence(): text = 'hello! ? world!' assert text_to_word_sequence(text) == ['hello', 'world']
Example #21
Source File: HAN.py From DeepResearch with MIT License | 5 votes |
def preprocessing(self): """Preprocessing of the text to make it more resonant for training """ paras = [] labels = [] texts = [] for idx in range(self.text.shape[0]): text = self.clean_string(self.text[idx]) texts.append(text) sentences = tokenize.sent_tokenize(text) paras.append(sentences) tokenizer = Tokenizer(num_words=self.max_features, oov_token=True) tokenizer.fit_on_texts(texts) data = np.zeros((len(texts), self.max_senten_num, self.max_senten_len), dtype='int32') for i, sentences in enumerate(paras): for j, sent in enumerate(sentences): if j < self.max_senten_num: wordTokens = text_to_word_sequence(sent) k = 0 for _, word in enumerate(wordTokens): if k < self.max_senten_len and word in tokenizer.word_index and tokenizer.word_index[word] < self.max_features: data[i, j, k] = tokenizer.word_index[word] k = k+1 self.word_index = tokenizer.word_index if self.verbose == 1: print('Total %s unique tokens.' % len(self.word_index)) labels = pd.get_dummies(self.categories) if self.verbose == 1: print('Shape of data tensor:', data.shape) print('Shape of labels tensor:', labels.shape) assert (len(self.classes) == labels.shape[1]) assert (data.shape[0] == labels.shape[0]) return data, labels
Example #22
Source File: preprocessors.py From keras-image-captioning with MIT License | 5 votes |
def normalize_captions(self, captions_txt): captions_txt = self._add_eos(captions_txt) word_sequences = map(text_to_word_sequence, captions_txt) result = map(' '.join, word_sequences) return result
Example #23
Source File: handler.py From tensorflow-chatbot-chinese with MIT License | 5 votes |
def prep(self, data): init = True for i in range(len(data)): reg = re.findall(r"[\w']+", data[i]) if len(reg) == 0: # +++$+++ init = True continue sent = text_to_word_sequence(data[i], lower=True, split=' ') if len(sent) > 15 or len(sent) < 2: # too long init = True continue idx_list = self.sentence_to_idx(sent) if len(idx_list) == 0: # <UNK> too many init = True continue if init: _in = idx_list init = False else: _out = idx_list #_rev_in = list(reversed(_in)) # (the first EOS is part of the loss) self.data.append([_in , _out + [special_tokens['<EOS>']]]) _in = idx_list if i % 100000 == 0: print("building data list: " + str(i) + "/" + str(len(data)) + " done.") print('original line num:', len(data)) print('prep data num: ', len(self.data)) self.data = np.array(self.data) self.perm = np.arange( len(self.data), dtype=np.int ) self.shuffle_perm()
Example #24
Source File: handler.py From tensorflow-chatbot-chinese with MIT License | 5 votes |
def prep(self, data): for i in range(len(data)): line = data[i] reg = re.findall(r"[\w']+", line) if len(reg) == 0: continue sent = text_to_word_sequence(line, lower=True, split=" ") _in = self.sentence_to_idx(sent, is_test=True) #self.test_data.append(list(reversed(_in))) self.test_data.append(_in) print('test data num: ', len(self.test_data))
Example #25
Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License | 5 votes |
def test_text_to_word_sequence(): text = 'hello! ? world!' assert text_to_word_sequence(text) == ['hello', 'world']
Example #26
Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License | 5 votes |
def test_text_to_word_sequence_multichar_split(): text = 'hello!stop?world!' assert text_to_word_sequence(text, split='stop') == ['hello', 'world']
Example #27
Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License | 5 votes |
def test_text_to_word_sequence_unicode(): text = u'ali! veli? kırk dokuz elli' assert text_to_word_sequence(text) == [u'ali', u'veli', u'kırk', u'dokuz', u'elli']
Example #28
Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License | 5 votes |
def test_text_to_word_sequence_unicode_multichar_split(): text = u'ali!stopveli?stopkırkstopdokuzstopelli' assert text_to_word_sequence(text, split='stop') == [u'ali', u'veli', u'kırk', u'dokuz', u'elli']
Example #29
Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License | 5 votes |
def test_text_to_word_sequence(): text = 'hello! ? world!' assert text_to_word_sequence(text) == ['hello', 'world']
Example #30
Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License | 5 votes |
def test_text_to_word_sequence_unicode(): text = u'ali! veli? kırk dokuz elli' assert text_to_word_sequence(text) == [u'ali', u'veli', u'kırk', u'dokuz', u'elli']