Python keras.preprocessing.text.Tokenizer() Examples
The following are 30
code examples of keras.preprocessing.text.Tokenizer().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
keras.preprocessing.text
, or try the search function
.
Example #1
Source File: feature.py From text-classifier with Apache License 2.0 | 7 votes |
def doc_vec_feature(self, data_set, max_sentences=16): from keras.preprocessing.text import Tokenizer, text_to_word_sequence tokenizer = Tokenizer() tokenizer.fit_on_texts(data_set) data_feature = np.zeros((len(data_set), max_sentences, self.max_len), dtype='int32') sentence_symbols = "".join(self.sentence_symbol) split = "[" + sentence_symbols + "]" for i, sentence in enumerate(data_set): short_sents = re.split(split, sentence) for j, sent in enumerate(short_sents): if j < max_sentences and sent.strip(): words = text_to_word_sequence(sent) k = 0 for w in words: if k < self.max_len: if w in tokenizer.word_index: data_feature[i, j, k] = tokenizer.word_index[w] k += 1 word_index = tokenizer.word_index logger.info('Number of Unique Tokens: %d' % len(word_index)) print('Shape of Data Tensor:', data_feature.shape) return data_feature
Example #2
Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License | 6 votes |
def test_tokenizer(): texts = ['The cat sat on the mat.', 'The dog sat on the log.', 'Dogs and cats living together.'] tokenizer = Tokenizer(num_words=10) tokenizer.fit_on_texts(texts) sequences = [] for seq in tokenizer.texts_to_sequences_generator(texts): sequences.append(seq) assert np.max(np.max(sequences)) < 10 assert np.min(np.min(sequences)) == 1 tokenizer.fit_on_sequences(sequences) for mode in ['binary', 'count', 'tfidf', 'freq']: matrix = tokenizer.texts_to_matrix(texts, mode)
Example #3
Source File: datasets.py From DEC-keras with MIT License | 6 votes |
def load_imdb(): from keras.preprocessing.text import Tokenizer from keras.datasets import imdb max_words = 1000 print('Loading data...') (x1, y1), (x2, y2) = imdb.load_data(num_words=max_words) x = np.concatenate((x1, x2)) y = np.concatenate((y1, y2)) print(len(x), 'train sequences') num_classes = np.max(y) + 1 print(num_classes, 'classes') print('Vectorizing sequence data...') tokenizer = Tokenizer(num_words=max_words) x = tokenizer.sequences_to_matrix(x, mode='binary') print('x_train shape:', x.shape) return x.astype(float), y
Example #4
Source File: make_vocab.py From GPT2-Chinese with MIT License | 6 votes |
def main(): parser = argparse.ArgumentParser() parser.add_argument('--raw_data_path', default='../data/train.json', type=str, required=False, help='原始训练语料') parser.add_argument('--vocab_file', default='vocab_processed.txt', type=str, required=False, help='生成vocab链接') parser.add_argument('--vocab_size', default=50000, type=int, required=False, help='词表大小') args = parser.parse_args() lac = thulac.thulac(seg_only=True) tokenizer = Tokenizer(num_words=args.vocab_size) print('args:\n' + args.__repr__()) print('This script is extremely slow especially for large corpus. Take a break.') f = open(args.raw_data_path, 'r') lines = json.load(f) for i, line in enumerate(tqdm(lines)): lines[i] = lac.cut(line, text=True) tokenizer.fit_on_texts(lines) vocab = list(tokenizer.index_word.values()) pre = ['[SEP]', '[CLS]', '[MASK]', '[PAD]', '[UNK]'] vocab = pre + vocab with open(args.vocab_file, 'w') as f: for word in vocab[:args.vocab_size + 5]: f.write(word + '\n')
Example #5
Source File: inputHandler.py From lstm-siamese-text-similarity with MIT License | 6 votes |
def create_embedding_matrix(tokenizer, word_vectors, embedding_dim): """ Create embedding matrix containing word indexes and respective vectors from word vectors Args: tokenizer (keras.preprocessing.text.Tokenizer): keras tokenizer object containing word indexes word_vectors (dict): dict containing word and their respective vectors embedding_dim (int): dimention of word vector Returns: """ nb_words = len(tokenizer.word_index) + 1 word_index = tokenizer.word_index embedding_matrix = np.zeros((nb_words, embedding_dim)) print("Embedding matrix shape: %s" % str(embedding_matrix.shape)) for word, i in word_index.items(): try: embedding_vector = word_vectors[word] if embedding_vector is not None: embedding_matrix[i] = embedding_vector except KeyError: print("vector not found for word - %s" % word) print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0)) return embedding_matrix
Example #6
Source File: textAnalysis.py From deep_learning with MIT License | 6 votes |
def train_wordtoVect(train_inputTexts): """ 训练词向量函数 """ texts=[] for doc in train_inputTexts: seg_doc = jieba.lcut(doc.replace('\n', '')) d =" ".join(seg_doc) texts.append(d) tokenizer = text.Tokenizer() # 分词MAX_NB_WORDS tokenizer.fit_on_texts(texts) text_sequences = tokenizer.texts_to_sequences(texts) # 受num_words影响 word_index = tokenizer.word_index # 词_索引 data = sequence.pad_sequences(text_sequences, maxlen=MAX_SEQUENCE_LENGTH) return word_index, data
Example #7
Source File: preprocessors.py From keras-image-captioning with MIT License | 6 votes |
def _handle_rare_words(self, captions): if self._rare_words_handling == 'nothing': return captions elif self._rare_words_handling == 'discard': tokenizer = Tokenizer() tokenizer.fit_on_texts(captions) new_captions = [] for caption in captions: words = text_to_word_sequence(caption) new_words = [w for w in words if tokenizer.word_counts.get(w, 0) >= self._words_min_occur] new_captions.append(' '.join(new_words)) return new_captions raise NotImplementedError('rare_words_handling={} is not implemented ' 'yet!'.format(self._rare_words_handling))
Example #8
Source File: pipeline_invoke_python.py From models with Apache License 2.0 | 6 votes |
def _transform_request(request): request_str = request.decode('utf-8') # tokenize the csv request and create json X = pandas.read_csv(io.StringIO(request_str), engine='python', quotechar='|', header=None).values[:,0] for index, item in enumerate(X): reqJson = json.loads(item, object_pairs_hook=OrderedDict) del reqJson['http']['timestamp'] del reqJson['http']['headers'] del reqJson['http']['source'] del reqJson['http']['route'] del reqJson['http']['responsePayload'] X[index] = json.dumps(reqJson, separators=(',', ':')) tokenizer = Tokenizer(filters='\t\n', char_level=True) tokenizer.fit_on_texts(X) # this used to be [log_entry] seq = tokenizer.texts_to_sequences([request_str]) max_log_length = 1024 log_entry_processed = sequence.pad_sequences(seq, maxlen=max_log_length) return log_entry_processed
Example #9
Source File: datasets.py From DEC-keras with MIT License | 6 votes |
def load_retures_keras(): from keras.preprocessing.text import Tokenizer from keras.datasets import reuters max_words = 1000 print('Loading data...') (x, y), (_, _) = reuters.load_data(num_words=max_words, test_split=0.) print(len(x), 'train sequences') num_classes = np.max(y) + 1 print(num_classes, 'classes') print('Vectorizing sequence data...') tokenizer = Tokenizer(num_words=max_words) x = tokenizer.sequences_to_matrix(x, mode='binary') print('x_train shape:', x.shape) return x.astype(float), y
Example #10
Source File: inputHandler.py From lstm-siamese-text-similarity with MIT License | 6 votes |
def word_embed_meta_data(documents, embedding_dim): """ Load tokenizer object for given vocabs list Args: documents (list): list of document embedding_dim (int): embedding dimension Returns: tokenizer (keras.preprocessing.text.Tokenizer): keras tokenizer object embedding_matrix (dict): dict with word_index and vector mapping """ documents = [x.lower().split() for x in documents] tokenizer = Tokenizer() tokenizer.fit_on_texts(documents) word_vector = train_word2vec(documents, embedding_dim) embedding_matrix = create_embedding_matrix(tokenizer, word_vector, embedding_dim) del word_vector gc.collect() return tokenizer, embedding_matrix
Example #11
Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License | 6 votes |
def test_tokenizer(): texts = ['The cat sat on the mat.', 'The dog sat on the log.', 'Dogs and cats living together.'] tokenizer = Tokenizer(num_words=10) tokenizer.fit_on_texts(texts) sequences = [] for seq in tokenizer.texts_to_sequences_generator(texts): sequences.append(seq) assert np.max(np.max(sequences)) < 10 assert np.min(np.min(sequences)) == 1 tokenizer.fit_on_sequences(sequences) for mode in ['binary', 'count', 'tfidf', 'freq']: matrix = tokenizer.texts_to_matrix(texts, mode)
Example #12
Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License | 6 votes |
def test_sequential_fit(): texts = ['The cat sat on the mat.', 'The dog sat on the log.', 'Dogs and cats living together.'] word_sequences = [ ['The', 'cat', 'is', 'sitting'], ['The', 'dog', 'is', 'standing'] ] tokenizer = Tokenizer() tokenizer.fit_on_texts(texts) tokenizer.fit_on_texts(word_sequences) assert tokenizer.document_count == 5 tokenizer.texts_to_matrix(texts) tokenizer.texts_to_matrix(word_sequences)
Example #13
Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License | 6 votes |
def test_tokenizer_oov_flag(): """ Test of Out of Vocabulary (OOV) flag in Tokenizer """ x_train = ['This text has only known words'] x_test = ['This text has some unknown words'] # 2 OOVs: some, unknown # Default, without OOV flag tokenizer = Tokenizer() tokenizer.fit_on_texts(x_train) x_test_seq = tokenizer.texts_to_sequences(x_test) assert len(x_test_seq[0]) == 4 # discards 2 OOVs # With OOV feature tokenizer = Tokenizer(oov_token='<unk>') tokenizer.fit_on_texts(x_train) x_test_seq = tokenizer.texts_to_sequences(x_test) assert len(x_test_seq[0]) == 6 # OOVs marked in place
Example #14
Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License | 6 votes |
def test_tokenizer(): texts = ['The cat sat on the mat.', 'The dog sat on the log.', 'Dogs and cats living together.'] tokenizer = Tokenizer(num_words=10) tokenizer.fit_on_texts(texts) sequences = [] for seq in tokenizer.texts_to_sequences_generator(texts): sequences.append(seq) assert np.max(np.max(sequences)) < 10 assert np.min(np.min(sequences)) == 1 tokenizer.fit_on_sequences(sequences) for mode in ['binary', 'count', 'tfidf', 'freq']: matrix = tokenizer.texts_to_matrix(texts, mode)
Example #15
Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License | 6 votes |
def test_tokenizer_oov_flag(): """ Test of Out of Vocabulary (OOV) flag in Tokenizer """ x_train = ['This text has only known words'] x_test = ['This text has some unknown words'] # 2 OOVs: some, unknown # Default, without OOV flag tokenizer = Tokenizer() tokenizer.fit_on_texts(x_train) x_test_seq = tokenizer.texts_to_sequences(x_test) assert len(x_test_seq[0]) == 4 # discards 2 OOVs # With OOV feature tokenizer = Tokenizer(oov_token='<unk>') tokenizer.fit_on_texts(x_train) x_test_seq = tokenizer.texts_to_sequences(x_test) assert len(x_test_seq[0]) == 6 # OOVs marked in place
Example #16
Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License | 6 votes |
def test_tokenizer(): texts = ['The cat sat on the mat.', 'The dog sat on the log.', 'Dogs and cats living together.'] tokenizer = Tokenizer(num_words=10) tokenizer.fit_on_texts(texts) sequences = [] for seq in tokenizer.texts_to_sequences_generator(texts): sequences.append(seq) assert np.max(np.max(sequences)) < 10 assert np.min(np.min(sequences)) == 1 tokenizer.fit_on_sequences(sequences) for mode in ['binary', 'count', 'tfidf', 'freq']: matrix = tokenizer.texts_to_matrix(texts, mode)
Example #17
Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License | 6 votes |
def test_sequential_fit(): texts = ['The cat sat on the mat.', 'The dog sat on the log.', 'Dogs and cats living together.'] word_sequences = [ ['The', 'cat', 'is', 'sitting'], ['The', 'dog', 'is', 'standing'] ] tokenizer = Tokenizer() tokenizer.fit_on_texts(texts) tokenizer.fit_on_texts(word_sequences) assert tokenizer.document_count == 5 tokenizer.texts_to_matrix(texts) tokenizer.texts_to_matrix(word_sequences)
Example #18
Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License | 6 votes |
def test_tokenizer(): texts = ['The cat sat on the mat.', 'The dog sat on the log.', 'Dogs and cats living together.'] tokenizer = Tokenizer(num_words=10) tokenizer.fit_on_texts(texts) sequences = [] for seq in tokenizer.texts_to_sequences_generator(texts): sequences.append(seq) assert np.max(np.max(sequences)) < 10 assert np.min(np.min(sequences)) == 1 tokenizer.fit_on_sequences(sequences) for mode in ['binary', 'count', 'tfidf', 'freq']: matrix = tokenizer.texts_to_matrix(texts, mode)
Example #19
Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License | 6 votes |
def test_sequential_fit(): texts = ['The cat sat on the mat.', 'The dog sat on the log.', 'Dogs and cats living together.'] word_sequences = [ ['The', 'cat', 'is', 'sitting'], ['The', 'dog', 'is', 'standing'] ] tokenizer = Tokenizer() tokenizer.fit_on_texts(texts) tokenizer.fit_on_texts(word_sequences) assert tokenizer.document_count == 5 tokenizer.texts_to_matrix(texts) tokenizer.texts_to_matrix(word_sequences)
Example #20
Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License | 6 votes |
def test_tokenizer_oov_flag(): """ Test of Out of Vocabulary (OOV) flag in Tokenizer """ x_train = ['This text has only known words'] x_test = ['This text has some unknown words'] # 2 OOVs: some, unknown # Default, without OOV flag tokenizer = Tokenizer() tokenizer.fit_on_texts(x_train) x_test_seq = tokenizer.texts_to_sequences(x_test) assert len(x_test_seq[0]) == 4 # discards 2 OOVs # With OOV feature tokenizer = Tokenizer(oov_token='<unk>') tokenizer.fit_on_texts(x_train) x_test_seq = tokenizer.texts_to_sequences(x_test) assert len(x_test_seq[0]) == 6 # OOVs marked in place
Example #21
Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License | 6 votes |
def test_tokenizer(): texts = ['The cat sat on the mat.', 'The dog sat on the log.', 'Dogs and cats living together.'] tokenizer = Tokenizer(num_words=10) tokenizer.fit_on_texts(texts) sequences = [] for seq in tokenizer.texts_to_sequences_generator(texts): sequences.append(seq) assert np.max(np.max(sequences)) < 10 assert np.min(np.min(sequences)) == 1 tokenizer.fit_on_sequences(sequences) for mode in ['binary', 'count', 'tfidf', 'freq']: matrix = tokenizer.texts_to_matrix(texts, mode)
Example #22
Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License | 6 votes |
def test_tokenizer_oov_flag(): """ Test of Out of Vocabulary (OOV) flag in Tokenizer """ x_train = ['This text has only known words'] x_test = ['This text has some unknown words'] # 2 OOVs: some, unknown # Default, without OOV flag tokenizer = Tokenizer() tokenizer.fit_on_texts(x_train) x_test_seq = tokenizer.texts_to_sequences(x_test) assert len(x_test_seq[0]) == 4 # discards 2 OOVs # With OOV feature tokenizer = Tokenizer(oov_token='<unk>') tokenizer.fit_on_texts(x_train) x_test_seq = tokenizer.texts_to_sequences(x_test) assert len(x_test_seq[0]) == 6 # OOVs marked in place
Example #23
Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License | 6 votes |
def test_tokenizer(): texts = ['The cat sat on the mat.', 'The dog sat on the log.', 'Dogs and cats living together.'] tokenizer = Tokenizer(num_words=10) tokenizer.fit_on_texts(texts) sequences = [] for seq in tokenizer.texts_to_sequences_generator(texts): sequences.append(seq) assert np.max(np.max(sequences)) < 10 assert np.min(np.min(sequences)) == 1 tokenizer.fit_on_sequences(sequences) for mode in ['binary', 'count', 'tfidf', 'freq']: matrix = tokenizer.texts_to_matrix(texts, mode)
Example #24
Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License | 6 votes |
def test_sequential_fit(): texts = ['The cat sat on the mat.', 'The dog sat on the log.', 'Dogs and cats living together.'] word_sequences = [ ['The', 'cat', 'is', 'sitting'], ['The', 'dog', 'is', 'standing'] ] tokenizer = Tokenizer() tokenizer.fit_on_texts(texts) tokenizer.fit_on_texts(word_sequences) assert tokenizer.document_count == 5 tokenizer.texts_to_matrix(texts) tokenizer.texts_to_matrix(word_sequences)
Example #25
Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License | 6 votes |
def test_tokenizer_oov_flag(): """ Test of Out of Vocabulary (OOV) flag in Tokenizer """ x_train = ['This text has only known words'] x_test = ['This text has some unknown words'] # 2 OOVs: some, unknown # Default, without OOV flag tokenizer = Tokenizer() tokenizer.fit_on_texts(x_train) x_test_seq = tokenizer.texts_to_sequences(x_test) assert len(x_test_seq[0]) == 4 # discards 2 OOVs # With OOV feature tokenizer = Tokenizer(oov_token='<unk>') tokenizer.fit_on_texts(x_train) x_test_seq = tokenizer.texts_to_sequences(x_test) assert len(x_test_seq[0]) == 6 # OOVs marked in place
Example #26
Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License | 6 votes |
def test_tokenizer(): texts = ['The cat sat on the mat.', 'The dog sat on the log.', 'Dogs and cats living together.'] tokenizer = Tokenizer(num_words=10) tokenizer.fit_on_texts(texts) sequences = [] for seq in tokenizer.texts_to_sequences_generator(texts): sequences.append(seq) assert np.max(np.max(sequences)) < 10 assert np.min(np.min(sequences)) == 1 tokenizer.fit_on_sequences(sequences) for mode in ['binary', 'count', 'tfidf', 'freq']: matrix = tokenizer.texts_to_matrix(texts, mode)
Example #27
Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License | 6 votes |
def test_sequential_fit(): texts = ['The cat sat on the mat.', 'The dog sat on the log.', 'Dogs and cats living together.'] word_sequences = [ ['The', 'cat', 'is', 'sitting'], ['The', 'dog', 'is', 'standing'] ] tokenizer = Tokenizer() tokenizer.fit_on_texts(texts) tokenizer.fit_on_texts(word_sequences) assert tokenizer.document_count == 5 tokenizer.texts_to_matrix(texts) tokenizer.texts_to_matrix(word_sequences)
Example #28
Source File: text_test.py From DeepLearning_Wavelet-LSTM with MIT License | 6 votes |
def test_tokenizer_oov_flag(): """ Test of Out of Vocabulary (OOV) flag in Tokenizer """ x_train = ['This text has only known words'] x_test = ['This text has some unknown words'] # 2 OOVs: some, unknown # Default, without OOV flag tokenizer = Tokenizer() tokenizer.fit_on_texts(x_train) x_test_seq = tokenizer.texts_to_sequences(x_test) assert len(x_test_seq[0]) == 4 # discards 2 OOVs # With OOV feature tokenizer = Tokenizer(oov_token='<unk>') tokenizer.fit_on_texts(x_train) x_test_seq = tokenizer.texts_to_sequences(x_test) assert len(x_test_seq[0]) == 6 # OOVs marked in place
Example #29
Source File: inputHandler.py From lstm-siamese-text-similarity with MIT License | 6 votes |
def create_test_data(tokenizer, test_sentences_pair, max_sequence_length): """ Create training and validation dataset Args: tokenizer (keras.preprocessing.text.Tokenizer): keras tokenizer object test_sentences_pair (list): list of tuple of sentences pairs max_sequence_length (int): max sequence length of sentences to apply padding Returns: test_data_1 (list): list of input features for training set from sentences1 test_data_2 (list): list of input features for training set from sentences2 """ test_sentences1 = [x[0].lower() for x in test_sentences_pair] test_sentences2 = [x[1].lower() for x in test_sentences_pair] test_sequences_1 = tokenizer.texts_to_sequences(test_sentences1) test_sequences_2 = tokenizer.texts_to_sequences(test_sentences2) leaks_test = [[len(set(x1)), len(set(x2)), len(set(x1).intersection(x2))] for x1, x2 in zip(test_sequences_1, test_sequences_2)] leaks_test = np.array(leaks_test) test_data_1 = pad_sequences(test_sequences_1, maxlen=max_sequence_length) test_data_2 = pad_sequences(test_sequences_2, maxlen=max_sequence_length) return test_data_1, test_data_2, leaks_test
Example #30
Source File: loaders.py From open-solution-mapping-challenge with MIT License | 5 votes |
def __init__(self, char_level, maxlen, num_words): self.char_level = char_level self.maxlen = maxlen self.num_words = num_words self.tokenizer = text.Tokenizer(char_level=self.char_level, num_words=self.num_words)