Python gensim.utils.simple_preprocess() Examples
The following are 16
code examples of gensim.utils.simple_preprocess().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
gensim.utils
, or try the search function
.

Example #1
Source File: doc2vec.py From asreview with Apache License 2.0 | 7 votes |
def fit(self, texts): model_param = { "vector_size": self.vector_size, "epochs": self.epochs, "min_count": self.min_count, "workers": self.n_jobs, "window": self.window, "dm_concat": self.dm_concat, "dbow_words": self.dbow_words, } corpus = [TaggedDocument(simple_preprocess(text), [i]) for i, text in enumerate(texts)] # If self.dm is 2, train both models and concatenate the feature # vectors later. Resulting vector size should be the same. if self.dm == 2: model_param["vector_size"] = int(model_param["vector_size"]/2) self.model_dm = _train_model(corpus, **model_param, dm=1) self.model_dbow = _train_model(corpus, **model_param, dm=0) else: self.model = _train_model(corpus, **model_param, dm=self.dm)
Example #2
Source File: transformations.py From keras-pandas with MIT License | 6 votes |
def fit(self, X, y=None): # Format text for processing, by creating a list of strings observations = self.prepare_input(X) # Preprocess & tokenize observations = list(map(lambda x: simple_preprocess(x), observations)) # Generate embedding_sequence_length, if necessary if self.max_sequence_length is None: self.max_sequence_length = self.generate_embedding_sequence_length(observations) # Update index_lookup tokens = set() for observation in observations: tokens.update(observation) logging.debug('Fitting with tokens: {}'.format(tokens)) current_max_index = max(self.token_index_lookup.values()) index_range = range(current_max_index, len(tokens) + current_max_index) learned_token_index_lookup = dict(zip(tokens, index_range)) self.token_index_lookup.update(learned_token_index_lookup) new_max_token_index = max(self.token_index_lookup.values()) logging.info('Learned tokens, new_max_token_index: {}'.format(new_max_token_index)) return self
Example #3
Source File: parseundp.py From Semantic-Search-for-Sustainable-Development with Apache License 2.0 | 6 votes |
def read_corpus(path = '.', exclude = [], targets = None): i= 0 for file in os.listdir(path): if file[-4:] == '.txt' and file not in exclude and 'no_en' not in file: # ensure file is an english txt file print(file) with open(os.path.join(path, file), encoding="utf8") as document_text: for line in document_text: count = 0 words = simple_preprocess(line) for word in words: # count the number of words with <= 3 characters if len(word) <= 3: count += 1 if count < len(words)/2 and len(words) > 10: # exclude lines in which 1/2 the words have less yield(doc2vec.TaggedDocument(words, [i])) # than 3 characters or have less than 10 words i+=1 if targets: for key, val in targets.items(): yield(doc2vec.TaggedDocument(simple_preprocess(val), [i])) i+=1
Example #4
Source File: CustomParVec.py From Semantic-Search-for-Sustainable-Development with Apache License 2.0 | 6 votes |
def inferVector1(self, line): ''' Given a new line, infer a custom vector representation using the corpus tfidf. Args: line : new sentence to be inferred Returns: numpy.ndarray : vector representation of the line ''' line = ' '.join(simple_preprocess(line)) # pre-process the line line_tf_idf = self.tf_idf_obj.transform([line]) # infer the tf-idf values for the words in the line rows, cols = line_tf_idf.nonzero() new_vec = np.zeros(self.dimensions) # Apply the same sentence to vector conversion as above. for col in cols: try: new_vec += (self.word2vec_model[(self.word_index[col])] * line_tf_idf[0, col]) except: continue return np.asarray(new_vec)
Example #5
Source File: test_word2vec.py From topical_word_embeddings with MIT License | 5 votes |
def __iter__(self): with open(datapath('lee_background.cor')) as f: for line in f: yield utils.simple_preprocess(line)
Example #6
Source File: test_word2vec.py From topical_word_embeddings with MIT License | 5 votes |
def __iter__(self): with open(datapath('lee_background.cor')) as f: for line in f: yield utils.simple_preprocess(line)
Example #7
Source File: test_word2vec.py From topical_word_embeddings with MIT License | 5 votes |
def __iter__(self): with open(datapath('lee_background.cor')) as f: for line in f: yield utils.simple_preprocess(line)
Example #8
Source File: transformations.py From keras-pandas with MIT License | 5 votes |
def process_string(self, input_string): """ Turn a string into padded sequences, consistent with Keras's Embedding layer - Simple preprocess & tokenize - Convert tokens to indices - Pad sequence to be the correct length :param input_string: A string, to be converted into a padded sequence of token indices :type input_string: str :return: A padded, fixed-length array of token indices :rtype: [int] """ logging.debug('Processing string: {}'.format(input_string)) # Convert to tokens tokens = simple_preprocess(input_string) logging.debug('Tokens: {}'.format(tokens)) # Convert to indices indices = list(map(lambda x: self.token_index_lookup[x], tokens)) logging.debug('Indices: {}'.format(indices)) # Pad indices padding_index = self.token_index_lookup['__PAD__'] padding_length = self.max_sequence_length padded_indices = self.pad(indices, length=padding_length, pad_char=padding_index) logging.debug('Padded indices: {}'.format(padded_indices)) return padded_indices
Example #9
Source File: CustomParVec.py From Semantic-Search-for-Sustainable-Development with Apache License 2.0 | 5 votes |
def inferVector2(self, line): ''' Given a new line, infer a custom vector representation using the ground truth tfidf. Args: line : new sentence to be inferred Returns: numpy.ndarray : vector representation of the line ''' line = ' '.join(simple_preprocess(line)) # pre-process the line replacement_words = [] for word in line.split(): if word not in self.extra_tf_idf_obj.vocabulary_: try: similar_words = self.word2vec_model.similar_by_word(word, topn=10, restrict_vocab=None) for sim in similar_words: if sim[0] in self.extra_tf_idf_obj.vocabulary_: replacement_words.append((word, sim[0])) break except: continue for old, new in replacement_words: line = line.replace(old, new) line_tf_idf = self.extra_tf_idf_obj.transform([line]) # infer the tf-idf values for the words in the line rows, cols = line_tf_idf.nonzero() new_vec = np.zeros(self.dimensions) # Apply the same sentence to vector conversion as above. for col in cols: try: new_vec += (self.word2vec_model[(self.extra_word_index[col])] * line_tf_idf[0, col]) except: continue return np.asarray(new_vec)
Example #10
Source File: test_vec4ir.py From vec4ir with MIT License | 5 votes |
def test_doc2vec_inference(): tagged_docs = [TaggedDocument(simple_preprocess(doc), [i]) for i, doc in enumerate(documents)] model = Doc2Vec(tagged_docs, epochs=1, min_count=1) d2v = Doc2VecInference(model, DEFAULT_ANALYZER) match_op = Matching() retrieval = Retrieval(d2v, matching=match_op).fit(documents) result = retrieval.query("scientists") assert result[0] == 1
Example #11
Source File: test_vec4ir.py From vec4ir with MIT License | 5 votes |
def test_doc2vec_inference_saveload(): tagged_docs = [TaggedDocument(simple_preprocess(doc), [i]) for i, doc in enumerate(documents)] model = Doc2Vec(tagged_docs, epochs=1, min_count=1, vector_size=10) model.save(TEST_FILE) del model model = Doc2Vec.load(TEST_FILE) os.remove(TEST_FILE) d2v = Doc2VecInference(model, DEFAULT_ANALYZER) match_op = Matching() retrieval = Retrieval(d2v, matching=match_op).fit(documents) result = retrieval.query("scientists") assert result[0] == 1
Example #12
Source File: doc2vec.py From asreview with Apache License 2.0 | 5 votes |
def transform(self, texts): corpus = [TaggedDocument(simple_preprocess(text), [i]) for i, text in enumerate(texts)] if self.dm == 2: X_dm = _transform_text(self.model_dm, corpus) X_dbow = _transform_text(self.model_dbow, corpus) X = np.concatenate((X_dm, X_dbow), axis=1) else: X = _transform_text(self.model, corpus) return X
Example #13
Source File: test_word2vec.py From topical_word_embeddings with MIT License | 5 votes |
def __iter__(self): with open(datapath('lee_background.cor')) as f: for line in f: yield utils.simple_preprocess(line)
Example #14
Source File: test_word2vec.py From topical_word_embeddings with MIT License | 5 votes |
def __iter__(self): with open(datapath('lee_background.cor')) as f: for line in f: yield utils.simple_preprocess(line)
Example #15
Source File: test_word2vec.py From topical_word_embeddings with MIT License | 5 votes |
def __iter__(self): with open(datapath('lee_background.cor')) as f: for line in f: yield utils.simple_preprocess(line)
Example #16
Source File: wordtwovec.py From aristo-mini with Apache License 2.0 | 5 votes |
def tokenizer(sentence: str) -> List[str]: """use gensim's `simple_preprocess` and `STOPWORDS` list""" return [stem(token) for token in simple_preprocess(sentence) if token not in STOPWORDS]