Python gensim.models.Phrases() Examples
The following are 8
code examples of gensim.models.Phrases().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
gensim.models
, or try the search function
.
Example #1
Source File: Word2VecFromParsedCorpus.py From scattertext with Apache License 2.0 | 6 votes |
def add_phrases(self, corpus): ''' Parameters ---------- corpus: Corpus for phrase augmentation Returns ------- New ParsedCorpus containing unigrams in corpus and new phrases ''' from gensim.models import Phrases assert isinstance(corpus, ParsedCorpus) self.phrases = [Phrases(CorpusAdapterForGensim.get_sentences(corpus), delimiter=' ')] for i in range(1, self.max_tokens_per_phrase): self.phrases.append(Phrases(self.phrases[-1][CorpusAdapterForGensim.get_sentences(corpus)])) return self
Example #2
Source File: phrases.py From broca with MIT License | 6 votes |
def train_phrases(paths, out='data/bigram_model.phrases', tokenizer=word_tokenize, **kwargs): """ Train a bigram phrase model on a list of files. """ n = 0 for path in paths: print('Counting lines for {0}...'.format(path)) n += sum(1 for line in open(path, 'r')) print('Processing {0} lines...'.format(n)) # Change to use less memory. Default is 40m. kwargs = { 'max_vocab_size': 40000000, 'threshold': 8. }.update(kwargs) print('Training bigrams...') bigram = Phrases(_phrase_doc_stream(paths, n, tokenizer=word_tokenize), **kwargs) print('Saving...') bigram.save(out)
Example #3
Source File: Word2VecFromParsedCorpus.py From scattertext with Apache License 2.0 | 5 votes |
def __init__(self, phrases, gram_size): ''' Parameters ---------- phrases : list[gensim.models.Phrases] gram_size : int, maximum number of words per phrase kwargs : parameters for FeatsFromSpacyDoc.init ''' from gensim.models import Phrases phrases = phrases gram_size = gram_size assert type(phrases) == Phrases self.gram_size = gram_size self.phrases = phrases
Example #4
Source File: Word2VecFromParsedCorpus.py From scattertext with Apache License 2.0 | 5 votes |
def _scan_and_build_vocab(self): from gensim.models import Phrases bigram_transformer = Phrases(CorpusAdapterForGensim.get_sentences(self.corpus)) try: self.model.scan_vocab(CorpusAdapterForGensim.get_sentences(self.corpus)) except: pass self.model.build_vocab(bigram_transformer[CorpusAdapterForGensim.get_sentences(self.corpus)])
Example #5
Source File: train_vectors.py From Blackstone with Apache License 2.0 | 5 votes |
def compute_vectors(input_path: Path, output_path: Path): """ Builds word embeddings using gensim Word2Vec. This function takes a file contained single sentences per line and writes the computed vectors in text format to the specified output path. """ print(f"Processing {input_path}") sentences = LineSentence(input_path) bigram_transformer = Phrases(sentences) model = Word2Vec( bigram_transformer[sentences], size=150, window=5, min_count=5, workers=4 ) print(f"Saving vectors to {output_path}") model.wv.save_word2vec_format(output_path, binary=False)
Example #6
Source File: overkill.py From broca with MIT License | 5 votes |
def tokenize(self, docs): if self.lemmatize: lem = WordNetLemmatizer() #print('RAKE tokenizing...') pre_tdocs = RAKETokenizer(n_jobs=self.n_jobs).tokenize(docs) for i, tdoc in enumerate(pre_tdocs): for t in tdoc: if t.startswith('one'): print(t) print(i) #print('Additional Tokenizing docs...') if self.n_jobs == 1: tdocs = [pre_tokenize(doc, tdoc, lem=lem) for doc, tdoc in zip(docs, pre_tdocs)] else: tdocs = parallel(partial(pre_tokenize, lem=lem), zip(docs, pre_tdocs), self.n_jobs, expand_args=True) #print('Training bigram...') if self.bigram is None: self.bigram = Phrases(tdocs, min_count=self.min_count, threshold=self.threshold, delimiter=b' ') else: self.bigram.add_vocab(tdocs) #print('Training trigram...') if self.trigram is None: self.trigram = Phrases(self.bigram[tdocs], min_count=self.min_count, threshold=self.threshold, delimiter=b' ') else: self.trigram.add_vocab(self.bigram[tdocs]) return [tdoc for tdoc in self.trigram[self.bigram[tdocs]]]
Example #7
Source File: cf.py From Seq2Seq_Upgrade_TensorFlow with Apache License 2.0 | 5 votes |
def quad_gram_words(tokenized_sentences_tokenized_words, minimum_count_for_vectorization): print "performing bi gram" bigram = Phrases((tokenized_sentences_tokenized_words), min_count=minimum_count_for_vectorization, delimiter='_', threshold = 10) print "performing tri gram" trigram = Phrases((list(bigram[tokenized_sentences_tokenized_words])), min_count=minimum_count_for_vectorization, delimiter='_', threshold = 10) print "performing quad gram" quadgram = Phrases((list(trigram[list(bigram[tokenized_sentences_tokenized_words])])), min_count=minimum_count_for_vectorization, delimiter='_', threshold = 10) quadgramprocessed = (quadgram[list(trigram[list(bigram[tokenized_sentences_tokenized_words])])]) return quadgramprocessed
Example #8
Source File: cf.py From Seq2Seq_Upgrade_TensorFlow with Apache License 2.0 | 5 votes |
def bi_gram_words(tokenized_sentences_tokenized_words, minimum_count_for_vectorization): print "performing bi gram" bigram = Phrases((tokenized_sentences_tokenized_words), min_count=minimum_count_for_vectorization, delimiter='_', threshold = 10) bigramprocessed = (bigram[tokenized_sentences_tokenized_words]) return bigramprocessed