Python gensim.models.doc2vec.LabeledSentence() Examples
The following are 8
code examples of gensim.models.doc2vec.LabeledSentence().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
gensim.models.doc2vec
, or try the search function
.
Example #1
Source File: doc2vec.py From broca with MIT License | 6 votes |
def _doc2vec_doc_stream(paths, n, tokenizer=word_tokenize, sentences=True): """ Generator to feed sentences to the dov2vec model. """ i = 0 p = Progress() for path in paths: with open(path, 'r') as f: for line in f: i += 1 p.print_progress(i/n) # We do minimal pre-processing here so the model can learn # punctuation line = line.lower() if sentences: for sent in sent_tokenize(line): tokens = tokenizer(sent) yield LabeledSentence(tokens, ['SENT_{}'.format(i)]) else: tokens = tokenizer(line) yield LabeledSentence(tokens, ['SENT_{}'.format(i)])
Example #2
Source File: corpora.py From Document2Vec with MIT License | 5 votes |
def __iter__(self): if self.labels: for index, line in zip(self.series.index, self.series.values): label = ['SENT_%s' % str(index)] ls = LabeledSentence(line.split(' '), label) yield ls else: for index, line in self.series.index, self.series.values: yield line.split(' ')
Example #3
Source File: embedding_trainer.py From kaggle-HomeDepot with MIT License | 5 votes |
def __iter__(self): for column in self.columns: for sentence in self.df[column]: if not sentence in self.sent_label: self.cnt += 1 self.sent_label[sentence] = "SENT_%d"%self.cnt tokens = nlp_utils._tokenize(sentence, token_pattern) yield LabeledSentence(words=tokens, tags=[self.sent_label[sentence]])
Example #4
Source File: doc2vectrain.py From NETL-Automatic-Topic-Labelling- with Apache License 2.0 | 5 votes |
def __iter__(self): for source in self.sources: with codecs.open(source, "r", "utf-8") as fin: for cnt,line in enumerate(fin): if "<doc" in line: # Every new document starts with this format found = "" m = re.search('title="(.*)">',line) # This gives the document title of Wikipedia if m: found = m.group(1) found = found.lower() found = unicodedata.normalize("NFKD", found) found = found.replace(" ","_") found = found.encode('utf-8') else: found = "" values =[] else: if "</doc" not in line: #</doc tells us end of document, till not reached it is same document for word in line.split(" "): values.append(word.strip()) if "</doc" in line: if found!= "": yield LabeledSentence(words = values, tags = [found])
Example #5
Source File: doc2vec_model.py From doc2vec with MIT License | 5 votes |
def label_sentences(corpus, label_type): """ Gensim's Doc2Vec implementation requires each document/paragraph to have a label associated with it. We do this by using the LabeledSentence method. The format will be "TRAIN_i" or "TEST_i" where "i" is a dummy index of the review. """ labeled = [] for i, v in enumerate(corpus): label = label_type + '_' + str(i) labeled.append(doc2vec.LabeledSentence(v.split(), [label])) return labeled
Example #6
Source File: doc2vec.py From broca with MIT License | 5 votes |
def _gen_sentence(self, assetid_body_tuple): ''' Takes an assetid_body_tuple and returns a Doc2Vec LabeledSentence Args: assetid_body_tuple (tuple): (assetid, bodytext) pair ''' asset_id, body = assetid_body_tuple text = self._process(body) sentence = LabeledSentence(text, labels=['DOC_%s' % str(asset_id)]) return sentence
Example #7
Source File: generate_d2v.py From kaggle-word2vec-movie-reviews with GNU General Public License v2.0 | 5 votes |
def getCleanLabeledReviews(reviews): clean_reviews = [] for review in reviews["review"]: clean_reviews.append(KaggleWord2VecUtility.review_to_wordlist(review)) labelized = [] for i, id_label in enumerate(reviews["id"]): labelized.append(LabeledSentence(clean_reviews[i], [id_label])) return labelized
Example #8
Source File: predict.py From kaggle-word2vec-movie-reviews with GNU General Public License v2.0 | 5 votes |
def getCleanLabeledReviews(reviews): clean_reviews = [] for review in reviews["review"]: clean_reviews.append(KaggleWord2VecUtility.review_to_wordlist(review, True)) labelized = [] for i, id_label in enumerate(reviews["id"]): labelized.append(LabeledSentence(clean_reviews[i], [id_label])) return labelized