Python gensim.models.doc2vec.LabeledSentence() Examples

The following are 8 code examples of gensim.models.doc2vec.LabeledSentence(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module gensim.models.doc2vec , or try the search function .
Example #1
Source File: doc2vec.py    From broca with MIT License 6 votes vote down vote up
def _doc2vec_doc_stream(paths, n, tokenizer=word_tokenize, sentences=True):
    """
    Generator to feed sentences to the dov2vec model.
    """
    i = 0
    p = Progress()
    for path in paths:
        with open(path, 'r') as f:
            for line in f:
                i += 1
                p.print_progress(i/n)

                # We do minimal pre-processing here so the model can learn
                # punctuation
                line = line.lower()

                if sentences:
                    for sent in sent_tokenize(line):
                        tokens = tokenizer(sent)
                        yield LabeledSentence(tokens, ['SENT_{}'.format(i)])
                else:
                    tokens = tokenizer(line)
                    yield LabeledSentence(tokens, ['SENT_{}'.format(i)]) 
Example #2
Source File: corpora.py    From Document2Vec with MIT License 5 votes vote down vote up
def __iter__(self):
        if self.labels:
            for index, line in zip(self.series.index, self.series.values):
                label = ['SENT_%s' % str(index)]
                ls = LabeledSentence(line.split(' '), label)
                yield ls
        else:
            for index, line in self.series.index, self.series.values:
                yield line.split(' ') 
Example #3
Source File: embedding_trainer.py    From kaggle-HomeDepot with MIT License 5 votes vote down vote up
def __iter__(self):
        for column in self.columns:
            for sentence in self.df[column]:
                if not sentence in self.sent_label:
                    self.cnt += 1
                    self.sent_label[sentence] = "SENT_%d"%self.cnt
                tokens = nlp_utils._tokenize(sentence, token_pattern)
                yield LabeledSentence(words=tokens, tags=[self.sent_label[sentence]]) 
Example #4
Source File: doc2vectrain.py    From NETL-Automatic-Topic-Labelling- with Apache License 2.0 5 votes vote down vote up
def __iter__(self):

        for source in self.sources:
            with codecs.open(source, "r", "utf-8") as fin:
                for cnt,line in enumerate(fin):
                    if "<doc" in line:           # Every new document starts with this format
                        found = ""

                        m = re.search('title="(.*)">',line)    # This gives the document title of Wikipedia
                        if m:
                            found = m.group(1)
                            found = found.lower()
			    found = unicodedata.normalize("NFKD", found) 
                            found = found.replace(" ","_") 
                            found = found.encode('utf-8')
					   
                        else:
                            found = ""
                        values =[]
                    else:
                        if "</doc" not in line:                      #</doc tells us end of document, till not reached it is same document
                            for word in line.split(" "):
                                values.append(word.strip())
                        if "</doc" in line:
                            if found!= "":
                            
                                yield LabeledSentence(words = values, tags = [found]) 
Example #5
Source File: doc2vec_model.py    From doc2vec with MIT License 5 votes vote down vote up
def label_sentences(corpus, label_type):
        """
        Gensim's Doc2Vec implementation requires each
         document/paragraph to have a label associated with it.
        We do this by using the LabeledSentence method.
        The format will be "TRAIN_i" or "TEST_i" where "i" is
        a dummy index of the review.
        """
        labeled = []
        for i, v in enumerate(corpus):
            label = label_type + '_' + str(i)
            labeled.append(doc2vec.LabeledSentence(v.split(), [label]))
        return labeled 
Example #6
Source File: doc2vec.py    From broca with MIT License 5 votes vote down vote up
def _gen_sentence(self, assetid_body_tuple):
        '''
        Takes an assetid_body_tuple and returns a Doc2Vec LabeledSentence 

        Args:
            assetid_body_tuple (tuple): (assetid, bodytext) pair 
        '''
        asset_id, body = assetid_body_tuple
        text = self._process(body)
        sentence = LabeledSentence(text, labels=['DOC_%s' % str(asset_id)])
        return sentence 
Example #7
Source File: generate_d2v.py    From kaggle-word2vec-movie-reviews with GNU General Public License v2.0 5 votes vote down vote up
def getCleanLabeledReviews(reviews):
    clean_reviews = []
    for review in reviews["review"]:
        clean_reviews.append(KaggleWord2VecUtility.review_to_wordlist(review))
    
    labelized = []
    for i, id_label in enumerate(reviews["id"]):
        labelized.append(LabeledSentence(clean_reviews[i], [id_label]))
    return labelized 
Example #8
Source File: predict.py    From kaggle-word2vec-movie-reviews with GNU General Public License v2.0 5 votes vote down vote up
def getCleanLabeledReviews(reviews):
    clean_reviews = []
    for review in reviews["review"]:
        clean_reviews.append(KaggleWord2VecUtility.review_to_wordlist(review, True))
    
    labelized = []
    for i, id_label in enumerate(reviews["id"]):
        labelized.append(LabeledSentence(clean_reviews[i], [id_label]))
    return labelized