Python gensim.models.doc2vec.LabeledSentence() Examples

The following are 8 code examples of gensim.models.doc2vec.LabeledSentence(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module gensim.models.doc2vec , or try the search function

Example #1

Source File: doc2vec.py From broca with MIT License

6 votes

def _doc2vec_doc_stream(paths, n, tokenizer=word_tokenize, sentences=True):
    """
    Generator to feed sentences to the dov2vec model.
    """
    i = 0
    p = Progress()
    for path in paths:
        with open(path, 'r') as f:
            for line in f:
                i += 1
                p.print_progress(i/n)

                # We do minimal pre-processing here so the model can learn
                # punctuation
                line = line.lower()

                if sentences:
                    for sent in sent_tokenize(line):
                        tokens = tokenizer(sent)
                        yield LabeledSentence(tokens, ['SENT_{}'.format(i)])
                else:
                    tokens = tokenizer(line)
                    yield LabeledSentence(tokens, ['SENT_{}'.format(i)])

Example #2

Source File: corpora.py From Document2Vec with MIT License

5 votes

def __iter__(self):
        if self.labels:
            for index, line in zip(self.series.index, self.series.values):
                label = ['SENT_%s' % str(index)]
                ls = LabeledSentence(line.split(' '), label)
                yield ls
        else:
            for index, line in self.series.index, self.series.values:
                yield line.split(' ')

Example #3

Source File: embedding_trainer.py From kaggle-HomeDepot with MIT License

5 votes

def __iter__(self):
        for column in self.columns:
            for sentence in self.df[column]:
                if not sentence in self.sent_label:
                    self.cnt += 1
                    self.sent_label[sentence] = "SENT_%d"%self.cnt
                tokens = nlp_utils._tokenize(sentence, token_pattern)
                yield LabeledSentence(words=tokens, tags=[self.sent_label[sentence]])

Example #4

Source File: doc2vectrain.py From NETL-Automatic-Topic-Labelling- with Apache License 2.0

5 votes

def __iter__(self):

        for source in self.sources:
            with codecs.open(source, "r", "utf-8") as fin:
                for cnt,line in enumerate(fin):
                    if "<doc" in line:           # Every new document starts with this format
                        found = ""

                        m = re.search('title="(.*)">',line)    # This gives the document title of Wikipedia
                        if m:
                            found = m.group(1)
                            found = found.lower()
			    found = unicodedata.normalize("NFKD", found) 
                            found = found.replace(" ","_") 
                            found = found.encode('utf-8')
					   
                        else:
                            found = ""
                        values =[]
                    else:
                        if "</doc" not in line:                      #</doc tells us end of document, till not reached it is same document
                            for word in line.split(" "):
                                values.append(word.strip())
                        if "</doc" in line:
                            if found!= "":
                            
                                yield LabeledSentence(words = values, tags = [found])

Example #5

Source File: doc2vec_model.py From doc2vec with MIT License

5 votes

def label_sentences(corpus, label_type):
        """
        Gensim's Doc2Vec implementation requires each
         document/paragraph to have a label associated with it.
        We do this by using the LabeledSentence method.
        The format will be "TRAIN_i" or "TEST_i" where "i" is
        a dummy index of the review.
        """
        labeled = []
        for i, v in enumerate(corpus):
            label = label_type + '_' + str(i)
            labeled.append(doc2vec.LabeledSentence(v.split(), [label]))
        return labeled

Example #6

Source File: doc2vec.py From broca with MIT License

5 votes

def _gen_sentence(self, assetid_body_tuple):
        '''
        Takes an assetid_body_tuple and returns a Doc2Vec LabeledSentence 

        Args:
            assetid_body_tuple (tuple): (assetid, bodytext) pair 
        '''
        asset_id, body = assetid_body_tuple
        text = self._process(body)
        sentence = LabeledSentence(text, labels=['DOC_%s' % str(asset_id)])
        return sentence

Example #7

Source File: generate_d2v.py From kaggle-word2vec-movie-reviews with GNU General Public License v2.0

5 votes

def getCleanLabeledReviews(reviews):
    clean_reviews = []
    for review in reviews["review"]:
        clean_reviews.append(KaggleWord2VecUtility.review_to_wordlist(review))
    
    labelized = []
    for i, id_label in enumerate(reviews["id"]):
        labelized.append(LabeledSentence(clean_reviews[i], [id_label]))
    return labelized

Example #8

Source File: predict.py From kaggle-word2vec-movie-reviews with GNU General Public License v2.0

5 votes

def getCleanLabeledReviews(reviews):
    clean_reviews = []
    for review in reviews["review"]:
        clean_reviews.append(KaggleWord2VecUtility.review_to_wordlist(review, True))
    
    labelized = []
    for i, id_label in enumerate(reviews["id"]):
        labelized.append(LabeledSentence(clean_reviews[i], [id_label]))
    return labelized