Python Examples of gensim.utils.smart

Source File: ldamallet.py From topical_word_embeddings with MIT License

6 votes

def load_word_topics(self):
        logger.info("loading assigned topics from %s" % self.fstate())
        wordtopics = numpy.zeros((self.num_topics, self.num_terms), dtype=numpy.float32)
        with utils.smart_open(self.fstate()) as fin:
            _ = next(fin)  # header
            self.alpha = numpy.array([float(val) for val in next(fin).split()[2:]])
            assert len(self.alpha) == self.num_topics, "mismatch between MALLET vs. requested topics"
            _ = next(fin)  # beta
            for lineno, line in enumerate(fin):
                line = utils.to_unicode(line)
                doc, source, pos, typeindex, token, topic = line.split()
                tokenid = self.id2word.token2id[token] if hasattr(self.id2word, 'token2id') else int(token)
                wordtopics[int(topic), tokenid] += 1
        logger.info("loaded assigned topics for %i tokens" % wordtopics.sum())
        self.wordtopics = wordtopics
        self.print_topics(15)

Source File: ldamallet.py From topical_word_embeddings with MIT License

6 votes

def load_word_topics(self):
        logger.info("loading assigned topics from %s" % self.fstate())
        wordtopics = numpy.zeros((self.num_topics, self.num_terms), dtype=numpy.float32)
        with utils.smart_open(self.fstate()) as fin:
            _ = next(fin)  # header
            self.alpha = numpy.array([float(val) for val in next(fin).split()[2:]])
            assert len(self.alpha) == self.num_topics, "mismatch between MALLET vs. requested topics"
            _ = next(fin)  # beta
            for lineno, line in enumerate(fin):
                line = utils.to_unicode(line)
                doc, source, pos, typeindex, token, topic = line.split()
                tokenid = self.id2word.token2id[token] if hasattr(self.id2word, 'token2id') else int(token)
                wordtopics[int(topic), tokenid] += 1
        logger.info("loaded assigned topics for %i tokens" % wordtopics.sum())
        self.wordtopics = wordtopics
        self.print_topics(15)

Source File: dictionary.py From topical_word_embeddings with MIT License

6 votes

def save_as_text(self, fname, sort_by_word=True):
        """
        Save this Dictionary to a text file, in format:
        `id[TAB]word_utf8[TAB]document frequency[NEWLINE]`. Sorted by word,
        or by decreasing word frequency.

        Note: text format should be use for corpus inspection. Use `save`/`load`
        to store in binary format (pickle) for improved performance.
        """
        logger.info("saving dictionary mapping to %s" % fname)
        with utils.smart_open(fname, 'wb') as fout:
            if sort_by_word:
                for token, tokenid in sorted(iteritems(self.token2id)):
                    line = "%i\t%s\t%i\n" % (tokenid, token, self.dfs.get(tokenid, 0))
                    fout.write(utils.to_utf8(line))
            else:
                for tokenid, freq in sorted(iteritems(self.dfs), key=lambda item: -item[1]):
                    line = "%i\t%s\t%i\n" % (tokenid, self[tokenid], freq)
                    fout.write(utils.to_utf8(line))

Source File: hashdictionary.py From topical_word_embeddings with MIT License

6 votes

def save_as_text(self, fname):
        """
        Save this HashDictionary to a text file, for easier debugging.

        The format is:
        `id[TAB]document frequency of this id[TAB]tab-separated set of words in UTF8 that map to this id[NEWLINE]`.

        Note: use `save`/`load` to store in binary format instead (pickle).
        """
        logger.info("saving HashDictionary mapping to %s" % fname)
        with utils.smart_open(fname, 'wb') as fout:
            for tokenid in self.keys():
                words = sorted(self[tokenid])
                if words:
                    words_df = [(word, self.dfs_debug.get(word, 0)) for word in words]
                    words_df = ["%s(%i)" % item for item in sorted(words_df, key=lambda item: -item[1])]
                    fout.write(utils.to_utf8("%i\t%i\t%s\n" %
                        (tokenid, self.dfs.get(tokenid, 0), '\t'.join(words_df))))
#endclass HashDictionary

Source File: hashdictionary.py From topical_word_embeddings with MIT License

6 votes

def save_as_text(self, fname):
        """
        Save this HashDictionary to a text file, for easier debugging.

        The format is:
        `id[TAB]document frequency of this id[TAB]tab-separated set of words in UTF8 that map to this id[NEWLINE]`.

        Note: use `save`/`load` to store in binary format instead (pickle).
        """
        logger.info("saving HashDictionary mapping to %s" % fname)
        with utils.smart_open(fname, 'wb') as fout:
            for tokenid in self.keys():
                words = sorted(self[tokenid])
                if words:
                    words_df = [(word, self.dfs_debug.get(word, 0)) for word in words]
                    words_df = ["%s(%i)" % item for item in sorted(words_df, key=lambda item: -item[1])]
                    fout.write(utils.to_utf8("%i\t%i\t%s\n" %
                        (tokenid, self.dfs.get(tokenid, 0), '\t'.join(words_df))))
#endclass HashDictionary

Source File: ucicorpus.py From topical_word_embeddings with MIT License

6 votes

def __init__(self, input):
        """
        Initialize the reader.

        The `input` parameter refers to a file on the local filesystem,
        which is expected to be in the UCI Bag-of-Words format.
        """

        logger.info('Initializing corpus reader from %s' % input)

        self.input = input

        with utils.smart_open(self.input) as fin:
            self.num_docs = self.num_terms = self.num_nnz = 0
            try:
                self.num_docs = int(next(fin).strip())
                self.num_terms = int(next(fin).strip())
                self.num_nnz = int(next(fin).strip())
            except StopIteration:
                pass

        logger.info('accepted corpus with %i documents, %i features, %i non-zero entries' %
            (self.num_docs, self.num_terms, self.num_nnz))

Source File: ucicorpus.py From topical_word_embeddings with MIT License

6 votes

def __init__(self, input):
        """
        Initialize the reader.

        The `input` parameter refers to a file on the local filesystem,
        which is expected to be in the UCI Bag-of-Words format.
        """

        logger.info('Initializing corpus reader from %s' % input)

        self.input = input

        with utils.smart_open(self.input) as fin:
            self.num_docs = self.num_terms = self.num_nnz = 0
            try:
                self.num_docs = int(next(fin).strip())
                self.num_terms = int(next(fin).strip())
                self.num_nnz = int(next(fin).strip())
            except StopIteration:
                pass

        logger.info('accepted corpus with %i documents, %i features, %i non-zero entries' %
            (self.num_docs, self.num_terms, self.num_nnz))

Source File: dictionary.py From topical_word_embeddings with MIT License

6 votes

def save_as_text(self, fname, sort_by_word=True):
        """
        Save this Dictionary to a text file, in format:
        `id[TAB]word_utf8[TAB]document frequency[NEWLINE]`. Sorted by word,
        or by decreasing word frequency.

        Note: text format should be use for corpus inspection. Use `save`/`load`
        to store in binary format (pickle) for improved performance.
        """
        logger.info("saving dictionary mapping to %s" % fname)
        with utils.smart_open(fname, 'wb') as fout:
            if sort_by_word:
                for token, tokenid in sorted(iteritems(self.token2id)):
                    line = "%i\t%s\t%i\n" % (tokenid, token, self.dfs.get(tokenid, 0))
                    fout.write(utils.to_utf8(line))
            else:
                for tokenid, freq in sorted(iteritems(self.dfs), key=lambda item: -item[1]):
                    line = "%i\t%s\t%i\n" % (tokenid, self[tokenid], freq)
                    fout.write(utils.to_utf8(line))

Source File: ldamallet.py From topical_word_embeddings with MIT License

6 votes

def load_word_topics(self):
        logger.info("loading assigned topics from %s" % self.fstate())
        wordtopics = numpy.zeros((self.num_topics, self.num_terms), dtype=numpy.float32)
        with utils.smart_open(self.fstate()) as fin:
            _ = next(fin)  # header
            self.alpha = numpy.array([float(val) for val in next(fin).split()[2:]])
            assert len(self.alpha) == self.num_topics, "mismatch between MALLET vs. requested topics"
            _ = next(fin)  # beta
            for lineno, line in enumerate(fin):
                line = utils.to_unicode(line)
                doc, source, pos, typeindex, token, topic = line.split()
                tokenid = self.id2word.token2id[token] if hasattr(self.id2word, 'token2id') else int(token)
                wordtopics[int(topic), tokenid] += 1
        logger.info("loaded assigned topics for %i tokens" % wordtopics.sum())
        self.wordtopics = wordtopics
        self.print_topics(15)

Source File: svmlightcorpus.py From topical_word_embeddings with MIT License

6 votes

def save_corpus(fname, corpus, id2word=None, labels=False, metadata=False):
        """
        Save a corpus in the SVMlight format.

        The SVMlight `<target>` class tag is taken from the `labels` array, or set
        to 0 for all documents if `labels` is not supplied.

        This function is automatically called by `SvmLightCorpus.serialize`; don't
        call it directly, call `serialize` instead.
        """
        logger.info("converting corpus to SVMlight format: %s" % fname)

        offsets = []
        with utils.smart_open(fname, 'wb') as fout:
            for docno, doc in enumerate(corpus):
                label = labels[docno] if labels else 0 # target class is 0 by default
                offsets.append(fout.tell())
                fout.write(utils.to_utf8(SvmLightCorpus.doc2line(doc, label)))
        return offsets

Source File: svmlightcorpus.py From topical_word_embeddings with MIT License

6 votes

def save_corpus(fname, corpus, id2word=None, labels=False, metadata=False):
        """
        Save a corpus in the SVMlight format.

        The SVMlight `<target>` class tag is taken from the `labels` array, or set
        to 0 for all documents if `labels` is not supplied.

        This function is automatically called by `SvmLightCorpus.serialize`; don't
        call it directly, call `serialize` instead.
        """
        logger.info("converting corpus to SVMlight format: %s" % fname)

        offsets = []
        with utils.smart_open(fname, 'wb') as fout:
            for docno, doc in enumerate(corpus):
                label = labels[docno] if labels else 0 # target class is 0 by default
                offsets.append(fout.tell())
                fout.write(utils.to_utf8(SvmLightCorpus.doc2line(doc, label)))
        return offsets

Source File: ucicorpus.py From topical_word_embeddings with MIT License

6 votes

def __init__(self, input):
        """
        Initialize the reader.

        The `input` parameter refers to a file on the local filesystem,
        which is expected to be in the UCI Bag-of-Words format.
        """

        logger.info('Initializing corpus reader from %s' % input)

        self.input = input

        with utils.smart_open(self.input) as fin:
            self.num_docs = self.num_terms = self.num_nnz = 0
            try:
                self.num_docs = int(next(fin).strip())
                self.num_terms = int(next(fin).strip())
                self.num_nnz = int(next(fin).strip())
            except StopIteration:
                pass

        logger.info('accepted corpus with %i documents, %i features, %i non-zero entries' %
            (self.num_docs, self.num_terms, self.num_nnz))

Source File: dictionary.py From topical_word_embeddings with MIT License

6 votes

def save_as_text(self, fname, sort_by_word=True):
        """
        Save this Dictionary to a text file, in format:
        `id[TAB]word_utf8[TAB]document frequency[NEWLINE]`. Sorted by word,
        or by decreasing word frequency.

        Note: text format should be use for corpus inspection. Use `save`/`load`
        to store in binary format (pickle) for improved performance.
        """
        logger.info("saving dictionary mapping to %s" % fname)
        with utils.smart_open(fname, 'wb') as fout:
            if sort_by_word:
                for token, tokenid in sorted(iteritems(self.token2id)):
                    line = "%i\t%s\t%i\n" % (tokenid, token, self.dfs.get(tokenid, 0))
                    fout.write(utils.to_utf8(line))
            else:
                for tokenid, freq in sorted(iteritems(self.dfs), key=lambda item: -item[1]):
                    line = "%i\t%s\t%i\n" % (tokenid, self[tokenid], freq)
                    fout.write(utils.to_utf8(line))

Source File: hashdictionary.py From topical_word_embeddings with MIT License

6 votes

def save_as_text(self, fname):
        """
        Save this HashDictionary to a text file, for easier debugging.

        The format is:
        `id[TAB]document frequency of this id[TAB]tab-separated set of words in UTF8 that map to this id[NEWLINE]`.

        Note: use `save`/`load` to store in binary format instead (pickle).
        """
        logger.info("saving HashDictionary mapping to %s" % fname)
        with utils.smart_open(fname, 'wb') as fout:
            for tokenid in self.keys():
                words = sorted(self[tokenid])
                if words:
                    words_df = [(word, self.dfs_debug.get(word, 0)) for word in words]
                    words_df = ["%s(%i)" % item for item in sorted(words_df, key=lambda item: -item[1])]
                    fout.write(utils.to_utf8("%i\t%i\t%s\n" %
                        (tokenid, self.dfs.get(tokenid, 0), '\t'.join(words_df))))
#endclass HashDictionary

Source File: svmlightcorpus.py From topical_word_embeddings with MIT License

6 votes

def save_corpus(fname, corpus, id2word=None, labels=False, metadata=False):
        """
        Save a corpus in the SVMlight format.

        The SVMlight `<target>` class tag is taken from the `labels` array, or set
        to 0 for all documents if `labels` is not supplied.

        This function is automatically called by `SvmLightCorpus.serialize`; don't
        call it directly, call `serialize` instead.
        """
        logger.info("converting corpus to SVMlight format: %s" % fname)

        offsets = []
        with utils.smart_open(fname, 'wb') as fout:
            for docno, doc in enumerate(corpus):
                label = labels[docno] if labels else 0 # target class is 0 by default
                offsets.append(fout.tell())
                fout.write(utils.to_utf8(SvmLightCorpus.doc2line(doc, label)))
        return offsets

Source File: ucicorpus.py From topical_word_embeddings with MIT License

5 votes

def __init__(self, fname, fname_vocab=None):
        IndexedCorpus.__init__(self, fname)
        UciReader.__init__(self, fname)

        if fname_vocab is None:
            fname_vocab = fname + '.vocab'

        self.fname = fname
        with utils.smart_open(fname_vocab) as fin:
            words = [word.strip() for word in fin]
        self.id2word = dict(enumerate(words))

        self.transposed = True

Source File: test_word2vec.py From topical_word_embeddings with MIT License

5 votes

def testLineSentenceWorksWithCompressedFile(self):
        """Does LineSentence work with a compressed file object argument?"""
        with utils.smart_open(datapath('head500.noblanks.cor')) as orig:
            sentences = word2vec.LineSentence(bz2.BZ2File(datapath('head500.noblanks.cor.bz2')))
            for words in sentences:
                self.assertEqual(words, utils.to_unicode(orig.readline()).split())

Source File: GensimLDA.py From termite-data-server with BSD 3-Clause "New" or "Revised" License

5 votes

def get_texts(self):
		total_docs = 0
		if os.path.isdir( self.input ):
			# Read two levels of files
			filenames = glob.glob('{}/*'.format(self.input))
			for filename in filenames:
				if os.path.isdir(filename):
					filenames += glob.glob('{}/*'.format(filename))
			for filename in filenames:
				if not os.path.isdir( filename ):
					with utils.smart_open( filename ) as f:
						docId = filename
						docContent = u' '.join(f.read().decode('utf-8', 'ignore').splitlines())
						tokens = self.tokenRegex.findall(docContent)
						tokens = [token.lower().encode('utf-8') for token in tokens if token not in STOPWORDS]
						yield tokens
						self.docIds.append(docId)
						total_docs += 1
		else:
			with utils.smart_open(self.input) as f:
				for line in f:
					docId, docContent = line.decode('utf-8', 'ignore').rstrip('\n').split('\t')
					tokens = self.tokenRegex.findall(docContent)
					tokens = [token.lower().encode('utf-8') for token in tokens if token not in STOPWORDS]
					yield tokens
					self.docIds.append(docId)
					total_docs += 1
		self.length = total_docs

Source File: test_word2vec.py From topical_word_embeddings with MIT License

5 votes

def testLineSentenceWorksWithNormalFile(self):
        """Does LineSentence work with a file object argument, rather than filename?"""
        with utils.smart_open(datapath('head500.noblanks.cor')) as orig:
            with utils.smart_open(datapath('head500.noblanks.cor')) as fin:
                sentences = word2vec.LineSentence(fin)
                for words in sentences:
                    self.assertEqual(words, utils.to_unicode(orig.readline()).split())
#endclass TestWord2VecSentenceIterators

Source File: lowcorpus.py From topical_word_embeddings with MIT License

5 votes

def docbyoffset(self, offset):
        """
        Return the document stored at file position `offset`.
        """
        with utils.smart_open(self.fname) as f:
            f.seek(offset)
            return self.line2doc(f.readline())

# endclass LowCorpus

Source File: GensimLDA.py From termite-data-server with BSD 3-Clause "New" or "Revised" License

5 votes

def get_texts(self):
		total_docs = 0
		if os.path.isdir( self.input ):
			# Read two levels of files
			filenames = glob.glob('{}/*'.format(self.input))
			for filename in filenames:
				if os.path.isdir(filename):
					filenames += glob.glob('{}/*'.format(filename))
			for filename in filenames:
				if not os.path.isdir( filename ):
					with utils.smart_open( filename ) as f:
						docId = filename
						docContent = u' '.join(f.read().decode('utf-8', 'ignore').splitlines())
						tokens = self.tokenRegex.findall(docContent)
						tokens = [token.lower().encode('utf-8') for token in tokens if token not in STOPWORDS]
						yield tokens
						self.docIds.append(docId)
						total_docs += 1
		else:
			with utils.smart_open(self.input) as f:
				for line in f:
					docId, docContent = line.decode('utf-8', 'ignore').rstrip('\n').split('\t')
					tokens = self.tokenRegex.findall(docContent)
					tokens = [token.lower().encode('utf-8') for token in tokens if token not in STOPWORDS]
					yield tokens
					self.docIds.append(docId)
					total_docs += 1
		self.length = total_docs

Source File: malletcorpus.py From topical_word_embeddings with MIT License

5 votes

def docbyoffset(self, offset):
        """
        Return the document stored at file position `offset`.
        """
        with utils.smart_open(self.fname) as f:
            f.seek(offset)
            return self.line2doc(f.readline())

# endclass MalletCorpus

Source File: malletcorpus.py From topical_word_embeddings with MIT License

5 votes

def __iter__(self):
        """
        Iterate over the corpus at the given filename.

        Yields a bag-of-words, a.k.a list of tuples of (word id, word count), based on the given id2word dictionary.
        """
        with utils.smart_open(self.fname) as f:
            for line in f:
                yield self.line2doc(line)

Source File: svmlightcorpus.py From topical_word_embeddings with MIT License

5 votes

def docbyoffset(self, offset):
        """
        Return the document stored at file position `offset`.
        """
        with utils.smart_open(self.fname) as f:
            f.seek(offset)
            return self.line2doc(f.readline())[0]

Source File: svmlightcorpus.py From topical_word_embeddings with MIT License

5 votes

def __iter__(self):
        """
        Iterate over the corpus, returning one sparse vector at a time.
        """
        lineno = -1
        self.labels = []
        with utils.smart_open(self.fname) as fin:
            for lineno, line in enumerate(fin):
                doc = self.line2doc(line)
                if doc is not None:
                    if self.store_labels:
                        self.labels.append(doc[1])
                    yield doc[0]
        self.length = lineno + 1

Source File: interfaces.py From topical_word_embeddings with MIT License

5 votes

def save_corpus(fname, corpus, id2word=None, metadata=False):
        """
        Save an existing `corpus` to disk.

        Some formats also support saving the dictionary (`feature_id->word` mapping),
        which can in this case be provided by the optional `id2word` parameter.

        >>> MmCorpus.save_corpus('file.mm', corpus)

        Some corpora also support an index of where each document begins, so
        that the documents on disk can be accessed in O(1) time (see the
        `corpora.IndexedCorpus` base class). In this case, `save_corpus` is automatically
        called internally by `serialize`, which does `save_corpus` plus saves the index
        at the same time, so you want to store the corpus with::

        >>> MmCorpus.serialize('file.mm', corpus) # stores index as well, allowing random access to individual documents

        Calling `serialize()` is preferred to calling `save_corpus()`.

        """
        raise NotImplementedError('cannot instantiate abstract base class')

        # example code:
        logger.info("converting corpus to ??? format: %s" % fname)
        with utils.smart_open(fname, 'wb') as fout:
            for doc in corpus: # iterate over the document stream
                fmt = str(doc) # format the document appropriately...
                fout.write(utils.to_utf8("%s\n" % fmt)) # serialize the formatted document to disk
#endclass CorpusABC

Source File: test_word2vec.py From topical_word_embeddings with MIT License

5 votes

def testLineSentenceWorksWithNormalFile(self):
        """Does LineSentence work with a file object argument, rather than filename?"""
        with utils.smart_open(datapath('head500.noblanks.cor')) as orig:
            with utils.smart_open(datapath('head500.noblanks.cor')) as fin:
                sentences = word2vec.LineSentence(fin)
                for words in sentences:
                    self.assertEqual(words, utils.to_unicode(orig.readline()).split())
#endclass TestWord2VecSentenceIterators

Source File: test_word2vec.py From topical_word_embeddings with MIT License

5 votes

def testLineSentenceWorksWithCompressedFile(self):
        """Does LineSentence work with a compressed file object argument?"""
        with utils.smart_open(datapath('head500.noblanks.cor')) as orig:
            sentences = word2vec.LineSentence(bz2.BZ2File(datapath('head500.noblanks.cor.bz2')))
            for words in sentences:
                self.assertEqual(words, utils.to_unicode(orig.readline()).split())

Source File: test_word2vec.py From topical_word_embeddings with MIT License

5 votes

def testLineSentenceWorksWithFilename(self):
        """Does LineSentence work with a filename argument?"""
        with utils.smart_open(datapath('lee_background.cor')) as orig:
            sentences = word2vec.LineSentence(datapath('lee_background.cor'))
            for words in sentences:
                self.assertEqual(words, utils.to_unicode(orig.readline()).split())

Source File: preprocessing.py From topical_word_embeddings with MIT License

5 votes

def read_file(path):
    with utils.smart_open(path) as fin:
        return fin.read()

Python gensim.utils.smart_open() Examples