Python gensim.utils.smart_open() Examples
The following are 30
code examples of gensim.utils.smart_open().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
gensim.utils
, or try the search function
.
Example #1
Source File: ldamallet.py From topical_word_embeddings with MIT License | 6 votes |
def load_word_topics(self): logger.info("loading assigned topics from %s" % self.fstate()) wordtopics = numpy.zeros((self.num_topics, self.num_terms), dtype=numpy.float32) with utils.smart_open(self.fstate()) as fin: _ = next(fin) # header self.alpha = numpy.array([float(val) for val in next(fin).split()[2:]]) assert len(self.alpha) == self.num_topics, "mismatch between MALLET vs. requested topics" _ = next(fin) # beta for lineno, line in enumerate(fin): line = utils.to_unicode(line) doc, source, pos, typeindex, token, topic = line.split() tokenid = self.id2word.token2id[token] if hasattr(self.id2word, 'token2id') else int(token) wordtopics[int(topic), tokenid] += 1 logger.info("loaded assigned topics for %i tokens" % wordtopics.sum()) self.wordtopics = wordtopics self.print_topics(15)
Example #2
Source File: ldamallet.py From topical_word_embeddings with MIT License | 6 votes |
def load_word_topics(self): logger.info("loading assigned topics from %s" % self.fstate()) wordtopics = numpy.zeros((self.num_topics, self.num_terms), dtype=numpy.float32) with utils.smart_open(self.fstate()) as fin: _ = next(fin) # header self.alpha = numpy.array([float(val) for val in next(fin).split()[2:]]) assert len(self.alpha) == self.num_topics, "mismatch between MALLET vs. requested topics" _ = next(fin) # beta for lineno, line in enumerate(fin): line = utils.to_unicode(line) doc, source, pos, typeindex, token, topic = line.split() tokenid = self.id2word.token2id[token] if hasattr(self.id2word, 'token2id') else int(token) wordtopics[int(topic), tokenid] += 1 logger.info("loaded assigned topics for %i tokens" % wordtopics.sum()) self.wordtopics = wordtopics self.print_topics(15)
Example #3
Source File: dictionary.py From topical_word_embeddings with MIT License | 6 votes |
def save_as_text(self, fname, sort_by_word=True): """ Save this Dictionary to a text file, in format: `id[TAB]word_utf8[TAB]document frequency[NEWLINE]`. Sorted by word, or by decreasing word frequency. Note: text format should be use for corpus inspection. Use `save`/`load` to store in binary format (pickle) for improved performance. """ logger.info("saving dictionary mapping to %s" % fname) with utils.smart_open(fname, 'wb') as fout: if sort_by_word: for token, tokenid in sorted(iteritems(self.token2id)): line = "%i\t%s\t%i\n" % (tokenid, token, self.dfs.get(tokenid, 0)) fout.write(utils.to_utf8(line)) else: for tokenid, freq in sorted(iteritems(self.dfs), key=lambda item: -item[1]): line = "%i\t%s\t%i\n" % (tokenid, self[tokenid], freq) fout.write(utils.to_utf8(line))
Example #4
Source File: hashdictionary.py From topical_word_embeddings with MIT License | 6 votes |
def save_as_text(self, fname): """ Save this HashDictionary to a text file, for easier debugging. The format is: `id[TAB]document frequency of this id[TAB]tab-separated set of words in UTF8 that map to this id[NEWLINE]`. Note: use `save`/`load` to store in binary format instead (pickle). """ logger.info("saving HashDictionary mapping to %s" % fname) with utils.smart_open(fname, 'wb') as fout: for tokenid in self.keys(): words = sorted(self[tokenid]) if words: words_df = [(word, self.dfs_debug.get(word, 0)) for word in words] words_df = ["%s(%i)" % item for item in sorted(words_df, key=lambda item: -item[1])] fout.write(utils.to_utf8("%i\t%i\t%s\n" % (tokenid, self.dfs.get(tokenid, 0), '\t'.join(words_df)))) #endclass HashDictionary
Example #5
Source File: hashdictionary.py From topical_word_embeddings with MIT License | 6 votes |
def save_as_text(self, fname): """ Save this HashDictionary to a text file, for easier debugging. The format is: `id[TAB]document frequency of this id[TAB]tab-separated set of words in UTF8 that map to this id[NEWLINE]`. Note: use `save`/`load` to store in binary format instead (pickle). """ logger.info("saving HashDictionary mapping to %s" % fname) with utils.smart_open(fname, 'wb') as fout: for tokenid in self.keys(): words = sorted(self[tokenid]) if words: words_df = [(word, self.dfs_debug.get(word, 0)) for word in words] words_df = ["%s(%i)" % item for item in sorted(words_df, key=lambda item: -item[1])] fout.write(utils.to_utf8("%i\t%i\t%s\n" % (tokenid, self.dfs.get(tokenid, 0), '\t'.join(words_df)))) #endclass HashDictionary
Example #6
Source File: ucicorpus.py From topical_word_embeddings with MIT License | 6 votes |
def __init__(self, input): """ Initialize the reader. The `input` parameter refers to a file on the local filesystem, which is expected to be in the UCI Bag-of-Words format. """ logger.info('Initializing corpus reader from %s' % input) self.input = input with utils.smart_open(self.input) as fin: self.num_docs = self.num_terms = self.num_nnz = 0 try: self.num_docs = int(next(fin).strip()) self.num_terms = int(next(fin).strip()) self.num_nnz = int(next(fin).strip()) except StopIteration: pass logger.info('accepted corpus with %i documents, %i features, %i non-zero entries' % (self.num_docs, self.num_terms, self.num_nnz))
Example #7
Source File: ucicorpus.py From topical_word_embeddings with MIT License | 6 votes |
def __init__(self, input): """ Initialize the reader. The `input` parameter refers to a file on the local filesystem, which is expected to be in the UCI Bag-of-Words format. """ logger.info('Initializing corpus reader from %s' % input) self.input = input with utils.smart_open(self.input) as fin: self.num_docs = self.num_terms = self.num_nnz = 0 try: self.num_docs = int(next(fin).strip()) self.num_terms = int(next(fin).strip()) self.num_nnz = int(next(fin).strip()) except StopIteration: pass logger.info('accepted corpus with %i documents, %i features, %i non-zero entries' % (self.num_docs, self.num_terms, self.num_nnz))
Example #8
Source File: dictionary.py From topical_word_embeddings with MIT License | 6 votes |
def save_as_text(self, fname, sort_by_word=True): """ Save this Dictionary to a text file, in format: `id[TAB]word_utf8[TAB]document frequency[NEWLINE]`. Sorted by word, or by decreasing word frequency. Note: text format should be use for corpus inspection. Use `save`/`load` to store in binary format (pickle) for improved performance. """ logger.info("saving dictionary mapping to %s" % fname) with utils.smart_open(fname, 'wb') as fout: if sort_by_word: for token, tokenid in sorted(iteritems(self.token2id)): line = "%i\t%s\t%i\n" % (tokenid, token, self.dfs.get(tokenid, 0)) fout.write(utils.to_utf8(line)) else: for tokenid, freq in sorted(iteritems(self.dfs), key=lambda item: -item[1]): line = "%i\t%s\t%i\n" % (tokenid, self[tokenid], freq) fout.write(utils.to_utf8(line))
Example #9
Source File: ldamallet.py From topical_word_embeddings with MIT License | 6 votes |
def load_word_topics(self): logger.info("loading assigned topics from %s" % self.fstate()) wordtopics = numpy.zeros((self.num_topics, self.num_terms), dtype=numpy.float32) with utils.smart_open(self.fstate()) as fin: _ = next(fin) # header self.alpha = numpy.array([float(val) for val in next(fin).split()[2:]]) assert len(self.alpha) == self.num_topics, "mismatch between MALLET vs. requested topics" _ = next(fin) # beta for lineno, line in enumerate(fin): line = utils.to_unicode(line) doc, source, pos, typeindex, token, topic = line.split() tokenid = self.id2word.token2id[token] if hasattr(self.id2word, 'token2id') else int(token) wordtopics[int(topic), tokenid] += 1 logger.info("loaded assigned topics for %i tokens" % wordtopics.sum()) self.wordtopics = wordtopics self.print_topics(15)
Example #10
Source File: svmlightcorpus.py From topical_word_embeddings with MIT License | 6 votes |
def save_corpus(fname, corpus, id2word=None, labels=False, metadata=False): """ Save a corpus in the SVMlight format. The SVMlight `<target>` class tag is taken from the `labels` array, or set to 0 for all documents if `labels` is not supplied. This function is automatically called by `SvmLightCorpus.serialize`; don't call it directly, call `serialize` instead. """ logger.info("converting corpus to SVMlight format: %s" % fname) offsets = [] with utils.smart_open(fname, 'wb') as fout: for docno, doc in enumerate(corpus): label = labels[docno] if labels else 0 # target class is 0 by default offsets.append(fout.tell()) fout.write(utils.to_utf8(SvmLightCorpus.doc2line(doc, label))) return offsets
Example #11
Source File: svmlightcorpus.py From topical_word_embeddings with MIT License | 6 votes |
def save_corpus(fname, corpus, id2word=None, labels=False, metadata=False): """ Save a corpus in the SVMlight format. The SVMlight `<target>` class tag is taken from the `labels` array, or set to 0 for all documents if `labels` is not supplied. This function is automatically called by `SvmLightCorpus.serialize`; don't call it directly, call `serialize` instead. """ logger.info("converting corpus to SVMlight format: %s" % fname) offsets = [] with utils.smart_open(fname, 'wb') as fout: for docno, doc in enumerate(corpus): label = labels[docno] if labels else 0 # target class is 0 by default offsets.append(fout.tell()) fout.write(utils.to_utf8(SvmLightCorpus.doc2line(doc, label))) return offsets
Example #12
Source File: ucicorpus.py From topical_word_embeddings with MIT License | 6 votes |
def __init__(self, input): """ Initialize the reader. The `input` parameter refers to a file on the local filesystem, which is expected to be in the UCI Bag-of-Words format. """ logger.info('Initializing corpus reader from %s' % input) self.input = input with utils.smart_open(self.input) as fin: self.num_docs = self.num_terms = self.num_nnz = 0 try: self.num_docs = int(next(fin).strip()) self.num_terms = int(next(fin).strip()) self.num_nnz = int(next(fin).strip()) except StopIteration: pass logger.info('accepted corpus with %i documents, %i features, %i non-zero entries' % (self.num_docs, self.num_terms, self.num_nnz))
Example #13
Source File: dictionary.py From topical_word_embeddings with MIT License | 6 votes |
def save_as_text(self, fname, sort_by_word=True): """ Save this Dictionary to a text file, in format: `id[TAB]word_utf8[TAB]document frequency[NEWLINE]`. Sorted by word, or by decreasing word frequency. Note: text format should be use for corpus inspection. Use `save`/`load` to store in binary format (pickle) for improved performance. """ logger.info("saving dictionary mapping to %s" % fname) with utils.smart_open(fname, 'wb') as fout: if sort_by_word: for token, tokenid in sorted(iteritems(self.token2id)): line = "%i\t%s\t%i\n" % (tokenid, token, self.dfs.get(tokenid, 0)) fout.write(utils.to_utf8(line)) else: for tokenid, freq in sorted(iteritems(self.dfs), key=lambda item: -item[1]): line = "%i\t%s\t%i\n" % (tokenid, self[tokenid], freq) fout.write(utils.to_utf8(line))
Example #14
Source File: hashdictionary.py From topical_word_embeddings with MIT License | 6 votes |
def save_as_text(self, fname): """ Save this HashDictionary to a text file, for easier debugging. The format is: `id[TAB]document frequency of this id[TAB]tab-separated set of words in UTF8 that map to this id[NEWLINE]`. Note: use `save`/`load` to store in binary format instead (pickle). """ logger.info("saving HashDictionary mapping to %s" % fname) with utils.smart_open(fname, 'wb') as fout: for tokenid in self.keys(): words = sorted(self[tokenid]) if words: words_df = [(word, self.dfs_debug.get(word, 0)) for word in words] words_df = ["%s(%i)" % item for item in sorted(words_df, key=lambda item: -item[1])] fout.write(utils.to_utf8("%i\t%i\t%s\n" % (tokenid, self.dfs.get(tokenid, 0), '\t'.join(words_df)))) #endclass HashDictionary
Example #15
Source File: svmlightcorpus.py From topical_word_embeddings with MIT License | 6 votes |
def save_corpus(fname, corpus, id2word=None, labels=False, metadata=False): """ Save a corpus in the SVMlight format. The SVMlight `<target>` class tag is taken from the `labels` array, or set to 0 for all documents if `labels` is not supplied. This function is automatically called by `SvmLightCorpus.serialize`; don't call it directly, call `serialize` instead. """ logger.info("converting corpus to SVMlight format: %s" % fname) offsets = [] with utils.smart_open(fname, 'wb') as fout: for docno, doc in enumerate(corpus): label = labels[docno] if labels else 0 # target class is 0 by default offsets.append(fout.tell()) fout.write(utils.to_utf8(SvmLightCorpus.doc2line(doc, label))) return offsets
Example #16
Source File: ucicorpus.py From topical_word_embeddings with MIT License | 5 votes |
def __init__(self, fname, fname_vocab=None): IndexedCorpus.__init__(self, fname) UciReader.__init__(self, fname) if fname_vocab is None: fname_vocab = fname + '.vocab' self.fname = fname with utils.smart_open(fname_vocab) as fin: words = [word.strip() for word in fin] self.id2word = dict(enumerate(words)) self.transposed = True
Example #17
Source File: test_word2vec.py From topical_word_embeddings with MIT License | 5 votes |
def testLineSentenceWorksWithCompressedFile(self): """Does LineSentence work with a compressed file object argument?""" with utils.smart_open(datapath('head500.noblanks.cor')) as orig: sentences = word2vec.LineSentence(bz2.BZ2File(datapath('head500.noblanks.cor.bz2'))) for words in sentences: self.assertEqual(words, utils.to_unicode(orig.readline()).split())
Example #18
Source File: GensimLDA.py From termite-data-server with BSD 3-Clause "New" or "Revised" License | 5 votes |
def get_texts(self): total_docs = 0 if os.path.isdir( self.input ): # Read two levels of files filenames = glob.glob('{}/*'.format(self.input)) for filename in filenames: if os.path.isdir(filename): filenames += glob.glob('{}/*'.format(filename)) for filename in filenames: if not os.path.isdir( filename ): with utils.smart_open( filename ) as f: docId = filename docContent = u' '.join(f.read().decode('utf-8', 'ignore').splitlines()) tokens = self.tokenRegex.findall(docContent) tokens = [token.lower().encode('utf-8') for token in tokens if token not in STOPWORDS] yield tokens self.docIds.append(docId) total_docs += 1 else: with utils.smart_open(self.input) as f: for line in f: docId, docContent = line.decode('utf-8', 'ignore').rstrip('\n').split('\t') tokens = self.tokenRegex.findall(docContent) tokens = [token.lower().encode('utf-8') for token in tokens if token not in STOPWORDS] yield tokens self.docIds.append(docId) total_docs += 1 self.length = total_docs
Example #19
Source File: test_word2vec.py From topical_word_embeddings with MIT License | 5 votes |
def testLineSentenceWorksWithNormalFile(self): """Does LineSentence work with a file object argument, rather than filename?""" with utils.smart_open(datapath('head500.noblanks.cor')) as orig: with utils.smart_open(datapath('head500.noblanks.cor')) as fin: sentences = word2vec.LineSentence(fin) for words in sentences: self.assertEqual(words, utils.to_unicode(orig.readline()).split()) #endclass TestWord2VecSentenceIterators
Example #20
Source File: lowcorpus.py From topical_word_embeddings with MIT License | 5 votes |
def docbyoffset(self, offset): """ Return the document stored at file position `offset`. """ with utils.smart_open(self.fname) as f: f.seek(offset) return self.line2doc(f.readline()) # endclass LowCorpus
Example #21
Source File: GensimLDA.py From termite-data-server with BSD 3-Clause "New" or "Revised" License | 5 votes |
def get_texts(self): total_docs = 0 if os.path.isdir( self.input ): # Read two levels of files filenames = glob.glob('{}/*'.format(self.input)) for filename in filenames: if os.path.isdir(filename): filenames += glob.glob('{}/*'.format(filename)) for filename in filenames: if not os.path.isdir( filename ): with utils.smart_open( filename ) as f: docId = filename docContent = u' '.join(f.read().decode('utf-8', 'ignore').splitlines()) tokens = self.tokenRegex.findall(docContent) tokens = [token.lower().encode('utf-8') for token in tokens if token not in STOPWORDS] yield tokens self.docIds.append(docId) total_docs += 1 else: with utils.smart_open(self.input) as f: for line in f: docId, docContent = line.decode('utf-8', 'ignore').rstrip('\n').split('\t') tokens = self.tokenRegex.findall(docContent) tokens = [token.lower().encode('utf-8') for token in tokens if token not in STOPWORDS] yield tokens self.docIds.append(docId) total_docs += 1 self.length = total_docs
Example #22
Source File: malletcorpus.py From topical_word_embeddings with MIT License | 5 votes |
def docbyoffset(self, offset): """ Return the document stored at file position `offset`. """ with utils.smart_open(self.fname) as f: f.seek(offset) return self.line2doc(f.readline()) # endclass MalletCorpus
Example #23
Source File: malletcorpus.py From topical_word_embeddings with MIT License | 5 votes |
def __iter__(self): """ Iterate over the corpus at the given filename. Yields a bag-of-words, a.k.a list of tuples of (word id, word count), based on the given id2word dictionary. """ with utils.smart_open(self.fname) as f: for line in f: yield self.line2doc(line)
Example #24
Source File: svmlightcorpus.py From topical_word_embeddings with MIT License | 5 votes |
def docbyoffset(self, offset): """ Return the document stored at file position `offset`. """ with utils.smart_open(self.fname) as f: f.seek(offset) return self.line2doc(f.readline())[0]
Example #25
Source File: svmlightcorpus.py From topical_word_embeddings with MIT License | 5 votes |
def __iter__(self): """ Iterate over the corpus, returning one sparse vector at a time. """ lineno = -1 self.labels = [] with utils.smart_open(self.fname) as fin: for lineno, line in enumerate(fin): doc = self.line2doc(line) if doc is not None: if self.store_labels: self.labels.append(doc[1]) yield doc[0] self.length = lineno + 1
Example #26
Source File: interfaces.py From topical_word_embeddings with MIT License | 5 votes |
def save_corpus(fname, corpus, id2word=None, metadata=False): """ Save an existing `corpus` to disk. Some formats also support saving the dictionary (`feature_id->word` mapping), which can in this case be provided by the optional `id2word` parameter. >>> MmCorpus.save_corpus('file.mm', corpus) Some corpora also support an index of where each document begins, so that the documents on disk can be accessed in O(1) time (see the `corpora.IndexedCorpus` base class). In this case, `save_corpus` is automatically called internally by `serialize`, which does `save_corpus` plus saves the index at the same time, so you want to store the corpus with:: >>> MmCorpus.serialize('file.mm', corpus) # stores index as well, allowing random access to individual documents Calling `serialize()` is preferred to calling `save_corpus()`. """ raise NotImplementedError('cannot instantiate abstract base class') # example code: logger.info("converting corpus to ??? format: %s" % fname) with utils.smart_open(fname, 'wb') as fout: for doc in corpus: # iterate over the document stream fmt = str(doc) # format the document appropriately... fout.write(utils.to_utf8("%s\n" % fmt)) # serialize the formatted document to disk #endclass CorpusABC
Example #27
Source File: test_word2vec.py From topical_word_embeddings with MIT License | 5 votes |
def testLineSentenceWorksWithNormalFile(self): """Does LineSentence work with a file object argument, rather than filename?""" with utils.smart_open(datapath('head500.noblanks.cor')) as orig: with utils.smart_open(datapath('head500.noblanks.cor')) as fin: sentences = word2vec.LineSentence(fin) for words in sentences: self.assertEqual(words, utils.to_unicode(orig.readline()).split()) #endclass TestWord2VecSentenceIterators
Example #28
Source File: test_word2vec.py From topical_word_embeddings with MIT License | 5 votes |
def testLineSentenceWorksWithCompressedFile(self): """Does LineSentence work with a compressed file object argument?""" with utils.smart_open(datapath('head500.noblanks.cor')) as orig: sentences = word2vec.LineSentence(bz2.BZ2File(datapath('head500.noblanks.cor.bz2'))) for words in sentences: self.assertEqual(words, utils.to_unicode(orig.readline()).split())
Example #29
Source File: test_word2vec.py From topical_word_embeddings with MIT License | 5 votes |
def testLineSentenceWorksWithFilename(self): """Does LineSentence work with a filename argument?""" with utils.smart_open(datapath('lee_background.cor')) as orig: sentences = word2vec.LineSentence(datapath('lee_background.cor')) for words in sentences: self.assertEqual(words, utils.to_unicode(orig.readline()).split())
Example #30
Source File: preprocessing.py From topical_word_embeddings with MIT License | 5 votes |
def read_file(path): with utils.smart_open(path) as fin: return fin.read()