Python gensim.utils.to_unicode() Examples
The following are 30
code examples of gensim.utils.to_unicode().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
gensim.utils
, or try the search function
.
Example #1
Source File: ldamallet.py From topical_word_embeddings with MIT License | 6 votes |
def load_word_topics(self): logger.info("loading assigned topics from %s" % self.fstate()) wordtopics = numpy.zeros((self.num_topics, self.num_terms), dtype=numpy.float32) with utils.smart_open(self.fstate()) as fin: _ = next(fin) # header self.alpha = numpy.array([float(val) for val in next(fin).split()[2:]]) assert len(self.alpha) == self.num_topics, "mismatch between MALLET vs. requested topics" _ = next(fin) # beta for lineno, line in enumerate(fin): line = utils.to_unicode(line) doc, source, pos, typeindex, token, topic = line.split() tokenid = self.id2word.token2id[token] if hasattr(self.id2word, 'token2id') else int(token) wordtopics[int(topic), tokenid] += 1 logger.info("loaded assigned topics for %i tokens" % wordtopics.sum()) self.wordtopics = wordtopics self.print_topics(15)
Example #2
Source File: dictionary.py From topical_word_embeddings with MIT License | 6 votes |
def load_from_text(fname): """ Load a previously stored Dictionary from a text file. Mirror function to `save_as_text`. """ result = Dictionary() with utils.smart_open(fname) as f: for lineno, line in enumerate(f): line = utils.to_unicode(line) try: wordid, word, docfreq = line[:-1].split('\t') except Exception: raise ValueError("invalid line in dictionary file %s: %s" % (fname, line.strip())) wordid = int(wordid) if word in result.token2id: raise KeyError('token %s is defined as ID %d and as ID %d' % (word, wordid, result.token2id[word])) result.token2id[word] = wordid result.dfs[wordid] = int(docfreq) return result
Example #3
Source File: test_lee.py From topical_word_embeddings with MIT License | 6 votes |
def setUp(self): """setup lee test corpora""" global bg_corpus, corpus, human_sim_vector, bg_corpus2, corpus2 pre_path = os.path.join(os.path.dirname(__file__), 'test_data') bg_corpus_file = 'lee_background.cor' corpus_file = 'lee.cor' sim_file = 'similarities0-1.txt' # read in the corpora latin1 = lambda line: utils.to_unicode(line, encoding='latin1') with utils.smart_open(os.path.join(pre_path, bg_corpus_file)) as f: bg_corpus = preprocess_documents(latin1(line) for line in f) with utils.smart_open(os.path.join(pre_path, corpus_file)) as f: corpus = preprocess_documents(latin1(line) for line in f) with utils.smart_open(os.path.join(pre_path, bg_corpus_file)) as f: bg_corpus2 = [preprocess_string(latin1(s), filters=DEFAULT_FILTERS[:-1]) for s in f] with utils.smart_open(os.path.join(pre_path, corpus_file)) as f: corpus2 = [preprocess_string(latin1(s), filters=DEFAULT_FILTERS[:-1]) for s in f] # read the human similarity data sim_matrix = np.loadtxt(os.path.join(pre_path, sim_file)) sim_m_size = np.shape(sim_matrix)[0] human_sim_vector = sim_matrix[matutils.triu_indices(sim_m_size, 1)]
Example #4
Source File: test_lee.py From topical_word_embeddings with MIT License | 6 votes |
def setUp(self): """setup lee test corpora""" global bg_corpus, corpus, human_sim_vector, bg_corpus2, corpus2 pre_path = os.path.join(os.path.dirname(__file__), 'test_data') bg_corpus_file = 'lee_background.cor' corpus_file = 'lee.cor' sim_file = 'similarities0-1.txt' # read in the corpora latin1 = lambda line: utils.to_unicode(line, encoding='latin1') with utils.smart_open(os.path.join(pre_path, bg_corpus_file)) as f: bg_corpus = preprocess_documents(latin1(line) for line in f) with utils.smart_open(os.path.join(pre_path, corpus_file)) as f: corpus = preprocess_documents(latin1(line) for line in f) with utils.smart_open(os.path.join(pre_path, bg_corpus_file)) as f: bg_corpus2 = [preprocess_string(latin1(s), filters=DEFAULT_FILTERS[:-1]) for s in f] with utils.smart_open(os.path.join(pre_path, corpus_file)) as f: corpus2 = [preprocess_string(latin1(s), filters=DEFAULT_FILTERS[:-1]) for s in f] # read the human similarity data sim_matrix = np.loadtxt(os.path.join(pre_path, sim_file)) sim_m_size = np.shape(sim_matrix)[0] human_sim_vector = sim_matrix[matutils.triu_indices(sim_m_size, 1)]
Example #5
Source File: dump.py From embedding with MIT License | 6 votes |
def tokenize(content, token_min_len=2, token_max_len=100, lower=True): content = re.sub(EMAIL_PATTERN, ' ', content) # remove email pattern content = re.sub(URL_PATTERN, ' ', content) # remove url pattern content = re.sub(WIKI_REMOVE_CHARS, ' ', content) # remove unnecessary chars content = re.sub(WIKI_SPACE_CHARS, ' ', content) content = re.sub(MULTIPLE_SPACES, ' ', content) tokens = content.replace(", )", "").split(" ") result = [] for token in tokens: if not token.startswith('_'): token_candidate = to_unicode(re.sub(WIKI_REMOVE_TOKEN_CHARS, '', token)) else: token_candidate = "" if len(token_candidate) > 0: result.append(token_candidate) return result
Example #6
Source File: dictionary.py From topical_word_embeddings with MIT License | 6 votes |
def load_from_text(fname): """ Load a previously stored Dictionary from a text file. Mirror function to `save_as_text`. """ result = Dictionary() with utils.smart_open(fname) as f: for lineno, line in enumerate(f): line = utils.to_unicode(line) try: wordid, word, docfreq = line[:-1].split('\t') except Exception: raise ValueError("invalid line in dictionary file %s: %s" % (fname, line.strip())) wordid = int(wordid) if word in result.token2id: raise KeyError('token %s is defined as ID %d and as ID %d' % (word, wordid, result.token2id[word])) result.token2id[word] = wordid result.dfs[wordid] = int(docfreq) return result
Example #7
Source File: ldamallet.py From topical_word_embeddings with MIT License | 6 votes |
def load_word_topics(self): logger.info("loading assigned topics from %s" % self.fstate()) wordtopics = numpy.zeros((self.num_topics, self.num_terms), dtype=numpy.float32) with utils.smart_open(self.fstate()) as fin: _ = next(fin) # header self.alpha = numpy.array([float(val) for val in next(fin).split()[2:]]) assert len(self.alpha) == self.num_topics, "mismatch between MALLET vs. requested topics" _ = next(fin) # beta for lineno, line in enumerate(fin): line = utils.to_unicode(line) doc, source, pos, typeindex, token, topic = line.split() tokenid = self.id2word.token2id[token] if hasattr(self.id2word, 'token2id') else int(token) wordtopics[int(topic), tokenid] += 1 logger.info("loaded assigned topics for %i tokens" % wordtopics.sum()) self.wordtopics = wordtopics self.print_topics(15)
Example #8
Source File: test_lee.py From topical_word_embeddings with MIT License | 6 votes |
def setUp(self): """setup lee test corpora""" global bg_corpus, corpus, human_sim_vector, bg_corpus2, corpus2 pre_path = os.path.join(os.path.dirname(__file__), 'test_data') bg_corpus_file = 'lee_background.cor' corpus_file = 'lee.cor' sim_file = 'similarities0-1.txt' # read in the corpora latin1 = lambda line: utils.to_unicode(line, encoding='latin1') with utils.smart_open(os.path.join(pre_path, bg_corpus_file)) as f: bg_corpus = preprocess_documents(latin1(line) for line in f) with utils.smart_open(os.path.join(pre_path, corpus_file)) as f: corpus = preprocess_documents(latin1(line) for line in f) with utils.smart_open(os.path.join(pre_path, bg_corpus_file)) as f: bg_corpus2 = [preprocess_string(latin1(s), filters=DEFAULT_FILTERS[:-1]) for s in f] with utils.smart_open(os.path.join(pre_path, corpus_file)) as f: corpus2 = [preprocess_string(latin1(s), filters=DEFAULT_FILTERS[:-1]) for s in f] # read the human similarity data sim_matrix = np.loadtxt(os.path.join(pre_path, sim_file)) sim_m_size = np.shape(sim_matrix)[0] human_sim_vector = sim_matrix[matutils.triu_indices(sim_m_size, 1)]
Example #9
Source File: dictionary.py From topical_word_embeddings with MIT License | 6 votes |
def load_from_text(fname): """ Load a previously stored Dictionary from a text file. Mirror function to `save_as_text`. """ result = Dictionary() with utils.smart_open(fname) as f: for lineno, line in enumerate(f): line = utils.to_unicode(line) try: wordid, word, docfreq = line[:-1].split('\t') except Exception: raise ValueError("invalid line in dictionary file %s: %s" % (fname, line.strip())) wordid = int(wordid) if word in result.token2id: raise KeyError('token %s is defined as ID %d and as ID %d' % (word, wordid, result.token2id[word])) result.token2id[word] = wordid result.dfs[wordid] = int(docfreq) return result
Example #10
Source File: test_word2vec.py From topical_word_embeddings with MIT License | 5 votes |
def testLineSentenceWorksWithCompressedFile(self): """Does LineSentence work with a compressed file object argument?""" with utils.smart_open(datapath('head500.noblanks.cor')) as orig: sentences = word2vec.LineSentence(bz2.BZ2File(datapath('head500.noblanks.cor.bz2'))) for words in sentences: self.assertEqual(words, utils.to_unicode(orig.readline()).split())
Example #11
Source File: preprocessing.py From topical_word_embeddings with MIT License | 5 votes |
def strip_multiple_whitespaces(s): s = utils.to_unicode(s) return RE_WHITESPACE.sub(" ", s)
Example #12
Source File: matutils.py From topical_word_embeddings with MIT License | 5 votes |
def __init__(self, input, transposed=True): """ Initialize the matrix reader. The `input` refers to a file on local filesystem, which is expected to be in the sparse (coordinate) Matrix Market format. Documents are assumed to be rows of the matrix (and document features are columns). `input` is either a string (file path) or a file-like object that supports `seek()` (e.g. gzip.GzipFile, bz2.BZ2File). """ logger.info("initializing corpus reader from %s" % input) self.input, self.transposed = input, transposed with utils.file_or_filename(self.input) as lines: try: header = utils.to_unicode(next(lines)).strip() if not header.lower().startswith('%%matrixmarket matrix coordinate real general'): raise ValueError("File %s not in Matrix Market format with coordinate real general; instead found: \n%s" % (self.input, header)) except StopIteration: pass self.num_docs = self.num_terms = self.num_nnz = 0 for lineno, line in enumerate(lines): line = utils.to_unicode(line) if not line.startswith('%'): self.num_docs, self.num_terms, self.num_nnz = map(int, line.split()) if not self.transposed: self.num_docs, self.num_terms = self.num_terms, self.num_docs break logger.info("accepted corpus with %i documents, %i features, %i non-zero entries" % (self.num_docs, self.num_terms, self.num_nnz))
Example #13
Source File: svmlightcorpus.py From topical_word_embeddings with MIT License | 5 votes |
def line2doc(self, line): """ Create a document from a single line (string) in SVMlight format """ line = utils.to_unicode(line) line = line[: line.find('#')].strip() if not line: return None # ignore comments and empty lines parts = line.split() if not parts: raise ValueError('invalid line format in %s' % self.fname) target, fields = parts[0], [part.rsplit(':', 1) for part in parts[1:]] doc = [(int(p1) - 1, float(p2)) for p1, p2 in fields if p1 != 'qid'] # ignore 'qid' features, convert 1-based feature ids to 0-based return doc, target
Example #14
Source File: wikicorpus.py From topical_word_embeddings with MIT License | 5 votes |
def filter_wiki(raw): """ Filter out wiki mark-up from `raw`, leaving only text. `raw` is either unicode or utf-8 encoded string. """ # parsing of the wiki markup is not perfect, but sufficient for our purposes # contributions to improving this code are welcome :) text = utils.to_unicode(raw, 'utf8', errors='ignore') text = utils.decode_htmlentities(text) # ' ' --> '\xa0' return remove_markup(text)
Example #15
Source File: preprocessing.py From topical_word_embeddings with MIT License | 5 votes |
def strip_short(s, minsize=3): s = utils.to_unicode(s) return " ".join(e for e in s.split() if len(e) >= minsize)
Example #16
Source File: preprocessing.py From topical_word_embeddings with MIT License | 5 votes |
def remove_stopwords(s): s = utils.to_unicode(s) return " ".join(w for w in s.split() if w not in STOPWORDS)
Example #17
Source File: preprocessing.py From topical_word_embeddings with MIT License | 5 votes |
def strip_punctuation(s): s = utils.to_unicode(s) return RE_PUNCT.sub(" ", s) # unicode.translate cannot delete characters like str can
Example #18
Source File: preprocessing.py From topical_word_embeddings with MIT License | 5 votes |
def strip_tags(s): s = utils.to_unicode(s) return RE_TAGS.sub("",s)
Example #19
Source File: preprocessing.py From topical_word_embeddings with MIT License | 5 votes |
def strip_numeric(s): s = utils.to_unicode(s) return RE_NUMERIC.sub("", s)
Example #20
Source File: test_word2vec.py From topical_word_embeddings with MIT License | 5 votes |
def testLineSentenceWorksWithNormalFile(self): """Does LineSentence work with a file object argument, rather than filename?""" with utils.smart_open(datapath('head500.noblanks.cor')) as orig: with utils.smart_open(datapath('head500.noblanks.cor')) as fin: sentences = word2vec.LineSentence(fin) for words in sentences: self.assertEqual(words, utils.to_unicode(orig.readline()).split()) #endclass TestWord2VecSentenceIterators
Example #21
Source File: test_word2vec.py From topical_word_embeddings with MIT License | 5 votes |
def testLineSentenceWorksWithCompressedFile(self): """Does LineSentence work with a compressed file object argument?""" with utils.smart_open(datapath('head500.noblanks.cor')) as orig: sentences = word2vec.LineSentence(bz2.BZ2File(datapath('head500.noblanks.cor.bz2'))) for words in sentences: self.assertEqual(words, utils.to_unicode(orig.readline()).split())
Example #22
Source File: test_word2vec.py From topical_word_embeddings with MIT License | 5 votes |
def testLineSentenceWorksWithFilename(self): """Does LineSentence work with a filename argument?""" with utils.smart_open(datapath('lee_background.cor')) as orig: sentences = word2vec.LineSentence(datapath('lee_background.cor')) for words in sentences: self.assertEqual(words, utils.to_unicode(orig.readline()).split())
Example #23
Source File: test_word2vec.py From topical_word_embeddings with MIT License | 5 votes |
def testLineSentenceWorksWithNormalFile(self): """Does LineSentence work with a file object argument, rather than filename?""" with utils.smart_open(datapath('head500.noblanks.cor')) as orig: with utils.smart_open(datapath('head500.noblanks.cor')) as fin: sentences = word2vec.LineSentence(fin) for words in sentences: self.assertEqual(words, utils.to_unicode(orig.readline()).split()) #endclass TestWord2VecSentenceIterators
Example #24
Source File: preprocessing.py From topical_word_embeddings with MIT License | 5 votes |
def preprocess_string(s, filters=DEFAULT_FILTERS): s = utils.to_unicode(s) for f in filters: s = f(s) return s.split()
Example #25
Source File: preprocessing.py From topical_word_embeddings with MIT License | 5 votes |
def stem_text(text): """ Return lowercase and (porter-)stemmed version of string `text`. """ text = utils.to_unicode(text) p = PorterStemmer() return ' '.join(p.stem(word) for word in text.split())
Example #26
Source File: preprocessing.py From topical_word_embeddings with MIT License | 5 votes |
def remove_stopwords(s): s = utils.to_unicode(s) return " ".join(w for w in s.split() if w not in STOPWORDS)
Example #27
Source File: preprocessing.py From topical_word_embeddings with MIT License | 5 votes |
def split_alphanum(s): s = utils.to_unicode(s) s = RE_AL_NUM.sub(r"\1 \2", s) return RE_NUM_AL.sub(r"\1 \2", s)
Example #28
Source File: preprocessing.py From topical_word_embeddings with MIT License | 5 votes |
def strip_multiple_whitespaces(s): s = utils.to_unicode(s) return RE_WHITESPACE.sub(" ", s)
Example #29
Source File: preprocessing.py From topical_word_embeddings with MIT License | 5 votes |
def strip_non_alphanum(s): s = utils.to_unicode(s) return RE_NONALPHA.sub(" ", s)
Example #30
Source File: malletcorpus.py From topical_word_embeddings with MIT License | 5 votes |
def line2doc(self, line): l = [word for word in utils.to_unicode(line).strip().split(' ') if word] docid, doclang, words = l[0], l[1], l[2:] doc = super(MalletCorpus, self).line2doc(' '.join(words)) if self.metadata: return doc, (docid, doclang) else: return doc