Python gensim.models.keyedvectors.KeyedVectors.load_word2vec_format() Examples
The following are 22
code examples of gensim.models.keyedvectors.KeyedVectors.load_word2vec_format().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
gensim.models.keyedvectors.KeyedVectors
, or try the search function
.
Example #1
Source File: build_w2v.py From text-classifier with Apache License 2.0 | 7 votes |
def build(train_seg_path, test_seg_path, out_path=None, sentence_path='', w2v_bin_path="w2v.bin", min_count=1, col_sep='\t'): sentences = extract_sentence(train_seg_path, test_seg_path, col_sep=col_sep) save_sentence(sentences, sentence_path) print('train w2v model...') # train model w2v = Word2Vec(sg=1, sentences=LineSentence(sentence_path), size=256, window=5, min_count=min_count, iter=40) w2v.wv.save_word2vec_format(w2v_bin_path, binary=True) print("save %s ok." % w2v_bin_path) # test # sim = w2v.wv.similarity('大', '小') # print('大 vs 小 similarity score:', sim) # load model model = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True) word_dict = {} for word in model.vocab: word_dict[word] = model[word] save_pkl(word_dict, out_path, overwrite=True)
Example #2
Source File: word2vec.py From nlp-recipes with MIT License | 6 votes |
def load_pretrained_vectors( dir_path, file_name="GoogleNews-vectors-negative300.bin", limit=None ): """ Method that loads word2vec vectors. Downloads if it doesn't exist. Args: file_name(str): Name of the word2vec file. dir_path(str): Path to the directory where word2vec vectors exist or will be downloaded. limit(int): Number of word vectors that is loaded from gensim. This option allows us to save RAM space and avoid memory errors. Returns: gensim.models.keyedvectors.Word2VecKeyedVectors: Loaded word2vectors """ file_path = _maybe_download_and_extract(dir_path, file_name) word2vec_vectors = KeyedVectors.load_word2vec_format( file_path, binary=True, limit=limit ) return word2vec_vectors
Example #3
Source File: utils.py From text-summarization-tensorflow with MIT License | 6 votes |
def get_init_embedding(reversed_dict, embedding_size): glove_file = "glove/glove.42B.300d.txt" word2vec_file = get_tmpfile("word2vec_format.vec") glove2word2vec(glove_file, word2vec_file) print("Loading Glove vectors...") word_vectors = KeyedVectors.load_word2vec_format(word2vec_file) word_vec_list = list() for _, word in sorted(reversed_dict.items()): try: word_vec = word_vectors.word_vec(word) except KeyError: word_vec = np.zeros([embedding_size], dtype=np.float32) word_vec_list.append(word_vec) # Assign random vector to <s>, </s> token word_vec_list[2] = np.random.normal(0, 1, embedding_size) word_vec_list[3] = np.random.normal(0, 1, embedding_size) return np.array(word_vec_list)
Example #4
Source File: vectorizers.py From revscoring with MIT License | 6 votes |
def load_word2vec(filename=None, path=None, binary=False, limit=None): if path is not None: return KeyedVectors.load_word2vec_format( path, binary=binary, limit=limit) elif filename is not None: for dir_path in ASSET_SEARCH_DIRS: try: path = os.path.join(dir_path, filename) return KeyedVectors.load_word2vec_format( path, binary=binary, limit=limit) except FileNotFoundError: continue raise FileNotFoundError("Please make sure that 'filename' \ specifies the word vector binary name \ in default search paths or 'path' \ speficies file path of the binary") else: raise TypeError( "load_word2vec() requires either 'filename' or 'path' to be set.")
Example #5
Source File: wordvec.py From OpenNIR with MIT License | 6 votes |
def gensim_w2v_handler(url): def wrapped(logger): with tempfile.TemporaryDirectory() as p: vocab_path = os.path.join(p, 'vocab') with logger.duration(f'downloading {url}'): util.download(url, vocab_path) with logger.duration(f'loading binary {vocab_path}'): vectors = KeyedVectors.load_word2vec_format(vocab_path, binary=True) vocab_path += '.txt' with logger.duration(f'saving text {vocab_path}'): vectors.save_word2vec_format(vocab_path) with logger.duration(f'reading embedding'): weights = None terms = [] for i, values in enumerate(plaintext.read_sv(vocab_path, sep=' ')): if i == 0: weights = np.ndarray((int(values[0]), int(values[1]))) else: term, values = values[0], values[1:] terms.append(term) weights[i-1] = [float(v) for v in values] return terms, np.array(weights) return wrapped
Example #6
Source File: build_w2v.py From castor with Apache License 2.0 | 6 votes |
def convert(fname, save_file): with open(fname, 'rb') as dim_file: vocab_size, dim = (int(x) for x in dim_file.readline().split()) word_vectors = KeyedVectors.load_word2vec_format(fname, binary=True) print("Loading vectors from {}".format(fname)) vectors = [] for line in tqdm(word_vectors.syn0, total=len(word_vectors.syn0)): vectors.extend(line.tolist()) vectors = torch.Tensor(vectors).view(-1, dim) stoi = {word.strip():voc.index for word, voc in word_vectors.vocab.items()} print('saving vectors to', save_file) torch.save((stoi, vectors, dim), save_file)
Example #7
Source File: node2vec_recommender.py From entity2rec with Apache License 2.0 | 6 votes |
def __init__(self, dataset, p=1, q=4, walk_length=100, num_walks=50, dimensions=200, window_size=30, workers=8, iterations=5): Node2Vec.__init__(self, False, True, False, p, q, walk_length, num_walks, dimensions, window_size, workers, iterations) self.dataset = dataset file = 'num%d_p%d_q%d_l%d_d%d_iter%d_winsize%d.emd' % (num_walks, p, q, walk_length, dimensions, iterations, window_size) self.path = 'datasets/%s/node2vec/' % self.dataset + file if file not in os.listdir('datasets/%s/node2vec/' % self.dataset): self.run('datasets/%s/node2vec/altogether.edgelist' % self.dataset, self.path) self.node2vec_model = KeyedVectors.load_word2vec_format(self.path, binary=True)
Example #8
Source File: entity2rel.py From entity2rec with Apache License 2.0 | 5 votes |
def add_embedding(self, property, embedding_file): self.embedding_files[property] = KeyedVectors.load_word2vec_format(embedding_file, binary=self.binary)
Example #9
Source File: test_DocSim.py From document-similarity with MIT License | 5 votes |
def setUpClass(cls): test_model_path = './data/test_data.txt' cls.w2v_model = KeyedVectors.load_word2vec_format(test_model_path, binary=False) cls.stopwords = ['to', 'an', 'a'] cls.doc_sim = DocSim(cls.w2v_model, cls.stopwords)
Example #10
Source File: feature_extraction.py From nlp-architect with Apache License 2.0 | 5 votes |
def load_word2vec_model_from_path(self): """ Load Word2Vec model Returns: the Word2Vec model """ word_embeddings_model = KeyedVectors.load_word2vec_format( self.word2vec_model_path, binary=True ) if not word_embeddings_model: return None return word_embeddings_model
Example #11
Source File: embeddings.py From danlp with BSD 3-Clause "New" or "Revised" License | 5 votes |
def load_wv_with_gensim(pretrained_embedding: str, cache_dir=DEFAULT_CACHE_DIR, verbose: bool = False): """ Available wordembeddings: - wiki.da.wv - cc.da.wv - conll17.da.wv - news.da.wv - sketchengine.da.wv Available subwordembeddings: - wiki.da.swv - cc.da.swv - sketchengine.da.swv :param pretrained_embedding: :param cache_dir: the directory for storing cached data :param verbose: :return: KeyedVectors or FastTextKeyedVectors """ word_embeddings_available(pretrained_embedding, can_use_subword=True) download_model(pretrained_embedding, cache_dir, _process_downloaded_embeddings, verbose=verbose) wv_path = os.path.join(cache_dir, pretrained_embedding + ".bin") if pretrained_embedding.split(".")[-1] == 'wv': return KeyedVectors.load_word2vec_format(wv_path, binary=True) elif pretrained_embedding.split(".")[-1] == 'swv': from gensim.models.fasttext import load_facebook_vectors return load_facebook_vectors(wv_path)
Example #12
Source File: gensim_word2vec.py From seq2vec with GNU General Public License v3.0 | 5 votes |
def __init__(self, model_path): self.word2vec = KeyedVectors.load_word2vec_format( model_path, binary=True )
Example #13
Source File: entity2rel.py From entity2vec with Apache License 2.0 | 5 votes |
def add_embedding(self, embedding_file): self.embedding_files.append(KeyedVectors.load_word2vec_format(embedding_file, binary=self.binary)) # access a particular embedding file and get the relatedness score
Example #14
Source File: multi_class_classification.py From edge2vec with BSD 3-Clause "New" or "Revised" License | 5 votes |
def load_word2vec_model(file): ''' load node embedding model ''' model = KeyedVectors.load_word2vec_format(file , binary=False) # print model.wv["1"] return model
Example #15
Source File: train.py From DeepNews with Apache License 2.0 | 5 votes |
def check_for_similar_words(self,): from gensim.models.keyedvectors import KeyedVectors model = KeyedVectors.load_word2vec_format("../../temp_results/word2vec_hindi.txt", binary=False) self.pretty_print(u"भारत",model.most_similar(u"भारत")) self.pretty_print(u"सिंह",model.most_similar(u"सिंह")) self.pretty_print(u"क्रिकेट",model.most_similar(u"क्रिकेट")) self.pretty_print(u"रुपये",model.most_similar(u"रुपये"))
Example #16
Source File: prepare_d2d.py From NPRF with Apache License 2.0 | 5 votes |
def sim_mat_and_kernel_d2d(relevance_file, topic_file, corpus_file, topk_corpus_file, embedding_file, stop_file, sim_output_path, kernel_output_path, kernel_mu_list, kernel_sigma_list, topk_supervised, d2d, test): '''Simultaneously compute similarity matrix and RBF kernel features Args: relevance_file: A dumped relevance dict file topic_file: a single line format topic file. format: qid term1 term2 ... corpus_file: corpus corresponding to docnolist file. format: docno\tdoclen\tterm1 term2 topk_corpus_file: corpus that contain only the topk terms for each document, format: same as corpus_file embedding_file: output file from word2vec toolkit, boolean=True stop_file: a stopword list file, one word per line sim_output_path: kernel_output_path: kernel_mu_list: kernel_sigma_list: topk_supervised: number of top-n documents for each query d2d: True for NPRF, False for simple query-document matching used by e.g. DRMM, K-NRM test: control the temporary output. Set false Returns: ''' relevance_dict = load_pickle(relevance_file) topic_dict = parse_topic(topic_file) corpus = parse_corpus(corpus_file) topk_corpus = parse_corpus(topk_corpus_file) embeddings = KeyedVectors.load_word2vec_format(embedding_file, binary=True) stoplist = parse_stoplist(stop_file) qid_list = relevance_dict.keys() for qid in qid_list: sim_mat_and_kernel_per_query(relevance_dict, topic_dict, corpus, topk_corpus, embeddings, stoplist, sim_output_path, kernel_output_path, kernel_mu_list, kernel_sigma_list, topk_supervised, d2d, test, qid)
Example #17
Source File: word_embeddings.py From chameleon_recsys with MIT License | 5 votes |
def load_word_embeddings(path, binary=True): w2v_model = KeyedVectors.load_word2vec_format(path, binary=binary) return w2v_model
Example #18
Source File: word_model.py From coqa-baselines with MIT License | 5 votes |
def set_model(self, filename, embed_type='glove'): timer = Timer('Load {}'.format(filename)) if embed_type == 'glove': self._model = GloveModel(filename) else: self._model = KeyedVectors.load_word2vec_format(filename, binary=True if embed_type == 'word2vec' else False) print('Embeddings: vocab = {}, embed_size = {}'.format(len(self._model.vocab), self._model.vector_size)) timer.finish()
Example #19
Source File: fasttext_embedding.py From SOQAL with MIT License | 5 votes |
def __init__(self, model_path): self.model_path = model_path print("loading fastText model ...") #self.model = pickle.load(open(self.model_path,"rb")) self.model = KeyedVectors.load_word2vec_format(self.model_path, encoding='utf-8', unicode_errors='ignore') print("done fastText loading model") self.tokenizer = WordPunctTokenizer() self.stemmer = ARLSTem() self.SYMBOLS = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\"' self.vocab = self.model.vocab
Example #20
Source File: link_prediction.py From edge2vec with BSD 3-Clause "New" or "Revised" License | 5 votes |
def load_word2vec_model(file): ''' return node embedding model ''' model = KeyedVectors.load_word2vec_format(file , binary=False) # print model.wv["1"] return model
Example #21
Source File: corpus_utils.py From NeuronBlocks with MIT License | 4 votes |
def load_embedding(embedding_path, embedding_dim, format, file_type, with_head=False, word_set=None): """ Args: format: 'glove', 'word2vec', 'fasttext' file_type: 'text' or 'binary' """ embedding_dict = dict() if format == 'word2vec' or format == 'fasttext': if file_type == 'text': vector_total = KeyedVectors.load_word2vec_format(embedding_path, binary=False, unicode_errors='ignore') else: if format == 'word2vec': vector_total = KeyedVectors.load_word2vec_format(embedding_path, binary=True, unicode_errors='ignore') elif format == 'fasttext': vector_total = FastText.load_fasttext_format(embedding_path, encoding='utf8') assert vector_total.vector_size == embedding_dim if word_set is None: embedding_dict = vector_total else: if not (format == 'fasttext' and file_type == 'binary'): word_total = vector_total.index2word # actually, vector_total.index2word is the word list else: word_total = vector_total.wv.index2word for word in word_total: if word in word_set: embedding_dict[word] = vector_total[word] elif format == 'glove': with codecs.open(embedding_path, 'r', encoding='utf-8') as fin: if with_head == True: _ = fin.readline() for idx, line in enumerate(fin): line = line.rstrip() if idx == 0 and len(line.split()) == 2: continue if len(line) > 0: word, vec = line.split(" ", 1) if (word_set and word in word_set) or (word_set is None): vector = [float(num) for num in vec.split(" ")] assert len(vector) == embedding_dim embedding_dict[word] = vector else: raise Exception('The format supported are glove, word2vec or fasttext, dost not support %s now.' % format) return embedding_dict
Example #22
Source File: embeddings.py From danlp with BSD 3-Clause "New" or "Revised" License | 4 votes |
def _process_embeddings_for_spacy(tmp_file_path: str, meta_info: dict, cache_dir: str = DEFAULT_CACHE_DIR, clean_up_raw_data: bool = True, verbose: bool = False): """ To use pretrained embeddings with spaCy the embeddings need to be stored in a specific format. This function converts embeddings saved in the binary word2vec format to a spaCy model with the init_model() function from spaCy. The generated files will be saved in the cache_dir under a folder called <pretrained_embedding>.spacy More information on converting pretrained word embeddings to spaCy models here: https://spacy.io/usage/vectors-similarity#custom :param str tmp_file_path: the file name of the embedding binary file :param str cache_dir: the directory for storing cached data :param bool verbose: """ from pathlib import Path from spacy.cli import init_model embeddings = meta_info['name'] bin_file_path = os.path.join(cache_dir, embeddings + ".bin") if not os.path.isfile( bin_file_path): # Preprocess to transform to word2vec .bin format _process_downloaded_embeddings(tmp_file_path, meta_info, cache_dir, clean_up_raw_data, verbose) vec_file = embeddings + ".vec" word_vecs = KeyedVectors.load_word2vec_format(bin_file_path, binary=True, encoding='utf8') assert_wv_dimensions(word_vecs, embeddings) word_vecs.save_word2vec_format(vec_file, binary=False) spacy_dir = os.path.join(cache_dir, embeddings + '.spacy') os.makedirs(spacy_dir, exist_ok=True) if os.path.isabs(spacy_dir): full_spacy_dir = Path(spacy_dir) else: full_spacy_dir = Path(os.path.join(os.getcwd(), spacy_dir)) init_model('da', full_spacy_dir, vectors_loc=vec_file) os.remove(vec_file) # Clean up the vec file