Python gensim.models.KeyedVectors.load_word2vec_format() Examples
The following are 30
code examples of gensim.models.KeyedVectors.load_word2vec_format().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
gensim.models.KeyedVectors
, or try the search function
.
Example #1
Source File: embeddings.py From steppy-toolkit with MIT License | 11 votes |
def load_word2vec_embeddings(filepath, tokenizer, max_features, embedding_size): model = KeyedVectors.load_word2vec_format(filepath, binary=True) emb_mean, emb_std = model.wv.syn0.mean(), model.wv.syn0.std() word_index = tokenizer.word_index nb_words = min(max_features, len(word_index)) embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embedding_size)) for word, i in word_index.items(): if i >= max_features: continue try: embedding_vector = model[word] embedding_matrix[i] = embedding_vector except KeyError: continue return embedding_matrix
Example #2
Source File: loadEmbeddings.py From acl2017-interactive_summarizer with Apache License 2.0 | 6 votes |
def loadEmbeddings(self, filepath, data_path, vocab_size, binary_val): embed_short = os.path.normpath("%s/embed.dat" % data_path) if not os.path.exists(embed_short): print("Caching word embeddings in memmapped format...") print(binary_val, filepath) wv = KeyedVectors.load_word2vec_format("%s" % (filepath), binary=binary_val) fp = np.memmap(embed_short, dtype=np.double, mode='w+', shape=wv.syn0.shape) fp[:] = wv.syn0[:] with open(os.path.normpath("%s/embed.vocab" % data_path), "w") as fp: for _, w in sorted((voc.index, word) for word, voc in wv.vocab.items()): fp.write("%s\n"%(w.encode("utf8"))) del fp, wv self.W = np.memmap(os.path.normpath("%s/embed.dat" % data_path), dtype=np.double, mode="r", shape=(vocab_size, self.embedding_size)) with codecs.open(os.path.normpath("%s/embed.vocab" % data_path), 'r', 'utf-8') as f: vocab_list = [x.strip() for x in f.readlines()] self.vocab_dict = {w: k for k, w in enumerate(vocab_list)}
Example #3
Source File: models.py From open-solution-toxic-comments with MIT License | 6 votes |
def _get_embedding_matrix(self, tokenizer): model = KeyedVectors.load_word2vec_format(self.pretrained_filepath, binary=True) emb_mean, emb_std = model.syn0.mean(), model.syn0.std() word_index = tokenizer.word_index nb_words = min(self.max_features, len(word_index)) embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, self.embedding_size)) for word, i in word_index.items(): if i >= self.max_features: continue try: embedding_vector = model[word] embedding_matrix[i] = embedding_vector except KeyError: continue return embedding_matrix
Example #4
Source File: preprocessing.py From R-NET-in-Keras with MIT License | 6 votes |
def word2vec(word2vec_path): # Download word2vec data if it's not present yet if not path.exists(word2vec_path): glove_file_path = get_glove_file_path() print('Converting Glove to word2vec...', end='') glove2word2vec(glove_file_path, word2vec_path) # Convert glove to word2vec os.remove(glove_file_path) # Remove glove file and keep only word2vec print('Done') print('Reading word2vec data... ', end='') model = KeyedVectors.load_word2vec_format(word2vec_path) print('Done') def get_word_vector(word): try: return model[word] except KeyError: return np.zeros(model.vector_size) return get_word_vector
Example #5
Source File: embeddings.py From open-solution-mapping-challenge with MIT License | 6 votes |
def load_word2vec_embeddings(filepath, tokenizer, max_features, embedding_size): model = KeyedVectors.load_word2vec_format(filepath, binary=True) emb_mean, emb_std = model.wv.syn0.mean(), model.wv.syn0.std() word_index = tokenizer.word_index nb_words = min(max_features, len(word_index)) embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embedding_size)) for word, i in word_index.items(): if i >= max_features: continue try: embedding_vector = model[word] embedding_matrix[i] = embedding_vector except KeyError: continue return embedding_matrix
Example #6
Source File: pick_wordvec.py From CAMP_iccv19 with Apache License 2.0 | 6 votes |
def main(opt): vocab = pickle.load(open(opt.vocab_path, 'rb')) num = len(vocab) print (num) model = KeyedVectors.load_word2vec_format(opt.embed_weight, binary=True) matrix_len = num weights_matrix = np.zeros((num, 300)) words_found = 0 mask = np.zeros(num, dtype=int) for i, word in enumerate(vocab.idx2word): try: weights_matrix[i] = model[vocab.idx2word[i]] words_found += 1 mask[i] = 1 except KeyError: weights_matrix[i] = np.random.normal(scale=0.1, size=(300, )) print (words_found) np.save("./embed/f30kword2vec300dim_3.npy", weights_matrix) np.save("./embed/f30kword2vecmask_3.npy", mask)
Example #7
Source File: file_manage.py From resilient-community-apps with MIT License | 6 votes |
def get_summary_nlp(self): """ Return a summary of a NLP model file :return: """ ret = [] try: word2vec = KeyedVectors.load_word2vec_format(self.filename, binary=False) mtime = self._get_mtime() dim_vectors = word2vec.vector_size word_count = len(word2vec.vectors) ret.append("---------------------------") ret.append("Summary for NLP model file:") ret.append("---------------------------") ret.append(self.FILE_NAME_OUTPUT.format(self.filename)) ret.append(self.LAST_MODIFICATION_TIME.format(mtime)) ret.append(self.FEATURE_DIMENSION.format(dim_vectors)) ret.append(self.NUM_SENTENCES.format(word_count)) ret.append("\n") except Exception as e: ret.append("Failed to read NLP model {}.".format(self.filename)) ret.append("Error: {}".format(e)) return ret
Example #8
Source File: np2vec.py From nlp-architect with Apache License 2.0 | 6 votes |
def load(cls, np2vec_model_file, binary=False, word_ngrams=0, word2vec_format=True): """ Load the np2vec model. Args: np2vec_model_file (str): the file containing the np2vec model to load binary (bool): boolean indicating whether the np2vec model to load is in binary format word_ngrams (int {1,0}): If 1, np2vec model to load uses word vectors with subword ( ngrams) information. word2vec_format(bool): boolean indicating whether the model to load has been stored in original word2vec format. Returns: np2vec model to load """ if word_ngrams == 0: if word2vec_format: return KeyedVectors.load_word2vec_format(np2vec_model_file, binary=binary) return KeyedVectors.load(np2vec_model_file, mmap="r") if word_ngrams == 1: return FastText.load(np2vec_model_file) logger.error("invalid value for 'word_ngrams'") return None
Example #9
Source File: embeddings.py From saber with MIT License | 6 votes |
def _prepare_embedding_index(self, binary=True): """Returns an embedding index for pre-trained token embeddings. For pre-trained word embeddings given at `self.filepath`, returns a dictionary mapping words to their embedding (an 'embedding index'). If `self.debug` is True, only the first ten thousand vectors are loaded. Args: binary (bool): True if pre-trained embeddings are in C binary format, False if they are in C text format. Defaults to True. Returns: Dictionary mapping words to pre-trained word embeddings, known as an 'embedding index'. """ limit = 10000 if self.__dict__.get("debug", False) else None vectors = KeyedVectors.load_word2vec_format(self.filepath, binary=binary, limit=limit) embedding_idx = {word: vectors[word] for word in vectors.vocab} return embedding_idx
Example #10
Source File: embed.py From verejne.digital with Apache License 2.0 | 6 votes |
def __init__(self, all_texts): # Creating the model print("Reading the pretrained model for Word2VecEmbedder") self.sk_model = KeyedVectors.load_word2vec_format( '/data/verejne/datautils/embedding_data/slovak.vec', encoding='utf-8', unicode_errors='ignore') print("Model contains", len(self.sk_model.vocab), "tokens") print(self.sk_model.similarity("mesto", "mesta")) self.dimension = len(self.sk_model["auto"]) print("sídlisk" in self.sk_model) print("sídlisk".encode('utf8') in self.sk_model) print("Dimension of embedding of 'auto' is", self.dimension) # Create frequency table for words if all_texts is None: return for text in all_texts: self.add_text_to_corpus(text) self.print_corpus_stats()
Example #11
Source File: matcher.py From HIT-SCIR-CoNLL2019 with Apache License 2.0 | 5 votes |
def init_word2vec(filename, binary=False): global _word2vec _word2vec = KeyedVectors.load_word2vec_format(filename, binary=binary)
Example #12
Source File: contractions.py From pycontractions with BSD 3-Clause "New" or "Revised" License | 5 votes |
def load_models(self): """Attempt to find/load/download keyedvector model.""" if self.kv_model is not None: if not hasattr(self.kv_model, 'wmdistance'): raise AttributeError("Model does not support Word Mover's Distance, must be in keyedvectors format") elif self.w2v_path is not None: if not os.path.exists(self.w2v_path): print("Word2Vec model not found at {}".format(self.w2v_path)) sys.exit(1) try: self.kv_model = KeyedVectors.load_word2vec_format(self.w2v_path, binary=True) except: print("Error loading Word2Vec model") raise elif self.api_key is not None: try: self.kv_model = api.load(self.api_key) except: print("Error downloading model {}".format(self.api_key)) raise if not hasattr(self.kv_model, 'wmdistance'): raise AttributeError("Model does not support Word Mover's Distance, must be in keyedvectors format") else: raise AttributeError("No model given") try: self.lc_tool = language_check.LanguageTool(self.lang_code) except: print("Error initializing LanguageTool") raise
Example #13
Source File: corpus.py From ner with Apache License 2.0 | 5 votes |
def load_embeddings(self, file_path): # Embeddins must be in fastText format either bin or print('Loading embeddins...') if file_path.endswith('.bin'): from gensim.models.wrappers import FastText embeddings = FastText.load_fasttext_format(file_path) else: from gensim.models import KeyedVectors embeddings = KeyedVectors.load_word2vec_format(file_path) return embeddings
Example #14
Source File: nlp.py From bugbug with Mozilla Public License 2.0 | 5 votes |
def get_word_embeddings(): word_embeddings = KeyedVectors.load_word2vec_format("wiki-news-300d-1M-subword.vec") word_embeddings.init_sims(replace=True) return word_embeddings
Example #15
Source File: NER_model.py From DeepPavlov with Apache License 2.0 | 5 votes |
def load_pretrained_word_emb(self, model_path, model_name, word_dim, word2id=None, vocab_size=None): loaded_words = 0 if word2id is not None: vocab_size = len(word2id) word_embeddings = np.zeros(shape=(vocab_size, word_dim)) if model_name == "glove": model = KeyedVectors.load_word2vec_format(model_path, binary=False) for word in word2id: if word in model: word_embeddings[word2id[word]] = model[word] loaded_words += 1 elif model_name == "baomoi": model = KeyedVectors.load_word2vec_format(model_path, binary=True, unicode_errors='ignore') for word in word2id: if len(word) == 1: if word[0] in string.punctuation: word_embeddings[word2id[word]] = model["<punct>"] loaded_words += 1 elif word.isdigit(): word_embeddings[word2id[word]] = model["<number>"] loaded_words += 1 elif word in model.vocab: word_embeddings[word2id[word]] = model[word] loaded_words += 1 elif model_name is not None: raise RuntimeError(f'got an unexpected value for model_name: `{model_name}`') log.info(f"{loaded_words}/{vocab_size} words were loaded from {model_path}.") return word_embeddings
Example #16
Source File: glove_embedder.py From DeepPavlov with Apache License 2.0 | 5 votes |
def load(self) -> None: """ Load dict of embeddings from given file """ log.info(f"[loading GloVe embeddings from `{self.load_path}`]") if not self.load_path.exists(): log.warning(f'{self.load_path} does not exist, cannot load embeddings from it!') return self.model = KeyedVectors.load_word2vec_format(str(self.load_path)) self.dim = self.model.vector_size
Example #17
Source File: nlp_word2vec.py From resilient-community-apps with MIT License | 5 votes |
def load_model(self, file_name=None): """ Load a saved model :param file_name: [optional] model file. Use default if None :return: """ model_file = file_name if file_name else FileManage.DEFAULT_NLP_FILE try: self.word2vec = KeyedVectors.load_word2vec_format(model_file, binary=False) except Exception as e: self.log.error("Failed to load a saved model {}".format(model_file))
Example #18
Source File: feature_engineering.py From CIKM-AnalytiCup-2018 with Apache License 2.0 | 5 votes |
def __init__(self, train_df, test_df, unlabeled_df, data_loader, normalization=True): self.train_df = train_df self.test_df = test_df self.unlabeled_df = unlabeled_df self.data_loader = data_loader self.stopwords = self.data_loader.load_stopwords() self.train_df['splited_spn_1'] = self.train_df['spn_1'].apply(lambda v: v.split()) self.train_df['splited_spn_2'] = self.train_df['spn_2'].apply(lambda v: v.split()) self.unlabeled_df['splited_spn_1'] = self.unlabeled_df['spn_1'].apply(lambda v: v.split()) self.test_df['splited_spn_1'] = self.test_df['spn_1'].apply(lambda v: v.split()) self.test_df['splited_spn_2'] = self.test_df['spn_2'].apply(lambda v: v.split()) self.normalization = normalization docs = self.train_df['splited_spn_1'].values.tolist() + self.train_df['splited_spn_2'].values.tolist() + \ self.test_df['splited_spn_1'].values.tolist() + self.test_df['splited_spn_2'].values.tolist() + self.unlabeled_df['splited_spn_1'].tolist() docs = np.array(docs) docs = np.unique(docs) docs = docs.tolist() docs_raw = self.train_df['spn_1'].values.tolist() + self.train_df['spn_2'].values.tolist() + \ self.test_df['spn_1'].values.tolist() + self.test_df['spn_2'].values.tolist() + self.unlabeled_df['spn_1'].tolist() docs_raw = np.array(docs_raw) docs_raw = np.unique(docs_raw) docs_raw = docs_raw.tolist() self.tfidf_vectorizer = TfidfVectorizer() self.tfidf_vectorizer.fit(docs_raw) self.bm25_scorer = bm25.bm25Scorer(docs=docs) #print("[FE] Loading the word2vec model") #self.word2vec_model = KeyedVectors.load_word2vec_format(dataset_config.SPANISH_WORDVEC_PATH) #self.word2vec_model.init_sims(replace=True) #print("[FE] Loaded the word2vec mdoel") self.build_statistic()
Example #19
Source File: models.py From caml-mimic with MIT License | 5 votes |
def _code_emb_init(self, code_emb, dicts): code_embs = KeyedVectors.load_word2vec_format(code_emb) weights = np.zeros(self.final.weight.size()) for i in range(self.Y): code = dicts['ind2c'][i] weights[i] = code_embs[code] self.U.weight.data = torch.Tensor(weights).clone() self.final.weight.data = torch.Tensor(weights).clone()
Example #20
Source File: models.py From caml-mimic with MIT License | 5 votes |
def _code_emb_init(self, code_emb, dicts): code_embs = KeyedVectors.load_word2vec_format(code_emb) weights = np.zeros(self.final.weight.size()) for i in range(self.Y): code = dicts['ind2c'][i] weights[i] = code_embs[code] self.final.weight.data = torch.Tensor(weights).clone()
Example #21
Source File: glove.py From nlp-recipes with MIT License | 5 votes |
def load_pretrained_vectors( dir_path, file_name="glove.840B.300d.txt", limit=None ): """ Method that loads gloVe vectors. Downloads if it doesn't exist. Args: file_name(str): Name of the gloVe file. dir_path(str): Path to the directory where gloVe vectors exist or will be downloaded. limit(int): Number of word vectors that is loaded from gensim. This option allows us to save RAM space and avoid memory errors. Returns: gensim.models.keyedvectors.Word2VecKeyedVectors: Loaded word2vectors """ file_path = _maybe_download_and_extract(dir_path, file_name) tmp_file = get_tmpfile("test_word2vec.txt") # Convert GloVe format to word2vec _ = glove2word2vec(file_path, tmp_file) model = KeyedVectors.load_word2vec_format(tmp_file, limit=limit) os.remove(tmp_file) return model
Example #22
Source File: document_embedder.py From fake-news-detection-pipeline with Apache License 2.0 | 5 votes |
def _set_word2vec(self): if self.pretrained is None: raise ValueError("Pretrained word2vec path is not specified during instantiation") self._w2v = KeyedVectors.load_word2vec_format(self.pretrained, binary=True)
Example #23
Source File: vectorizer.py From medaCy with GNU General Public License v3.0 | 5 votes |
def load_word_embeddings(self, embeddings_file): """Uses self.word_embeddings_file and gensim to load word embeddings into memory. :param embeddings_file: Word embeddings file to use. Can be .bin or other common formats. """ is_binary = embeddings_file.endswith('.bin') word_vectors = KeyedVectors.load_word2vec_format(embeddings_file, binary=is_binary) self.word_vectors = word_vectors
Example #24
Source File: config.py From BREDS with GNU Lesser General Public License v3.0 | 5 votes |
def read_word2vec(self): print("Loading word2vec model ...\n") self.word2vec = KeyedVectors.load_word2vec_format(self.word2vecmodelpath, binary=True) self.vec_dim = self.word2vec.vector_size print(self.vec_dim, "dimensions")
Example #25
Source File: auxiliary_word2vec.py From ZeroShotVideoClassification with Apache License 2.0 | 5 votes |
def load_word2vec(): try: wv_model = Word2Vec.load('/workplace/GoogleNews', mmap='r') except: wv_model = Word2Vec.load_word2vec_format( '/workplace/GoogleNews-vectors-negative300.bin', binary=True) wv_model.init_sims(replace=True) wv_model.save('assets/GoogleNews') return wv_model
Example #26
Source File: wordembed.py From PyShortTextCategorization with MIT License | 5 votes |
def load_word2vec_model(path, binary=True): """ Load a pre-trained Word2Vec model. :param path: path of the file of the pre-trained Word2Vec model :param binary: whether the file is in binary format (Default: True) :return: a pre-trained Word2Vec model :type path: str :type binary: bool :rtype: gensim.models.keyedvectors.KeyedVectors """ return KeyedVectors.load_word2vec_format(path, binary=binary)
Example #27
Source File: wordembed.py From PyShortTextCategorization with MIT License | 5 votes |
def load_poincare_model(path, word2vec_format=True, binary=False): """ Load a Poincare embedding model. :param path: path of the file of the pre-trained Poincare embedding model :param word2vec_format: whether to load from word2vec format (default: True) :param binary: binary format (default: False) :return: a pre-trained Poincare embedding model :type path: str :type word2vec_format: bool :type binary: bool :rtype: gensim.models.poincare.PoincareKeyedVectors """ if word2vec_format: return PoincareKeyedVectors.load_word2vec_format(path, binary=binary) else: return PoincareModel.load(path).kv
Example #28
Source File: utils.py From Text-Classification-Models-Pytorch with MIT License | 5 votes |
def get_word_embeddings(w2vfile, word_to_index, embedsize=300): ''' For each word in our vocabulary, get the word2vec encoding of the word Inputs: w2vfile (string) : Path to the file containing (pre-trained) word embeddings embedsize (int) : Length of each word vector Returns: word_embeddings : Dictionary mapping each word to corresponding embedding ''' word_embeddings = {} if w2vfile.endswith('.txt'): f = open(w2vfile) for line in tqdm(f): values = line.split(" ") word = values[0] coefs = np.asarray(values[1:], dtype='float32') if word in word_to_index: word_embeddings[word] = coefs f.close() elif w2vfile.endswith('.bin'): word2vec = KeyedVectors.load_word2vec_format(w2vfile, binary=True, limit=1000000) for word in tqdm(word_to_index): try: word_embeddings[word] = word2vec[word.lower()] except KeyError: pass else: print ('Can\'t load word embeddings.') exit(-1) print('Found {0}/{1} word vectors.'.format(len(word_embeddings), len(word_to_index))) if len(word_to_index) > len(word_embeddings): print('Initializing remaining {} word vectors with zeros.'.format(len(word_to_index) - len(word_embeddings))) for word in word_to_index: if word not in word_embeddings: word_embeddings[word] = np.zeros((embedsize,)) return word_embeddings
Example #29
Source File: utils.py From Text-Classification-Models-Pytorch with MIT License | 5 votes |
def get_word_embeddings(w2vfile, word_to_index, embedsize=300): ''' For each word in our vocabulary, get the word2vec encoding of the word Inputs: w2vfile (string) : Path to the file containing (pre-trained) word embeddings embedsize (int) : Length of each word vector Returns: word_embeddings : Dictionary mapping each word to corresponding embedding ''' word_embeddings = {} if w2vfile.endswith('.txt'): f = open(w2vfile) for line in tqdm(f): values = line.split(" ") word = values[0] coefs = np.asarray(values[1:], dtype='float32') if word in word_to_index: word_embeddings[word] = coefs f.close() elif w2vfile.endswith('.bin'): word2vec = KeyedVectors.load_word2vec_format(w2vfile, binary=True, limit=1000000) for word in tqdm(word_to_index): try: word_embeddings[word] = word2vec[word.lower()] except KeyError: pass else: print ('Can\'t load word embeddings.') exit(-1) print('Found {0}/{1} word vectors.'.format(len(word_embeddings), len(word_to_index))) if len(word_to_index) > len(word_embeddings): print('Initializing remaining {} word vectors with zeros.'.format(len(word_to_index) - len(word_embeddings))) for word in word_to_index: if word not in word_embeddings: word_embeddings[word] = np.zeros((embedsize,)) return word_embeddings
Example #30
Source File: word2vec.py From nlp-journey with Apache License 2.0 | 5 votes |
def load_text(self): try: model = KeyedVectors.load_word2vec_format(self.model_path, self.vocab_path, binary=False) except FileNotFoundError: model = None return model