Python gensim.models.Word2Vec.load() Examples
The following are 30
code examples of gensim.models.Word2Vec.load().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
gensim.models.Word2Vec
, or try the search function
.
Example #1
Source File: data_utils.py From CCKS2019-Chinese-Clinical-NER with MIT License | 6 votes |
def get_embedding_matrix(model_filepath, word2id): """ Get the embedding matrix of the word2vec model :param model_filepath: the file path to the pre-build word2vec model :param word2id: the directory mapping from word to id :return: the embedding matrix of the word2vec model """ word2vec_model = Word2Vec.load(model_filepath) embeddings_dict = __get_embedding_dict(model_filepath) embedding_matrix = np.zeros((len(word2id) + 1, word2vec_model.vector_size)) for word, idx in word2id.items(): embedding_vector = embeddings_dict.get(word) if embedding_vector is not None: embedding_matrix[idx] = embedding_vector return embedding_matrix
Example #2
Source File: word2vec_helpers.py From chinese-text-classification-with-cnn-tf with Apache License 2.0 | 6 votes |
def embedding_sentences(sentences, embedding_size = 128, window = 5, min_count = 5, file_to_load = None, file_to_save = None): if file_to_load is not None: w2vModel = Word2Vec.load(file_to_load) else: w2vModel = Word2Vec(sentences, size = embedding_size, window = window, min_count = min_count, workers = multiprocessing.cpu_count()) if file_to_save is not None: w2vModel.save(file_to_save) all_vectors = [] embeddingDim = w2vModel.vector_size embeddingUnknown = [0 for i in range(embeddingDim)] for sentence in sentences: this_vector = [] for word in sentence: if word in w2vModel.wv.vocab: this_vector.append(w2vModel[word]) else: this_vector.append(embeddingUnknown) all_vectors.append(this_vector) return all_vectors
Example #3
Source File: word_eval.py From embedding with MIT License | 6 votes |
def load_vectors(self, vecs_fname, method): if method == "word2vec": model = Word2Vec.load(vecs_fname) words = model.wv.index2word vecs = model.wv.vectors else: words, vecs = [], [] with open(vecs_fname, 'r', encoding='utf-8') as f: if "fasttext" in method: next(f) # skip head line for line in f: if method == "swivel": splited_line = line.strip().split("\t") else: splited_line = line.strip().split(" ") words.append(splited_line[0]) vec = [float(el) for el in splited_line[1:]] vecs.append(vec) unit_vecs = normalize(vecs, norm='l2', axis=1) dictionary = {} for word, vec in zip(words, unit_vecs): dictionary[word] = vec return dictionary, words, unit_vecs
Example #4
Source File: word_utils.py From embedding with MIT License | 6 votes |
def load_word_embeddings(self, vecs_fname, method): if method == "word2vec": model = Word2Vec.load(vecs_fname) words = model.wv.index2word vecs = model.wv.vectors else: words, vecs = [], [] with open(vecs_fname, 'r', encoding='utf-8') as f1: if "fasttext" in method: next(f1) # skip head line for line in f1: if method == "swivel": splited_line = line.replace("\n", "").strip().split("\t") else: splited_line = line.replace("\n", "").strip().split(" ") words.append(splited_line[0]) vec = [float(el) for el in splited_line[1:]] vecs.append(vec) return words, vecs
Example #5
Source File: utils.py From mat2vec with MIT License | 6 votes |
def compute_epoch_accuracies(root, prefix, analogy_file): filenames = glob.glob(os.path.join(root, prefix+"_epoch*.model")) nr_epochs = len(filenames) accuracies = dict() losses = [0] * nr_epochs for filename in filenames: epoch = int(re.search("\d+\.model", filename).group()[:-6]) m = Word2Vec.load(filename) losses[epoch] = m.get_latest_training_loss() sections = m.wv.accuracy(analogy_file) for sec in sections: if sec["section"] not in accuracies: accuracies[sec["section"]] = [0] * nr_epochs correct, incorrect = len(sec["correct"]), len(sec["incorrect"]) if incorrect > 0: accuracy = correct / (correct + incorrect) else: accuracy = 0 accuracies[sec["section"]][epoch] = (correct, incorrect, accuracy) save_obj(accuracies, os.path.join("models", prefix + "_accuracies")) save_obj(np.concatenate([np.array([losses[0]]), np.diff(losses)]), os.path.join("models", prefix + "_loss"))
Example #6
Source File: main.py From nonce2vec with MIT License | 6 votes |
def _check_men(args): """Check embeddings quality. Calculate correlation with the similarity ratings in the MEN dataset. """ logger.info('Checking embeddings quality against MEN similarity ratings') logger.info('Loading word2vec model...') model = Word2Vec.load(args.w2v_model) logger.info('Model loaded') system_actual = [] # This is needed because we may not be able to calculate cosine for # all pairs human_actual = [] count = 0 for (first, second), human in Samples(source='men', shuffle=False): if first not in model.wv.vocab or second not in model.wv.vocab: logger.error('Could not find one of more pair item in model ' 'vocabulary: {}, {}'.format(first, second)) continue sim = _cosine_similarity(model.wv[first], model.wv[second]) system_actual.append(sim) human_actual.append(human) count += 1 spr = _spearman(human_actual, system_actual) logger.info('SPEARMAN: {} calculated over {} items'.format(spr, count))
Example #7
Source File: word2vec_helpers.py From question-classification-cnn-rnn-attention with Apache License 2.0 | 6 votes |
def __init__(self, test_model=False, verify_model=True): model = Word2Vec.load(modelfile) if(test_model): acc = model.accuracy(questionfile) logger.info("Test model " + modelfile + " in " + questionfile) self.vector_size = model.vector_size self.vocab_size = len(model.wv.vocab) + 1 self.word2index = self.GetWord2Index(model) self.index2word = self.GetIndex2Word(model) self.wordvector = self.GetWordVector(model) if(verify_model): logger.info("Verifing imported word2vec model") random_state = check_random_state(12) check_index = random_state.randint(low=0, high=self.vocab_size-2,size=1000) for index in check_index: word_wv = model.wv.index2word[index] word_our = self.index2word[index+1] #print(index, word_wv, word_our) assert word_wv == word_our assert model.wv.vocab[word_our].index == self.word2index[word_our] - 1 assert np.array_equal(model.wv[word_our], self.wordvector[self.word2index[word_our]]) logger.info("Imported word2vec model is verified")
Example #8
Source File: word2vec_helpers.py From question-classification-cnn-rnn-attention with Apache License 2.0 | 6 votes |
def __init__(self, test_model=False, verify_model=True): model = Word2Vec.load(modelfile) if(test_model): acc = model.accuracy(questionfile) logger.info("Test model " + modelfile + " in " + questionfile) self.vector_size = model.vector_size self.vocab_size = len(model.wv.vocab) + 1 self.word2index = self.GetWord2Index(model) self.index2word = self.GetIndex2Word(model) self.wordvector = self.GetWordVector(model) if(verify_model): logger.info("Verifing imported word2vec model") random_state = check_random_state(12) check_index = random_state.randint(low=0, high=self.vocab_size-2,size=1000) for index in check_index: word_wv = model.wv.index2word[index] word_our = self.index2word[index+1] #print(index, word_wv, word_our) assert word_wv == word_our assert model.wv.vocab[word_our].index == self.word2index[word_our] - 1 assert np.array_equal(model.wv[word_our], self.wordvector[self.word2index[word_our]]) logger.info("Imported word2vec model is verified")
Example #9
Source File: word2vec_helpers.py From question-classification-cnn-rnn-attention with Apache License 2.0 | 6 votes |
def __init__(self, test_model=False, verify_model=True): model = Word2Vec.load(modelfile) if(test_model): acc = model.accuracy(questionfile) logger.info("Test model " + modelfile + " in " + questionfile) self.vector_size = model.vector_size self.vocab_size = len(model.wv.vocab) + 1 self.word2index = self.GetWord2Index(model) self.index2word = self.GetIndex2Word(model) self.wordvector = self.GetWordVector(model) if(verify_model): logger.info("Verifing imported word2vec model") random_state = check_random_state(12) check_index = random_state.randint(low=0, high=self.vocab_size-2,size=1000) for index in check_index: word_wv = model.wv.index2word[index] word_our = self.index2word[index+1] #print(index, word_wv, word_our) assert word_wv == word_our assert model.wv.vocab[word_our].index == self.word2index[word_our] - 1 assert np.array_equal(model.wv[word_our], self.wordvector[self.word2index[word_our]]) logger.info("Imported word2vec model is verified")
Example #10
Source File: train_word2vec.py From text-classification with Apache License 2.0 | 5 votes |
def demo(): model = Word2Vec.load(config['model_file']) print("Provide three testing modes\n") print("Input a word, return 10 most similar words") print("Input two words, return their cosine similarity") print("Input three words, return the inference word") while True: try: query = input() q_list = query.split() if len(q_list) == 1: print("The 10 most similar words:") res = model.most_similar(q_list[0],topn = 10) for item in res: print(item[0]+","+str(item[1])) elif len(q_list) == 2: print("Cosine similarity:") res = model.similarity(q_list[0],q_list[1]) print(res) else: print("%s to %s, is like %s to " % (q_list[0],q_list[2],q_list[1])) res = model.most_similar([q_list[0],q_list[1]], [q_list[2]], topn= 10) for item in res: print(item[0]+","+str(item[1])) print("----------------------------") except Exception as e: print(repr(e))
Example #11
Source File: wordtwovec.py From aristo-mini with Apache License 2.0 | 5 votes |
def __init__(self, model_file: str) -> None: if model_file.endswith(".bin"): self.model = Word2Vec.load_word2vec_format(model_file, binary=True) else: self.model = Word2Vec.load(model_file)
Example #12
Source File: SCDV.py From SCDV with MIT License | 5 votes |
def read_GMM(idx_name, idx_proba_name): # Loads cluster assignments and probability of cluster assignments. idx = joblib.load(idx_name) idx_proba = joblib.load(idx_proba_name) print("Cluster Model Loaded...") return (idx, idx_proba)
Example #13
Source File: TopicCoherence.py From SCDV with MIT License | 5 votes |
def read_GMM(idx_name, idx_proba_name): # Loads cluster assignments and probability of cluster assignments. idx = joblib.load(idx_name) idx_proba = joblib.load(idx_proba_name) print("Cluster Model Loaded...") return (idx, idx_proba)
Example #14
Source File: SCDV.py From SCDV with MIT License | 5 votes |
def read_GMM(idx_name, idx_proba_name): # Loads cluster assignments and probability of cluster assignments. idx = joblib.load(idx_name) idx_proba = joblib.load(idx_proba_name) print("Cluster Model Loaded...") return (idx, idx_proba)
Example #15
Source File: SCDV.py From SCDV with MIT License | 5 votes |
def read_GMM(idx_name, idx_proba_name): # Loads cluster assignments and probability of cluster assignments. idx = joblib.load(idx_name) idx_proba = joblib.load(idx_proba_name) print("Cluster Model Loaded...") return (idx, idx_proba)
Example #16
Source File: load_data.py From pynlp with MIT License | 5 votes |
def load_char_word_static_data(file, data_size=None): model = Word2Vec.load('../output/word2vec/word2vec.model') path = os.path.join(os.path.dirname(__file__), file) df = pd.read_csv(path) p = df['sentence1'].values[0:data_size] h = df['sentence2'].values[0:data_size] label = df['label'].values[0:data_size] p, h, label = shuffle(p, h, label) p_c_index, h_c_index = char_index(p, h) p_seg = map(lambda x: list(jieba.cut(x)), p) h_seg = map(lambda x: list(jieba.cut(x)), h) p_w_vec = list(map(lambda x: w2v(x, model), p_seg)) h_w_vec = list(map(lambda x: w2v(x, model), h_seg)) p_w_vec = list(map(lambda x: w2v_process(x), p_w_vec)) h_w_vec = list(map(lambda x: w2v_process(x), h_w_vec)) return p_c_index, h_c_index, p_w_vec, h_w_vec, label # 加载char_index与动态词向量的训练数据
Example #17
Source File: model.py From cakechat with Apache License 2.0 | 5 votes |
def _get_w2v_model(corpus_name, voc_size, model_resolver_factory=None, tokenized_lines=None, vec_size=TOKEN_REPRESENTATION_SIZE, window_size=W2V_WINDOW_SIZE, skip_gram=USE_SKIP_GRAM): _logger.info('Getting w2v model') model_path = get_w2v_model_path(corpus_name, voc_size, vec_size, window_size, skip_gram) model_resolver = model_resolver_factory(model_path) if model_resolver_factory else DummyFileResolver(model_path) if not model_resolver.resolve(): if not tokenized_lines: raise ModelLoaderException( 'Tokenized corpus "{}" was not provided, so w2v model can\'t be trained.'.format(corpus_name)) # bin model is not present on the disk, so get it model = _train_model(tokenized_lines, voc_size, vec_size, window_size, skip_gram) _save_model(model, model_path) else: # bin model is on the disk, load it model = _load_model(model_path) _logger.info('Successfully got w2v model\n') return model
Example #18
Source File: Step6_all_feature_extract.py From resume_job_matching with Apache License 2.0 | 5 votes |
def getAllFeatures(train, mapper): print "this is getAllFeatures" # every record has a cluster value calculated by lda w2c_f, w2c_w = 10, 14 lda_dict_1 = util.read_dict(util.features_prefix + 'id_lda_256.pkl') lda_dict_2 = util.read_dict(util.features_prefix + 'id_lda_512.pkl') k_mean_dict_1 = util.read_dict(util.features_prefix + 'c_k_all_64.pkl') k_mean_dict_2 = util.read_dict(util.features_prefix + 'c_k_all_128.pkl') sentence_dict_path = util.txt_prefix + 'id_sentences.pkl' word2vec_path = util.txt_prefix + str(w2c_f) + 'features_1minwords_' + str(w2c_w) + 'context.pkl' sentence_dic = util.read_dict(sentence_dict_path) model = Word2Vec.load(word2vec_path) train_X = train[features] train_X = mapper.transform(train_X) # .values new_train_X = [] for i in xrange(len(train_X)): id = train_X[i][0] lda_1 = lda_dict_1[id] lda_2 = lda_dict_2[id] s = sentence_dic.get(id) f = np.concatenate(([train_X[i][1:].astype(np.float32)], [sentence_to_matrix_vec(s, model, w2c_f, k_mean_dict_1, k_mean_dict_2)]), axis=1)[0] f = np.concatenate(([f], [[lda_1, lda_2]]), axis=1)[0] new_train_X.append(f) new_train_X = np.array(new_train_X) return new_train_X
Example #19
Source File: train_word2vec.py From text-classification with Apache License 2.0 | 5 votes |
def segment(): # jieba custom setting. DATA_DIR = os.getcwd() + '/data/user_dict' jieba.load_userdict(os.path.join(DATA_DIR, 'Company.txt')) jieba.load_userdict(os.path.join(DATA_DIR, 'Concept.txt')) jieba.load_userdict(os.path.join(DATA_DIR, 'Consumer.txt')) jieba.load_userdict(os.path.join(DATA_DIR, 'Holder.txt')) jieba.load_userdict(os.path.join(DATA_DIR, 'HoldingCompany.txt')) jieba.load_userdict(os.path.join(DATA_DIR, 'MainComposition.txt')) jieba.load_userdict(os.path.join(DATA_DIR, 'Manager.txt')) jieba.load_userdict(os.path.join(DATA_DIR, 'Material.txt')) jieba.load_userdict(os.path.join(DATA_DIR, 'OtherCompetitor.txt')) jieba.load_userdict(os.path.join(DATA_DIR, 'Supplier.txt')) jieba.load_userdict(os.path.join(DATA_DIR, 'Finance.txt')) # load stopwords set stopword_set = set() with open(os.getcwd()+'/data/user_dict/stopWord.txt', 'r', encoding='utf-8') as stopwords: for stopword in stopwords: stopword_set.add(stopword.strip('\n')) output = open(config['input_seg'], 'w', encoding='utf-8') with open(config['input_raw'], 'r', encoding='utf-8') as content : for texts_num, line in enumerate(content): line = line.strip('\n') words = jieba.cut(line, cut_all=False) for word in words: if word not in stopword_set: output.write(word + ' ') output.write('\n') if (texts_num + 1) % 10000 == 0: logging.info("Segmented %d th articles" % (texts_num + 1)) output.close()
Example #20
Source File: similarity.py From 4lang with MIT License | 5 votes |
def get_vec_sim(self): model_fn = self.config.get('vectors', 'model') model_type = self.config.get('vectors', 'model_type') logging.warning('Loading model: {0}'.format(model_fn)) if model_type == 'word2vec': self.vec_model = Word2Vec.load_word2vec_format(model_fn, binary=True) elif model_type == 'gensim': self.vec_model = Word2Vec.load(model_fn) else: raise Exception('Unknown LSA model format') logging.warning('Model loaded: {0}'.format(model_fn))
Example #21
Source File: evaluate.py From embeddings with Apache License 2.0 | 5 votes |
def evaluate(filename, fname): counter = 0 top3Counter = 0 top5c = 0 linec = 0 model = Word2Vec.load(fname) questions = [] with open(filename) as test: for line in test: questions.append(line) for line in questions: if line.startswith(':'): try: print('Accuracy: ' + str(100*counter/linec) + '\n') except: pass print("Evaluating " + line.rstrip('\n')) else: try: linec += 1 source, target, question, answer = line.encode('utf-8').decode('utf-8').lower().rstrip('\n').replace('(', '').replace(')', '').split(' ') # word_vectors.most_similar(positive=['woman', 'king'], negative=['man']) result = model.wv.most_similar(positive=[target, question], negative=[source]) if result[0][0] == answer: counter += 1 print('Accuracy: ' + str(100*counter/linec), end = '\r') for i in range(0, 3): if result[i][0] == answer: top3Counter += 1 break for i in range(0, 10): if result[i][0] == answer: top5c += 1 break except: pass print('Correct guess: ' + str(100 * counter / linec)) print('Correct guess in top 3:' + str(100 * top3Counter / linec)) print('Correct guess in top 10:' + str(100 * top5c / linec)) return None
Example #22
Source File: load_data.py From text_matching with Apache License 2.0 | 5 votes |
def load_char_word_static_data(file, data_size=None): model = Word2Vec.load('../output/word2vec/word2vec.model') path = os.path.join(os.path.dirname(__file__), file) df = pd.read_csv(path) p = df['sentence1'].values[0:data_size] h = df['sentence2'].values[0:data_size] label = df['label'].values[0:data_size] p, h, label = shuffle(p, h, label) p_c_index, h_c_index = char_index(p, h) p_seg = map(lambda x: list(jieba.cut(x)), p) h_seg = map(lambda x: list(jieba.cut(x)), h) p_w_vec = list(map(lambda x: w2v(x, model), p_seg)) h_w_vec = list(map(lambda x: w2v(x, model), h_seg)) p_w_vec = list(map(lambda x: w2v_process(x), p_w_vec)) h_w_vec = list(map(lambda x: w2v_process(x), h_w_vec)) return p_c_index, h_c_index, p_w_vec, h_w_vec, label # 加载char_index与动态词向量的训练数据
Example #23
Source File: word2vec_helpers.py From DetectMaliciousURL with Apache License 2.0 | 5 votes |
def embedding_sentences(sentences, embedding_size = 128, window = 5, min_count = 5, file_to_load = None, file_to_save = None): ''' embeding_size 词嵌入维数 window : 上下文窗口 min_count : 词频少于min_count会被删除 ''' if file_to_load is not None: w2vModel = Word2Vec.load(file_to_load) else: w2vModel = Word2Vec(sentences, size = embedding_size, window = window, min_count = min_count, workers = multiprocessing.cpu_count()) if file_to_save is not None: w2vModel.save(file_to_save) all_vectors = [] embeddingDim = w2vModel.vector_size # 嵌入维数 embeddingUnknown = [0 for i in range(embeddingDim)] for sentence in sentences: this_vector = [] for word in sentence: if word in w2vModel.wv.vocab: this_vector.append(w2vModel[word]) else: this_vector.append(embeddingUnknown) all_vectors.append(this_vector) return all_vectors
Example #24
Source File: word2vec_helpers.py From DetectMaliciousURL with Apache License 2.0 | 5 votes |
def embedding_sentences(sentences, embedding_size = 128, window = 5, min_count = 5, file_to_load = None, file_to_save = None): ''' embeding_size 词嵌入维数 window : 上下文窗口 min_count : 词频少于min_count会被删除 ''' if file_to_load is not None: w2vModel = Word2Vec.load(file_to_load) else: w2vModel = Word2Vec(sentences, size = embedding_size, window = window, min_count = min_count, workers = multiprocessing.cpu_count()) if file_to_save is not None: w2vModel.save(file_to_save) all_vectors = [] embeddingDim = w2vModel.vector_size # 嵌入维数 embeddingUnknown = [0 for i in range(embeddingDim)] for sentence in sentences: this_vector = [] for word in sentence: if word in w2vModel.wv.vocab: this_vector.append(w2vModel[word]) else: this_vector.append(embeddingUnknown) all_vectors.append(this_vector) return all_vectors
Example #25
Source File: informativeness.py From nonce2vec with MIT License | 5 votes |
def __init__(self, model_path, sum_filter=None, sum_thresh=None, train_filter=None, train_thresh=None, sort_by=None): """Initialize the Informativeness instance. Args: model_path (str): The absolute path to the gensim w2v CBOW model. sum_filter (str): Filter for the sum initialization phase. sum_thresh (int): Threshold for sum filter (self and cwi filters only). train_filter (str): Filter for the training phase. train_thresh (int): Threshold for the train filter (self and cwi filters only). sort_by (str): Sort context items in asc or desc of cwi values before training. """ self._sum_filter = sum_filter if sum_filter and sum_filter != 'random' and sum_thresh is None: raise Exception('Setting sum_filter as \'{}\' requires specifying ' 'a threshold parameter'.format(sum_filter)) self._sum_thresh = sum_thresh self._train_filter = train_filter if train_filter and train_filter != 'random' and train_thresh is None: raise Exception('Setting train_filter as \'{}\' requires ' 'specifying a threshold parameter' .format(train_filter)) self._train_thresh = train_thresh self._model = Word2Vec.load(model_path) self._sort_by = sort_by
Example #26
Source File: model.py From cakechat with Apache License 2.0 | 5 votes |
def _load_model(model_path): _logger.info('Loading model from {}'.format(model_path)) model = Word2Vec.load(model_path, mmap='r') _logger.info('Model "{}" has been loaded.'.format(os.path.basename(model_path))) return model
Example #27
Source File: data_utils.py From CCKS2019-Chinese-Clinical-NER with MIT License | 5 votes |
def load_vocab(vocab_filepath): """ Load the dictionary mapping from word to id :param vocab_filepath: the file path to the pre-built dictionary :return: the dictionary mapping from word to id """ with open(vocab_filepath, "rb") as fr: word2id = pickle.load(fr) return word2id
Example #28
Source File: main.py From IDEA with MIT License | 5 votes |
def build_phrase(doc): # load phrase model return trigram[bigram[doc]]
Example #29
Source File: main.py From IDEA with MIT License | 5 votes |
def load_phrase(): global bigram global trigram bigram = Phrases.load(os.path.join("..", "model", "bigram.model")) trigram = Phrases.load(os.path.join("..", "model", "trigram.model"))
Example #30
Source File: main.py From IDEA with MIT License | 5 votes |
def load_obj(filename): with open(filename) as fin: return cPickle.load(fin)