Python gensim.models.Word2Vec.load() Examples

The following are 30 code examples of gensim.models.Word2Vec.load(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module gensim.models.Word2Vec , or try the search function .
Example #1
Source File: data_utils.py    From CCKS2019-Chinese-Clinical-NER with MIT License 6 votes vote down vote up
def get_embedding_matrix(model_filepath, word2id):
    """
    Get the embedding matrix of the word2vec model
    :param model_filepath: the file path to the pre-build word2vec model
    :param word2id: the directory mapping from word to id
    :return: the embedding matrix of the word2vec model
    """
    word2vec_model = Word2Vec.load(model_filepath)
    embeddings_dict = __get_embedding_dict(model_filepath)
    embedding_matrix = np.zeros((len(word2id) + 1, word2vec_model.vector_size))
    for word, idx in word2id.items():
        embedding_vector = embeddings_dict.get(word)
        if embedding_vector is not None:
            embedding_matrix[idx] = embedding_vector

    return embedding_matrix 
Example #2
Source File: word2vec_helpers.py    From chinese-text-classification-with-cnn-tf with Apache License 2.0 6 votes vote down vote up
def embedding_sentences(sentences, embedding_size = 128, window = 5, min_count = 5, file_to_load = None, file_to_save = None):
    if file_to_load is not None:
        w2vModel = Word2Vec.load(file_to_load)
    else:
        w2vModel = Word2Vec(sentences, size = embedding_size, window = window, min_count = min_count, workers = multiprocessing.cpu_count())
        if file_to_save is not None:
            w2vModel.save(file_to_save)
    all_vectors = []
    embeddingDim = w2vModel.vector_size
    embeddingUnknown = [0 for i in range(embeddingDim)]
    for sentence in sentences:
        this_vector = []
        for word in sentence:
            if word in w2vModel.wv.vocab:
                this_vector.append(w2vModel[word])
            else:
                this_vector.append(embeddingUnknown)
        all_vectors.append(this_vector)
    return all_vectors 
Example #3
Source File: word_eval.py    From embedding with MIT License 6 votes vote down vote up
def load_vectors(self, vecs_fname, method):
        if method == "word2vec":
            model = Word2Vec.load(vecs_fname)
            words = model.wv.index2word
            vecs = model.wv.vectors
        else:
            words, vecs = [], []
            with open(vecs_fname, 'r', encoding='utf-8') as f:
                if "fasttext" in method:
                    next(f)  # skip head line
                for line in f:
                    if method == "swivel":
                        splited_line = line.strip().split("\t")
                    else:
                        splited_line = line.strip().split(" ")
                    words.append(splited_line[0])
                    vec = [float(el) for el in splited_line[1:]]
                    vecs.append(vec)
        unit_vecs = normalize(vecs, norm='l2', axis=1)
        dictionary = {}
        for word, vec in zip(words, unit_vecs):
            dictionary[word] = vec
        return dictionary, words, unit_vecs 
Example #4
Source File: word_utils.py    From embedding with MIT License 6 votes vote down vote up
def load_word_embeddings(self, vecs_fname, method):
        if method == "word2vec":
            model = Word2Vec.load(vecs_fname)
            words = model.wv.index2word
            vecs = model.wv.vectors
        else:
            words, vecs = [], []
            with open(vecs_fname, 'r', encoding='utf-8') as f1:
                if "fasttext" in method:
                    next(f1)  # skip head line
                for line in f1:
                    if method == "swivel":
                        splited_line = line.replace("\n", "").strip().split("\t")
                    else:
                        splited_line = line.replace("\n", "").strip().split(" ")
                    words.append(splited_line[0])
                    vec = [float(el) for el in splited_line[1:]]
                    vecs.append(vec)
        return words, vecs 
Example #5
Source File: utils.py    From mat2vec with MIT License 6 votes vote down vote up
def compute_epoch_accuracies(root, prefix, analogy_file):
    filenames = glob.glob(os.path.join(root, prefix+"_epoch*.model"))
    nr_epochs = len(filenames)
    accuracies = dict()
    losses = [0] * nr_epochs
    for filename in filenames:
        epoch = int(re.search("\d+\.model", filename).group()[:-6])
        m = Word2Vec.load(filename)
        losses[epoch] = m.get_latest_training_loss()
        sections = m.wv.accuracy(analogy_file)
        for sec in sections:
            if sec["section"] not in accuracies:
                accuracies[sec["section"]] = [0] * nr_epochs
            correct, incorrect = len(sec["correct"]), len(sec["incorrect"])
            if incorrect > 0:
                accuracy = correct / (correct + incorrect)
            else:
                accuracy = 0
            accuracies[sec["section"]][epoch] = (correct, incorrect, accuracy)
        save_obj(accuracies, os.path.join("models", prefix + "_accuracies"))
        save_obj(np.concatenate([np.array([losses[0]]), np.diff(losses)]), os.path.join("models", prefix + "_loss")) 
Example #6
Source File: main.py    From nonce2vec with MIT License 6 votes vote down vote up
def _check_men(args):
    """Check embeddings quality.

    Calculate correlation with the similarity ratings in the MEN dataset.
    """
    logger.info('Checking embeddings quality against MEN similarity ratings')
    logger.info('Loading word2vec model...')
    model = Word2Vec.load(args.w2v_model)
    logger.info('Model loaded')
    system_actual = []
    # This is needed because we may not be able to calculate cosine for
    # all pairs
    human_actual = []
    count = 0
    for (first, second), human in Samples(source='men', shuffle=False):
        if first not in model.wv.vocab or second not in model.wv.vocab:
            logger.error('Could not find one of more pair item in model '
                         'vocabulary: {}, {}'.format(first, second))
            continue
        sim = _cosine_similarity(model.wv[first], model.wv[second])
        system_actual.append(sim)
        human_actual.append(human)
        count += 1
    spr = _spearman(human_actual, system_actual)
    logger.info('SPEARMAN: {} calculated over {} items'.format(spr, count)) 
Example #7
Source File: word2vec_helpers.py    From question-classification-cnn-rnn-attention with Apache License 2.0 6 votes vote down vote up
def __init__(self, test_model=False, verify_model=True):
        model = Word2Vec.load(modelfile)

        if(test_model):
            acc = model.accuracy(questionfile)
            logger.info("Test model " + modelfile + " in " + questionfile)

        self.vector_size = model.vector_size
        self.vocab_size = len(model.wv.vocab) + 1
        self.word2index = self.GetWord2Index(model)
        self.index2word = self.GetIndex2Word(model)
        self.wordvector = self.GetWordVector(model)

        if(verify_model):
            logger.info("Verifing imported word2vec model")
            random_state = check_random_state(12)
            check_index = random_state.randint(low=0, high=self.vocab_size-2,size=1000)
            for index in check_index:
                word_wv = model.wv.index2word[index]
                word_our = self.index2word[index+1]
                #print(index, word_wv, word_our)
                assert word_wv == word_our
                assert model.wv.vocab[word_our].index == self.word2index[word_our] - 1
                assert np.array_equal(model.wv[word_our], self.wordvector[self.word2index[word_our]])
            logger.info("Imported word2vec model is verified") 
Example #8
Source File: word2vec_helpers.py    From question-classification-cnn-rnn-attention with Apache License 2.0 6 votes vote down vote up
def __init__(self, test_model=False, verify_model=True):
        model = Word2Vec.load(modelfile)

        if(test_model):
            acc = model.accuracy(questionfile)
            logger.info("Test model " + modelfile + " in " + questionfile)

        self.vector_size = model.vector_size
        self.vocab_size = len(model.wv.vocab) + 1
        self.word2index = self.GetWord2Index(model)
        self.index2word = self.GetIndex2Word(model)
        self.wordvector = self.GetWordVector(model)

        if(verify_model):
            logger.info("Verifing imported word2vec model")
            random_state = check_random_state(12)
            check_index = random_state.randint(low=0, high=self.vocab_size-2,size=1000)
            for index in check_index:
                word_wv = model.wv.index2word[index]
                word_our = self.index2word[index+1]
                #print(index, word_wv, word_our)
                assert word_wv == word_our
                assert model.wv.vocab[word_our].index == self.word2index[word_our] - 1
                assert np.array_equal(model.wv[word_our], self.wordvector[self.word2index[word_our]])
            logger.info("Imported word2vec model is verified") 
Example #9
Source File: word2vec_helpers.py    From question-classification-cnn-rnn-attention with Apache License 2.0 6 votes vote down vote up
def __init__(self, test_model=False, verify_model=True):
        model = Word2Vec.load(modelfile)

        if(test_model):
            acc = model.accuracy(questionfile)
            logger.info("Test model " + modelfile + " in " + questionfile)

        self.vector_size = model.vector_size
        self.vocab_size = len(model.wv.vocab) + 1
        self.word2index = self.GetWord2Index(model)
        self.index2word = self.GetIndex2Word(model)
        self.wordvector = self.GetWordVector(model)

        if(verify_model):
            logger.info("Verifing imported word2vec model")
            random_state = check_random_state(12)
            check_index = random_state.randint(low=0, high=self.vocab_size-2,size=1000)
            for index in check_index:
                word_wv = model.wv.index2word[index]
                word_our = self.index2word[index+1]
                #print(index, word_wv, word_our)
                assert word_wv == word_our
                assert model.wv.vocab[word_our].index == self.word2index[word_our] - 1
                assert np.array_equal(model.wv[word_our], self.wordvector[self.word2index[word_our]])
            logger.info("Imported word2vec model is verified") 
Example #10
Source File: train_word2vec.py    From text-classification with Apache License 2.0 5 votes vote down vote up
def demo():
    model = Word2Vec.load(config['model_file'])

    print("Provide three testing modes\n")
    print("Input a word, return 10 most similar words")
    print("Input two words, return their cosine similarity")
    print("Input three words, return the inference word")

    while True:
        try:
            query = input()
            q_list = query.split()

            if len(q_list) == 1:
                print("The 10 most similar words:")
                res = model.most_similar(q_list[0],topn = 10)
                for item in res:
                    print(item[0]+","+str(item[1]))

            elif len(q_list) == 2:
                print("Cosine similarity:")
                res = model.similarity(q_list[0],q_list[1])
                print(res)
            
            else:
                print("%s to %s, is like %s to " % (q_list[0],q_list[2],q_list[1]))
                res = model.most_similar([q_list[0],q_list[1]], [q_list[2]], topn= 10)
                for item in res:
                    print(item[0]+","+str(item[1]))
            print("----------------------------")
        except Exception as e:
            print(repr(e)) 
Example #11
Source File: wordtwovec.py    From aristo-mini with Apache License 2.0 5 votes vote down vote up
def __init__(self, model_file: str) -> None:
        if model_file.endswith(".bin"):
            self.model = Word2Vec.load_word2vec_format(model_file, binary=True)
        else:
            self.model = Word2Vec.load(model_file) 
Example #12
Source File: SCDV.py    From SCDV with MIT License 5 votes vote down vote up
def read_GMM(idx_name, idx_proba_name):
    # Loads cluster assignments and probability of cluster assignments. 
    idx = joblib.load(idx_name)
    idx_proba = joblib.load(idx_proba_name)
    print("Cluster Model Loaded...")
    return (idx, idx_proba) 
Example #13
Source File: TopicCoherence.py    From SCDV with MIT License 5 votes vote down vote up
def read_GMM(idx_name, idx_proba_name):
    # Loads cluster assignments and probability of cluster assignments.
    idx = joblib.load(idx_name)
    idx_proba = joblib.load(idx_proba_name)
    print("Cluster Model Loaded...")
    return (idx, idx_proba) 
Example #14
Source File: SCDV.py    From SCDV with MIT License 5 votes vote down vote up
def read_GMM(idx_name, idx_proba_name):
    # Loads cluster assignments and probability of cluster assignments.
    idx = joblib.load(idx_name)
    idx_proba = joblib.load(idx_proba_name)
    print("Cluster Model Loaded...")
    return (idx, idx_proba) 
Example #15
Source File: SCDV.py    From SCDV with MIT License 5 votes vote down vote up
def read_GMM(idx_name, idx_proba_name):
    # Loads cluster assignments and probability of cluster assignments.
    idx = joblib.load(idx_name)
    idx_proba = joblib.load(idx_proba_name)
    print("Cluster Model Loaded...")
    return (idx, idx_proba) 
Example #16
Source File: load_data.py    From pynlp with MIT License 5 votes vote down vote up
def load_char_word_static_data(file, data_size=None):
    model = Word2Vec.load('../output/word2vec/word2vec.model')

    path = os.path.join(os.path.dirname(__file__), file)
    df = pd.read_csv(path)
    p = df['sentence1'].values[0:data_size]
    h = df['sentence2'].values[0:data_size]
    label = df['label'].values[0:data_size]

    p, h, label = shuffle(p, h, label)

    p_c_index, h_c_index = char_index(p, h)

    p_seg = map(lambda x: list(jieba.cut(x)), p)
    h_seg = map(lambda x: list(jieba.cut(x)), h)

    p_w_vec = list(map(lambda x: w2v(x, model), p_seg))
    h_w_vec = list(map(lambda x: w2v(x, model), h_seg))

    p_w_vec = list(map(lambda x: w2v_process(x), p_w_vec))
    h_w_vec = list(map(lambda x: w2v_process(x), h_w_vec))

    return p_c_index, h_c_index, p_w_vec, h_w_vec, label


# 加载char_index与动态词向量的训练数据 
Example #17
Source File: model.py    From cakechat with Apache License 2.0 5 votes vote down vote up
def _get_w2v_model(corpus_name,
                   voc_size,
                   model_resolver_factory=None,
                   tokenized_lines=None,
                   vec_size=TOKEN_REPRESENTATION_SIZE,
                   window_size=W2V_WINDOW_SIZE,
                   skip_gram=USE_SKIP_GRAM):
    _logger.info('Getting w2v model')

    model_path = get_w2v_model_path(corpus_name, voc_size, vec_size, window_size, skip_gram)
    model_resolver = model_resolver_factory(model_path) if model_resolver_factory else DummyFileResolver(model_path)

    if not model_resolver.resolve():
        if not tokenized_lines:
            raise ModelLoaderException(
                'Tokenized corpus "{}" was not provided, so w2v model can\'t be trained.'.format(corpus_name))

        # bin model is not present on the disk, so get it
        model = _train_model(tokenized_lines, voc_size, vec_size, window_size, skip_gram)
        _save_model(model, model_path)
    else:
        # bin model is on the disk, load it
        model = _load_model(model_path)

    _logger.info('Successfully got w2v model\n')

    return model 
Example #18
Source File: Step6_all_feature_extract.py    From resume_job_matching with Apache License 2.0 5 votes vote down vote up
def getAllFeatures(train, mapper):
    print "this is getAllFeatures"
    # every record has a cluster value calculated by lda
    w2c_f, w2c_w = 10, 14
    lda_dict_1 = util.read_dict(util.features_prefix + 'id_lda_256.pkl')
    lda_dict_2 = util.read_dict(util.features_prefix + 'id_lda_512.pkl')
    k_mean_dict_1 = util.read_dict(util.features_prefix + 'c_k_all_64.pkl')
    k_mean_dict_2 = util.read_dict(util.features_prefix + 'c_k_all_128.pkl')
    sentence_dict_path = util.txt_prefix + 'id_sentences.pkl'
    word2vec_path = util.txt_prefix + str(w2c_f) + 'features_1minwords_' + str(w2c_w) + 'context.pkl'
    sentence_dic = util.read_dict(sentence_dict_path)
    model = Word2Vec.load(word2vec_path)

    train_X = train[features]
    train_X = mapper.transform(train_X)  # .values
    new_train_X = []
    for i in xrange(len(train_X)):
        id = train_X[i][0]
        lda_1 = lda_dict_1[id]
        lda_2 = lda_dict_2[id]
        s = sentence_dic.get(id)
        f = np.concatenate(([train_X[i][1:].astype(np.float32)],
                            [sentence_to_matrix_vec(s, model, w2c_f, k_mean_dict_1, k_mean_dict_2)]), axis=1)[0]
        f = np.concatenate(([f], [[lda_1, lda_2]]), axis=1)[0]
        new_train_X.append(f)
    new_train_X = np.array(new_train_X)
    return new_train_X 
Example #19
Source File: train_word2vec.py    From text-classification with Apache License 2.0 5 votes vote down vote up
def segment():
    # jieba custom setting.
    DATA_DIR = os.getcwd() + '/data/user_dict'
    jieba.load_userdict(os.path.join(DATA_DIR, 'Company.txt'))
    jieba.load_userdict(os.path.join(DATA_DIR, 'Concept.txt'))
    jieba.load_userdict(os.path.join(DATA_DIR, 'Consumer.txt'))
    jieba.load_userdict(os.path.join(DATA_DIR, 'Holder.txt'))
    jieba.load_userdict(os.path.join(DATA_DIR, 'HoldingCompany.txt'))
    jieba.load_userdict(os.path.join(DATA_DIR, 'MainComposition.txt'))
    jieba.load_userdict(os.path.join(DATA_DIR, 'Manager.txt'))
    jieba.load_userdict(os.path.join(DATA_DIR, 'Material.txt'))
    jieba.load_userdict(os.path.join(DATA_DIR, 'OtherCompetitor.txt'))
    jieba.load_userdict(os.path.join(DATA_DIR, 'Supplier.txt'))
    jieba.load_userdict(os.path.join(DATA_DIR, 'Finance.txt'))

    # load stopwords set
    stopword_set = set()
    with open(os.getcwd()+'/data/user_dict/stopWord.txt', 'r', encoding='utf-8') as stopwords:
        for stopword in stopwords:
            stopword_set.add(stopword.strip('\n'))

    output = open(config['input_seg'], 'w', encoding='utf-8')
    with open(config['input_raw'], 'r', encoding='utf-8') as content :
        for texts_num, line in enumerate(content):
            line = line.strip('\n')
            words = jieba.cut(line, cut_all=False)
            for word in words:
                if word not in stopword_set:
                    output.write(word + ' ')
            output.write('\n')

            if (texts_num + 1) % 10000 == 0:
                logging.info("Segmented %d th articles" % (texts_num + 1))
    output.close() 
Example #20
Source File: similarity.py    From 4lang with MIT License 5 votes vote down vote up
def get_vec_sim(self):
        model_fn = self.config.get('vectors', 'model')
        model_type = self.config.get('vectors', 'model_type')
        logging.warning('Loading model: {0}'.format(model_fn))
        if model_type == 'word2vec':
            self.vec_model = Word2Vec.load_word2vec_format(model_fn,
                                                           binary=True)
        elif model_type == 'gensim':
            self.vec_model = Word2Vec.load(model_fn)
        else:
            raise Exception('Unknown LSA model format')
        logging.warning('Model loaded: {0}'.format(model_fn)) 
Example #21
Source File: evaluate.py    From embeddings with Apache License 2.0 5 votes vote down vote up
def evaluate(filename, fname):
	counter = 0
	top3Counter = 0
	top5c = 0
	linec = 0
	model = Word2Vec.load(fname)
	questions = []
	with open(filename) as test:
		for line in test:
			questions.append(line)

	for line in questions:
		if line.startswith(':'):
			try:
				print('Accuracy: ' + str(100*counter/linec) + '\n')
			except:
				pass
			print("Evaluating " + line.rstrip('\n'))
		else:	
			try:
				linec += 1
				source, target, question, answer = line.encode('utf-8').decode('utf-8').lower().rstrip('\n').replace('(', '').replace(')', '').split(' ')
				# word_vectors.most_similar(positive=['woman', 'king'], negative=['man'])
				result = model.wv.most_similar(positive=[target, question], negative=[source])
				if result[0][0] == answer:
					counter += 1
				print('Accuracy: ' + str(100*counter/linec), end = '\r')
				for i in range(0, 3):
					if result[i][0] == answer:
						top3Counter += 1
						break
				for i in range(0, 10):
					if result[i][0] == answer:
						top5c += 1
						break
			except:
				pass
	print('Correct guess: ' + str(100 * counter / linec))
	print('Correct guess in top 3:' + str(100 * top3Counter / linec))
	print('Correct guess in top 10:' + str(100 * top5c / linec))
	return None 
Example #22
Source File: load_data.py    From text_matching with Apache License 2.0 5 votes vote down vote up
def load_char_word_static_data(file, data_size=None):
    model = Word2Vec.load('../output/word2vec/word2vec.model')

    path = os.path.join(os.path.dirname(__file__), file)
    df = pd.read_csv(path)
    p = df['sentence1'].values[0:data_size]
    h = df['sentence2'].values[0:data_size]
    label = df['label'].values[0:data_size]

    p, h, label = shuffle(p, h, label)

    p_c_index, h_c_index = char_index(p, h)

    p_seg = map(lambda x: list(jieba.cut(x)), p)
    h_seg = map(lambda x: list(jieba.cut(x)), h)

    p_w_vec = list(map(lambda x: w2v(x, model), p_seg))
    h_w_vec = list(map(lambda x: w2v(x, model), h_seg))

    p_w_vec = list(map(lambda x: w2v_process(x), p_w_vec))
    h_w_vec = list(map(lambda x: w2v_process(x), h_w_vec))

    return p_c_index, h_c_index, p_w_vec, h_w_vec, label


# 加载char_index与动态词向量的训练数据 
Example #23
Source File: word2vec_helpers.py    From DetectMaliciousURL with Apache License 2.0 5 votes vote down vote up
def embedding_sentences(sentences, embedding_size = 128, window = 5, min_count = 5, file_to_load = None, file_to_save = None):
    '''
    embeding_size 词嵌入维数
    window : 上下文窗口
    min_count : 词频少于min_count会被删除
    '''
    if file_to_load is not None:
        w2vModel = Word2Vec.load(file_to_load)
    else:
        w2vModel = Word2Vec(sentences, size = embedding_size, window = window, min_count = min_count, workers = multiprocessing.cpu_count())
        if file_to_save is not None:
            w2vModel.save(file_to_save)

    all_vectors = []
    embeddingDim = w2vModel.vector_size
    # 嵌入维数
    embeddingUnknown = [0 for i in range(embeddingDim)]
    for sentence in sentences:
        this_vector = []
        for word in sentence:
            if word in w2vModel.wv.vocab:
                this_vector.append(w2vModel[word])
            else:
                this_vector.append(embeddingUnknown)
        all_vectors.append(this_vector)
    return all_vectors 
Example #24
Source File: word2vec_helpers.py    From DetectMaliciousURL with Apache License 2.0 5 votes vote down vote up
def embedding_sentences(sentences, embedding_size = 128, window = 5, min_count = 5, file_to_load = None, file_to_save = None):
    '''
    embeding_size 词嵌入维数
    window : 上下文窗口
    min_count : 词频少于min_count会被删除
    '''
    if file_to_load is not None:
        w2vModel = Word2Vec.load(file_to_load)
    else:
        w2vModel = Word2Vec(sentences, size = embedding_size, window = window, min_count = min_count, workers = multiprocessing.cpu_count())
        if file_to_save is not None:
            w2vModel.save(file_to_save)

    all_vectors = []
    embeddingDim = w2vModel.vector_size
    # 嵌入维数
    embeddingUnknown = [0 for i in range(embeddingDim)]
    for sentence in sentences:
        this_vector = []
        for word in sentence:
            if word in w2vModel.wv.vocab:
                this_vector.append(w2vModel[word])
            else:
                this_vector.append(embeddingUnknown)
        all_vectors.append(this_vector)
    return all_vectors 
Example #25
Source File: informativeness.py    From nonce2vec with MIT License 5 votes vote down vote up
def __init__(self, model_path, sum_filter=None, sum_thresh=None,
                 train_filter=None, train_thresh=None, sort_by=None):
        """Initialize the Informativeness instance.

        Args:
            model_path (str): The absolute path to the gensim w2v CBOW model.
            sum_filter (str): Filter for the sum initialization phase.
            sum_thresh (int): Threshold for sum filter (self and cwi filters
                              only).
            train_filter (str): Filter for the training phase.
            train_thresh (int): Threshold for the train filter (self and cwi
                                filters only).
            sort_by (str): Sort context items in asc or desc of cwi values
                           before training.
        """
        self._sum_filter = sum_filter
        if sum_filter and sum_filter != 'random' and sum_thresh is None:
            raise Exception('Setting sum_filter as \'{}\' requires specifying '
                            'a threshold parameter'.format(sum_filter))
        self._sum_thresh = sum_thresh
        self._train_filter = train_filter
        if train_filter and train_filter != 'random' and train_thresh is None:
            raise Exception('Setting train_filter as \'{}\' requires '
                            'specifying a threshold parameter'
                            .format(train_filter))
        self._train_thresh = train_thresh
        self._model = Word2Vec.load(model_path)
        self._sort_by = sort_by 
Example #26
Source File: model.py    From cakechat with Apache License 2.0 5 votes vote down vote up
def _load_model(model_path):
    _logger.info('Loading model from {}'.format(model_path))
    model = Word2Vec.load(model_path, mmap='r')
    _logger.info('Model "{}" has been loaded.'.format(os.path.basename(model_path)))
    return model 
Example #27
Source File: data_utils.py    From CCKS2019-Chinese-Clinical-NER with MIT License 5 votes vote down vote up
def load_vocab(vocab_filepath):
    """
    Load the dictionary mapping from word to id
    :param vocab_filepath: the file path to the pre-built dictionary
    :return: the dictionary mapping from word to id
    """
    with open(vocab_filepath, "rb") as fr:
        word2id = pickle.load(fr)

    return word2id 
Example #28
Source File: main.py    From IDEA with MIT License 5 votes vote down vote up
def build_phrase(doc):
    # load phrase model
    return trigram[bigram[doc]] 
Example #29
Source File: main.py    From IDEA with MIT License 5 votes vote down vote up
def load_phrase():
    global bigram
    global trigram
    bigram = Phrases.load(os.path.join("..", "model", "bigram.model"))
    trigram = Phrases.load(os.path.join("..", "model", "trigram.model")) 
Example #30
Source File: main.py    From IDEA with MIT License 5 votes vote down vote up
def load_obj(filename):
    with open(filename) as fin:
        return cPickle.load(fin)