Python Examples of gensim.models.keyedvectors.KeyedVectors.load_word2vec

Source File: build_w2v.py From text-classifier with Apache License 2.0

7 votes

def build(train_seg_path, test_seg_path, out_path=None, sentence_path='',
          w2v_bin_path="w2v.bin", min_count=1, col_sep='\t'):
    sentences = extract_sentence(train_seg_path, test_seg_path, col_sep=col_sep)
    save_sentence(sentences, sentence_path)
    print('train w2v model...')
    # train model
    w2v = Word2Vec(sg=1, sentences=LineSentence(sentence_path),
                   size=256, window=5, min_count=min_count, iter=40)
    w2v.wv.save_word2vec_format(w2v_bin_path, binary=True)
    print("save %s ok." % w2v_bin_path)
    # test
    # sim = w2v.wv.similarity('大', '小')
    # print('大 vs 小 similarity score:', sim)
    # load model
    model = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True)
    word_dict = {}
    for word in model.vocab:
        word_dict[word] = model[word]
    save_pkl(word_dict, out_path, overwrite=True)

Source File: word2vec.py From nlp-recipes with MIT License

6 votes

def load_pretrained_vectors(
    dir_path, file_name="GoogleNews-vectors-negative300.bin", limit=None
):
    """ Method that loads word2vec vectors. Downloads if it doesn't exist.

    Args:
        file_name(str): Name of the word2vec file.
        dir_path(str): Path to the directory where word2vec vectors exist or will be
        downloaded.
        limit(int): Number of word vectors that is loaded from gensim. This option
        allows us to save RAM space and avoid memory errors.

    Returns:
        gensim.models.keyedvectors.Word2VecKeyedVectors: Loaded word2vectors

    """
    file_path = _maybe_download_and_extract(dir_path, file_name)
    word2vec_vectors = KeyedVectors.load_word2vec_format(
        file_path, binary=True, limit=limit
    )

    return word2vec_vectors

Source File: utils.py From text-summarization-tensorflow with MIT License

6 votes

def get_init_embedding(reversed_dict, embedding_size):
    glove_file = "glove/glove.42B.300d.txt"
    word2vec_file = get_tmpfile("word2vec_format.vec")
    glove2word2vec(glove_file, word2vec_file)
    print("Loading Glove vectors...")
    word_vectors = KeyedVectors.load_word2vec_format(word2vec_file)

    word_vec_list = list()
    for _, word in sorted(reversed_dict.items()):
        try:
            word_vec = word_vectors.word_vec(word)
        except KeyError:
            word_vec = np.zeros([embedding_size], dtype=np.float32)

        word_vec_list.append(word_vec)

    # Assign random vector to <s>, </s> token
    word_vec_list[2] = np.random.normal(0, 1, embedding_size)
    word_vec_list[3] = np.random.normal(0, 1, embedding_size)

    return np.array(word_vec_list)

Source File: vectorizers.py From revscoring with MIT License

6 votes

def load_word2vec(filename=None, path=None, binary=False, limit=None):
        if path is not None:
            return KeyedVectors.load_word2vec_format(
                path, binary=binary, limit=limit)
        elif filename is not None:
            for dir_path in ASSET_SEARCH_DIRS:
                try:
                    path = os.path.join(dir_path, filename)
                    return KeyedVectors.load_word2vec_format(
                        path, binary=binary, limit=limit)
                except FileNotFoundError:
                    continue
            raise FileNotFoundError("Please make sure that 'filename' \
                                    specifies the word vector binary name \
                                    in default search paths or 'path' \
                                    speficies file path of the binary")
        else:
            raise TypeError(
                "load_word2vec() requires either 'filename' or 'path' to be set.")

Source File: wordvec.py From OpenNIR with MIT License

6 votes

def gensim_w2v_handler(url):
    def wrapped(logger):
        with tempfile.TemporaryDirectory() as p:
            vocab_path = os.path.join(p, 'vocab')
            with logger.duration(f'downloading {url}'):
                util.download(url, vocab_path)
            with logger.duration(f'loading binary {vocab_path}'):
                vectors = KeyedVectors.load_word2vec_format(vocab_path, binary=True)
            vocab_path += '.txt'
            with logger.duration(f'saving text {vocab_path}'):
                vectors.save_word2vec_format(vocab_path)
            with logger.duration(f'reading embedding'):
                weights = None
                terms = []
                for i, values in enumerate(plaintext.read_sv(vocab_path, sep=' ')):
                    if i == 0:
                        weights = np.ndarray((int(values[0]), int(values[1])))
                    else:
                        term, values = values[0], values[1:]
                        terms.append(term)
                        weights[i-1] = [float(v) for v in values]
            return terms, np.array(weights)
    return wrapped

Source File: build_w2v.py From castor with Apache License 2.0

6 votes

def convert(fname, save_file):
    with open(fname, 'rb') as dim_file:
        vocab_size, dim = (int(x) for x in dim_file.readline().split())

    word_vectors = KeyedVectors.load_word2vec_format(fname, binary=True)

    print("Loading vectors from {}".format(fname))
    vectors = []
    for line in tqdm(word_vectors.syn0, total=len(word_vectors.syn0)):
        vectors.extend(line.tolist())
    vectors = torch.Tensor(vectors).view(-1, dim)

    stoi = {word.strip():voc.index for word, voc in word_vectors.vocab.items()}

    print('saving vectors to', save_file)
    torch.save((stoi, vectors, dim), save_file)

Source File: node2vec_recommender.py From entity2rec with Apache License 2.0

6 votes

def __init__(self, dataset, p=1, q=4, walk_length=100,
                 num_walks=50, dimensions=200, window_size=30, workers=8, iterations=5):

        Node2Vec.__init__(self, False, True, False, p, q, walk_length, num_walks, dimensions, window_size,
                          workers, iterations)

        self.dataset = dataset

        file = 'num%d_p%d_q%d_l%d_d%d_iter%d_winsize%d.emd' % (num_walks, p, q,
                                                               walk_length, dimensions,
                                                               iterations, window_size)

        self.path = 'datasets/%s/node2vec/' % self.dataset + file

        if file not in os.listdir('datasets/%s/node2vec/' % self.dataset):

            self.run('datasets/%s/node2vec/altogether.edgelist' % self.dataset,
                             self.path)

        self.node2vec_model = KeyedVectors.load_word2vec_format(self.path, binary=True)

Source File: entity2rel.py From entity2rec with Apache License 2.0

5 votes

def add_embedding(self, property, embedding_file):

        self.embedding_files[property] = KeyedVectors.load_word2vec_format(embedding_file, binary=self.binary)

Source File: test_DocSim.py From document-similarity with MIT License

5 votes

def setUpClass(cls):
        test_model_path = './data/test_data.txt'
        cls.w2v_model = KeyedVectors.load_word2vec_format(test_model_path, binary=False)
        cls.stopwords = ['to', 'an', 'a']
        cls.doc_sim = DocSim(cls.w2v_model, cls.stopwords)

Source File: feature_extraction.py From nlp-architect with Apache License 2.0

5 votes

def load_word2vec_model_from_path(self):
        """
        Load Word2Vec model

        Returns:
            the Word2Vec model
        """
        word_embeddings_model = KeyedVectors.load_word2vec_format(
            self.word2vec_model_path, binary=True
        )
        if not word_embeddings_model:
            return None
        return word_embeddings_model

Source File: embeddings.py From danlp with BSD 3-Clause "New" or "Revised" License

5 votes

def load_wv_with_gensim(pretrained_embedding: str, cache_dir=DEFAULT_CACHE_DIR,
                        verbose: bool = False):
    """

    Available wordembeddings:
    - wiki.da.wv
    - cc.da.wv
    - conll17.da.wv
    - news.da.wv
    - sketchengine.da.wv

    Available subwordembeddings:
    - wiki.da.swv
    - cc.da.swv
    - sketchengine.da.swv

    :param pretrained_embedding:
    :param cache_dir: the directory for storing cached data
    :param verbose:
    :return: KeyedVectors or FastTextKeyedVectors
    """
    word_embeddings_available(pretrained_embedding, can_use_subword=True)
    download_model(pretrained_embedding, cache_dir,
                   _process_downloaded_embeddings, verbose=verbose)
    wv_path = os.path.join(cache_dir, pretrained_embedding + ".bin")

    if pretrained_embedding.split(".")[-1] == 'wv':
        return KeyedVectors.load_word2vec_format(wv_path, binary=True)

    elif pretrained_embedding.split(".")[-1] == 'swv':
        from gensim.models.fasttext import load_facebook_vectors
        return load_facebook_vectors(wv_path)

Source File: gensim_word2vec.py From seq2vec with GNU General Public License v3.0

5 votes

def __init__(self, model_path):
        self.word2vec = KeyedVectors.load_word2vec_format(
            model_path, binary=True
        )

Source File: entity2rel.py From entity2vec with Apache License 2.0

5 votes

def add_embedding(self, embedding_file):

        self.embedding_files.append(KeyedVectors.load_word2vec_format(embedding_file, binary=self.binary))

    # access a particular embedding file and get the relatedness score

Source File: multi_class_classification.py From edge2vec with BSD 3-Clause "New" or "Revised" License

5 votes

def load_word2vec_model(file):
    '''
    load node embedding model
    '''
    model = KeyedVectors.load_word2vec_format(file , binary=False)
    # print model.wv["1"]
    return model

Source File: train.py From DeepNews with Apache License 2.0

5 votes

def check_for_similar_words(self,):
        from gensim.models.keyedvectors import KeyedVectors
        model = KeyedVectors.load_word2vec_format("../../temp_results/word2vec_hindi.txt", binary=False)
        
        self.pretty_print(u"भारत",model.most_similar(u"भारत"))
        self.pretty_print(u"सिंह",model.most_similar(u"सिंह"))
        self.pretty_print(u"क्रिकेट",model.most_similar(u"क्रिकेट"))
        self.pretty_print(u"रुपये",model.most_similar(u"रुपये"))

Source File: prepare_d2d.py From NPRF with Apache License 2.0

5 votes

def sim_mat_and_kernel_d2d(relevance_file, topic_file, corpus_file, topk_corpus_file, embedding_file, stop_file,
                           sim_output_path, kernel_output_path, kernel_mu_list, kernel_sigma_list,
                           topk_supervised, d2d, test):
  '''Simultaneously compute similarity matrix and RBF kernel features

  Args:
    relevance_file: A dumped relevance dict file
    topic_file: a single line format topic file. format: qid term1 term2 ...
    corpus_file: corpus corresponding to docnolist file. format: docno\tdoclen\tterm1 term2
    topk_corpus_file: corpus that contain only the topk terms for each document, format: same as corpus_file
    embedding_file: output file from word2vec toolkit, boolean=True
    stop_file: a stopword list file, one word per line
    sim_output_path:
    kernel_output_path:
    kernel_mu_list:
    kernel_sigma_list:
    topk_supervised: number of top-n documents for each query
    d2d: True for NPRF, False for simple query-document matching used by e.g. DRMM, K-NRM
    test: control the temporary output. Set false

  Returns:

  '''
  relevance_dict = load_pickle(relevance_file)
  topic_dict = parse_topic(topic_file)
  corpus = parse_corpus(corpus_file)
  topk_corpus = parse_corpus(topk_corpus_file)

  embeddings = KeyedVectors.load_word2vec_format(embedding_file, binary=True)
  stoplist = parse_stoplist(stop_file)
  qid_list = relevance_dict.keys()



  for qid in qid_list:
    sim_mat_and_kernel_per_query(relevance_dict, topic_dict, corpus, topk_corpus, embeddings, stoplist, sim_output_path,
                                 kernel_output_path, kernel_mu_list, kernel_sigma_list, topk_supervised, d2d, test, qid)

Source File: word_embeddings.py From chameleon_recsys with MIT License

5 votes

def load_word_embeddings(path, binary=True):
    w2v_model = KeyedVectors.load_word2vec_format(path, binary=binary)
    return w2v_model

Source File: word_model.py From coqa-baselines with MIT License

5 votes

def set_model(self, filename, embed_type='glove'):
        timer = Timer('Load {}'.format(filename))
        if embed_type == 'glove':
            self._model = GloveModel(filename)
        else:
            self._model = KeyedVectors.load_word2vec_format(filename, binary=True
                                                            if embed_type == 'word2vec' else False)
        print('Embeddings: vocab = {}, embed_size = {}'.format(len(self._model.vocab), self._model.vector_size))
        timer.finish()

Source File: fasttext_embedding.py From SOQAL with MIT License

5 votes

def __init__(self, model_path):
        self.model_path = model_path
        print("loading fastText model ...")
        #self.model = pickle.load(open(self.model_path,"rb"))
        self.model = KeyedVectors.load_word2vec_format(self.model_path, encoding='utf-8', unicode_errors='ignore')
        print("done fastText loading model")
        self.tokenizer = WordPunctTokenizer()
        self.stemmer = ARLSTem()
        self.SYMBOLS = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\"'
        self.vocab = self.model.vocab

Source File: link_prediction.py From edge2vec with BSD 3-Clause "New" or "Revised" License

5 votes

def load_word2vec_model(file):
    '''
    return node embedding model
    '''
    model = KeyedVectors.load_word2vec_format(file , binary=False)
    # print model.wv["1"]
    return model

Source File: corpus_utils.py From NeuronBlocks with MIT License

4 votes

def load_embedding(embedding_path, embedding_dim, format, file_type, with_head=False, word_set=None):
    """
    Args:
        format: 'glove', 'word2vec', 'fasttext'
        file_type: 'text' or 'binary'
    """
    embedding_dict = dict()

    if format == 'word2vec' or format == 'fasttext':
        if file_type == 'text':
            vector_total = KeyedVectors.load_word2vec_format(embedding_path, binary=False, unicode_errors='ignore')
        else:
            if format == 'word2vec':
                vector_total = KeyedVectors.load_word2vec_format(embedding_path, binary=True, unicode_errors='ignore')
            elif format == 'fasttext':
                vector_total = FastText.load_fasttext_format(embedding_path, encoding='utf8')

        assert vector_total.vector_size == embedding_dim
        if word_set is None:
            embedding_dict = vector_total
        else:
            if not (format == 'fasttext' and file_type == 'binary'):
                word_total = vector_total.index2word    # actually, vector_total.index2word is the word list
            else:
                word_total = vector_total.wv.index2word
            for word in word_total:
                if word in word_set:
                    embedding_dict[word] = vector_total[word]
    elif format == 'glove':
        with codecs.open(embedding_path, 'r', encoding='utf-8') as fin:
            if with_head == True:
                _ = fin.readline()
            for idx, line in enumerate(fin):
                line = line.rstrip()
                if idx == 0 and len(line.split()) == 2:
                    continue
                if len(line) > 0:
                    word, vec = line.split(" ", 1)
                    if (word_set and word in word_set) or (word_set is None):
                        vector = [float(num) for num in vec.split(" ")]
                        assert len(vector) == embedding_dim
                        embedding_dict[word] = vector
    else:
        raise Exception('The format supported are glove, word2vec or fasttext, dost not support %s now.' % format)
    return embedding_dict

Source File: embeddings.py From danlp with BSD 3-Clause "New" or "Revised" License

4 votes

def _process_embeddings_for_spacy(tmp_file_path: str, meta_info: dict,
                                  cache_dir: str = DEFAULT_CACHE_DIR,
                                  clean_up_raw_data: bool = True,
                                  verbose: bool = False):
    """
    To use pretrained embeddings with spaCy the embeddings need to be stored in
    a specific format. This function converts embeddings saved in the binary
    word2vec format to a spaCy model with the init_model() function from
    spaCy. The generated files will be saved in the cache_dir under a
    folder called <pretrained_embedding>.spacy

    More information on converting pretrained word embeddings to spaCy models here:
    https://spacy.io/usage/vectors-similarity#custom

    :param str tmp_file_path: the file name of the embedding binary file
    :param str cache_dir: the directory for storing cached data
    :param bool verbose:
    """
    from pathlib import Path
    from spacy.cli import init_model

    embeddings = meta_info['name']

    bin_file_path = os.path.join(cache_dir, embeddings + ".bin")

    if not os.path.isfile(
            bin_file_path):  # Preprocess to transform to word2vec .bin format
        _process_downloaded_embeddings(tmp_file_path, meta_info, cache_dir,
                                       clean_up_raw_data, verbose)

    vec_file = embeddings + ".vec"

    word_vecs = KeyedVectors.load_word2vec_format(bin_file_path, binary=True,
                                                  encoding='utf8')
    assert_wv_dimensions(word_vecs, embeddings)
    word_vecs.save_word2vec_format(vec_file, binary=False)

    spacy_dir = os.path.join(cache_dir, embeddings + '.spacy')
    os.makedirs(spacy_dir, exist_ok=True)

    if os.path.isabs(spacy_dir):
        full_spacy_dir = Path(spacy_dir)
    else:
        full_spacy_dir = Path(os.path.join(os.getcwd(), spacy_dir))

    init_model('da', full_spacy_dir, vectors_loc=vec_file)

    os.remove(vec_file)  # Clean up the vec file

Python gensim.models.keyedvectors.KeyedVectors.load_word2vec_format() Examples