Python gensim.models.keyedvectors.KeyedVectors.load_word2vec_format() Examples

The following are 22 code examples of gensim.models.keyedvectors.KeyedVectors.load_word2vec_format(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module gensim.models.keyedvectors.KeyedVectors , or try the search function .
Example #1
Source File: build_w2v.py    From text-classifier with Apache License 2.0 7 votes vote down vote up
def build(train_seg_path, test_seg_path, out_path=None, sentence_path='',
          w2v_bin_path="w2v.bin", min_count=1, col_sep='\t'):
    sentences = extract_sentence(train_seg_path, test_seg_path, col_sep=col_sep)
    save_sentence(sentences, sentence_path)
    print('train w2v model...')
    # train model
    w2v = Word2Vec(sg=1, sentences=LineSentence(sentence_path),
                   size=256, window=5, min_count=min_count, iter=40)
    w2v.wv.save_word2vec_format(w2v_bin_path, binary=True)
    print("save %s ok." % w2v_bin_path)
    # test
    # sim = w2v.wv.similarity('大', '小')
    # print('大 vs 小 similarity score:', sim)
    # load model
    model = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True)
    word_dict = {}
    for word in model.vocab:
        word_dict[word] = model[word]
    save_pkl(word_dict, out_path, overwrite=True) 
Example #2
Source File: word2vec.py    From nlp-recipes with MIT License 6 votes vote down vote up
def load_pretrained_vectors(
    dir_path, file_name="GoogleNews-vectors-negative300.bin", limit=None
):
    """ Method that loads word2vec vectors. Downloads if it doesn't exist.

    Args:
        file_name(str): Name of the word2vec file.
        dir_path(str): Path to the directory where word2vec vectors exist or will be
        downloaded.
        limit(int): Number of word vectors that is loaded from gensim. This option
        allows us to save RAM space and avoid memory errors.

    Returns:
        gensim.models.keyedvectors.Word2VecKeyedVectors: Loaded word2vectors

    """
    file_path = _maybe_download_and_extract(dir_path, file_name)
    word2vec_vectors = KeyedVectors.load_word2vec_format(
        file_path, binary=True, limit=limit
    )

    return word2vec_vectors 
Example #3
Source File: utils.py    From text-summarization-tensorflow with MIT License 6 votes vote down vote up
def get_init_embedding(reversed_dict, embedding_size):
    glove_file = "glove/glove.42B.300d.txt"
    word2vec_file = get_tmpfile("word2vec_format.vec")
    glove2word2vec(glove_file, word2vec_file)
    print("Loading Glove vectors...")
    word_vectors = KeyedVectors.load_word2vec_format(word2vec_file)

    word_vec_list = list()
    for _, word in sorted(reversed_dict.items()):
        try:
            word_vec = word_vectors.word_vec(word)
        except KeyError:
            word_vec = np.zeros([embedding_size], dtype=np.float32)

        word_vec_list.append(word_vec)

    # Assign random vector to <s>, </s> token
    word_vec_list[2] = np.random.normal(0, 1, embedding_size)
    word_vec_list[3] = np.random.normal(0, 1, embedding_size)

    return np.array(word_vec_list) 
Example #4
Source File: vectorizers.py    From revscoring with MIT License 6 votes vote down vote up
def load_word2vec(filename=None, path=None, binary=False, limit=None):
        if path is not None:
            return KeyedVectors.load_word2vec_format(
                path, binary=binary, limit=limit)
        elif filename is not None:
            for dir_path in ASSET_SEARCH_DIRS:
                try:
                    path = os.path.join(dir_path, filename)
                    return KeyedVectors.load_word2vec_format(
                        path, binary=binary, limit=limit)
                except FileNotFoundError:
                    continue
            raise FileNotFoundError("Please make sure that 'filename' \
                                    specifies the word vector binary name \
                                    in default search paths or 'path' \
                                    speficies file path of the binary")
        else:
            raise TypeError(
                "load_word2vec() requires either 'filename' or 'path' to be set.") 
Example #5
Source File: wordvec.py    From OpenNIR with MIT License 6 votes vote down vote up
def gensim_w2v_handler(url):
    def wrapped(logger):
        with tempfile.TemporaryDirectory() as p:
            vocab_path = os.path.join(p, 'vocab')
            with logger.duration(f'downloading {url}'):
                util.download(url, vocab_path)
            with logger.duration(f'loading binary {vocab_path}'):
                vectors = KeyedVectors.load_word2vec_format(vocab_path, binary=True)
            vocab_path += '.txt'
            with logger.duration(f'saving text {vocab_path}'):
                vectors.save_word2vec_format(vocab_path)
            with logger.duration(f'reading embedding'):
                weights = None
                terms = []
                for i, values in enumerate(plaintext.read_sv(vocab_path, sep=' ')):
                    if i == 0:
                        weights = np.ndarray((int(values[0]), int(values[1])))
                    else:
                        term, values = values[0], values[1:]
                        terms.append(term)
                        weights[i-1] = [float(v) for v in values]
            return terms, np.array(weights)
    return wrapped 
Example #6
Source File: build_w2v.py    From castor with Apache License 2.0 6 votes vote down vote up
def convert(fname, save_file):
    with open(fname, 'rb') as dim_file:
        vocab_size, dim = (int(x) for x in dim_file.readline().split())

    word_vectors = KeyedVectors.load_word2vec_format(fname, binary=True)

    print("Loading vectors from {}".format(fname))
    vectors = []
    for line in tqdm(word_vectors.syn0, total=len(word_vectors.syn0)):
        vectors.extend(line.tolist())
    vectors = torch.Tensor(vectors).view(-1, dim)

    stoi = {word.strip():voc.index for word, voc in word_vectors.vocab.items()}

    print('saving vectors to', save_file)
    torch.save((stoi, vectors, dim), save_file) 
Example #7
Source File: node2vec_recommender.py    From entity2rec with Apache License 2.0 6 votes vote down vote up
def __init__(self, dataset, p=1, q=4, walk_length=100,
                 num_walks=50, dimensions=200, window_size=30, workers=8, iterations=5):

        Node2Vec.__init__(self, False, True, False, p, q, walk_length, num_walks, dimensions, window_size,
                          workers, iterations)

        self.dataset = dataset

        file = 'num%d_p%d_q%d_l%d_d%d_iter%d_winsize%d.emd' % (num_walks, p, q,
                                                               walk_length, dimensions,
                                                               iterations, window_size)

        self.path = 'datasets/%s/node2vec/' % self.dataset + file

        if file not in os.listdir('datasets/%s/node2vec/' % self.dataset):

            self.run('datasets/%s/node2vec/altogether.edgelist' % self.dataset,
                             self.path)

        self.node2vec_model = KeyedVectors.load_word2vec_format(self.path, binary=True) 
Example #8
Source File: entity2rel.py    From entity2rec with Apache License 2.0 5 votes vote down vote up
def add_embedding(self, property, embedding_file):

        self.embedding_files[property] = KeyedVectors.load_word2vec_format(embedding_file, binary=self.binary) 
Example #9
Source File: test_DocSim.py    From document-similarity with MIT License 5 votes vote down vote up
def setUpClass(cls):
        test_model_path = './data/test_data.txt'
        cls.w2v_model = KeyedVectors.load_word2vec_format(test_model_path, binary=False)
        cls.stopwords = ['to', 'an', 'a']
        cls.doc_sim = DocSim(cls.w2v_model, cls.stopwords) 
Example #10
Source File: feature_extraction.py    From nlp-architect with Apache License 2.0 5 votes vote down vote up
def load_word2vec_model_from_path(self):
        """
        Load Word2Vec model

        Returns:
            the Word2Vec model
        """
        word_embeddings_model = KeyedVectors.load_word2vec_format(
            self.word2vec_model_path, binary=True
        )
        if not word_embeddings_model:
            return None
        return word_embeddings_model 
Example #11
Source File: embeddings.py    From danlp with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def load_wv_with_gensim(pretrained_embedding: str, cache_dir=DEFAULT_CACHE_DIR,
                        verbose: bool = False):
    """

    Available wordembeddings:
    - wiki.da.wv
    - cc.da.wv
    - conll17.da.wv
    - news.da.wv
    - sketchengine.da.wv

    Available subwordembeddings:
    - wiki.da.swv
    - cc.da.swv
    - sketchengine.da.swv

    :param pretrained_embedding:
    :param cache_dir: the directory for storing cached data
    :param verbose:
    :return: KeyedVectors or FastTextKeyedVectors
    """
    word_embeddings_available(pretrained_embedding, can_use_subword=True)
    download_model(pretrained_embedding, cache_dir,
                   _process_downloaded_embeddings, verbose=verbose)
    wv_path = os.path.join(cache_dir, pretrained_embedding + ".bin")

    if pretrained_embedding.split(".")[-1] == 'wv':
        return KeyedVectors.load_word2vec_format(wv_path, binary=True)

    elif pretrained_embedding.split(".")[-1] == 'swv':
        from gensim.models.fasttext import load_facebook_vectors
        return load_facebook_vectors(wv_path) 
Example #12
Source File: gensim_word2vec.py    From seq2vec with GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, model_path):
        self.word2vec = KeyedVectors.load_word2vec_format(
            model_path, binary=True
        ) 
Example #13
Source File: entity2rel.py    From entity2vec with Apache License 2.0 5 votes vote down vote up
def add_embedding(self, embedding_file):

        self.embedding_files.append(KeyedVectors.load_word2vec_format(embedding_file, binary=self.binary))

    # access a particular embedding file and get the relatedness score 
Example #14
Source File: multi_class_classification.py    From edge2vec with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def load_word2vec_model(file):
    '''
    load node embedding model
    '''
    model = KeyedVectors.load_word2vec_format(file , binary=False)
    # print model.wv["1"]
    return model 
Example #15
Source File: train.py    From DeepNews with Apache License 2.0 5 votes vote down vote up
def check_for_similar_words(self,):
        from gensim.models.keyedvectors import KeyedVectors
        model = KeyedVectors.load_word2vec_format("../../temp_results/word2vec_hindi.txt", binary=False)
        
        self.pretty_print(u"भारत",model.most_similar(u"भारत"))
        self.pretty_print(u"सिंह",model.most_similar(u"सिंह"))
        self.pretty_print(u"क्रिकेट",model.most_similar(u"क्रिकेट"))
        self.pretty_print(u"रुपये",model.most_similar(u"रुपये")) 
Example #16
Source File: prepare_d2d.py    From NPRF with Apache License 2.0 5 votes vote down vote up
def sim_mat_and_kernel_d2d(relevance_file, topic_file, corpus_file, topk_corpus_file, embedding_file, stop_file,
                           sim_output_path, kernel_output_path, kernel_mu_list, kernel_sigma_list,
                           topk_supervised, d2d, test):
  '''Simultaneously compute similarity matrix and RBF kernel features

  Args:
    relevance_file: A dumped relevance dict file
    topic_file: a single line format topic file. format: qid term1 term2 ...
    corpus_file: corpus corresponding to docnolist file. format: docno\tdoclen\tterm1 term2
    topk_corpus_file: corpus that contain only the topk terms for each document, format: same as corpus_file
    embedding_file: output file from word2vec toolkit, boolean=True
    stop_file: a stopword list file, one word per line
    sim_output_path:
    kernel_output_path:
    kernel_mu_list:
    kernel_sigma_list:
    topk_supervised: number of top-n documents for each query
    d2d: True for NPRF, False for simple query-document matching used by e.g. DRMM, K-NRM
    test: control the temporary output. Set false

  Returns:

  '''
  relevance_dict = load_pickle(relevance_file)
  topic_dict = parse_topic(topic_file)
  corpus = parse_corpus(corpus_file)
  topk_corpus = parse_corpus(topk_corpus_file)

  embeddings = KeyedVectors.load_word2vec_format(embedding_file, binary=True)
  stoplist = parse_stoplist(stop_file)
  qid_list = relevance_dict.keys()



  for qid in qid_list:
    sim_mat_and_kernel_per_query(relevance_dict, topic_dict, corpus, topk_corpus, embeddings, stoplist, sim_output_path,
                                 kernel_output_path, kernel_mu_list, kernel_sigma_list, topk_supervised, d2d, test, qid) 
Example #17
Source File: word_embeddings.py    From chameleon_recsys with MIT License 5 votes vote down vote up
def load_word_embeddings(path, binary=True):
    w2v_model = KeyedVectors.load_word2vec_format(path, binary=binary)
    return w2v_model 
Example #18
Source File: word_model.py    From coqa-baselines with MIT License 5 votes vote down vote up
def set_model(self, filename, embed_type='glove'):
        timer = Timer('Load {}'.format(filename))
        if embed_type == 'glove':
            self._model = GloveModel(filename)
        else:
            self._model = KeyedVectors.load_word2vec_format(filename, binary=True
                                                            if embed_type == 'word2vec' else False)
        print('Embeddings: vocab = {}, embed_size = {}'.format(len(self._model.vocab), self._model.vector_size))
        timer.finish() 
Example #19
Source File: fasttext_embedding.py    From SOQAL with MIT License 5 votes vote down vote up
def __init__(self, model_path):
        self.model_path = model_path
        print("loading fastText model ...")
        #self.model = pickle.load(open(self.model_path,"rb"))
        self.model = KeyedVectors.load_word2vec_format(self.model_path, encoding='utf-8', unicode_errors='ignore')
        print("done fastText loading model")
        self.tokenizer = WordPunctTokenizer()
        self.stemmer = ARLSTem()
        self.SYMBOLS = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\"'
        self.vocab = self.model.vocab 
Example #20
Source File: link_prediction.py    From edge2vec with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def load_word2vec_model(file):
    '''
    return node embedding model
    '''
    model = KeyedVectors.load_word2vec_format(file , binary=False)
    # print model.wv["1"]
    return model 
Example #21
Source File: corpus_utils.py    From NeuronBlocks with MIT License 4 votes vote down vote up
def load_embedding(embedding_path, embedding_dim, format, file_type, with_head=False, word_set=None):
    """
    Args:
        format: 'glove', 'word2vec', 'fasttext'
        file_type: 'text' or 'binary'
    """
    embedding_dict = dict()

    if format == 'word2vec' or format == 'fasttext':
        if file_type == 'text':
            vector_total = KeyedVectors.load_word2vec_format(embedding_path, binary=False, unicode_errors='ignore')
        else:
            if format == 'word2vec':
                vector_total = KeyedVectors.load_word2vec_format(embedding_path, binary=True, unicode_errors='ignore')
            elif format == 'fasttext':
                vector_total = FastText.load_fasttext_format(embedding_path, encoding='utf8')

        assert vector_total.vector_size == embedding_dim
        if word_set is None:
            embedding_dict = vector_total
        else:
            if not (format == 'fasttext' and file_type == 'binary'):
                word_total = vector_total.index2word    # actually, vector_total.index2word is the word list
            else:
                word_total = vector_total.wv.index2word
            for word in word_total:
                if word in word_set:
                    embedding_dict[word] = vector_total[word]
    elif format == 'glove':
        with codecs.open(embedding_path, 'r', encoding='utf-8') as fin:
            if with_head == True:
                _ = fin.readline()
            for idx, line in enumerate(fin):
                line = line.rstrip()
                if idx == 0 and len(line.split()) == 2:
                    continue
                if len(line) > 0:
                    word, vec = line.split(" ", 1)
                    if (word_set and word in word_set) or (word_set is None):
                        vector = [float(num) for num in vec.split(" ")]
                        assert len(vector) == embedding_dim
                        embedding_dict[word] = vector
    else:
        raise Exception('The format supported are glove, word2vec or fasttext, dost not support %s now.' % format)
    return embedding_dict 
Example #22
Source File: embeddings.py    From danlp with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def _process_embeddings_for_spacy(tmp_file_path: str, meta_info: dict,
                                  cache_dir: str = DEFAULT_CACHE_DIR,
                                  clean_up_raw_data: bool = True,
                                  verbose: bool = False):
    """
    To use pretrained embeddings with spaCy the embeddings need to be stored in
    a specific format. This function converts embeddings saved in the binary
    word2vec format to a spaCy model with the init_model() function from
    spaCy. The generated files will be saved in the cache_dir under a
    folder called <pretrained_embedding>.spacy

    More information on converting pretrained word embeddings to spaCy models here:
    https://spacy.io/usage/vectors-similarity#custom

    :param str tmp_file_path: the file name of the embedding binary file
    :param str cache_dir: the directory for storing cached data
    :param bool verbose:
    """
    from pathlib import Path
    from spacy.cli import init_model

    embeddings = meta_info['name']

    bin_file_path = os.path.join(cache_dir, embeddings + ".bin")

    if not os.path.isfile(
            bin_file_path):  # Preprocess to transform to word2vec .bin format
        _process_downloaded_embeddings(tmp_file_path, meta_info, cache_dir,
                                       clean_up_raw_data, verbose)

    vec_file = embeddings + ".vec"

    word_vecs = KeyedVectors.load_word2vec_format(bin_file_path, binary=True,
                                                  encoding='utf8')
    assert_wv_dimensions(word_vecs, embeddings)
    word_vecs.save_word2vec_format(vec_file, binary=False)

    spacy_dir = os.path.join(cache_dir, embeddings + '.spacy')
    os.makedirs(spacy_dir, exist_ok=True)

    if os.path.isabs(spacy_dir):
        full_spacy_dir = Path(spacy_dir)
    else:
        full_spacy_dir = Path(os.path.join(os.getcwd(), spacy_dir))

    init_model('da', full_spacy_dir, vectors_loc=vec_file)

    os.remove(vec_file)  # Clean up the vec file