Python gensim.models.KeyedVectors.load_word2vec_format() Examples

The following are 30 code examples of gensim.models.KeyedVectors.load_word2vec_format(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module gensim.models.KeyedVectors , or try the search function .
Example #1
Source File: embeddings.py    From steppy-toolkit with MIT License 11 votes vote down vote up
def load_word2vec_embeddings(filepath, tokenizer, max_features, embedding_size):
    model = KeyedVectors.load_word2vec_format(filepath, binary=True)

    emb_mean, emb_std = model.wv.syn0.mean(), model.wv.syn0.std()

    word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embedding_size))
    for word, i in word_index.items():
        if i >= max_features:
            continue
        try:
            embedding_vector = model[word]
            embedding_matrix[i] = embedding_vector
        except KeyError:
            continue
    return embedding_matrix 
Example #2
Source File: loadEmbeddings.py    From acl2017-interactive_summarizer with Apache License 2.0 6 votes vote down vote up
def loadEmbeddings(self, filepath, data_path, vocab_size, binary_val):
        embed_short = os.path.normpath("%s/embed.dat" % data_path)
        if not os.path.exists(embed_short):
            print("Caching word embeddings in memmapped format...")
            print(binary_val, filepath)
            wv = KeyedVectors.load_word2vec_format("%s" % (filepath), binary=binary_val)
            fp = np.memmap(embed_short, dtype=np.double, mode='w+', shape=wv.syn0.shape)
            fp[:] = wv.syn0[:]
            with open(os.path.normpath("%s/embed.vocab" % data_path), "w") as fp:
                for _, w in sorted((voc.index, word) for word, voc in wv.vocab.items()):
                    fp.write("%s\n"%(w.encode("utf8")))
            del fp, wv
            
        self.W = np.memmap(os.path.normpath("%s/embed.dat" % data_path), dtype=np.double, mode="r", shape=(vocab_size, self.embedding_size))
        with codecs.open(os.path.normpath("%s/embed.vocab" % data_path), 'r', 'utf-8') as f:
            vocab_list = [x.strip() for x in f.readlines()]
        self.vocab_dict = {w: k for k, w in enumerate(vocab_list)} 
Example #3
Source File: models.py    From open-solution-toxic-comments with MIT License 6 votes vote down vote up
def _get_embedding_matrix(self, tokenizer):
        model = KeyedVectors.load_word2vec_format(self.pretrained_filepath, binary=True)

        emb_mean, emb_std = model.syn0.mean(), model.syn0.std()

        word_index = tokenizer.word_index
        nb_words = min(self.max_features, len(word_index))
        embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, self.embedding_size))
        for word, i in word_index.items():
            if i >= self.max_features:
                continue
            try:
                embedding_vector = model[word]
                embedding_matrix[i] = embedding_vector
            except KeyError:
                continue
        return embedding_matrix 
Example #4
Source File: preprocessing.py    From R-NET-in-Keras with MIT License 6 votes vote down vote up
def word2vec(word2vec_path):
    # Download word2vec data if it's not present yet
    if not path.exists(word2vec_path):
        glove_file_path = get_glove_file_path()
        print('Converting Glove to word2vec...', end='')
        glove2word2vec(glove_file_path, word2vec_path)  # Convert glove to word2vec
        os.remove(glove_file_path)                      # Remove glove file and keep only word2vec
        print('Done')

    print('Reading word2vec data... ', end='')
    model = KeyedVectors.load_word2vec_format(word2vec_path)
    print('Done')

    def get_word_vector(word):
        try:
            return model[word]
        except KeyError:
            return np.zeros(model.vector_size)

    return get_word_vector 
Example #5
Source File: embeddings.py    From open-solution-mapping-challenge with MIT License 6 votes vote down vote up
def load_word2vec_embeddings(filepath, tokenizer, max_features, embedding_size):
    model = KeyedVectors.load_word2vec_format(filepath, binary=True)

    emb_mean, emb_std = model.wv.syn0.mean(), model.wv.syn0.std()

    word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embedding_size))
    for word, i in word_index.items():
        if i >= max_features:
            continue
        try:
            embedding_vector = model[word]
            embedding_matrix[i] = embedding_vector
        except KeyError:
            continue
    return embedding_matrix 
Example #6
Source File: pick_wordvec.py    From CAMP_iccv19 with Apache License 2.0 6 votes vote down vote up
def main(opt):
    vocab = pickle.load(open(opt.vocab_path, 'rb'))
    num = len(vocab)
    print (num)
    model = KeyedVectors.load_word2vec_format(opt.embed_weight, binary=True)
    

    matrix_len = num
    weights_matrix = np.zeros((num, 300))
    words_found = 0 
    mask = np.zeros(num, dtype=int)

    for i, word in enumerate(vocab.idx2word):
        try: 
            weights_matrix[i] = model[vocab.idx2word[i]]
            words_found += 1
            mask[i] = 1
        except KeyError:
            weights_matrix[i] = np.random.normal(scale=0.1, size=(300, ))

    print (words_found)

    np.save("./embed/f30kword2vec300dim_3.npy", weights_matrix)
    np.save("./embed/f30kword2vecmask_3.npy", mask) 
Example #7
Source File: file_manage.py    From resilient-community-apps with MIT License 6 votes vote down vote up
def get_summary_nlp(self):
        """
        Return a summary of a NLP model file
        :return:
        """
        ret = []
        try:
            word2vec = KeyedVectors.load_word2vec_format(self.filename, binary=False)
            mtime = self._get_mtime()
            dim_vectors = word2vec.vector_size
            word_count = len(word2vec.vectors)

            ret.append("---------------------------")
            ret.append("Summary for NLP model file:")
            ret.append("---------------------------")
            ret.append(self.FILE_NAME_OUTPUT.format(self.filename))
            ret.append(self.LAST_MODIFICATION_TIME.format(mtime))
            ret.append(self.FEATURE_DIMENSION.format(dim_vectors))
            ret.append(self.NUM_SENTENCES.format(word_count))
            ret.append("\n")
        except Exception as e:
            ret.append("Failed to read NLP model {}.".format(self.filename))
            ret.append("Error: {}".format(e))

        return ret 
Example #8
Source File: np2vec.py    From nlp-architect with Apache License 2.0 6 votes vote down vote up
def load(cls, np2vec_model_file, binary=False, word_ngrams=0, word2vec_format=True):
        """
        Load the np2vec model.

        Args:
            np2vec_model_file (str): the file containing the np2vec model to load
            binary (bool): boolean indicating whether the np2vec model to load is in binary format
            word_ngrams (int {1,0}): If 1, np2vec model to load uses word vectors with subword (
            ngrams) information.
            word2vec_format(bool): boolean indicating whether the model to load has been stored in
            original word2vec format.

        Returns:
            np2vec model to load
        """
        if word_ngrams == 0:
            if word2vec_format:
                return KeyedVectors.load_word2vec_format(np2vec_model_file, binary=binary)
            return KeyedVectors.load(np2vec_model_file, mmap="r")
        if word_ngrams == 1:
            return FastText.load(np2vec_model_file)
        logger.error("invalid value for 'word_ngrams'")
        return None 
Example #9
Source File: embeddings.py    From saber with MIT License 6 votes vote down vote up
def _prepare_embedding_index(self, binary=True):
        """Returns an embedding index for pre-trained token embeddings.

        For pre-trained word embeddings given at `self.filepath`, returns a
        dictionary mapping words to their embedding (an 'embedding index'). If `self.debug` is
        True, only the first ten thousand vectors are loaded.

        Args:
            binary (bool): True if pre-trained embeddings are in C binary format, False if they are
                in C text format. Defaults to True.

        Returns:
            Dictionary mapping words to pre-trained word embeddings, known as an 'embedding index'.
        """
        limit = 10000 if self.__dict__.get("debug", False) else None
        vectors = KeyedVectors.load_word2vec_format(self.filepath, binary=binary, limit=limit)
        embedding_idx = {word: vectors[word] for word in vectors.vocab}

        return embedding_idx 
Example #10
Source File: embed.py    From verejne.digital with Apache License 2.0 6 votes vote down vote up
def __init__(self, all_texts):
        # Creating the model
        print("Reading the pretrained model for Word2VecEmbedder")
        self.sk_model = KeyedVectors.load_word2vec_format(
            '/data/verejne/datautils/embedding_data/slovak.vec', encoding='utf-8', unicode_errors='ignore')
        print("Model contains", len(self.sk_model.vocab), "tokens")
        print(self.sk_model.similarity("mesto", "mesta"))
        self.dimension = len(self.sk_model["auto"])
        print("sídlisk" in self.sk_model)
        print("sídlisk".encode('utf8') in self.sk_model)

        print("Dimension of embedding of 'auto' is", self.dimension)
        # Create frequency table for words
        if all_texts is None:
            return

        for text in all_texts:
            self.add_text_to_corpus(text)
        self.print_corpus_stats() 
Example #11
Source File: matcher.py    From HIT-SCIR-CoNLL2019 with Apache License 2.0 5 votes vote down vote up
def init_word2vec(filename, binary=False):
    global _word2vec
    _word2vec = KeyedVectors.load_word2vec_format(filename, binary=binary) 
Example #12
Source File: contractions.py    From pycontractions with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def load_models(self):
        """Attempt to find/load/download keyedvector model."""
        if self.kv_model is not None:
            if not hasattr(self.kv_model, 'wmdistance'):
                raise AttributeError("Model does not support Word Mover's Distance, must be in keyedvectors format")

        elif self.w2v_path is not None:
            if not os.path.exists(self.w2v_path):
                print("Word2Vec model not found at {}".format(self.w2v_path))
                sys.exit(1)
            try:
                self.kv_model = KeyedVectors.load_word2vec_format(self.w2v_path, binary=True)
            except:
                print("Error loading Word2Vec model")
                raise
        elif self.api_key is not None:
            try:
                self.kv_model = api.load(self.api_key)
            except:
                print("Error downloading model {}".format(self.api_key))
                raise
            if not hasattr(self.kv_model, 'wmdistance'):
                raise AttributeError("Model does not support Word Mover's Distance, must be in keyedvectors format")
        else:
            raise AttributeError("No model given")

        try:
            self.lc_tool = language_check.LanguageTool(self.lang_code)
        except:
            print("Error initializing LanguageTool")
            raise 
Example #13
Source File: corpus.py    From ner with Apache License 2.0 5 votes vote down vote up
def load_embeddings(self, file_path):
        # Embeddins must be in fastText format either bin or
        print('Loading embeddins...')
        if file_path.endswith('.bin'):
            from gensim.models.wrappers import FastText
            embeddings = FastText.load_fasttext_format(file_path)
        else:
            from gensim.models import KeyedVectors
            embeddings = KeyedVectors.load_word2vec_format(file_path)
        return embeddings 
Example #14
Source File: nlp.py    From bugbug with Mozilla Public License 2.0 5 votes vote down vote up
def get_word_embeddings():
    word_embeddings = KeyedVectors.load_word2vec_format("wiki-news-300d-1M-subword.vec")
    word_embeddings.init_sims(replace=True)
    return word_embeddings 
Example #15
Source File: NER_model.py    From DeepPavlov with Apache License 2.0 5 votes vote down vote up
def load_pretrained_word_emb(self, model_path, model_name, word_dim, word2id=None, vocab_size=None):
        loaded_words = 0
        if word2id is not None:
            vocab_size = len(word2id)
        word_embeddings = np.zeros(shape=(vocab_size, word_dim))

        if model_name == "glove":
            model = KeyedVectors.load_word2vec_format(model_path, binary=False)
            for word in word2id:
                if word in model:
                    word_embeddings[word2id[word]] = model[word]
                    loaded_words += 1
        elif model_name == "baomoi":
            model = KeyedVectors.load_word2vec_format(model_path, binary=True, unicode_errors='ignore')
            for word in word2id:
                if len(word) == 1:
                    if word[0] in string.punctuation:
                        word_embeddings[word2id[word]] = model["<punct>"]
                        loaded_words += 1
                elif word.isdigit():
                    word_embeddings[word2id[word]] = model["<number>"]
                    loaded_words += 1
                elif word in model.vocab:
                    word_embeddings[word2id[word]] = model[word]
                    loaded_words += 1
        elif model_name is not None:
            raise RuntimeError(f'got an unexpected value for model_name: `{model_name}`')

        log.info(f"{loaded_words}/{vocab_size} words were loaded from {model_path}.")
        return word_embeddings 
Example #16
Source File: glove_embedder.py    From DeepPavlov with Apache License 2.0 5 votes vote down vote up
def load(self) -> None:
        """
        Load dict of embeddings from given file
        """
        log.info(f"[loading GloVe embeddings from `{self.load_path}`]")
        if not self.load_path.exists():
            log.warning(f'{self.load_path} does not exist, cannot load embeddings from it!')
            return
        self.model = KeyedVectors.load_word2vec_format(str(self.load_path))
        self.dim = self.model.vector_size 
Example #17
Source File: nlp_word2vec.py    From resilient-community-apps with MIT License 5 votes vote down vote up
def load_model(self, file_name=None):
        """
        Load a saved model
        :param file_name: [optional] model file. Use default if None
        :return:
        """
        model_file = file_name if file_name else FileManage.DEFAULT_NLP_FILE
        try:
            self.word2vec = KeyedVectors.load_word2vec_format(model_file, binary=False)
        except Exception as e:
            self.log.error("Failed to load a saved model {}".format(model_file)) 
Example #18
Source File: feature_engineering.py    From CIKM-AnalytiCup-2018 with Apache License 2.0 5 votes vote down vote up
def __init__(self, train_df, test_df, unlabeled_df, data_loader, normalization=True):
        self.train_df = train_df
        self.test_df = test_df
        self.unlabeled_df = unlabeled_df

        self.data_loader = data_loader
        self.stopwords = self.data_loader.load_stopwords()

        self.train_df['splited_spn_1'] = self.train_df['spn_1'].apply(lambda v: v.split())
        self.train_df['splited_spn_2'] = self.train_df['spn_2'].apply(lambda v: v.split())
        self.unlabeled_df['splited_spn_1'] = self.unlabeled_df['spn_1'].apply(lambda v: v.split())

        self.test_df['splited_spn_1'] = self.test_df['spn_1'].apply(lambda v: v.split())
        self.test_df['splited_spn_2'] = self.test_df['spn_2'].apply(lambda v: v.split())

        self.normalization = normalization

        docs = self.train_df['splited_spn_1'].values.tolist() + self.train_df['splited_spn_2'].values.tolist() + \
               self.test_df['splited_spn_1'].values.tolist() + self.test_df['splited_spn_2'].values.tolist() + self.unlabeled_df['splited_spn_1'].tolist()
        docs = np.array(docs)
        docs = np.unique(docs)
        docs = docs.tolist()
        
        docs_raw = self.train_df['spn_1'].values.tolist() + self.train_df['spn_2'].values.tolist() + \
                   self.test_df['spn_1'].values.tolist() + self.test_df['spn_2'].values.tolist() + self.unlabeled_df['spn_1'].tolist()
        docs_raw = np.array(docs_raw)
        docs_raw = np.unique(docs_raw)
        docs_raw = docs_raw.tolist()

        self.tfidf_vectorizer = TfidfVectorizer()
        self.tfidf_vectorizer.fit(docs_raw)

        self.bm25_scorer = bm25.bm25Scorer(docs=docs)

        #print("[FE] Loading the word2vec model")
        #self.word2vec_model = KeyedVectors.load_word2vec_format(dataset_config.SPANISH_WORDVEC_PATH)
        #self.word2vec_model.init_sims(replace=True)
        #print("[FE] Loaded the word2vec mdoel")

        self.build_statistic() 
Example #19
Source File: models.py    From caml-mimic with MIT License 5 votes vote down vote up
def _code_emb_init(self, code_emb, dicts):
        code_embs = KeyedVectors.load_word2vec_format(code_emb)
        weights = np.zeros(self.final.weight.size())
        for i in range(self.Y):
            code = dicts['ind2c'][i]
            weights[i] = code_embs[code]
        self.U.weight.data = torch.Tensor(weights).clone()
        self.final.weight.data = torch.Tensor(weights).clone() 
Example #20
Source File: models.py    From caml-mimic with MIT License 5 votes vote down vote up
def _code_emb_init(self, code_emb, dicts):
        code_embs = KeyedVectors.load_word2vec_format(code_emb)
        weights = np.zeros(self.final.weight.size())
        for i in range(self.Y):
            code = dicts['ind2c'][i]
            weights[i] = code_embs[code]
        self.final.weight.data = torch.Tensor(weights).clone() 
Example #21
Source File: glove.py    From nlp-recipes with MIT License 5 votes vote down vote up
def load_pretrained_vectors(
    dir_path, file_name="glove.840B.300d.txt", limit=None
):
    """ Method that loads gloVe vectors. Downloads if it doesn't exist.

    Args:
        file_name(str): Name of the gloVe file.
        dir_path(str): Path to the directory where gloVe vectors exist or will be
        downloaded.
        limit(int): Number of word vectors that is loaded from gensim. This option
        allows us to save RAM space and avoid memory errors.

    Returns:
        gensim.models.keyedvectors.Word2VecKeyedVectors: Loaded word2vectors
    """

    file_path = _maybe_download_and_extract(dir_path, file_name)
    tmp_file = get_tmpfile("test_word2vec.txt")

    # Convert GloVe format to word2vec
    _ = glove2word2vec(file_path, tmp_file)

    model = KeyedVectors.load_word2vec_format(tmp_file, limit=limit)
    os.remove(tmp_file)

    return model 
Example #22
Source File: document_embedder.py    From fake-news-detection-pipeline with Apache License 2.0 5 votes vote down vote up
def _set_word2vec(self):
        if self.pretrained is None:
            raise ValueError("Pretrained word2vec path is not specified during instantiation")
        self._w2v = KeyedVectors.load_word2vec_format(self.pretrained, binary=True) 
Example #23
Source File: vectorizer.py    From medaCy with GNU General Public License v3.0 5 votes vote down vote up
def load_word_embeddings(self, embeddings_file):
        """Uses self.word_embeddings_file and gensim to load word embeddings into memory.

        :param embeddings_file: Word embeddings file to use. Can be .bin or other common formats.
        """
        is_binary = embeddings_file.endswith('.bin')
        word_vectors = KeyedVectors.load_word2vec_format(embeddings_file, binary=is_binary)
        self.word_vectors = word_vectors 
Example #24
Source File: config.py    From BREDS with GNU Lesser General Public License v3.0 5 votes vote down vote up
def read_word2vec(self):
        print("Loading word2vec model ...\n")
        self.word2vec = KeyedVectors.load_word2vec_format(self.word2vecmodelpath, binary=True)
        self.vec_dim = self.word2vec.vector_size
        print(self.vec_dim, "dimensions") 
Example #25
Source File: auxiliary_word2vec.py    From ZeroShotVideoClassification with Apache License 2.0 5 votes vote down vote up
def load_word2vec():
    try:
        wv_model = Word2Vec.load('/workplace/GoogleNews', mmap='r')
    except:
        wv_model = Word2Vec.load_word2vec_format(
            '/workplace/GoogleNews-vectors-negative300.bin', binary=True)
        wv_model.init_sims(replace=True)
        wv_model.save('assets/GoogleNews')
    return wv_model 
Example #26
Source File: wordembed.py    From PyShortTextCategorization with MIT License 5 votes vote down vote up
def load_word2vec_model(path, binary=True):
    """ Load a pre-trained Word2Vec model.

    :param path: path of the file of the pre-trained Word2Vec model
    :param binary: whether the file is in binary format (Default: True)
    :return: a pre-trained Word2Vec model
    :type path: str
    :type binary: bool
    :rtype: gensim.models.keyedvectors.KeyedVectors
    """
    return KeyedVectors.load_word2vec_format(path, binary=binary) 
Example #27
Source File: wordembed.py    From PyShortTextCategorization with MIT License 5 votes vote down vote up
def load_poincare_model(path, word2vec_format=True, binary=False):
    """ Load a Poincare embedding model.

    :param path: path of the file of the pre-trained Poincare embedding model
    :param word2vec_format: whether to load from word2vec format (default: True)
    :param binary: binary format (default: False)
    :return: a pre-trained Poincare embedding model
    :type path: str
    :type word2vec_format: bool
    :type binary: bool
    :rtype: gensim.models.poincare.PoincareKeyedVectors
    """
    if word2vec_format:
        return PoincareKeyedVectors.load_word2vec_format(path, binary=binary)
    else:
        return PoincareModel.load(path).kv 
Example #28
Source File: utils.py    From Text-Classification-Models-Pytorch with MIT License 5 votes vote down vote up
def get_word_embeddings(w2vfile, word_to_index, embedsize=300):
    '''
    For each word in our vocabulary, get the word2vec encoding of the word
    Inputs:
        w2vfile (string) : Path to the file containing (pre-trained) word embeddings
        embedsize (int) : Length of each word vector
    Returns:
        word_embeddings : Dictionary mapping each word to corresponding embedding
    '''

    word_embeddings = {}
    if w2vfile.endswith('.txt'):
        f = open(w2vfile)
        for line in tqdm(f):
            values = line.split(" ")
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            if word in word_to_index:
                word_embeddings[word] = coefs
        f.close()
    elif w2vfile.endswith('.bin'):
        word2vec = KeyedVectors.load_word2vec_format(w2vfile, binary=True, limit=1000000)
        for word in tqdm(word_to_index):
            try:
                word_embeddings[word] = word2vec[word.lower()]
            except KeyError:
                pass
    else:
        print ('Can\'t load word embeddings.')
        exit(-1)

    print('Found {0}/{1} word vectors.'.format(len(word_embeddings), len(word_to_index)))
    if len(word_to_index) > len(word_embeddings):
        print('Initializing remaining {} word vectors with zeros.'.format(len(word_to_index) - len(word_embeddings)))

    for word in word_to_index:
        if word not in word_embeddings:
            word_embeddings[word] = np.zeros((embedsize,))
    return word_embeddings 
Example #29
Source File: utils.py    From Text-Classification-Models-Pytorch with MIT License 5 votes vote down vote up
def get_word_embeddings(w2vfile, word_to_index, embedsize=300):
    '''
    For each word in our vocabulary, get the word2vec encoding of the word
    Inputs:
        w2vfile (string) : Path to the file containing (pre-trained) word embeddings
        embedsize (int) : Length of each word vector
    Returns:
        word_embeddings : Dictionary mapping each word to corresponding embedding
    '''

    word_embeddings = {}
    if w2vfile.endswith('.txt'):
        f = open(w2vfile)
        for line in tqdm(f):
            values = line.split(" ")
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            if word in word_to_index:
                word_embeddings[word] = coefs
        f.close()
    elif w2vfile.endswith('.bin'):
        word2vec = KeyedVectors.load_word2vec_format(w2vfile, binary=True, limit=1000000)
        for word in tqdm(word_to_index):
            try:
                word_embeddings[word] = word2vec[word.lower()]
            except KeyError:
                pass
    else:
        print ('Can\'t load word embeddings.')
        exit(-1)

    print('Found {0}/{1} word vectors.'.format(len(word_embeddings), len(word_to_index)))
    if len(word_to_index) > len(word_embeddings):
        print('Initializing remaining {} word vectors with zeros.'.format(len(word_to_index) - len(word_embeddings)))

    for word in word_to_index:
        if word not in word_embeddings:
            word_embeddings[word] = np.zeros((embedsize,))
    return word_embeddings 
Example #30
Source File: word2vec.py    From nlp-journey with Apache License 2.0 5 votes vote down vote up
def load_text(self):
        try:
            model = KeyedVectors.load_word2vec_format(self.model_path, self.vocab_path, binary=False)
        except FileNotFoundError:
            model = None
        return model