Python Examples of gensim.models.KeyedVectors.load_word2vec

Source File: embeddings.py From steppy-toolkit with MIT License

11 votes

def load_word2vec_embeddings(filepath, tokenizer, max_features, embedding_size):
    model = KeyedVectors.load_word2vec_format(filepath, binary=True)

    emb_mean, emb_std = model.wv.syn0.mean(), model.wv.syn0.std()

    word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embedding_size))
    for word, i in word_index.items():
        if i >= max_features:
            continue
        try:
            embedding_vector = model[word]
            embedding_matrix[i] = embedding_vector
        except KeyError:
            continue
    return embedding_matrix

Source File: loadEmbeddings.py From acl2017-interactive_summarizer with Apache License 2.0

6 votes

def loadEmbeddings(self, filepath, data_path, vocab_size, binary_val):
        embed_short = os.path.normpath("%s/embed.dat" % data_path)
        if not os.path.exists(embed_short):
            print("Caching word embeddings in memmapped format...")
            print(binary_val, filepath)
            wv = KeyedVectors.load_word2vec_format("%s" % (filepath), binary=binary_val)
            fp = np.memmap(embed_short, dtype=np.double, mode='w+', shape=wv.syn0.shape)
            fp[:] = wv.syn0[:]
            with open(os.path.normpath("%s/embed.vocab" % data_path), "w") as fp:
                for _, w in sorted((voc.index, word) for word, voc in wv.vocab.items()):
                    fp.write("%s\n"%(w.encode("utf8")))
            del fp, wv
            
        self.W = np.memmap(os.path.normpath("%s/embed.dat" % data_path), dtype=np.double, mode="r", shape=(vocab_size, self.embedding_size))
        with codecs.open(os.path.normpath("%s/embed.vocab" % data_path), 'r', 'utf-8') as f:
            vocab_list = [x.strip() for x in f.readlines()]
        self.vocab_dict = {w: k for k, w in enumerate(vocab_list)}

Source File: models.py From open-solution-toxic-comments with MIT License

6 votes

def _get_embedding_matrix(self, tokenizer):
        model = KeyedVectors.load_word2vec_format(self.pretrained_filepath, binary=True)

        emb_mean, emb_std = model.syn0.mean(), model.syn0.std()

        word_index = tokenizer.word_index
        nb_words = min(self.max_features, len(word_index))
        embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, self.embedding_size))
        for word, i in word_index.items():
            if i >= self.max_features:
                continue
            try:
                embedding_vector = model[word]
                embedding_matrix[i] = embedding_vector
            except KeyError:
                continue
        return embedding_matrix

Source File: preprocessing.py From R-NET-in-Keras with MIT License

6 votes

def word2vec(word2vec_path):
    # Download word2vec data if it's not present yet
    if not path.exists(word2vec_path):
        glove_file_path = get_glove_file_path()
        print('Converting Glove to word2vec...', end='')
        glove2word2vec(glove_file_path, word2vec_path)  # Convert glove to word2vec
        os.remove(glove_file_path)                      # Remove glove file and keep only word2vec
        print('Done')

    print('Reading word2vec data... ', end='')
    model = KeyedVectors.load_word2vec_format(word2vec_path)
    print('Done')

    def get_word_vector(word):
        try:
            return model[word]
        except KeyError:
            return np.zeros(model.vector_size)

    return get_word_vector

Source File: embeddings.py From open-solution-mapping-challenge with MIT License

6 votes

def load_word2vec_embeddings(filepath, tokenizer, max_features, embedding_size):
    model = KeyedVectors.load_word2vec_format(filepath, binary=True)

    emb_mean, emb_std = model.wv.syn0.mean(), model.wv.syn0.std()

    word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embedding_size))
    for word, i in word_index.items():
        if i >= max_features:
            continue
        try:
            embedding_vector = model[word]
            embedding_matrix[i] = embedding_vector
        except KeyError:
            continue
    return embedding_matrix

Source File: pick_wordvec.py From CAMP_iccv19 with Apache License 2.0

6 votes

def main(opt):
    vocab = pickle.load(open(opt.vocab_path, 'rb'))
    num = len(vocab)
    print (num)
    model = KeyedVectors.load_word2vec_format(opt.embed_weight, binary=True)
    

    matrix_len = num
    weights_matrix = np.zeros((num, 300))
    words_found = 0 
    mask = np.zeros(num, dtype=int)

    for i, word in enumerate(vocab.idx2word):
        try: 
            weights_matrix[i] = model[vocab.idx2word[i]]
            words_found += 1
            mask[i] = 1
        except KeyError:
            weights_matrix[i] = np.random.normal(scale=0.1, size=(300, ))

    print (words_found)

    np.save("./embed/f30kword2vec300dim_3.npy", weights_matrix)
    np.save("./embed/f30kword2vecmask_3.npy", mask)

Source File: file_manage.py From resilient-community-apps with MIT License

6 votes

def get_summary_nlp(self):
        """
        Return a summary of a NLP model file
        :return:
        """
        ret = []
        try:
            word2vec = KeyedVectors.load_word2vec_format(self.filename, binary=False)
            mtime = self._get_mtime()
            dim_vectors = word2vec.vector_size
            word_count = len(word2vec.vectors)

            ret.append("---------------------------")
            ret.append("Summary for NLP model file:")
            ret.append("---------------------------")
            ret.append(self.FILE_NAME_OUTPUT.format(self.filename))
            ret.append(self.LAST_MODIFICATION_TIME.format(mtime))
            ret.append(self.FEATURE_DIMENSION.format(dim_vectors))
            ret.append(self.NUM_SENTENCES.format(word_count))
            ret.append("\n")
        except Exception as e:
            ret.append("Failed to read NLP model {}.".format(self.filename))
            ret.append("Error: {}".format(e))

        return ret

Source File: np2vec.py From nlp-architect with Apache License 2.0

6 votes

def load(cls, np2vec_model_file, binary=False, word_ngrams=0, word2vec_format=True):
        """
        Load the np2vec model.

        Args:
            np2vec_model_file (str): the file containing the np2vec model to load
            binary (bool): boolean indicating whether the np2vec model to load is in binary format
            word_ngrams (int {1,0}): If 1, np2vec model to load uses word vectors with subword (
            ngrams) information.
            word2vec_format(bool): boolean indicating whether the model to load has been stored in
            original word2vec format.

        Returns:
            np2vec model to load
        """
        if word_ngrams == 0:
            if word2vec_format:
                return KeyedVectors.load_word2vec_format(np2vec_model_file, binary=binary)
            return KeyedVectors.load(np2vec_model_file, mmap="r")
        if word_ngrams == 1:
            return FastText.load(np2vec_model_file)
        logger.error("invalid value for 'word_ngrams'")
        return None

Source File: embeddings.py From saber with MIT License

6 votes

def _prepare_embedding_index(self, binary=True):
        """Returns an embedding index for pre-trained token embeddings.

        For pre-trained word embeddings given at `self.filepath`, returns a
        dictionary mapping words to their embedding (an 'embedding index'). If `self.debug` is
        True, only the first ten thousand vectors are loaded.

        Args:
            binary (bool): True if pre-trained embeddings are in C binary format, False if they are
                in C text format. Defaults to True.

        Returns:
            Dictionary mapping words to pre-trained word embeddings, known as an 'embedding index'.
        """
        limit = 10000 if self.__dict__.get("debug", False) else None
        vectors = KeyedVectors.load_word2vec_format(self.filepath, binary=binary, limit=limit)
        embedding_idx = {word: vectors[word] for word in vectors.vocab}

        return embedding_idx

Source File: embed.py From verejne.digital with Apache License 2.0

6 votes

def __init__(self, all_texts):
        # Creating the model
        print("Reading the pretrained model for Word2VecEmbedder")
        self.sk_model = KeyedVectors.load_word2vec_format(
            '/data/verejne/datautils/embedding_data/slovak.vec', encoding='utf-8', unicode_errors='ignore')
        print("Model contains", len(self.sk_model.vocab), "tokens")
        print(self.sk_model.similarity("mesto", "mesta"))
        self.dimension = len(self.sk_model["auto"])
        print("sídlisk" in self.sk_model)
        print("sídlisk".encode('utf8') in self.sk_model)

        print("Dimension of embedding of 'auto' is", self.dimension)
        # Create frequency table for words
        if all_texts is None:
            return

        for text in all_texts:
            self.add_text_to_corpus(text)
        self.print_corpus_stats()

Source File: matcher.py From HIT-SCIR-CoNLL2019 with Apache License 2.0

5 votes

def init_word2vec(filename, binary=False):
    global _word2vec
    _word2vec = KeyedVectors.load_word2vec_format(filename, binary=binary)

Source File: contractions.py From pycontractions with BSD 3-Clause "New" or "Revised" License

5 votes

def load_models(self):
        """Attempt to find/load/download keyedvector model."""
        if self.kv_model is not None:
            if not hasattr(self.kv_model, 'wmdistance'):
                raise AttributeError("Model does not support Word Mover's Distance, must be in keyedvectors format")

        elif self.w2v_path is not None:
            if not os.path.exists(self.w2v_path):
                print("Word2Vec model not found at {}".format(self.w2v_path))
                sys.exit(1)
            try:
                self.kv_model = KeyedVectors.load_word2vec_format(self.w2v_path, binary=True)
            except:
                print("Error loading Word2Vec model")
                raise
        elif self.api_key is not None:
            try:
                self.kv_model = api.load(self.api_key)
            except:
                print("Error downloading model {}".format(self.api_key))
                raise
            if not hasattr(self.kv_model, 'wmdistance'):
                raise AttributeError("Model does not support Word Mover's Distance, must be in keyedvectors format")
        else:
            raise AttributeError("No model given")

        try:
            self.lc_tool = language_check.LanguageTool(self.lang_code)
        except:
            print("Error initializing LanguageTool")
            raise

Source File: corpus.py From ner with Apache License 2.0

5 votes

def load_embeddings(self, file_path):
        # Embeddins must be in fastText format either bin or
        print('Loading embeddins...')
        if file_path.endswith('.bin'):
            from gensim.models.wrappers import FastText
            embeddings = FastText.load_fasttext_format(file_path)
        else:
            from gensim.models import KeyedVectors
            embeddings = KeyedVectors.load_word2vec_format(file_path)
        return embeddings

Source File: nlp.py From bugbug with Mozilla Public License 2.0

5 votes

def get_word_embeddings():
    word_embeddings = KeyedVectors.load_word2vec_format("wiki-news-300d-1M-subword.vec")
    word_embeddings.init_sims(replace=True)
    return word_embeddings

Source File: NER_model.py From DeepPavlov with Apache License 2.0

5 votes

def load_pretrained_word_emb(self, model_path, model_name, word_dim, word2id=None, vocab_size=None):
        loaded_words = 0
        if word2id is not None:
            vocab_size = len(word2id)
        word_embeddings = np.zeros(shape=(vocab_size, word_dim))

        if model_name == "glove":
            model = KeyedVectors.load_word2vec_format(model_path, binary=False)
            for word in word2id:
                if word in model:
                    word_embeddings[word2id[word]] = model[word]
                    loaded_words += 1
        elif model_name == "baomoi":
            model = KeyedVectors.load_word2vec_format(model_path, binary=True, unicode_errors='ignore')
            for word in word2id:
                if len(word) == 1:
                    if word[0] in string.punctuation:
                        word_embeddings[word2id[word]] = model["<punct>"]
                        loaded_words += 1
                elif word.isdigit():
                    word_embeddings[word2id[word]] = model["<number>"]
                    loaded_words += 1
                elif word in model.vocab:
                    word_embeddings[word2id[word]] = model[word]
                    loaded_words += 1
        elif model_name is not None:
            raise RuntimeError(f'got an unexpected value for model_name: `{model_name}`')

        log.info(f"{loaded_words}/{vocab_size} words were loaded from {model_path}.")
        return word_embeddings

Source File: glove_embedder.py From DeepPavlov with Apache License 2.0

5 votes

def load(self) -> None:
        """
        Load dict of embeddings from given file
        """
        log.info(f"[loading GloVe embeddings from `{self.load_path}`]")
        if not self.load_path.exists():
            log.warning(f'{self.load_path} does not exist, cannot load embeddings from it!')
            return
        self.model = KeyedVectors.load_word2vec_format(str(self.load_path))
        self.dim = self.model.vector_size

Source File: nlp_word2vec.py From resilient-community-apps with MIT License

5 votes

def load_model(self, file_name=None):
        """
        Load a saved model
        :param file_name: [optional] model file. Use default if None
        :return:
        """
        model_file = file_name if file_name else FileManage.DEFAULT_NLP_FILE
        try:
            self.word2vec = KeyedVectors.load_word2vec_format(model_file, binary=False)
        except Exception as e:
            self.log.error("Failed to load a saved model {}".format(model_file))

Source File: feature_engineering.py From CIKM-AnalytiCup-2018 with Apache License 2.0

5 votes

def __init__(self, train_df, test_df, unlabeled_df, data_loader, normalization=True):
        self.train_df = train_df
        self.test_df = test_df
        self.unlabeled_df = unlabeled_df

        self.data_loader = data_loader
        self.stopwords = self.data_loader.load_stopwords()

        self.train_df['splited_spn_1'] = self.train_df['spn_1'].apply(lambda v: v.split())
        self.train_df['splited_spn_2'] = self.train_df['spn_2'].apply(lambda v: v.split())
        self.unlabeled_df['splited_spn_1'] = self.unlabeled_df['spn_1'].apply(lambda v: v.split())

        self.test_df['splited_spn_1'] = self.test_df['spn_1'].apply(lambda v: v.split())
        self.test_df['splited_spn_2'] = self.test_df['spn_2'].apply(lambda v: v.split())

        self.normalization = normalization

        docs = self.train_df['splited_spn_1'].values.tolist() + self.train_df['splited_spn_2'].values.tolist() + \
               self.test_df['splited_spn_1'].values.tolist() + self.test_df['splited_spn_2'].values.tolist() + self.unlabeled_df['splited_spn_1'].tolist()
        docs = np.array(docs)
        docs = np.unique(docs)
        docs = docs.tolist()
        
        docs_raw = self.train_df['spn_1'].values.tolist() + self.train_df['spn_2'].values.tolist() + \
                   self.test_df['spn_1'].values.tolist() + self.test_df['spn_2'].values.tolist() + self.unlabeled_df['spn_1'].tolist()
        docs_raw = np.array(docs_raw)
        docs_raw = np.unique(docs_raw)
        docs_raw = docs_raw.tolist()

        self.tfidf_vectorizer = TfidfVectorizer()
        self.tfidf_vectorizer.fit(docs_raw)

        self.bm25_scorer = bm25.bm25Scorer(docs=docs)

        #print("[FE] Loading the word2vec model")
        #self.word2vec_model = KeyedVectors.load_word2vec_format(dataset_config.SPANISH_WORDVEC_PATH)
        #self.word2vec_model.init_sims(replace=True)
        #print("[FE] Loaded the word2vec mdoel")

        self.build_statistic()

Source File: models.py From caml-mimic with MIT License

5 votes

def _code_emb_init(self, code_emb, dicts):
        code_embs = KeyedVectors.load_word2vec_format(code_emb)
        weights = np.zeros(self.final.weight.size())
        for i in range(self.Y):
            code = dicts['ind2c'][i]
            weights[i] = code_embs[code]
        self.U.weight.data = torch.Tensor(weights).clone()
        self.final.weight.data = torch.Tensor(weights).clone()

Source File: models.py From caml-mimic with MIT License

5 votes

def _code_emb_init(self, code_emb, dicts):
        code_embs = KeyedVectors.load_word2vec_format(code_emb)
        weights = np.zeros(self.final.weight.size())
        for i in range(self.Y):
            code = dicts['ind2c'][i]
            weights[i] = code_embs[code]
        self.final.weight.data = torch.Tensor(weights).clone()

Source File: glove.py From nlp-recipes with MIT License

5 votes

def load_pretrained_vectors(
    dir_path, file_name="glove.840B.300d.txt", limit=None
):
    """ Method that loads gloVe vectors. Downloads if it doesn't exist.

    Args:
        file_name(str): Name of the gloVe file.
        dir_path(str): Path to the directory where gloVe vectors exist or will be
        downloaded.
        limit(int): Number of word vectors that is loaded from gensim. This option
        allows us to save RAM space and avoid memory errors.

    Returns:
        gensim.models.keyedvectors.Word2VecKeyedVectors: Loaded word2vectors
    """

    file_path = _maybe_download_and_extract(dir_path, file_name)
    tmp_file = get_tmpfile("test_word2vec.txt")

    # Convert GloVe format to word2vec
    _ = glove2word2vec(file_path, tmp_file)

    model = KeyedVectors.load_word2vec_format(tmp_file, limit=limit)
    os.remove(tmp_file)

    return model

Source File: document_embedder.py From fake-news-detection-pipeline with Apache License 2.0

5 votes

def _set_word2vec(self):
        if self.pretrained is None:
            raise ValueError("Pretrained word2vec path is not specified during instantiation")
        self._w2v = KeyedVectors.load_word2vec_format(self.pretrained, binary=True)

Source File: vectorizer.py From medaCy with GNU General Public License v3.0

5 votes

def load_word_embeddings(self, embeddings_file):
        """Uses self.word_embeddings_file and gensim to load word embeddings into memory.

        :param embeddings_file: Word embeddings file to use. Can be .bin or other common formats.
        """
        is_binary = embeddings_file.endswith('.bin')
        word_vectors = KeyedVectors.load_word2vec_format(embeddings_file, binary=is_binary)
        self.word_vectors = word_vectors

Source File: config.py From BREDS with GNU Lesser General Public License v3.0

5 votes

def read_word2vec(self):
        print("Loading word2vec model ...\n")
        self.word2vec = KeyedVectors.load_word2vec_format(self.word2vecmodelpath, binary=True)
        self.vec_dim = self.word2vec.vector_size
        print(self.vec_dim, "dimensions")

Source File: auxiliary_word2vec.py From ZeroShotVideoClassification with Apache License 2.0

5 votes

def load_word2vec():
    try:
        wv_model = Word2Vec.load('/workplace/GoogleNews', mmap='r')
    except:
        wv_model = Word2Vec.load_word2vec_format(
            '/workplace/GoogleNews-vectors-negative300.bin', binary=True)
        wv_model.init_sims(replace=True)
        wv_model.save('assets/GoogleNews')
    return wv_model

Source File: wordembed.py From PyShortTextCategorization with MIT License

5 votes

def load_word2vec_model(path, binary=True):
    """ Load a pre-trained Word2Vec model.

    :param path: path of the file of the pre-trained Word2Vec model
    :param binary: whether the file is in binary format (Default: True)
    :return: a pre-trained Word2Vec model
    :type path: str
    :type binary: bool
    :rtype: gensim.models.keyedvectors.KeyedVectors
    """
    return KeyedVectors.load_word2vec_format(path, binary=binary)

Source File: wordembed.py From PyShortTextCategorization with MIT License

5 votes

def load_poincare_model(path, word2vec_format=True, binary=False):
    """ Load a Poincare embedding model.

    :param path: path of the file of the pre-trained Poincare embedding model
    :param word2vec_format: whether to load from word2vec format (default: True)
    :param binary: binary format (default: False)
    :return: a pre-trained Poincare embedding model
    :type path: str
    :type word2vec_format: bool
    :type binary: bool
    :rtype: gensim.models.poincare.PoincareKeyedVectors
    """
    if word2vec_format:
        return PoincareKeyedVectors.load_word2vec_format(path, binary=binary)
    else:
        return PoincareModel.load(path).kv

Source File: utils.py From Text-Classification-Models-Pytorch with MIT License

5 votes

def get_word_embeddings(w2vfile, word_to_index, embedsize=300):
    '''
    For each word in our vocabulary, get the word2vec encoding of the word
    Inputs:
        w2vfile (string) : Path to the file containing (pre-trained) word embeddings
        embedsize (int) : Length of each word vector
    Returns:
        word_embeddings : Dictionary mapping each word to corresponding embedding
    '''

    word_embeddings = {}
    if w2vfile.endswith('.txt'):
        f = open(w2vfile)
        for line in tqdm(f):
            values = line.split(" ")
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            if word in word_to_index:
                word_embeddings[word] = coefs
        f.close()
    elif w2vfile.endswith('.bin'):
        word2vec = KeyedVectors.load_word2vec_format(w2vfile, binary=True, limit=1000000)
        for word in tqdm(word_to_index):
            try:
                word_embeddings[word] = word2vec[word.lower()]
            except KeyError:
                pass
    else:
        print ('Can\'t load word embeddings.')
        exit(-1)

    print('Found {0}/{1} word vectors.'.format(len(word_embeddings), len(word_to_index)))
    if len(word_to_index) > len(word_embeddings):
        print('Initializing remaining {} word vectors with zeros.'.format(len(word_to_index) - len(word_embeddings)))

    for word in word_to_index:
        if word not in word_embeddings:
            word_embeddings[word] = np.zeros((embedsize,))
    return word_embeddings

Source File: utils.py From Text-Classification-Models-Pytorch with MIT License

5 votes

def get_word_embeddings(w2vfile, word_to_index, embedsize=300):
    '''
    For each word in our vocabulary, get the word2vec encoding of the word
    Inputs:
        w2vfile (string) : Path to the file containing (pre-trained) word embeddings
        embedsize (int) : Length of each word vector
    Returns:
        word_embeddings : Dictionary mapping each word to corresponding embedding
    '''

    word_embeddings = {}
    if w2vfile.endswith('.txt'):
        f = open(w2vfile)
        for line in tqdm(f):
            values = line.split(" ")
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            if word in word_to_index:
                word_embeddings[word] = coefs
        f.close()
    elif w2vfile.endswith('.bin'):
        word2vec = KeyedVectors.load_word2vec_format(w2vfile, binary=True, limit=1000000)
        for word in tqdm(word_to_index):
            try:
                word_embeddings[word] = word2vec[word.lower()]
            except KeyError:
                pass
    else:
        print ('Can\'t load word embeddings.')
        exit(-1)

    print('Found {0}/{1} word vectors.'.format(len(word_embeddings), len(word_to_index)))
    if len(word_to_index) > len(word_embeddings):
        print('Initializing remaining {} word vectors with zeros.'.format(len(word_to_index) - len(word_embeddings)))

    for word in word_to_index:
        if word not in word_embeddings:
            word_embeddings[word] = np.zeros((embedsize,))
    return word_embeddings

Source File: word2vec.py From nlp-journey with Apache License 2.0

5 votes

def load_text(self):
        try:
            model = KeyedVectors.load_word2vec_format(self.model_path, self.vocab_path, binary=False)
        except FileNotFoundError:
            model = None
        return model

Python gensim.models.KeyedVectors.load_word2vec_format() Examples