Python Examples of gensim.models.Word2Vec

Source File: build_w2v.py From text-classifier with Apache License 2.0

7 votes

def build(train_seg_path, test_seg_path, out_path=None, sentence_path='',
          w2v_bin_path="w2v.bin", min_count=1, col_sep='\t'):
    sentences = extract_sentence(train_seg_path, test_seg_path, col_sep=col_sep)
    save_sentence(sentences, sentence_path)
    print('train w2v model...')
    # train model
    w2v = Word2Vec(sg=1, sentences=LineSentence(sentence_path),
                   size=256, window=5, min_count=min_count, iter=40)
    w2v.wv.save_word2vec_format(w2v_bin_path, binary=True)
    print("save %s ok." % w2v_bin_path)
    # test
    # sim = w2v.wv.similarity('大', '小')
    # print('大 vs 小 similarity score:', sim)
    # load model
    model = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True)
    word_dict = {}
    for word in model.vocab:
        word_dict[word] = model[word]
    save_pkl(word_dict, out_path, overwrite=True)

Source File: node2vec.py From GraphEmbedding with MIT License

6 votes

def train(self, embed_size=128, window_size=5, workers=3, iter=5, **kwargs):

        kwargs["sentences"] = self.sentences
        kwargs["min_count"] = kwargs.get("min_count", 0)
        kwargs["size"] = embed_size
        kwargs["sg"] = 1
        kwargs["hs"] = 0  # node2vec not use Hierarchical Softmax
        kwargs["workers"] = workers
        kwargs["window"] = window_size
        kwargs["iter"] = iter

        print("Learning embedding vectors...")
        model = Word2Vec(**kwargs)
        print("Learning embedding vectors done!")

        self.w2v_model = model

        return model

Source File: pre_train.py From embeddings with Apache License 2.0

6 votes

def train_word2vec(input_file, output_file, skipgram, loss, size, epochs):
    """
    train_word2vec(args**) -> Takes the input file,
    the output file and the model hyperparameters as
    arguments and trains the model accordingly.
    The model is saved at the output location.

    Arguments
    ---------
    input_file : Input pre-processed wiki dump
    output_file : Output directory to save the model.
    skipgram : Layers of the model (0 - CBOW, 1 - Skipgram)
    loss : Loss Function (0 - Negative Sampling, 1 - Heirarichal Loss)
    size : Embedding size (100 ~ 300)
    epochs : Number of epochs
    """
    sentence = LineSentence(input_file)

    model = Word2Vec(sentence, sg=skipgram, hs=loss,
                     size=size, alpha=0.05, window=5,
                     min_count=5, workers=3, iter=epochs)

    model.save(output_file)

Source File: create_word2vec.py From dutchembeddings with GNU General Public License v2.0

6 votes

def create(basedir, num_workers=12, size=320, threshold=5):
        """
        Creates a word2vec model using the Gensim word2vec implementation.

        :param basedir: the dir from which to get the documents.
        :param num_workers: the number of workers to use for training word2vec
        :param size: the size of the resulting vectors.
        :param threshold: the frequency threshold.
        :return: the model.
        """

        logging.basicConfig(level=logging.INFO)
        sentences = SentenceIter(root=basedir)

        model = Word2Vec(sentences=sentences,
                         sg=True,
                         size=size,
                         workers=num_workers,
                         min_count=threshold,
                         window=11,
                         negative=15)
        model.save_word2vec_format("{0}-{1}.wordvecs", "{0}-{1}.vocab")

        return model

Source File: diffusion_2_vec.py From diff2vec with GNU General Public License v3.0

6 votes

def learn_pooled_embeddings(walks, counts, args):
    """
    Method to learn an embedding given the sequences and arguments.
    :param walks: Linear vertex sequences.
    :param counts: Number of nodes.
    :param args: Arguments.
    """
    model = Word2Vec(walks,
                     size=args.dimensions,
                     window=args.window_size,
                     min_count=1,
                     sg=1,
                     workers=args.workers,
                     iter=args.iter,
                     alpha=args.alpha)

    save_embedding(args, model, counts)

Source File: metapath2vec.py From cogdl with MIT License

6 votes

def train(self, G, node_type):
        self.G = G
        self.node_type = [str(a) for a in node_type]
        walks = self._simulate_walks(self.walk_length, self.walk_num, self.schema)
        walks = [[str(node) for node in walk] for walk in walks]
        model = Word2Vec(
            walks,
            size=self.dimension,
            window=self.window_size,
            min_count=0,
            sg=1,
            workers=self.worker,
            iter=self.iteration,
        )
        id2node = dict([(vid, node) for vid, node in enumerate(G.nodes())])
        embeddings = np.asarray([model.wv[str(id2node[i])] for i in range(len(id2node))])
        return embeddings

Source File: lex_sem_ft.py From DL-text with MIT License

6 votes

def sum_trigram(sent, model):
    sent = sent.split()
    first = True
    second = True
    tot = 0
    for i in range(len(sent)):
        try:
            if first:
                tot += model[None, None][sent[i]]
                first = False
            elif second:
                tot += model[None, sent[i-1]][sent[i]]
                second = False
            else:
                tot += model[sent[i-2], sent[i-1]][sent[i]]
        except:
            continue
    return tot

#Word2Vec Training(Returns Vector):

Source File: node2vec.py From cogdl with MIT License

6 votes

def train(self, G):
        self.G = G
        is_directed = nx.is_directed(self.G)
        for i, j in G.edges():
            G[i][j]["weight"] = G[i][j].get("weight", 1.0)
            if not is_directed:
                G[j][i]["weight"] = G[j][i].get("weight", 1.0)
        self._preprocess_transition_probs()
        walks = self._simulate_walks(self.walk_num, self.walk_length)
        walks = [[str(node) for node in walk] for walk in walks]
        model = Word2Vec(
            walks,
            size=self.dimension,
            window=self.window_size,
            min_count=0,
            sg=1,
            workers=self.worker,
            iter=self.iteration,
        )
        id2node = dict([(vid, node) for vid, node in enumerate(G.nodes())])
        self.embeddings = np.asarray(
            [model.wv[str(id2node[i])] for i in range(len(id2node))]
        )
        return self.embeddings

Source File: node2vec.py From entity2vec with Apache License 2.0

6 votes

def learn_embeddings(self, output, output_format='binary'):
        """
        Learn embeddings by optimizing the Skipgram objective using SGD.
        """

        self._simulate_walks()  # simulate random walks

        model = Word2Vec(self._walks, size=self.dimensions, window=self.window_size, min_count=0,
                         workers=self.workers, iter=self.iter, negative=25, sg=1)

        print("defined model using w2v")

        is_binary = output_format != 'text'
        model.wv.save_word2vec_format(output, binary=is_binary)

        actual_format = 'text' if output_format == 'text' else 'binary'
        print("saved model in word2vec %s format" % actual_format)

        return

Source File: deepwalk.py From cogdl with MIT License

6 votes

def train(self, G):
        self.G = G
        walks = self._simulate_walks(self.walk_length, self.walk_num)
        walks = [[str(node) for node in walk] for walk in walks]
        model = Word2Vec(
            walks,
            size=self.dimension,
            window=self.window_size,
            min_count=0,
            sg=1,
            workers=self.worker,
            iter=self.iteration,
        )
        id2node = dict([(vid, node) for vid, node in enumerate(G.nodes())])
        embeddings = np.asarray([model.wv[str(id2node[i])] for i in range(len(id2node))])
        return embeddings

Source File: keyword_word2vec.py From nlg-yongzhuo with MIT License

6 votes

def train_word2vec_by_word():
    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logging.info("running")

    inp = "cut_zhwiki_wiki_parse.txt"
    outp1 = "w2v_model_wiki.model"
    outp2 = "w2v_model_wiki_word.vec"

    print(multiprocessing.cpu_count())
    model = Word2Vec(LineSentence(inp), size=300, window=10,
                     # 这里用skip-heriber
                     min_count=1, sg=1, hs=1, iter=10, workers=multiprocessing.cpu_count())

    model.save(outp1)
    model.wv.save_word2vec_format(outp2, binary=False)

Source File: walkers.py From Splitter with GNU General Public License v3.0

6 votes

def learn_base_embedding(self):
        """
        Learning an embedding of nodes in the base graph.
        :return self.embedding: Embedding of nodes in the latent space.
        """
        self.paths = [[str(node) for node in walk] for walk in self.paths]

        model = Word2Vec(self.paths,
                         size=self.args.dimensions,
                         window=self.args.window_size,
                         min_count=1,
                         sg=1,
                         workers=self.args.workers,
                         iter=1)

        self.embedding = np.array([list(model[str(n)]) for n in self.graph.nodes()])
        return self.embedding

Source File: node2vec.py From entity2rec with Apache License 2.0

6 votes

def learn_embeddings(self, output):
        """
        Learn embeddings by optimizing the Skipgram objective using SGD.
        """

        walks = self._simulate_walks()  # simulate random walks

        model = Word2Vec(walks, size=self.dimensions, window=self.window_size, min_count=0,
                         workers=self.workers, iter=self.iter, negative=25, sg=1)

        print("defined model using w2v")

        model.wv.save_word2vec_format(output, binary=True)

        # free memory
        del walks
        self.alias_nodes = None
        self.alias_edges = None
        self.G = None

        print("saved model in word2vec binary format")

        return

Source File: deepwalk.py From GraphEmbedding with MIT License

6 votes

def train(self, embed_size=128, window_size=5, workers=3, iter=5, **kwargs):

        kwargs["sentences"] = self.sentences
        kwargs["min_count"] = kwargs.get("min_count", 0)
        kwargs["size"] = embed_size
        kwargs["sg"] = 1  # skip gram
        kwargs["hs"] = 1  # deepwalk use Hierarchical Softmax
        kwargs["workers"] = workers
        kwargs["window"] = window_size
        kwargs["iter"] = iter

        print("Learning embedding vectors...")
        model = Word2Vec(**kwargs)
        print("Learning embedding vectors done!")

        self.w2v_model = model
        return model

Source File: graph2vec.py From PyTorchText with MIT License

6 votes

def train_save(self, list_csv):
        sentences = MySentences(list_csv)
        num_features = 256
        min_word_count = 1
        num_workers = 20
        context = 5
        epoch = 20
        sample = 1e-5
        model = Word2Vec(
            sentences,
            size=num_features,
            min_count=min_word_count,
            workers=num_workers,
            sample=sample,
            window=context,
            iter=epoch,
        )
        #model.save(model_fn)
        return model

Source File: lex_sem_ft.py From DeepLearn with MIT License

6 votes

def sum_trigram(sent, model):
    sent = sent.split()
    first = True
    second = True
    tot = 0
    for i in range(len(sent)):
        try:
            if first:
                tot += model[None, None][sent[i]]
                first = False
            elif second:
                tot += model[None, sent[i-1]][sent[i]]
                second = False
            else:
                tot += model[sent[i-2], sent[i-1]][sent[i]]
        except:
            continue
    return tot

#Word2Vec Training(Returns Vector):

Source File: deepwalk.py From CogDL-TensorFlow with MIT License

6 votes

def train(self, G):
        self.G = G
        walks = self._simulate_walks(self.walk_length, self.walk_num)
        walks = [[str(node) for node in walk] for walk in walks]
        model = Word2Vec(
            walks,
            size=self.dimension,
            window=self.window_size,
            min_count=0,
            sg=1,
            workers=self.worker,
            iter=self.iteration,
        )
        id2node = dict([(vid, node) for vid, node in enumerate(G.nodes())])
        embeddings = np.asarray([model[str(id2node[i])] for i in range(len(id2node))])
        return embeddings

Source File: baseline.py From HARP with MIT License

6 votes

def skipgram_baseline(graph, **kwargs):
    scale = kwargs.get('scale', -1)
    representation_size = kwargs.get('representation_size', 128)

    if scale == 1:
        edges, weights = graph.get_edges()
    else:
        path_length = kwargs.get('path_length', 40)
        num_paths = kwargs.get('num_paths', 80)
        output = kwargs.get('output', 'default')
        edges = graph_coarsening.build_deepwalk_corpus(graph, num_paths, path_length, output)

    if kwargs['hs'] == 0:
        print ('Training the Negative Sampling Model...')
        model = Word2Vec(edges, size=representation_size, window=kwargs['window_size'], min_count=0, sg=1, hs=0, iter=kwargs['iter_count'], negative=5, workers=20)
    else:
        print ('Training the Hierarchical Softmax Model...')
        model = Word2Vec(edges, size=kwargs['representation_size'], window=kwargs['window_size'], min_count=0, sg=1, hs=1, iter=kwargs['iter_count'], workers=20)

    print ('Finish training the Skip-gram model.')
    return model

Source File: node2vec.py From CogDL-TensorFlow with MIT License

6 votes

def train(self, G):
        self.G = G
        is_directed = nx.is_directed(self.G)
        for i, j in G.edges():
            G[i][j]["weight"] = G[i][j].get("weight", 1.0)
            if not is_directed:
                G[j][i]["weight"] = G[j][i].get("weight", 1.0)
        self._preprocess_transition_probs()
        walks = self._simulate_walks(self.walk_num, self.walk_length)
        walks = [[str(node) for node in walk] for walk in walks]
        model = Word2Vec(
            walks,
            size=self.dimension,
            window=self.window_size,
            min_count=0,
            sg=1,
            workers=self.worker,
            iter=self.iteration,
        )
        id2node = dict([(vid, node) for vid, node in enumerate(G.nodes())])
        self.embeddings = np.asarray(
            [model[str(id2node[i])] for i in range(len(id2node))]
        )
        return self.embeddings

Source File: test_average.py From Fast_Sentence_Embeddings with GNU General Public License v3.0

6 votes

def test_cy_equal_np_w2v_random(self):
        w2v = Word2Vec(min_count=1, size=DIM)
        # Random initialization
        w2v.build_vocab(SENTENCES)

        m1 = Average(w2v)
        m1.prep.prepare_vectors(
            sv=m1.sv, total_sentences=len(self.sentences), update=False
        )
        m1._pre_train_calls()
        mem1 = m1._get_thread_working_mem()
        o1 = train_average_np(m1, self.sentences, m1.sv.vectors, mem1)

        m2 = Average(w2v)
        m2.prep.prepare_vectors(
            sv=m2.sv, total_sentences=len(self.sentences), update=False
        )
        m2._pre_train_calls()
        mem2 = m2._get_thread_working_mem()

        from fse.models.average_inner import train_average_cy

        o2 = train_average_cy(m2, self.sentences, m2.sv.vectors, mem2)

        self.assertTrue(np.allclose(m1.sv.vectors, m2.sv.vectors, atol=1e-6))

Source File: postprocessing.py From vec4ir with MIT License

6 votes

def uptrain(corpus,
            model_path=None,
            binary=True,
            lockf=0.0,
            min_count=1,
            size=300,
            **word2vec_params):
    wv = Word2Vec(min_count=min_count, size=size, **word2vec_params)
    print("Building vocabulary...")
    wv.build_vocab(corpus)
    print("Found %d distinct words." % len(wv.index2word))
    if model_path is not None:
        print("Intersecting with", model_path, "...")
        wv.intersect_word2vec_format(model_path, binary=binary, lockf=lockf)
        print("Intersected vectors locked with", lockf)

    total_examples = len(corpus)
    print("Training on %d documents..." % total_examples)
    wv.train(corpus, total_examples=total_examples)

    return wv

Source File: test_sif.py From Fast_Sentence_Embeddings with GNU General Public License v3.0

5 votes

def test_broken_vocab(self):
        w2v = Word2Vec(min_count=1, size=DIM)
        w2v.build_vocab([l.split() for l in open(CORPUS, "r")])
        for k in w2v.wv.vocab:
            w2v.wv.vocab[k].count = np.nan

        model = SIF(w2v)
        with self.assertRaises(RuntimeError):
            model.train(self.sentences)

Source File: pretrain_embedding.py From tf_CFO with MIT License

5 votes

def train(data_path, save_dir):
    sentences = []
    data_files = [os.path.join(os.path.dirname(data_path), file) for file in os.listdir(data_path)]
    for data_file in data_files:
        with open(data_file, 'r')as reader:
            for line in reader:
                question = line.strip().split('\t')[-1].lower()
                sentences.append(nltk.word_tokenize(question))

    model = Word2Vec(sentences, size=300, min_count=1, window=5, sg=1, iter=10)
    weights = model.wv.syn0
    d = dict([(k, v.index) for k, v in model.wv.vocab.items()])

    embeddings_index = {}
    for item in d:
        embeddings_index[item] = weights[d[item], :]
    pickle_save(embeddings_index, os.path.join(save_dir, 'fb_word2vec_300d.pkl'))

    word2idx = {}
    for idx, word in enumerate(embeddings_index.keys()):
        word2idx[word] = idx+1  # index 0 refers to unknown token
    pickle_save(word2idx, os.path.join(save_dir, 'fb_word2idx.pkl'))
   
    char2idx = {'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10,
                'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19,
                't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26, '0': 27, '1':28,
                '2': 29, '3': 30, '4': 31, '5': 32, '6': 33, '7': 34, '8': 35, '9': 36}
    pickle_save(char2idx, os.path.join(save_dir, 'fb_char2idx.pkl'))

Source File: __main__.py From GraphEmbeddingRecommendationSystem with MIT License

5 votes

def process(args):
    # Create a graph from the training set
    nodedict = graph.records_to_graph()
    # print(args)

    # Build the model using DeepWalk and Word2Vec
    G = graph.load_adjacencylist("out.adj", undirected=True)
    # YOUR CODE HERE
    # print(args.number_walks)
    # walk = graph.build_deepwalk_corpus(G, 2, 4, alpha=0,rand=random.Random(0))
    walk = graph.build_deepwalk_corpus(G, args.number_walks, args.walk_length, alpha=0,rand=random.Random(0))
    print len(walk)
    model = Word2Vec(walk, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers)
    print model                    
    # Namespace(csv_to_graph=True, loo=True, max_memory_data_size=1000000000, number_walks=10, representation_size=64, seed=0, walk_length=40, window_size=5, workers=1)
    # Perform some evaluation of the model on the test dataset
    with open("./data/test_user_ratings.dat") as fin:
        fin.next()
        groundtruth = [line.strip().split("\t")[:3] for line in fin]    # (user, movie, rating)
    tr = [int(round(float(g[2]))) for g in groundtruth]
    # print(groundtruth)
    pr = [predict_rating(model, nodedict, "u"+g[0], "m"+g[1]) for g in groundtruth]
    # print(pr)
    print "MSE = %f" % mean_squared_error(tr, pr)
    print "accuracy = %f" % accuracy_score(tr, pr)
    cm = confusion_matrix(tr, pr, labels=range(1,6))
    print cm

Source File: link_prediction.py From node2vec_linkprediction with MIT License

5 votes

def learn_embeddings(self, walks, dimensions, window_size=10, niter=5):
        '''
        Learn embeddings by optimizing the Skipgram objective using SGD.
        '''
        # TODO: Python27 only
        walks = [map(str, walk) for walk in walks]
        model = Word2Vec(walks,
                         size=dimensions,
                         window=window_size,
                         min_count=0,
                         sg=1,
                         workers=self.workers,
                         iter=niter)
        self.wvecs = model.wv

Source File: test_query_expansion.py From vec4ir with MIT License

5 votes

def test_embedded_query_expansion():
    model = Word2Vec([doc.split() for doc in DOCUMENTS], iter=1, min_count=1)
    m = 2
    expansion = EmbeddedQueryExpansion(model.wv, m=m)
    expansion.fit(DOCUMENTS)
    query = "surf"
    expanded_query = expansion.transform(query)
    # surf => surf surf Surfing
    print(query, expanded_query, sep='=>')
    assert len(expanded_query.split()) == len(query.split()) + m

Source File: test_query_expansion.py From vec4ir with MIT License

5 votes

def test_centroid_expansion():
    model = Word2Vec([doc.split() for doc in DOCUMENTS], iter=1, min_count=1)
    m = 2
    expansion = CentroidExpansion(model.wv, m=m)
    expansion.fit(DOCUMENTS)
    query = "surf"
    expanded_query = expansion.transform(query)
    # surf => surf surf Surfing
    print(query, expanded_query, sep='=>')
    assert len(expanded_query.split()) == len(query.split()) + m

Source File: test_vec4ir.py From vec4ir with MIT License

5 votes

def test_combined():
    model = Word2Vec([doc.split() for doc in documents], iter=1, min_count=1)
    wcd = WordCentroidDistance(model.wv)
    tfidf = Tfidf()

    wcd.fit(documents)
    # # they can operate on different feilds
    tfidf.fit(['fox', 'scientists'])
    match_op = Matching().fit(documents)

    combined = wcd + tfidf ** 2

    retrieval = Retrieval(combined, matching=match_op, labels=[7,42])
    result = retrieval.query('fox')
    assert result[0] == 7 
    result = retrieval.query('scientists')
    assert result[0] == 42


# # PYEMD is required
# def test_wordmovers():
#     model = Word2Vec([doc.split() for doc in documents], iter=1, min_count=1)
#     match_op = Matching()
#     wmd = WordMoversDistance(model.wv)
#     retrieval = Retrieval(wmd, matching=match_op)
#     retrieval.fit(documents)
#     result = retrieval.query('dog')
#     assert result[0] == 0

Source File: test_vec4ir.py From vec4ir with MIT License

5 votes

def test_word2vec():
    model = Word2Vec([doc.split() for doc in documents], iter=1, min_count=1)
    match_op = Matching()
    with pytest.raises(ValueError):
        wcd = WordCentroidDistance(model)

    wcd = WordCentroidDistance(model.wv)
    retrieval = Retrieval(wcd, matching=match_op)
    retrieval.fit(documents)
    result = retrieval.query('dog')
    assert result[0] == 0

Source File: train.py From DeepNews with Apache License 2.0

5 votes

def train_word_2_vec(self,model_save_file_name='../../temp_results/word2vec_hindi.txt'):
        model = Word2Vec(LineSentence(self.raw_file_name), size=300,workers=multiprocessing.cpu_count())
        model.wv.save_word2vec_format(model_save_file_name, binary=False)

Python gensim.models.Word2Vec() Examples