Python Examples of gensim.models.word2vec

Source File: rulebase.py From Chatbot with GNU General Public License v3.0

6 votes

def load_model(self,path):

        """
        Load a trained word2vec model(binary format only).

        Args:
            path: the path of the model.
        """
        try:
            self.model = models.Word2Vec.load(path)  # current loading method
        except FileNotFoundError as file_not_found_err:
            print("[Gensim] FileNotFoundError", file_not_found_err)
            exit()
        except UnicodeDecodeError as unicode_decode_err:
            print("[Gensim] UnicodeDecodeError", unicode_decode_err)
            self.model = models.KeyedVectors.load_word2vec_format(path, binary=True)  # old loading method
        except Exception as ex:
            print("[Gensim] Exception", ex)
            exit()

Source File: data_helpers.py From Text-Pairs-Relation-Classification with Apache License 2.0

6 votes

def create_metadata_file(word2vec_file, output_file):
    """
    Create the metadata file based on the corpus file (Used for the Embedding Visualization later).

    Args:
        word2vec_file: The word2vec file
        output_file: The metadata file path
    Raises:
        IOError: If word2vec model file doesn't exist
    """
    if not os.path.isfile(word2vec_file):
        raise IOError("[Error] The word2vec file doesn't exist.")

    model = gensim.models.Word2Vec.load(word2vec_file)
    word2idx = dict([(k, v.index) for k, v in model.wv.vocab.items()])
    word2idx_sorted = [(k, word2idx[k]) for k in sorted(word2idx, key=word2idx.get, reverse=False)]

    with open(output_file, 'w+') as fout:
        for word in word2idx_sorted:
            if word[0] is None:
                print("[Warning] Empty Line, should replaced by any thing else, or will cause a bug of tensorboard")
                fout.write('<Empty Line>' + '\n')
            else:
                fout.write(word[0] + '\n')

Source File: data_helpers.py From Text-Pairs-Relation-Classification with Apache License 2.0

6 votes

def load_word2vec_matrix(word2vec_file):
    """
    Return the word2vec model matrix.

    Args:
        word2vec_file: The word2vec file
    Returns:
        The word2vec model matrix
    Raises:
        IOError: If word2vec model file doesn't exist
    """
    if not os.path.isfile(word2vec_file):
        raise IOError("[Error] The word2vec file doesn't exist. ")

    model = gensim.models.Word2Vec.load(word2vec_file)
    vocab_size = model.wv.vectors.shape[0]
    embedding_size = model.vector_size
    vocab = dict([(k, v.index) for k, v in model.wv.vocab.items()])
    embedding_matrix = np.zeros([vocab_size, embedding_size])
    for key, value in vocab.items():
        if key is not None:
            embedding_matrix[value] = model[key]
    return vocab_size, embedding_size, embedding_matrix

Source File: data_helpers.py From Text-Pairs-Relation-Classification with Apache License 2.0

6 votes

def load_data_and_labels(data_file, word2vec_file):
    """
    Load research data from files, splits the data into words and generates labels.
    Return split sentences, labels and the max sentence length of the research data.

    Args:
        data_file: The research data
        word2vec_file: The word2vec model file
    Returns:
        The class Data
    """
    # Load word2vec file
    if not os.path.isfile(word2vec_file):
        raise IOError("[Error] The word2vec file doesn't exist. ")

    model = word2vec.Word2Vec.load(word2vec_file)

    # Load data from files and split by words
    data = data_word2vec(input_file=data_file, word2vec_model=model)
    # plot_seq_len(data_file, data)

    return data

Source File: data_helpers.py From Multi-Label-Text-Classification with Apache License 2.0

6 votes

def create_metadata_file(word2vec_file, output_file):
    """
    Create the metadata file based on the corpus file (Used for the Embedding Visualization later).

    Args:
        word2vec_file: The word2vec file
        output_file: The metadata file path
    Raises:
        IOError: If word2vec model file doesn't exist
    """
    if not os.path.isfile(word2vec_file):
        raise IOError("[Error] The word2vec file doesn't exist.")

    model = gensim.models.Word2Vec.load(word2vec_file)
    word2idx = dict([(k, v.index) for k, v in model.wv.vocab.items()])
    word2idx_sorted = [(k, word2idx[k]) for k in sorted(word2idx, key=word2idx.get, reverse=False)]

    with open(output_file, 'w+') as fout:
        for word in word2idx_sorted:
            if word[0] is None:
                print("[Warning] Empty Line, should replaced by any thing else, or will cause a bug of tensorboard")
                fout.write('<Empty Line>' + '\n')
            else:
                fout.write(word[0] + '\n')

Source File: data_helpers.py From Multi-Label-Text-Classification with Apache License 2.0

6 votes

def load_word2vec_matrix(word2vec_file):
    """
    Return the word2vec model matrix.

    Args:
        word2vec_file: The word2vec file
    Returns:
        The word2vec model matrix
    Raises:
        IOError: If word2vec model file doesn't exist
    """
    if not os.path.isfile(word2vec_file):
        raise IOError("[Error] The word2vec file doesn't exist. ")

    model = gensim.models.Word2Vec.load(word2vec_file)
    vocab_size = model.wv.vectors.shape[0]
    embedding_size = model.vector_size
    vocab = dict([(k, v.index) for k, v in model.wv.vocab.items()])
    embedding_matrix = np.zeros([vocab_size, embedding_size])
    for key, value in vocab.items():
        if key is not None:
            embedding_matrix[value] = model[key]
    return vocab_size, embedding_size, embedding_matrix

Source File: nlp_chinese.py From simple_nlp_chinese with MIT License

6 votes

def train_model(file_input, file_output):
    file_intermediate = os.path.join(
        os.path.dirname(file_input),
        os.path.splitext(file_input)[0])
    process_corpus_extraction(
        file_input, file_intermediate + '.extracted')
    process_chinese_filtering(
        file_intermediate + '.extracted',
        file_intermediate + '.filtered')
    process_chinese_transformation(
        file_intermediate + '.filtered',
        file_intermediate + '.transformed')
    process_chinese_transformation(
        file_intermediate + '.transformed',
        file_intermediate + '.segmented')
    # we can train for either word2vec or doc2vec
    # process_word_training(
    #     file_intermediate + '.segmented', file_output)
    process_doc_training(
        file_intermediate + '.segmented', file_output)

Source File: data_helpers.py From Hierarchical-Multi-Label-Text-Classification with Apache License 2.0

6 votes

def create_metadata_file(word2vec_file, output_file):
    """
    Create the metadata file based on the corpus file (Used for the Embedding Visualization later).

    Args:
        word2vec_file: The word2vec file
        output_file: The metadata file path
    Raises:
        IOError: If word2vec model file doesn't exist
    """
    if not os.path.isfile(word2vec_file):
        raise IOError("[Error] The word2vec file doesn't exist.")

    model = gensim.models.Word2Vec.load(word2vec_file)
    word2idx = dict([(k, v.index) for k, v in model.wv.vocab.items()])
    word2idx_sorted = [(k, word2idx[k]) for k in sorted(word2idx, key=word2idx.get, reverse=False)]

    with open(output_file, 'w+') as fout:
        for word in word2idx_sorted:
            if word[0] is None:
                print("[Warning] Empty Line, should replaced by any thing else, or will cause a bug of tensorboard")
                fout.write('<Empty Line>' + '\n')
            else:
                fout.write(word[0] + '\n')

Source File: data_helpers.py From Hierarchical-Multi-Label-Text-Classification with Apache License 2.0

6 votes

def load_word2vec_matrix(word2vec_file):
    """
    Return the word2vec model matrix.

    Args:
        word2vec_file: The word2vec file
    Returns:
        The word2vec model matrix
    Raises:
        IOError: If word2vec model file doesn't exist
    """
    if not os.path.isfile(word2vec_file):
        raise IOError("[Error] The word2vec file doesn't exist. ")

    model = gensim.models.Word2Vec.load(word2vec_file)
    vocab_size = model.wv.vectors.shape[0]
    embedding_size = model.vector_size
    vocab = dict([(k, v.index) for k, v in model.wv.vocab.items()])
    embedding_matrix = np.zeros([vocab_size, embedding_size])
    for key, value in vocab.items():
        if key is not None:
            embedding_matrix[value] = model[key]
    return vocab_size, embedding_size, embedding_matrix

Source File: NLP.py From Financial-NLP with Apache License 2.0

5 votes

def load_model(self, save_model_name):
        """
        load model into the object(self.model)
        """
        self.model=word2vec.Word2Vec.load(save_model_name)
        self.len_vector=self.model.trainables.layer1_size
        try:
            self.renew_label_vec()
        except:
            self.safe_renew_label_vec()

Source File: NLP.py From Financial-NLP with Apache License 2.0

5 votes

def safe_nlp_vector(self, words):
        """
        Parameters
            ----------
            words : list of str/str 
                wordbag
        Returns
            ----------
            ndarray(float)
                the corresponding vectors of words in wordbag.
                a vector contains the similarities calculated by word2vec and wordnet.
        """
        if isinstance(words, string_types):
            synonym=self.synonym_label(words)
            similarity=self.similarity_label(words)
        else:
            synonym=np.empty((len(self.Label_index),len(words)))
            similarity=np.empty((len(self.Label_index),len(words)))
            for i in range(len(words)):
                try:
                    synonym[:,i]=self.synonym_label(words[i])
                except:
                    synonym[:,i]=np.zeros((len(self.Label_index),1))[:,0]
                try:    
                    similarity[:,i]=self.similarity_label(words[i])[:,0]
                except:
                    similarity[:,i]=np.zeros((len(self.Label_index),1))[:,0]
        vector=np.concatenate((similarity, synonym))
        return vector

Source File: data_helpers.py From Multi-Label-Text-Classification with Apache License 2.0

5 votes

def load_data_and_labels(data_file, num_labels, word2vec_file, data_aug_flag):
    """
    Load research data from files, splits the data into words and generates labels.
    Return split sentences, labels and the max sentence length of the research data.

    Args:
        data_file: The research data
        num_labels: The number of classes
        word2vec_file: The word2vec model file
        data_aug_flag: The flag of data augmented
    Returns:
        The class _Data()
    Raises:
        IOError: If word2vec model file doesn't exist
    """
    # Load word2vec file
    if not os.path.isfile(word2vec_file):
        raise IOError("[Error] The word2vec file doesn't exist. ")

    model = word2vec.Word2Vec.load(word2vec_file)

    # Load data from files and split by words
    data = data_word2vec(input_file=data_file, num_labels=num_labels, word2vec_model=model)
    if data_aug_flag:
        data = data_augmented(data)

    # plot_seq_len(data_file, data)

    return data

Source File: nlp_chinese.py From simple_nlp_chinese with MIT License

5 votes

def process_word_training(file_input, file_output):
    model = gensim.models.Word2Vec(
        gensim.models.word2vec.LineSentence(file_input),
        size=400, workers=multiprocessing.cpu_count())
    # trim unneeded model memory = use (much) less RAM
    model.init_sims(replace=True)
    model.save(file_output)

Source File: data_helpers.py From Hierarchical-Multi-Label-Text-Classification with Apache License 2.0

5 votes

def load_data_and_labels(data_file, num_classes_list, total_classes, word2vec_file, data_aug_flag):
    """
    Load research data from files, splits the data into words and generates labels.
    Return split sentences, labels and the max sentence length of the research data.

    Args:
        data_file: The research data
        num_classes_list: <list> The number of classes
        total_classes: The total number of classes
        word2vec_file: The word2vec file
        data_aug_flag: The flag of data augmented
    Returns:
        The class _Data()
    Raises:
        IOError: If word2vec model file doesn't exist
    """
    # Load word2vec file
    if not os.path.isfile(word2vec_file):
        raise IOError("[Error] The word2vec file doesn't exist. ")

    model = word2vec.Word2Vec.load(word2vec_file)

    # Load data from files and split by words
    data = data_word2vec(data_file, num_classes_list, total_classes, word2vec_model=model)
    if data_aug_flag:
        data = data_augmented(data)

    # plot_seq_len(data_file, data)

    return data

Source File: NLP.py From Financial-NLP with Apache License 2.0

4 votes

def train_Word2Vec(self, train_corpus, saveflag=False, save_model_name='NLP_model', Size=100, Min_count=5):#, show_process=True):
        """
        train the word2vec model with the processing file.
        Parameters
            ----------
            train_corpus : str/list of lists
                name(absolute path) of train_corpus.
                of a list of sentences(a sentence is a list of words).
            saveflag : bool
                save trained model locally?
            save_model_name : str
                the model name(absolute path)
                default: 'NLP_model'
            Size : int
                length of the word vector
            Min_count : int
                minimum frequence can a word record on dictionary.
        Returns
            Nothing
        """
        print('start training...')
        prev_time = datetime.datetime.now() #当前时间    
        
        self.len_vector=Size
        #if show_process==True:
        #    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)   
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)  
        if isinstance(train_corpus, string_types):
            sentences=self.txt2sentence(train_corpus)
        else:    
            sentences=train_corpus
        self.model=gensim.models.Word2Vec(sentences, size=Size, min_count=Min_count) #word to vector\in R^Size
        if saveflag:
            self.save_model(save_model_name) # save model locally
        try:
            self.renew_label_vec()
        except:
            self.safe_renew_label_vec()
        
        cur_time = datetime.datetime.now()  #训练后此时时间
        h, remainder = divmod((cur_time - prev_time).seconds, 3600)
        m, s = divmod(remainder, 60)
        print('done.')
        print("It costs %02d:%02d:%02d to train word2vec model." % (h, m, s))
        # model.wv.save_word2vec_format(save_model_name+".bin",binary=True)

Source File: NLP.py From Financial-NLP with Apache License 2.0

4 votes

def show_Word2Vec(self, s, k=1, mode='topk'):
        """
        not often use now.
        Parameters
            ----------
            save_model_name : str
                the name of saved model
            s : str
            k : int/str
                if mode='similarity', it's a string.
                if mode='topk', it's a number, and defaultly 1.
            mode : str
                'similarity' : calculate the similarity between s and k, and note that k is a string.
                'topk' (default): find top k similar words of s, and note that k is a integer.
        Returns
            ----------
            float
                if mode='similarity', this is the similarity between s and k.
                if mode='return_topk', it'll not return a number but a iterator.
                if mode='topk', it'll print the most similar k words.
        """
        if self.model is None:
            raise Exception("no model")
            #model=word2vec.Word2Vec.load(save_model_name)
        if mode=='topk':
            y=self.model.most_similar(s,topn=k)
            print('与"%s"最相关的词有:\n' % s)
            for item in y:
                print(item[0],item[1])
        
        elif mode=='return_topk':
            return self.model.wv.most_similar(s,topn=k)
            #return model.most_similar(s,topn=k)
                
        elif mode=='similarity':
            y=self.model.wv.similarity(s,k) 
            # 余弦相似度，即对于两个向量v1,v2，先单位化后，再求内积。
            print('"%s"和"%s"的相似度为：%f%%' % (s,k,(y*100)))
            return y
        
        elif mode=='vector':
            print(self.model[s])

Python gensim.models.word2vec() Examples