Python gensim.models.word2vec() Examples
The following are 16
code examples of gensim.models.word2vec().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
gensim.models
, or try the search function
.
Example #1
Source File: rulebase.py From Chatbot with GNU General Public License v3.0 | 6 votes |
def load_model(self,path): """ Load a trained word2vec model(binary format only). Args: path: the path of the model. """ try: self.model = models.Word2Vec.load(path) # current loading method except FileNotFoundError as file_not_found_err: print("[Gensim] FileNotFoundError", file_not_found_err) exit() except UnicodeDecodeError as unicode_decode_err: print("[Gensim] UnicodeDecodeError", unicode_decode_err) self.model = models.KeyedVectors.load_word2vec_format(path, binary=True) # old loading method except Exception as ex: print("[Gensim] Exception", ex) exit()
Example #2
Source File: data_helpers.py From Text-Pairs-Relation-Classification with Apache License 2.0 | 6 votes |
def create_metadata_file(word2vec_file, output_file): """ Create the metadata file based on the corpus file (Used for the Embedding Visualization later). Args: word2vec_file: The word2vec file output_file: The metadata file path Raises: IOError: If word2vec model file doesn't exist """ if not os.path.isfile(word2vec_file): raise IOError("[Error] The word2vec file doesn't exist.") model = gensim.models.Word2Vec.load(word2vec_file) word2idx = dict([(k, v.index) for k, v in model.wv.vocab.items()]) word2idx_sorted = [(k, word2idx[k]) for k in sorted(word2idx, key=word2idx.get, reverse=False)] with open(output_file, 'w+') as fout: for word in word2idx_sorted: if word[0] is None: print("[Warning] Empty Line, should replaced by any thing else, or will cause a bug of tensorboard") fout.write('<Empty Line>' + '\n') else: fout.write(word[0] + '\n')
Example #3
Source File: data_helpers.py From Text-Pairs-Relation-Classification with Apache License 2.0 | 6 votes |
def load_word2vec_matrix(word2vec_file): """ Return the word2vec model matrix. Args: word2vec_file: The word2vec file Returns: The word2vec model matrix Raises: IOError: If word2vec model file doesn't exist """ if not os.path.isfile(word2vec_file): raise IOError("[Error] The word2vec file doesn't exist. ") model = gensim.models.Word2Vec.load(word2vec_file) vocab_size = model.wv.vectors.shape[0] embedding_size = model.vector_size vocab = dict([(k, v.index) for k, v in model.wv.vocab.items()]) embedding_matrix = np.zeros([vocab_size, embedding_size]) for key, value in vocab.items(): if key is not None: embedding_matrix[value] = model[key] return vocab_size, embedding_size, embedding_matrix
Example #4
Source File: data_helpers.py From Text-Pairs-Relation-Classification with Apache License 2.0 | 6 votes |
def load_data_and_labels(data_file, word2vec_file): """ Load research data from files, splits the data into words and generates labels. Return split sentences, labels and the max sentence length of the research data. Args: data_file: The research data word2vec_file: The word2vec model file Returns: The class Data """ # Load word2vec file if not os.path.isfile(word2vec_file): raise IOError("[Error] The word2vec file doesn't exist. ") model = word2vec.Word2Vec.load(word2vec_file) # Load data from files and split by words data = data_word2vec(input_file=data_file, word2vec_model=model) # plot_seq_len(data_file, data) return data
Example #5
Source File: data_helpers.py From Multi-Label-Text-Classification with Apache License 2.0 | 6 votes |
def create_metadata_file(word2vec_file, output_file): """ Create the metadata file based on the corpus file (Used for the Embedding Visualization later). Args: word2vec_file: The word2vec file output_file: The metadata file path Raises: IOError: If word2vec model file doesn't exist """ if not os.path.isfile(word2vec_file): raise IOError("[Error] The word2vec file doesn't exist.") model = gensim.models.Word2Vec.load(word2vec_file) word2idx = dict([(k, v.index) for k, v in model.wv.vocab.items()]) word2idx_sorted = [(k, word2idx[k]) for k in sorted(word2idx, key=word2idx.get, reverse=False)] with open(output_file, 'w+') as fout: for word in word2idx_sorted: if word[0] is None: print("[Warning] Empty Line, should replaced by any thing else, or will cause a bug of tensorboard") fout.write('<Empty Line>' + '\n') else: fout.write(word[0] + '\n')
Example #6
Source File: data_helpers.py From Multi-Label-Text-Classification with Apache License 2.0 | 6 votes |
def load_word2vec_matrix(word2vec_file): """ Return the word2vec model matrix. Args: word2vec_file: The word2vec file Returns: The word2vec model matrix Raises: IOError: If word2vec model file doesn't exist """ if not os.path.isfile(word2vec_file): raise IOError("[Error] The word2vec file doesn't exist. ") model = gensim.models.Word2Vec.load(word2vec_file) vocab_size = model.wv.vectors.shape[0] embedding_size = model.vector_size vocab = dict([(k, v.index) for k, v in model.wv.vocab.items()]) embedding_matrix = np.zeros([vocab_size, embedding_size]) for key, value in vocab.items(): if key is not None: embedding_matrix[value] = model[key] return vocab_size, embedding_size, embedding_matrix
Example #7
Source File: nlp_chinese.py From simple_nlp_chinese with MIT License | 6 votes |
def train_model(file_input, file_output): file_intermediate = os.path.join( os.path.dirname(file_input), os.path.splitext(file_input)[0]) process_corpus_extraction( file_input, file_intermediate + '.extracted') process_chinese_filtering( file_intermediate + '.extracted', file_intermediate + '.filtered') process_chinese_transformation( file_intermediate + '.filtered', file_intermediate + '.transformed') process_chinese_transformation( file_intermediate + '.transformed', file_intermediate + '.segmented') # we can train for either word2vec or doc2vec # process_word_training( # file_intermediate + '.segmented', file_output) process_doc_training( file_intermediate + '.segmented', file_output)
Example #8
Source File: data_helpers.py From Hierarchical-Multi-Label-Text-Classification with Apache License 2.0 | 6 votes |
def create_metadata_file(word2vec_file, output_file): """ Create the metadata file based on the corpus file (Used for the Embedding Visualization later). Args: word2vec_file: The word2vec file output_file: The metadata file path Raises: IOError: If word2vec model file doesn't exist """ if not os.path.isfile(word2vec_file): raise IOError("[Error] The word2vec file doesn't exist.") model = gensim.models.Word2Vec.load(word2vec_file) word2idx = dict([(k, v.index) for k, v in model.wv.vocab.items()]) word2idx_sorted = [(k, word2idx[k]) for k in sorted(word2idx, key=word2idx.get, reverse=False)] with open(output_file, 'w+') as fout: for word in word2idx_sorted: if word[0] is None: print("[Warning] Empty Line, should replaced by any thing else, or will cause a bug of tensorboard") fout.write('<Empty Line>' + '\n') else: fout.write(word[0] + '\n')
Example #9
Source File: data_helpers.py From Hierarchical-Multi-Label-Text-Classification with Apache License 2.0 | 6 votes |
def load_word2vec_matrix(word2vec_file): """ Return the word2vec model matrix. Args: word2vec_file: The word2vec file Returns: The word2vec model matrix Raises: IOError: If word2vec model file doesn't exist """ if not os.path.isfile(word2vec_file): raise IOError("[Error] The word2vec file doesn't exist. ") model = gensim.models.Word2Vec.load(word2vec_file) vocab_size = model.wv.vectors.shape[0] embedding_size = model.vector_size vocab = dict([(k, v.index) for k, v in model.wv.vocab.items()]) embedding_matrix = np.zeros([vocab_size, embedding_size]) for key, value in vocab.items(): if key is not None: embedding_matrix[value] = model[key] return vocab_size, embedding_size, embedding_matrix
Example #10
Source File: NLP.py From Financial-NLP with Apache License 2.0 | 5 votes |
def load_model(self, save_model_name): """ load model into the object(self.model) """ self.model=word2vec.Word2Vec.load(save_model_name) self.len_vector=self.model.trainables.layer1_size try: self.renew_label_vec() except: self.safe_renew_label_vec()
Example #11
Source File: NLP.py From Financial-NLP with Apache License 2.0 | 5 votes |
def safe_nlp_vector(self, words): """ Parameters ---------- words : list of str/str wordbag Returns ---------- ndarray(float) the corresponding vectors of words in wordbag. a vector contains the similarities calculated by word2vec and wordnet. """ if isinstance(words, string_types): synonym=self.synonym_label(words) similarity=self.similarity_label(words) else: synonym=np.empty((len(self.Label_index),len(words))) similarity=np.empty((len(self.Label_index),len(words))) for i in range(len(words)): try: synonym[:,i]=self.synonym_label(words[i]) except: synonym[:,i]=np.zeros((len(self.Label_index),1))[:,0] try: similarity[:,i]=self.similarity_label(words[i])[:,0] except: similarity[:,i]=np.zeros((len(self.Label_index),1))[:,0] vector=np.concatenate((similarity, synonym)) return vector
Example #12
Source File: data_helpers.py From Multi-Label-Text-Classification with Apache License 2.0 | 5 votes |
def load_data_and_labels(data_file, num_labels, word2vec_file, data_aug_flag): """ Load research data from files, splits the data into words and generates labels. Return split sentences, labels and the max sentence length of the research data. Args: data_file: The research data num_labels: The number of classes word2vec_file: The word2vec model file data_aug_flag: The flag of data augmented Returns: The class _Data() Raises: IOError: If word2vec model file doesn't exist """ # Load word2vec file if not os.path.isfile(word2vec_file): raise IOError("[Error] The word2vec file doesn't exist. ") model = word2vec.Word2Vec.load(word2vec_file) # Load data from files and split by words data = data_word2vec(input_file=data_file, num_labels=num_labels, word2vec_model=model) if data_aug_flag: data = data_augmented(data) # plot_seq_len(data_file, data) return data
Example #13
Source File: nlp_chinese.py From simple_nlp_chinese with MIT License | 5 votes |
def process_word_training(file_input, file_output): model = gensim.models.Word2Vec( gensim.models.word2vec.LineSentence(file_input), size=400, workers=multiprocessing.cpu_count()) # trim unneeded model memory = use (much) less RAM model.init_sims(replace=True) model.save(file_output)
Example #14
Source File: data_helpers.py From Hierarchical-Multi-Label-Text-Classification with Apache License 2.0 | 5 votes |
def load_data_and_labels(data_file, num_classes_list, total_classes, word2vec_file, data_aug_flag): """ Load research data from files, splits the data into words and generates labels. Return split sentences, labels and the max sentence length of the research data. Args: data_file: The research data num_classes_list: <list> The number of classes total_classes: The total number of classes word2vec_file: The word2vec file data_aug_flag: The flag of data augmented Returns: The class _Data() Raises: IOError: If word2vec model file doesn't exist """ # Load word2vec file if not os.path.isfile(word2vec_file): raise IOError("[Error] The word2vec file doesn't exist. ") model = word2vec.Word2Vec.load(word2vec_file) # Load data from files and split by words data = data_word2vec(data_file, num_classes_list, total_classes, word2vec_model=model) if data_aug_flag: data = data_augmented(data) # plot_seq_len(data_file, data) return data
Example #15
Source File: NLP.py From Financial-NLP with Apache License 2.0 | 4 votes |
def train_Word2Vec(self, train_corpus, saveflag=False, save_model_name='NLP_model', Size=100, Min_count=5):#, show_process=True): """ train the word2vec model with the processing file. Parameters ---------- train_corpus : str/list of lists name(absolute path) of train_corpus. of a list of sentences(a sentence is a list of words). saveflag : bool save trained model locally? save_model_name : str the model name(absolute path) default: 'NLP_model' Size : int length of the word vector Min_count : int minimum frequence can a word record on dictionary. Returns Nothing """ print('start training...') prev_time = datetime.datetime.now() #当前时间 self.len_vector=Size #if show_process==True: # logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) if isinstance(train_corpus, string_types): sentences=self.txt2sentence(train_corpus) else: sentences=train_corpus self.model=gensim.models.Word2Vec(sentences, size=Size, min_count=Min_count) #word to vector\in R^Size if saveflag: self.save_model(save_model_name) # save model locally try: self.renew_label_vec() except: self.safe_renew_label_vec() cur_time = datetime.datetime.now() #训练后此时时间 h, remainder = divmod((cur_time - prev_time).seconds, 3600) m, s = divmod(remainder, 60) print('done.') print("It costs %02d:%02d:%02d to train word2vec model." % (h, m, s)) # model.wv.save_word2vec_format(save_model_name+".bin",binary=True)
Example #16
Source File: NLP.py From Financial-NLP with Apache License 2.0 | 4 votes |
def show_Word2Vec(self, s, k=1, mode='topk'): """ not often use now. Parameters ---------- save_model_name : str the name of saved model s : str k : int/str if mode='similarity', it's a string. if mode='topk', it's a number, and defaultly 1. mode : str 'similarity' : calculate the similarity between s and k, and note that k is a string. 'topk' (default): find top k similar words of s, and note that k is a integer. Returns ---------- float if mode='similarity', this is the similarity between s and k. if mode='return_topk', it'll not return a number but a iterator. if mode='topk', it'll print the most similar k words. """ if self.model is None: raise Exception("no model") #model=word2vec.Word2Vec.load(save_model_name) if mode=='topk': y=self.model.most_similar(s,topn=k) print('与"%s"最相关的词有:\n' % s) for item in y: print(item[0],item[1]) elif mode=='return_topk': return self.model.wv.most_similar(s,topn=k) #return model.most_similar(s,topn=k) elif mode=='similarity': y=self.model.wv.similarity(s,k) # 余弦相似度,即对于两个向量v1,v2,先单位化后,再求内积。 print('"%s"和"%s"的相似度为:%f%%' % (s,k,(y*100))) return y elif mode=='vector': print(self.model[s])