Python word2vec.load() Examples
The following are 30
code examples of word2vec.load().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
word2vec
, or try the search function
.
Example #1
Source File: data_process_aug.py From AIchallenger2018_MachineReadingComprehension with MIT License | 6 votes |
def transfer(model_path, embedding_size): start_time = time.time() model = word2vec.load(model_path) word2id_dic = {} init_0 = [0.0 for i in range(embedding_size)] id2vec_dic = [init_0] for i in range(len(model.vocab)): id = i + 1 word2id_dic[model.vocab[i]] = id id2vec_dic.append(model[model.vocab[i]].tolist()) end_time = time.time() print('词转id,id转向量完成') print(end_time - start_time) return word2id_dic, id2vec_dic # 存入json文件
Example #2
Source File: a8_train.py From text_classification with MIT License | 5 votes |
def assign_pretrained_word_embedding(sess,vocabulary_index2word,vocab_size,model,word2vec_model_path=None): print("using pre-trained word emebedding.started.word2vec_model_path:",word2vec_model_path) # word2vecc=word2vec.load('word_embedding.txt') #load vocab-vector fiel.word2vecc['w91874'] word2vec_model = word2vec.load(word2vec_model_path, kind='bin') word2vec_dict = {} for word, vector in zip(word2vec_model.vocab, word2vec_model.vectors): word2vec_dict[word] = vector word_embedding_2dlist = [[]] * vocab_size # create an empty word_embedding list. word_embedding_2dlist[0] = np.zeros(FLAGS.embed_size) # assign empty for first word:'PAD' bound = np.sqrt(6.0) / np.sqrt(vocab_size) # bound for random variables. count_exist = 0; count_not_exist = 0 for i in range(1, vocab_size): # loop each word word = vocabulary_index2word[i] # get a word embedding = None try: embedding = word2vec_dict[word] # try to get vector:it is an array. except Exception: embedding = None if embedding is not None: # the 'word' exist a embedding word_embedding_2dlist[i] = embedding; count_exist = count_exist + 1 # assign array to this word. else: # no embedding for this word word_embedding_2dlist[i] = np.random.uniform(-bound, bound, FLAGS.embed_size); count_not_exist = count_not_exist + 1 # init a random value for the word. word_embedding_final = np.array(word_embedding_2dlist) # covert to 2d array. word_embedding = tf.constant(word_embedding_final, dtype=tf.float32) # convert to tensor t_assign_embedding = tf.assign(model.Embedding,word_embedding) # assign this value to our embedding variables of our model. sess.run(t_assign_embedding); print("word. exists embedding:", count_exist, " ;word not exist embedding:", count_not_exist) print("using pre-trained word emebedding.ended...") # do evalation on validation dataset, report loss and accuracy
Example #3
Source File: p7_TextCNN_train_exp512.py From text_classification with MIT License | 5 votes |
def assign_pretrained_word_embedding(sess,vocabulary_index2word,vocab_size,textCNN,word2vec_model_path=None): print("using pre-trained word emebedding.started.word2vec_model_path:",word2vec_model_path) # word2vecc=word2vec.load('word_embedding.txt') #load vocab-vector fiel.word2vecc['w91874'] word2vec_model = word2vec.load(word2vec_model_path, kind='bin') word2vec_dict = {} for word, vector in zip(word2vec_model.vocab, word2vec_model.vectors): word2vec_dict[word] = vector word_embedding_2dlist = [[]] * vocab_size # create an empty word_embedding list. word_embedding_2dlist[0] = np.zeros(FLAGS.embed_size) # assign empty for first word:'PAD' bound = np.sqrt(6.0) / np.sqrt(vocab_size) # bound for random variables. count_exist = 0; count_not_exist = 0 for i in range(1, vocab_size): # loop each word word = vocabulary_index2word[i] # get a word embedding = None try: embedding = word2vec_dict[word] # try to get vector:it is an array. except Exception: embedding = None if embedding is not None: # the 'word' exist a embedding word_embedding_2dlist[i] = embedding; count_exist = count_exist + 1 # assign array to this word. else: # no embedding for this word word_embedding_2dlist[i] = np.random.uniform(-bound, bound, FLAGS.embed_size); count_not_exist = count_not_exist + 1 # init a random value for the word. word_embedding_final = np.array(word_embedding_2dlist) # covert to 2d array. word_embedding = tf.constant(word_embedding_final, dtype=tf.float32) # convert to tensor t_assign_embedding = tf.assign(textCNN.Embedding,word_embedding) # assign this value to our embedding variables of our model. sess.run(t_assign_embedding); print("word. exists embedding:", count_exist, " ;word not exist embedding:", count_not_exist) print("using pre-trained word emebedding.ended...") # 在验证集上做验证,报告损失、精确度
Example #4
Source File: embedding2matrix.py From PyTorchText with MIT License | 5 votes |
def main(em_file, em_result): ''' embedding ->numpy ''' em = word2vec.load(em_file) vec = (em.vectors) word2id = em.vocab_hash # d = dict(vector = vec, word2id = word2id) # t.save(d,em_result) np.savez_compressed(em_result,vector=vec,word2id=word2id)
Example #5
Source File: p71_TextRCNN_train.py From text_classification with MIT License | 5 votes |
def assign_pretrained_word_embedding(sess,vocabulary_index2word,vocab_size,textRCNN,word2vec_model_path=None): print("using pre-trained word emebedding.started.word2vec_model_path:",word2vec_model_path) # word2vecc=word2vec.load('word_embedding.txt') #load vocab-vector fiel.word2vecc['w91874'] word2vec_model = word2vec.load(word2vec_model_path, kind='bin') word2vec_dict = {} for word, vector in zip(word2vec_model.vocab, word2vec_model.vectors): word2vec_dict[word] = vector word_embedding_2dlist = [[]] * vocab_size # create an empty word_embedding list. word_embedding_2dlist[0] = np.zeros(FLAGS.embed_size) # assign empty for first word:'PAD' bound = np.sqrt(6.0) / np.sqrt(vocab_size) # bound for random variables. count_exist = 0; count_not_exist = 0 for i in range(1, vocab_size): # loop each word word = vocabulary_index2word[i] # get a word embedding = None try: embedding = word2vec_dict[word] # try to get vector:it is an array. except Exception: embedding = None if embedding is not None: # the 'word' exist a embedding word_embedding_2dlist[i] = embedding; count_exist = count_exist + 1 # assign array to this word. else: # no embedding for this word word_embedding_2dlist[i] = np.random.uniform(-bound, bound, FLAGS.embed_size); count_not_exist = count_not_exist + 1 # init a random value for the word. word_embedding_final = np.array(word_embedding_2dlist) # covert to 2d array. word_embedding = tf.constant(word_embedding_final, dtype=tf.float32) # convert to tensor t_assign_embedding = tf.assign(textRCNN.Embedding,word_embedding) # assign this value to our embedding variables of our model. sess.run(t_assign_embedding); print("word. exists embedding:", count_exist, " ;word not exist embedding:", count_not_exist) print("using pre-trained word emebedding.ended...") # 在验证集上做验证,报告损失、精确度
Example #6
Source File: data_util_zhihu.py From text_classification with MIT License | 5 votes |
def create_voabulary(simple=None,word2vec_model_path='../zhihu-word2vec-title-desc.bin-100',name_scope=''): #zhihu-word2vec-multilabel.bin-100 cache_path ='../cache_vocabulary_label_pik/'+ name_scope + "_word_voabulary.pik" print("cache_path:",cache_path,"file_exists:",os.path.exists(cache_path)) if os.path.exists(cache_path):#如果缓存文件存在,则直接读取 with open(cache_path, 'r') as data_f: vocabulary_word2index, vocabulary_index2word=pickle.load(data_f) return vocabulary_word2index, vocabulary_index2word else: vocabulary_word2index={} vocabulary_index2word={} if simple is not None: word2vec_model_path='../zhihu-word2vec.bin-100' print("create vocabulary. word2vec_model_path:",word2vec_model_path) model=word2vec.load(word2vec_model_path,kind='bin') vocabulary_word2index['PAD_ID']=0 vocabulary_index2word[0]='PAD_ID' special_index=0 if 'biLstmTextRelation' in name_scope: vocabulary_word2index['EOS']=1 # a special token for biLstTextRelation model. which is used between two sentences. vocabulary_index2word[1]='EOS' special_index=1 for i,vocab in enumerate(model.vocab): vocabulary_word2index[vocab]=i+1+special_index vocabulary_index2word[i+1+special_index]=vocab #save to file system if vocabulary of words is not exists. if not os.path.exists(cache_path): #如果不存在写到缓存文件中 with open(cache_path, 'a') as data_f: pickle.dump((vocabulary_word2index,vocabulary_index2word), data_f) return vocabulary_word2index,vocabulary_index2word # create vocabulary of lables. label is sorted. 1 is high frequency, 2 is low frequency.
Example #7
Source File: data_util_zhihu.py From text_classification with MIT License | 5 votes |
def load_data(vocabulary_word2index,vocabulary_word2index_label,valid_portion=0.05,max_training_data=1000000,training_data_path='train-zhihu4-only-title-all.txt'): # n_words=100000, """ input: a file path :return: train, test, valid. where train=(trainX, trainY). where trainX: is a list of list.each list representation a sentence.trainY: is a list of label. each label is a number """ # 1.load a zhihu data from file # example:"w305 w6651 w3974 w1005 w54 w109 w110 w3974 w29 w25 w1513 w3645 w6 w111 __label__-400525901828896492" print("load_data.started...") zhihu_f = codecs.open(training_data_path, 'r', 'utf8') #-zhihu4-only-title.txt lines = zhihu_f.readlines() # 2.transform X as indices # 3.transform y as scalar X = [] Y = [] for i, line in enumerate(lines): x, y = line.split('__label__') #x='w17314 w5521 w7729 w767 w10147 w111' y=y.replace('\n','') x = x.replace("\t",' EOS ').strip() if i<5: print("x0:",x) #get raw x #x_=process_one_sentence_to_get_ui_bi_tri_gram(x) #if i<5: # print("x1:",x_) # x=x.split(" ") x = [vocabulary_word2index.get(e,0) for e in x] #if can't find the word, set the index as '0'.(equal to PAD_ID = 0) if i<5: print("x1:",x) #word to index y = vocabulary_word2index_label[y] #np.abs(hash(y)) X.append(x) Y.append(y) # 4.split to train,test and valid data number_examples = len(X) print("number_examples:",number_examples) # train = (X[0:int((1 - valid_portion) * number_examples)], Y[0:int((1 - valid_portion) * number_examples)]) test = (X[int((1 - valid_portion) * number_examples) + 1:], Y[int((1 - valid_portion) * number_examples) + 1:]) # 5.return print("load_data.ended...") return train, test, test # 将一句话转化为(uigram,bigram,trigram)后的字符串
Example #8
Source File: data_util_zhihu.py From text_classification with MIT License | 5 votes |
def process_one_sentence_to_get_ui_bi_tri_gram(sentence,n_gram=3): """ :param sentence: string. example:'w17314 w5521 w7729 w767 w10147 w111' :param n_gram: :return:string. example:'w17314 w17314w5521 w17314w5521w7729 w5521 w5521w7729 w5521w7729w767 w7729 w7729w767 w7729w767w10147 w767 w767w10147 w767w10147w111 w10147 w10147w111 w111' """ result=[] word_list=sentence.split(" ") #[sentence[i] for i in range(len(sentence))] unigram='';bigram='';trigram='';fourgram='' length_sentence=len(word_list) for i,word in enumerate(word_list): unigram=word #ui-gram word_i=unigram if n_gram>=2 and i+2<=length_sentence: #bi-gram bigram="".join(word_list[i:i+2]) word_i=word_i+' '+bigram if n_gram>=3 and i+3<=length_sentence: #tri-gram trigram="".join(word_list[i:i+3]) word_i = word_i + ' ' + trigram if n_gram>=4 and i+4<=length_sentence: #four-gram fourgram="".join(word_list[i:i+4]) word_i = word_i + ' ' + fourgram if n_gram>=5 and i+5<=length_sentence: #five-gram fivegram="".join(word_list[i:i+5]) word_i = word_i + ' ' + fivegram result.append(word_i) result=" ".join(result) return result # 加载数据,标签包含多个label:load data with multi-labels
Example #9
Source File: a3_train.py From text_classification with MIT License | 5 votes |
def assign_pretrained_word_embedding(sess,vocabulary_index2word,vocab_size,model,word2vec_model_path=None): print("using pre-trained word emebedding.started.word2vec_model_path:",word2vec_model_path) # word2vecc=word2vec.load('word_embedding.txt') #load vocab-vector fiel.word2vecc['w91874'] word2vec_model = word2vec.load(word2vec_model_path, kind='bin') word2vec_dict = {} for word, vector in zip(word2vec_model.vocab, word2vec_model.vectors): word2vec_dict[word] = vector word_embedding_2dlist = [[]] * vocab_size # create an empty word_embedding list. word_embedding_2dlist[0] = np.zeros(FLAGS.embed_size) # assign empty for first word:'PAD' bound = np.sqrt(6.0) / np.sqrt(vocab_size) # bound for random variables. count_exist = 0; count_not_exist = 0 for i in range(1, vocab_size): # loop each word word = vocabulary_index2word[i] # get a word embedding = None try: embedding = word2vec_dict[word] # try to get vector:it is an array. except Exception: embedding = None if embedding is not None: # the 'word' exist a embedding word_embedding_2dlist[i] = embedding; count_exist = count_exist + 1 # assign array to this word. else: # no embedding for this word word_embedding_2dlist[i] = np.random.uniform(-bound, bound, FLAGS.embed_size); count_not_exist = count_not_exist + 1 # init a random value for the word. word_embedding_final = np.array(word_embedding_2dlist) # covert to 2d array. word_embedding = tf.constant(word_embedding_final, dtype=tf.float32) # convert to tensor t_assign_embedding = tf.assign(model.Embedding,word_embedding) # assign this value to our embedding variables of our model. sess.run(t_assign_embedding); print("word. exists embedding:", count_exist, " ;word not exist embedding:", count_not_exist) print("using pre-trained word emebedding.ended...") # 在验证集上做验证,报告损失、精确度
Example #10
Source File: p6_fastTextB_train_multilabel.py From text_classification with MIT License | 5 votes |
def assign_pretrained_word_embedding(sess,vocabulary_index2word,vocab_size,fast_text): print("using pre-trained word emebedding.started...") # word2vecc=word2vec.load('word_embedding.txt') #load vocab-vector fiel.word2vecc['w91874'] word2vec_model = word2vec.load('zhihu-word2vec-multilabel.bin-100', kind='bin') word2vec_dict = {} for word, vector in zip(word2vec_model.vocab, word2vec_model.vectors): word2vec_dict[word] = vector word_embedding_2dlist = [[]] * vocab_size # create an empty word_embedding list. word_embedding_2dlist[0] = np.zeros(FLAGS.embed_size) # assign empty for first word:'PAD' bound = np.sqrt(6.0) / np.sqrt(vocab_size) # bound for random variables. count_exist = 0; count_not_exist = 0 for i in range(1, vocab_size): # loop each word word = vocabulary_index2word[i] # get a word embedding = None try: embedding = word2vec_dict[word] # try to get vector:it is an array. except Exception: embedding = None if embedding is not None: # the 'word' exist a embedding word_embedding_2dlist[i] = embedding; count_exist = count_exist + 1 # assign array to this word. else: # no embedding for this word word_embedding_2dlist[i] = np.random.uniform(-bound, bound, FLAGS.embed_size); count_not_exist = count_not_exist + 1 # init a random value for the word. word_embedding_final = np.array(word_embedding_2dlist) # covert to 2d array. word_embedding = tf.constant(word_embedding_final, dtype=tf.float32) # convert to tensor t_assign_embedding = tf.assign(fast_text.Embedding, word_embedding) # assign this value to our embedding variables of our model. sess.run(t_assign_embedding); print("word. exists embedding:", count_exist, " ;word not exist embedding:", count_not_exist) print("using pre-trained word emebedding.ended...") #从logits中取出前五 get label using logits
Example #11
Source File: p6_fastTextB_train_multilabel.py From text_classification with MIT License | 5 votes |
def load_data(cache_file_h5py,cache_file_pickle): """ load data from h5py and pickle cache files, which is generate by take step by step of pre-processing.ipynb :param cache_file_h5py: :param cache_file_pickle: :return: """ if not os.path.exists(cache_file_h5py) or not os.path.exists(cache_file_pickle): raise RuntimeError("############################ERROR##############################\n. " "please download cache file, it include training data and vocabulary & labels. " "link can be found in README.md\n download zip file, unzip it, then put cache files as FLAGS." "cache_file_h5py and FLAGS.cache_file_pickle suggested location.") print("INFO. cache file exists. going to load cache file") f_data = h5py.File(cache_file_h5py, 'r') print("f_data.keys:",list(f_data.keys())) train_X=f_data['train_X'] # np.array( print("train_X.shape:",train_X.shape) train_Y=f_data['train_Y'] # np.array( print("train_Y.shape:",train_Y.shape,";") vaild_X=f_data['vaild_X'] # np.array( valid_Y=f_data['valid_Y'] # np.array( test_X=f_data['test_X'] # np.array( test_Y=f_data['test_Y'] # np.array( word2index, label2index=None,None with open(cache_file_pickle, 'rb') as data_f_pickle: word2index, label2index=pickle.load(data_f_pickle) print("INFO. cache file load successful...") return word2index, label2index,train_X,train_Y,vaild_X,valid_Y,test_X,test_Y
Example #12
Source File: p5_fastTextB_train.py From text_classification with MIT License | 5 votes |
def assign_pretrained_word_embedding(sess,vocabulary_index2word,vocab_size,fast_text): print("using pre-trained word emebedding.started...") # word2vecc=word2vec.load('word_embedding.txt') #load vocab-vector fiel.word2vecc['w91874'] word2vec_model = word2vec.load('zhihu-word2vec-multilabel.bin-100', kind='bin') word2vec_dict = {} for word, vector in zip(word2vec_model.vocab, word2vec_model.vectors): word2vec_dict[word] = vector word_embedding_2dlist = [[]] * vocab_size # create an empty word_embedding list. word_embedding_2dlist[0] = np.zeros(FLAGS.embed_size) # assign empty for first word:'PAD' bound = np.sqrt(6.0) / np.sqrt(vocab_size) # bound for random variables. count_exist = 0; count_not_exist = 0 for i in range(1, vocab_size): # loop each word word = vocabulary_index2word[i] # get a word embedding = None try: embedding = word2vec_dict[word] # try to get vector:it is an array. except Exception: embedding = None if embedding is not None: # the 'word' exist a embedding word_embedding_2dlist[i] = embedding; count_exist = count_exist + 1 # assign array to this word. else: # no embedding for this word word_embedding_2dlist[i] = np.random.uniform(-bound, bound, FLAGS.embed_size); count_not_exist = count_not_exist + 1 # init a random value for the word. word_embedding_final = np.array(word_embedding_2dlist) # covert to 2d array. word_embedding = tf.constant(word_embedding_final, dtype=tf.float32) # convert to tensor t_assign_embedding = tf.assign(fast_text.Embedding,word_embedding) # assign this value to our embedding variables of our model. sess.run(t_assign_embedding); print("word. exists embedding:", count_exist, " ;word not exist embedding:", count_not_exist) print("using pre-trained word emebedding.ended...") # 在验证集上做验证,报告损失、精确度
Example #13
Source File: p5_fastTextB_train.py From text_classification with MIT License | 5 votes |
def load_data(cache_file_h5py,cache_file_pickle): """ load data from h5py and pickle cache files, which is generate by take step by step of pre-processing.ipynb :param cache_file_h5py: :param cache_file_pickle: :return: """ if not os.path.exists(cache_file_h5py) or not os.path.exists(cache_file_pickle): raise RuntimeError("############################ERROR##############################\n. " "please download cache file, it include training data and vocabulary & labels. " "link can be found in README.md\n download zip file, unzip it, then put cache files as FLAGS." "cache_file_h5py and FLAGS.cache_file_pickle suggested location.") print("INFO. cache file exists. going to load cache file") f_data = h5py.File(cache_file_h5py, 'r') print("f_data.keys:",list(f_data.keys())) train_X=f_data['train_X'] # np.array( print("train_X.shape:",train_X.shape) train_Y=f_data['train_Y'] # np.array( print("train_Y.shape:",train_Y.shape,";") vaild_X=f_data['vaild_X'] # np.array( valid_Y=f_data['valid_Y'] # np.array( test_X=f_data['test_X'] # np.array( test_Y=f_data['test_Y'] # np.array( #print(train_X) #f_data.close() word2index, label2index=None,None with open(cache_file_pickle, 'rb') as data_f_pickle: word2index, label2index=pickle.load(data_f_pickle) print("INFO. cache file load successful...") return word2index, label2index,train_X,train_Y,vaild_X,valid_Y,test_X,test_Y
Example #14
Source File: p7_TextCNN_train_exp_512_0609.py From text_classification with MIT License | 5 votes |
def assign_pretrained_word_embedding(sess,vocabulary_index2word,vocab_size,textCNN,word2vec_model_path=None): print("using pre-trained word emebedding.started.word2vec_model_path:",word2vec_model_path) # word2vecc=word2vec.load('word_embedding.txt') #load vocab-vector fiel.word2vecc['w91874'] word2vec_model = word2vec.load(word2vec_model_path, kind='bin') word2vec_dict = {} for word, vector in zip(word2vec_model.vocab, word2vec_model.vectors): word2vec_dict[word] = vector word_embedding_2dlist = [[]] * vocab_size # create an empty word_embedding list. word_embedding_2dlist[0] = np.zeros(FLAGS.embed_size) # assign empty for first word:'PAD' bound = np.sqrt(6.0) / np.sqrt(vocab_size) # bound for random variables. count_exist = 0; count_not_exist = 0 for i in range(1, vocab_size): # loop each word word = vocabulary_index2word[i] # get a word embedding = None try: embedding = word2vec_dict[word] # try to get vector:it is an array. except Exception: embedding = None if embedding is not None: # the 'word' exist a embedding word_embedding_2dlist[i] = embedding; count_exist = count_exist + 1 # assign array to this word. else: # no embedding for this word word_embedding_2dlist[i] = np.random.uniform(-bound, bound, FLAGS.embed_size); count_not_exist = count_not_exist + 1 # init a random value for the word. word_embedding_final = np.array(word_embedding_2dlist) # covert to 2d array. word_embedding = tf.constant(word_embedding_final, dtype=tf.float32) # convert to tensor t_assign_embedding = tf.assign(textCNN.Embedding,word_embedding) # assign this value to our embedding variables of our model. sess.run(t_assign_embedding); print("word. exists embedding:", count_exist, " ;word not exist embedding:", count_not_exist) print("using pre-trained word emebedding.ended...") # 在验证集上做验证,报告损失、精确度
Example #15
Source File: data_util_zhihu.py From text_classification with MIT License | 5 votes |
def create_voabulary_labelO(): model = word2vec.load('zhihu-word2vec-multilabel.bin-100', kind='bin') #zhihu-word2vec.bin-100 count=0 vocabulary_word2index_label={} vocabulary_index2word_label={} label_unique={} for i,vocab in enumerate(model.vocab): if '__label__' in vocab: #'__label__-2051131023989903826 label=vocab[vocab.index('__label__')+len('__label__'):] if label_unique.get(label,None) is None: #不曾出现过的话,保持到字典中 vocabulary_word2index_label[label]=count vocabulary_index2word_label[count]=label #ADD count=count+1 label_unique[label]=label return vocabulary_word2index_label,vocabulary_index2word_label
Example #16
Source File: data_util_zhihu.py From text_classification with MIT License | 5 votes |
def load_data(vocabulary_word2index,vocabulary_word2index_label,valid_portion=0.05,max_training_data=1000000,training_data_path='train-zhihu4-only-title-all.txt'): # n_words=100000, """ input: a file path :return: train, test, valid. where train=(trainX, trainY). where trainX: is a list of list.each list representation a sentence.trainY: is a list of label. each label is a number """ # 1.load a zhihu data from file # example:"w305 w6651 w3974 w1005 w54 w109 w110 w3974 w29 w25 w1513 w3645 w6 w111 __label__-400525901828896492" print("load_data.started...") zhihu_f = codecs.open(training_data_path, 'r', 'utf8') #-zhihu4-only-title.txt lines = zhihu_f.readlines() # 2.transform X as indices # 3.transform y as scalar X = [] Y = [] for i, line in enumerate(lines): x, y = line.split('__label__') #x='w17314 w5521 w7729 w767 w10147 w111' y=y.replace('\n','') x = x.replace("\t",' EOS ').strip() if i<5: print("x0:",x) #get raw x #x_=process_one_sentence_to_get_ui_bi_tri_gram(x) #if i<5: # print("x1:",x_) # x=x.split(" ") x = [vocabulary_word2index.get(e,0) for e in x] #if can't find the word, set the index as '0'.(equal to PAD_ID = 0) if i<5: print("x1:",x) #word to index y = vocabulary_word2index_label[y] #np.abs(hash(y)) X.append(x) Y.append(y) # 4.split to train,test and valid data number_examples = len(X) print("number_examples:",number_examples) # train = (X[0:int((1 - valid_portion) * number_examples)], Y[0:int((1 - valid_portion) * number_examples)]) test = (X[int((1 - valid_portion) * number_examples) + 1:], Y[int((1 - valid_portion) * number_examples) + 1:]) # 5.return print("load_data.ended...") return train, test, test # 将一句话转化为(uigram,bigram,trigram)后的字符串
Example #17
Source File: data_util_zhihu.py From text_classification with MIT License | 5 votes |
def process_one_sentence_to_get_ui_bi_tri_gram(sentence,n_gram=3): """ :param sentence: string. example:'w17314 w5521 w7729 w767 w10147 w111' :param n_gram: :return:string. example:'w17314 w17314w5521 w17314w5521w7729 w5521 w5521w7729 w5521w7729w767 w7729 w7729w767 w7729w767w10147 w767 w767w10147 w767w10147w111 w10147 w10147w111 w111' """ result=[] word_list=sentence.split(" ") #[sentence[i] for i in range(len(sentence))] unigram='';bigram='';trigram='';fourgram='' length_sentence=len(word_list) for i,word in enumerate(word_list): unigram=word #ui-gram word_i=unigram if n_gram>=2 and i+2<=length_sentence: #bi-gram bigram="".join(word_list[i:i+2]) word_i=word_i+' '+bigram if n_gram>=3 and i+3<=length_sentence: #tri-gram trigram="".join(word_list[i:i+3]) word_i = word_i + ' ' + trigram if n_gram>=4 and i+4<=length_sentence: #four-gram fourgram="".join(word_list[i:i+4]) word_i = word_i + ' ' + fourgram if n_gram>=5 and i+5<=length_sentence: #five-gram fivegram="".join(word_list[i:i+5]) word_i = word_i + ' ' + fivegram result.append(word_i) result=" ".join(result) return result # 加载数据,标签包含多个label:load data with multi-labels
Example #18
Source File: p8_TextRNN_train.py From text_classification with MIT License | 5 votes |
def assign_pretrained_word_embedding(sess,vocabulary_index2word,vocab_size,textRNN,word2vec_model_path=None): print("using pre-trained word emebedding.started.word2vec_model_path:",word2vec_model_path) # word2vecc=word2vec.load('word_embedding.txt') #load vocab-vector fiel.word2vecc['w91874'] word2vec_model = word2vec.load(word2vec_model_path, kind='bin') word2vec_dict = {} for word, vector in zip(word2vec_model.vocab, word2vec_model.vectors): word2vec_dict[word] = vector word_embedding_2dlist = [[]] * vocab_size # create an empty word_embedding list. word_embedding_2dlist[0] = np.zeros(FLAGS.embed_size) # assign empty for first word:'PAD' bound = np.sqrt(6.0) / np.sqrt(vocab_size) # bound for random variables. count_exist = 0; count_not_exist = 0 for i in range(1, vocab_size): # loop each word word = vocabulary_index2word[i] # get a word embedding = None try: embedding = word2vec_dict[word] # try to get vector:it is an array. except Exception: embedding = None if embedding is not None: # the 'word' exist a embedding word_embedding_2dlist[i] = embedding; count_exist = count_exist + 1 # assign array to this word. else: # no embedding for this word word_embedding_2dlist[i] = np.random.uniform(-bound, bound, FLAGS.embed_size); count_not_exist = count_not_exist + 1 # init a random value for the word. word_embedding_final = np.array(word_embedding_2dlist) # covert to 2d array. word_embedding = tf.constant(word_embedding_final, dtype=tf.float32) # convert to tensor t_assign_embedding = tf.assign(textRNN.Embedding,word_embedding) # assign this value to our embedding variables of our model. sess.run(t_assign_embedding); print("word. exists embedding:", count_exist, " ;word not exist embedding:", count_not_exist) print("using pre-trained word emebedding.ended...") # 在验证集上做验证,报告损失、精确度
Example #19
Source File: p9_BiLstmTextRelation_train.py From text_classification with MIT License | 5 votes |
def assign_pretrained_word_embedding(sess,vocabulary_index2word,vocab_size,textRNN,word2vec_model_path=None): print("using pre-trained word emebedding.started.word2vec_model_path:",word2vec_model_path) # word2vecc=word2vec.load('word_embedding.txt') #load vocab-vector fiel.word2vecc['w91874'] word2vec_model = word2vec.load(word2vec_model_path, kind='bin') word2vec_dict = {} for word, vector in zip(word2vec_model.vocab, word2vec_model.vectors): word2vec_dict[word] = vector word_embedding_2dlist = [[]] * vocab_size # create an empty word_embedding list. word_embedding_2dlist[0] = np.zeros(FLAGS.embed_size) # assign empty for first word:'PAD' bound = np.sqrt(6.0) / np.sqrt(vocab_size) # bound for random variables. count_exist = 0; count_not_exist = 0 for i in range(1, vocab_size): # loop each word word = vocabulary_index2word[i] # get a word embedding = None try: embedding = word2vec_dict[word] # try to get vector:it is an array. except Exception: embedding = None if embedding is not None: # the 'word' exist a embedding word_embedding_2dlist[i] = embedding; count_exist = count_exist + 1 # assign array to this word. else: # no embedding for this word word_embedding_2dlist[i] = np.random.uniform(-bound, bound, FLAGS.embed_size); count_not_exist = count_not_exist + 1 # init a random value for the word. word_embedding_final = np.array(word_embedding_2dlist) # covert to 2d array. word_embedding = tf.constant(word_embedding_final, dtype=tf.float32) # convert to tensor t_assign_embedding = tf.assign(textRNN.Embedding,word_embedding) # assign this value to our embedding variables of our model. sess.run(t_assign_embedding); print("word. exists embedding:", count_exist, " ;word not exist embedding:", count_not_exist) print("using pre-trained word emebedding.ended...") # 在验证集上做验证,报告损失、精确度
Example #20
Source File: a1_seq2seq_attention_train.py From text_classification with MIT License | 5 votes |
def assign_pretrained_word_embedding(sess,vocabulary_index2word,vocab_size,model,word2vec_model_path=None): print("using pre-trained word emebedding.started.word2vec_model_path:",word2vec_model_path) # word2vecc=word2vec.load('word_embedding.txt') #load vocab-vector fiel.word2vecc['w91874'] word2vec_model = word2vec.load(word2vec_model_path, kind='bin') word2vec_dict = {} for word, vector in zip(word2vec_model.vocab, word2vec_model.vectors): word2vec_dict[word] = vector word_embedding_2dlist = [[]] * vocab_size # create an empty word_embedding list. word_embedding_2dlist[0] = np.zeros(FLAGS.embed_size) # assign empty for first word:'PAD' bound = np.sqrt(6.0) / np.sqrt(vocab_size) # bound for random variables. count_exist = 0; count_not_exist = 0 for i in range(1, vocab_size): # loop each word word = vocabulary_index2word[i] # get a word embedding = None try: embedding = word2vec_dict[word] # try to get vector:it is an array. except Exception: embedding = None if embedding is not None: # the 'word' exist a embedding word_embedding_2dlist[i] = embedding; count_exist = count_exist + 1 # assign array to this word. else: # no embedding for this word word_embedding_2dlist[i] = np.random.uniform(-bound, bound, FLAGS.embed_size); count_not_exist = count_not_exist + 1 # init a random value for the word. word_embedding_final = np.array(word_embedding_2dlist) # covert to 2d array. word_embedding = tf.constant(word_embedding_final, dtype=tf.float32) # convert to tensor t_assign_embedding = tf.assign(model.Embedding,word_embedding) # assign this value to our embedding variables of our model. sess.run(t_assign_embedding); print("word. exists embedding:", count_exist, " ;word not exist embedding:", count_not_exist) print("using pre-trained word emebedding.ended...") # 在验证集上做验证,报告损失、精确度
Example #21
Source File: p72_TextCNN_with_RCNN_train.py From text_classification with MIT License | 5 votes |
def assign_pretrained_word_embedding(sess,vocabulary_index2word,vocab_size,textCNN,word2vec_model_path=None): print("using pre-trained word emebedding.started.word2vec_model_path:",word2vec_model_path) # word2vecc=word2vec.load('word_embedding.txt') #load vocab-vector fiel.word2vecc['w91874'] word2vec_model = word2vec.load(word2vec_model_path, kind='bin') word2vec_dict = {} for word, vector in zip(word2vec_model.vocab, word2vec_model.vectors): word2vec_dict[word] = vector word_embedding_2dlist = [[]] * vocab_size # create an empty word_embedding list. word_embedding_2dlist[0] = np.zeros(FLAGS.embed_size) # assign empty for first word:'PAD' bound = np.sqrt(6.0) / np.sqrt(vocab_size) # bound for random variables. count_exist = 0; count_not_exist = 0 for i in range(1, vocab_size): # loop each word word = vocabulary_index2word[i] # get a word embedding = None try: embedding = word2vec_dict[word] # try to get vector:it is an array. except Exception: embedding = None if embedding is not None: # the 'word' exist a embedding word_embedding_2dlist[i] = embedding; count_exist = count_exist + 1 # assign array to this word. else: # no embedding for this word word_embedding_2dlist[i] = np.random.uniform(-bound, bound, FLAGS.embed_size); count_not_exist = count_not_exist + 1 # init a random value for the word. word_embedding_final = np.array(word_embedding_2dlist) # covert to 2d array. word_embedding = tf.constant(word_embedding_final, dtype=tf.float32) # convert to tensor t_assign_embedding = tf.assign(textCNN.Embedding,word_embedding) # assign this value to our embedding variables of our model. sess.run(t_assign_embedding); print("word. exists embedding:", count_exist, " ;word not exist embedding:", count_not_exist) print("using pre-trained word emebedding.ended...") # 在验证集上做验证,报告损失、精确度
Example #22
Source File: a2_train.py From pynlp with MIT License | 5 votes |
def assign_pretrained_word_embedding(sess,vocabulary_index2word,vocab_size,model,word2vec_model_path=None): print("using pre-trained word emebedding.started.word2vec_model_path:",word2vec_model_path) # word2vecc=word2vec.load('word_embedding.txt') #load vocab-vector fiel.word2vecc['w91874'] word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_model_path, binary=True) word2vec_dict = {} for word, vector in zip(word2vec_model.vocab, word2vec_model.vectors): word2vec_dict[word] = vector word_embedding_2dlist = [[]] * vocab_size # create an empty word_embedding list. word_embedding_2dlist[0] = np.zeros(FLAGS.embed_size) # assign empty for first word:'PAD' bound = np.sqrt(6.0) / np.sqrt(vocab_size) # bound for random variables. count_exist = 0 count_not_exist = 0 for i in range(1, vocab_size): # loop each word word = vocabulary_index2word[i] # get a word embedding = None try: embedding = word2vec_dict[word] # try to get vector:it is an array. except Exception: embedding = None if embedding is not None: # the 'word' exist a embedding word_embedding_2dlist[i] = embedding count_exist = count_exist + 1 # assign array to this word. else: # no embedding for this word word_embedding_2dlist[i] = np.random.uniform(-bound, bound, FLAGS.embed_size) count_not_exist = count_not_exist + 1 # init a random value for the word. word_embedding_final = np.array(word_embedding_2dlist) # covert to 2d array. word_embedding = tf.constant(word_embedding_final, dtype=tf.float32) # convert to tensor t_assign_embedding = tf.assign(model.Embedding,word_embedding) # assign this value to our embedding variables of our model. sess.run(t_assign_embedding); print("word. exists embedding:", count_exist, " ;word not exist embedding:", count_not_exist) print("using pre-trained word emebedding.ended...") # 在验证集上做验证,报告损失、精确度
Example #23
Source File: data_input_helper.py From deep_learning with MIT License | 5 votes |
def __init__(self,file_path): self.model = word2vec.load(file_path) if 'unknown' not in self.model.vocab_hash: unknown_vec = np.random.uniform(-0.1,0.1,size=128) self.model.vocab_hash['unknown'] = len(self.model.vocab) self.model.vectors = np.row_stack((self.model.vectors,unknown_vec))
Example #24
Source File: data_util_zhihu.py From text_classification with MIT License | 5 votes |
def load_data(vocabulary_word2index,vocabulary_word2index_label,valid_portion=0.05,max_training_data=1000000,training_data_path='train-zhihu4-only-title-all.txt'): # n_words=100000, """ input: a file path :return: train, test, valid. where train=(trainX, trainY). where trainX: is a list of list.each list representation a sentence.trainY: is a list of label. each label is a number """ # 1.load a zhihu data from file # example:"w305 w6651 w3974 w1005 w54 w109 w110 w3974 w29 w25 w1513 w3645 w6 w111 __label__-400525901828896492" print("load_data.started...") zhihu_f = codecs.open(training_data_path, 'r', 'utf8') #-zhihu4-only-title.txt lines = zhihu_f.readlines() # 2.transform X as indices # 3.transform y as scalar X = [] Y = [] for i, line in enumerate(lines): x, y = line.split('__label__') #x='w17314 w5521 w7729 w767 w10147 w111' y=y.replace('\n','') x = x.replace("\t",' EOS ').strip() if i<5: print("x0:",x) #get raw x #x_=process_one_sentence_to_get_ui_bi_tri_gram(x) #if i<5: # print("x1:",x_) # x=x.split(" ") x = [vocabulary_word2index.get(e,0) for e in x] #if can't find the word, set the index as '0'.(equal to PAD_ID = 0) if i<5: print("x1:",x) #word to index y = vocabulary_word2index_label[y] #np.abs(hash(y)) X.append(x) Y.append(y) # 4.split to train,test and valid data number_examples = len(X) print("number_examples:",number_examples) # train = (X[0:int((1 - valid_portion) * number_examples)], Y[0:int((1 - valid_portion) * number_examples)]) test = (X[int((1 - valid_portion) * number_examples) + 1:], Y[int((1 - valid_portion) * number_examples) + 1:]) # 5.return print("load_data.ended...") return train, test, test # 将一句话转化为(uigram,bigram,trigram)后的字符串
Example #25
Source File: pre_data.py From PJ_NLP with Apache License 2.0 | 5 votes |
def emb2npz(emb_file_path, emb_dict_path): """将txt格式的embedding转为字典格式, 并将<PAD>和<UNK>加入""" emb = word2vec.load(emb_file_path) vec = emb.vectors word2id = emb.vocab_hash word2id['<PAD>'] = len(word2id) pad_row = [0] * vec.shape[1] vec = np.row_stack((vec, pad_row)) np.savez_compressed(emb_dict_path, vec=vec, word2id=word2id) print('word size: {}'.format(len(word2id))) print('emb shape: {}'.format(vec.shape))
Example #26
Source File: pre_data.py From PJ_NLP with Apache License 2.0 | 5 votes |
def data2npz(src_path, dst_path): """src_path txt: label+\t+title+\t+content 如:40,6 w6061,w26959,w109 w23255,w728,w12768,w58588,w11,w1442,w855,w36791""" data = np.load(conf.emb_path) word2id = data['word2id'].item() del data labels = [] titles = [] contents = [] with open(src_path, 'r', encoding='utf-8') as f: for i, line in enumerate(f): label, title, content = line.replace('\n', '').split('\t') label = [int(lab) for lab in label.split(',')] label_mat = np.zeros(conf.n_classes, dtype='int32') label_mat[label] = 1 labels.append(label_mat) # word2id title = [word2id[word if word in word2id else '</s>'] for word in title.split(',') if word.rstrip()] content = [word2id[word if word in word2id else '</s>'] for word in content.split(',') if word.rstrip()] # padding titles.append(padding(title, conf.title_seq_len, pad=word2id['<PAD>'])) contents.append(padding(content, conf.content_seq_len, pad=word2id['<PAD>'])) print('data size: {}'.format(len(labels))) np.savez_compressed(dst_path, label=labels, title=titles, content=contents)
Example #27
Source File: model.py From TextLevelGCN with GNU General Public License v3.0 | 5 votes |
def load_word2vec(self, word2vec_file): model = word2vec.load(word2vec_file) embedding_matrix = [] for word in self.vocab: try: embedding_matrix.append(model[word]) except KeyError: # print(word) embedding_matrix.append(model['the']) embedding_matrix = np.array(embedding_matrix) return embedding_matrix
Example #28
Source File: data_process.py From AIchallenger2018_MachineReadingComprehension with MIT License | 5 votes |
def transfer(model_path, embedding_size): start_time = time.time() model = word2vec.load(model_path) word2id_dic = {} init_0 = [0.0 for i in range(embedding_size)] id2vec_dic = [init_0] for i in range(len(model.vocab)): id = i + 1 word2id_dic[model.vocab[i]] = id id2vec_dic.append(model[model.vocab[i]].tolist()) end_time = time.time() print('词转id,id转向量完成') print(end_time - start_time) return word2id_dic, id2vec_dic
Example #29
Source File: data_process_addAnswer.py From AIchallenger2018_MachineReadingComprehension with MIT License | 5 votes |
def transfer(model_path, embedding_size): start_time = time.time() model = word2vec.load(model_path) word2id_dic = {} init_0 = [0.0 for i in range(embedding_size)] id2vec_dic = [init_0] for i in range(len(model.vocab)): id = i + 1 word2id_dic[model.vocab[i]] = id id2vec_dic.append(model[model.vocab[i]].tolist()) end_time = time.time() print('词转id,id转向量完成') print(end_time - start_time) return word2id_dic, id2vec_dic
Example #30
Source File: word2vec_helper.py From AI_Poet_Totoro with MIT License | 5 votes |
def __init__(self,file_path): # w2v_file = os.path.join(base_path, "vectors_poem.bin") self.model = word2vec.load(file_path) self.add_word('<unknown>') self.add_word('<pad>') # self.vocab_size = len(self.model.vocab)