Python nltk.tokenize.word_tokenize() Examples
The following are 30
code examples of nltk.tokenize.word_tokenize().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
nltk.tokenize
, or try the search function
.
Example #1
Source File: summarize.py From Django-Bookworm with MIT License | 7 votes |
def get_summary(self, number_of_sentences=5): ''' generates summary based on weighted word frequencies :param number_of_sentences: total number of sentences to return in summary :return: string of summary ''' sentence_value = {} for sentence in self.__sentence: for word in self.__word_freq.keys(): if word in word_tokenize(sentence.lower()): if sentence in sentence_value: sentence_value[sentence] += self.__word_freq.get(word) else: sentence_value[sentence] = self.__word_freq.get(word, 0) summary_sentences = heapq.nlargest(number_of_sentences, sentence_value, key=sentence_value.get) summary = ' '.join(summary_sentences) return summary
Example #2
Source File: pre_processing.py From TextLevelGCN with GNU General Public License v3.0 | 7 votes |
def clean_text(text): # stop_words = stopwords.words('english') stop_words = [] stop_words.extend(['!', ',' ,'.' ,'?' ,'-s' ,'-ly' ,'</s> ', 's']) stemmer = WordNetLemmatizer() text = remove_short(text) text = clean_str(text) text = word_tokenize(text) text = [word for word in text if word not in stop_words] text = [stemmer.lemmatize(word) for word in text] return ' '.join(text)
Example #3
Source File: ded_detAttn.py From tf-var-attention with MIT License | 6 votes |
def validate(self, sess, x_val, y_val, true_val): # Calculate BLEU on validation data hypotheses_val = [] references_val = [] symbol=[] if self.config['experiment'] == 'qgen': symbol.append('?') for batch_i, (input_batch, output_batch, source_sent_lengths, tar_sent_lengths) in enumerate( data_utils.get_batches(x_val, y_val, self.batch_size)): answer_logits = sess.run(self.inference_logits, feed_dict={self.input_data: input_batch, self.source_sentence_length: source_sent_lengths, self.keep_prob: 1.0}) for k, pred in enumerate(answer_logits): hypotheses_val.append( word_tokenize(" ".join([self.decoder_idx_word[i] for i in pred if i not in [self.pad, -1, self.eos]])) + symbol) references_val.append([word_tokenize(true_val[batch_i * self.batch_size + k])]) bleu_scores = eval_utils.calculate_bleu_scores(references_val, hypotheses_val) self.epoch_bleu_score_val['1'].append(bleu_scores[0]) self.epoch_bleu_score_val['2'].append(bleu_scores[1]) self.epoch_bleu_score_val['3'].append(bleu_scores[2]) self.epoch_bleu_score_val['4'].append(bleu_scores[3])
Example #4
Source File: word2vec.py From Bidirectiona-LSTM-for-text-summarization- with MIT License | 6 votes |
def createCorpus(t): corpus = [] all_sent = [] for k in t: for p in t[k]: corpus.append(st(p)) for sent in range(len(corpus)): for k in corpus[sent]: all_sent.append(k) for m in range(len(all_sent)): all_sent[m] = wt(all_sent[m]) all_words=[] for sent in all_sent: hold=[] for word in sent: hold.append(word.lower()) all_words.append(hold) return all_words
Example #5
Source File: vqa.py From visual_question_answering with MIT License | 6 votes |
def filter_by_ans_len(self, max_ans_len, min_freq=5): print("Filtering the answers by length...") keep_ques = {} for ann in tqdm(self.dataset['annotations']): if len(word_tokenize(ann['best_answer'])) <= max_ans_len \ and ann['best_answer_count']>=min_freq: keep_ques[ann['question_id']] = \ keep_ques.get(ann['question_id'], 0) + 1 self.dataset['annotations'] = \ [ann for ann in self.dataset['annotations'] \ if keep_ques.get(ann['question_id'],0)>0] self.questions['questions'] = \ [ques for ques in self.questions['questions'] \ if keep_ques.get(ques['question_id'],0)>0] self.createIndex()
Example #6
Source File: prepro.py From visDial.pytorch with MIT License | 6 votes |
def tokenize_data(data): ''' Tokenize captions, questions and answers Also maintain word count if required ''' ques_toks, ans_toks, caption_toks = [], [], [] print data['split'] print 'Tokenizing captions...' for i in data['data']['dialogs']: caption = word_tokenize(i['caption']) caption_toks.append(caption) print 'Tokenizing questions...' for i in data['data']['questions']: ques_tok = word_tokenize(i + '?') ques_toks.append(ques_tok) print 'Tokenizing answers...' for i in data['data']['answers']: ans_tok = word_tokenize(i) ans_toks.append(ans_tok) return ques_toks, ans_toks, caption_toks
Example #7
Source File: vqa.py From visual_question_answering with MIT License | 6 votes |
def createIndex(self): # create index print('creating index...') imgToQA = {ann['image_id']: [] for ann in self.dataset['annotations']} qa = {ann['question_id']: [] for ann in self.dataset['annotations']} qqa = {ann['question_id']: [] for ann in self.dataset['annotations']} max_ques_len = 0 for ann in self.dataset['annotations']: imgToQA[ann['image_id']] += [ann] qa[ann['question_id']] = ann for ques in self.questions['questions']: qqa[ques['question_id']] = ques max_ques_len = max(max_ques_len, len(word_tokenize(ques['question']))) print('index created!') # create class members self.qa = qa self.qqa = qqa self.imgToQA = imgToQA self.max_ques_len = max_ques_len
Example #8
Source File: vqa.py From visual_question_answering with MIT License | 6 votes |
def filter_by_ques_len(self, max_ques_len): print("Filtering the questions by length...") keep_ques = {} for ques in tqdm(self.questions['questions']): if len(word_tokenize(ques['question'])) <= max_ques_len: keep_ques[ques['question_id']] = \ keep_ques.get(ques['question_id'], 0) + 1 self.dataset['annotations'] = \ [ann for ann in self.dataset['annotations'] \ if keep_ques.get(ann['question_id'],0)>0] self.questions['questions'] = \ [ques for ques in self.questions['questions'] \ if keep_ques.get(ques['question_id'],0)>0] self.createIndex()
Example #9
Source File: data_utils.py From dgm_latent_bow with MIT License | 6 votes |
def quora_read(file_path, bleu_baseline=False): """Read the quora dataset""" print("Reading quora raw data .. ") print(" data path: %s" % file_path) with open(file_path) as fd: lines = fd.readlines() sentence_sets = [] for l in tqdm(lines): p0, p1 = l[:-1].lower().split("\t") sentence_sets.append([word_tokenize(p0), word_tokenize(p1)]) if(bleu_baseline): print("calculating bleu ... ") hypothesis = [s[0] for s in sentence_sets] references = [s[1:] for s in sentence_sets] bleu = corpus_bleu(references, hypothesis) print("bleu on the training set: %.4f" % bleu) return sentence_sets
Example #10
Source File: prepare_clc_fce_data.py From NLP_Toolkit with Apache License 2.0 | 6 votes |
def main(): fce = convert_fce(args.fce_dataset_path) with open(args.output + "/fce-original.txt", 'w', encoding='utf-8') as out_original, \ open(args.output + "/fce-applied.txt", 'w', encoding='utf-8') as out_applied: for doc in tqdm(fce, unit='doc'): sents = re.split(r"\n +\n", doc) for sent in sents: tokenized_sents = sent_tokenize(sent) for i in range(len(tokenized_sents)): if re.search(r"[{>][.?!]$", tokenized_sents[i]): tokenized_sents[i + 1] = tokenized_sents[i] + " " + tokenized_sents[i + 1] tokenized_sents[i] = "" regexp = r'{([^{}]*?)=>([^{}]*?)}' original = re.sub(regexp, r"\1", tokenized_sents[i]) applied = re.sub(regexp, r"\2", tokenized_sents[i]) # filter out nested alerts if original != "" and applied != "" and not re.search(r"[{}=]", original) \ and not re.search(r"[{}=]", applied): out_original.write(" ".join(word_tokenize(original)) + "\n") out_applied.write(" ".join(word_tokenize(applied)) + "\n")
Example #11
Source File: utils.py From text-summarization-tensorflow with MIT License | 6 votes |
def build_dataset(step, word_dict, article_max_len, summary_max_len, toy=False): if step == "train": article_list = get_text_list(train_article_path, toy) title_list = get_text_list(train_title_path, toy) elif step == "valid": article_list = get_text_list(valid_article_path, toy) else: raise NotImplementedError x = [word_tokenize(d) for d in article_list] x = [[word_dict.get(w, word_dict["<unk>"]) for w in d] for d in x] x = [d[:article_max_len] for d in x] x = [d + (article_max_len - len(d)) * [word_dict["<padding>"]] for d in x] if step == "valid": return x else: y = [word_tokenize(d) for d in title_list] y = [[word_dict.get(w, word_dict["<unk>"]) for w in d] for d in y] y = [d[:(summary_max_len - 1)] for d in y] return x, y
Example #12
Source File: dont_run_me_run_the_other_script_instead.py From punctuator2 with MIT License | 6 votes |
def process_line(line): tokens = word_tokenize(line) output_tokens = [] for token in tokens: if token in INS_PUNCTS: output_tokens.append(INS_PUNCTS[token]) elif token in EOS_PUNCTS: output_tokens.append(EOS_PUNCTS[token]) elif is_number(token): output_tokens.append(NUM) else: output_tokens.append(token.lower()) return untokenize(" ".join(output_tokens) + " ")
Example #13
Source File: sumbasic.py From ns with MIT License | 5 votes |
def sum_basic(lines, word_limit, update_non_redundency=True): def weight(sents, distribution): def _weight_sent(sent): tokens = preprocess(word_tokenize(sent)) return reduce(lambda x,y: x+y, [distribution.get(x) for x in tokens]) / len(tokens) return [_weight_sent(sent) for sent in sents] def probability_distribution(tokens): N = len(tokens) distinct_words = set(tokens) probabilities = [tokens.count(w) / N for w in distinct_words] return dict(list(zip(distinct_words, probabilities))) sents = to_sents(lines) tokens = to_tokens(sents) tokens = preprocess(tokens) pd = probability_distribution(tokens) summary = "" while len(word_tokenize(summary)) < word_limit: weights = weight(sents, pd) highest_weight_sentence = max(list(zip(sents, weights)), key=itemgetter(1))[0] summary += " " + highest_weight_sentence if update_non_redundency: for token in preprocess(word_tokenize(highest_weight_sentence)): pd[token] = pd[token] * pd[token] else: sents.remove(highest_weight_sentence) return summary
Example #14
Source File: sumbasic.py From ns with MIT License | 5 votes |
def leading(lines, word_limit): sents = to_sents(lines) summary = "" while len(word_tokenize(summary)) < word_limit: summary += " " + sents.pop(0) # main methods
Example #15
Source File: reviews_data.py From company-reviews with MIT License | 5 votes |
def preprocess(reviews, stop, MIN_WORDS): docs = [] doc_indexes = [] for i,review in enumerate(reviews): rev_words = [] words = [word for sent in sent_tokenize(review) for word in word_tokenize(sent.lower())] stripped_words = [] for word in words: new_words = strip_and_split(word) # some words aren't separated correctly or have numbers stripped_words += [nw for nw in new_words if nw not in stop] if len(stripped_words) < MIN_WORDS: continue docs.append(stripped_words) doc_indexes.append(i) return docs, doc_indexes
Example #16
Source File: reviews_data.py From company-reviews with MIT License | 5 votes |
def get_combined_lower(indeed_reviews_db, glassdoor_reviews_db): combined = get_combined_reviews(indeed_reviews_db,glassdoor_reviews_db) combined_lower = [] for review in combined: combined_lower.append(review.lower())#' '.join([word for sent in sent_tokenize(review) for word in word_tokenize(sent.lower())])) return combined_lower # Make the ratings keys standard
Example #17
Source File: reviews_data.py From company-reviews with MIT License | 5 votes |
def get_stemmed_separate(indeed_reviews_db, glassdoor_reviews_db): separate = get_separate_reviews(indeed_reviews_db, glassdoor_reviews_db) stemmer = PorterStemmer() stemmed_reviews = [] for review in separate: stemmed_reviews.append(' '.join([stemmer.stem(word) for sent in sent_tokenize(review) for word in word_tokenize(sent.lower())])) return stemmed_reviews
Example #18
Source File: reviews_data.py From company-reviews with MIT License | 5 votes |
def get_stemmed_combined_reviews(indeed_reviews_db, glassdoor_reviews_db): combined = get_combined_reviews(indeed_reviews_db, glassdoor_reviews_db) stemmer = PorterStemmer() stemmed_reviews = [] for review in combined: stemmed_reviews.append(' '.join([stemmer.stem(word) for sent in sent_tokenize(review) for word in word_tokenize(sent.lower())])) return stemmed_reviews
Example #19
Source File: utilities.py From KDD2018_MPCN with GNU General Public License v3.0 | 5 votes |
def tylib_tokenize(x, setting='split', lower=False, tweet_process=False): ''' All tokenizer in one. A convenient wrapper Supported - 'split','nltk_tweet' TODO:'treebank','nltk_word' Args: x: `list`. list of words setting: `str` supports different tokenizers Returns: Tokenized output `list` ''' if(setting=='split'): tokens = x.split(' ') elif(setting=='nltk_tweet'): tokens = tweet_tokenizer.tokenize(x) elif(setting=='nltk'): tokens = word_tokenize(x) if(lower): tokens = [x.lower() for x in tokens] if(tweet_process): tokens = [tweet_processer(x) for x in tokens] return tokens
Example #20
Source File: preprocess_trec.py From sentence_classification with MIT License | 5 votes |
def preprocess(text): """ Preprocess text for encoder """ X = [] sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') for t in text: sents = sent_detector.tokenize(t) result = '' for s in sents: tokens = word_tokenize(s) result += ' ' + ' '.join(tokens) X.append(result) return X
Example #21
Source File: evaluator.py From NLP_Toolkit with Apache License 2.0 | 5 votes |
def yelp_ppl(self, texts_transfered): texts_transfered = [' '.join(word_tokenize(itm.lower().strip())) for itm in texts_transfered] sum = 0 words = [] length = 0 for i, line in enumerate(texts_transfered): words += [word for word in line.split()] length += len(line.split()) score = self.yelp_ppl_model.score(line) sum += score return math.pow(10, -sum / length)
Example #22
Source File: evaluator.py From NLP_Toolkit with Apache License 2.0 | 5 votes |
def nltk_bleu(self, texts_origin, text_transfered): texts_origin = [word_tokenize(text_origin.lower().strip()) for text_origin in texts_origin] text_transfered = word_tokenize(text_transfered.lower().strip()) return sentence_bleu(texts_origin, text_transfered) * 100
Example #23
Source File: evaluator.py From NLP_Toolkit with Apache License 2.0 | 5 votes |
def yelp_style_check(self, text_transfered, style_origin): text_transfered = ' '.join(word_tokenize(text_transfered.lower().strip())) if text_transfered == '': return False label = self.classifier_yelp.predict([text_transfered]) style_transfered = label[0][0] == '__label__positive' return (style_transfered != style_origin)
Example #24
Source File: reader.py From SEDST with MIT License | 5 votes |
def _get_tokenized_data(self, raw_data, db_data, construct_vocab): tokenized_data = [] vk_map = self._value_key_map(db_data) for dial_id, dial in enumerate(raw_data): tokenized_dial = [] for turn in dial['dial']: turn_num = turn['turn'] constraint = [] for slot in turn['usr']['slu']: if slot['act'] == 'inform': s = slot['slots'][0][1] if s not in ['dontcare', 'none']: constraint.extend(word_tokenize(s)) degree = len(self.db_search(constraint)) constraint.append('EOS_Z1') user = word_tokenize(turn['usr']['transcript']) + ['EOS_U'] response = word_tokenize(self._replace_entity(turn['sys']['sent'], vk_map, constraint)) + ['EOS_M'] tokenized_dial.append({ 'dial_id': dial_id, 'turn_num': turn_num, 'user': user, 'response': response, 'constraint': constraint, 'degree': degree, }) if construct_vocab: for word in user + response + constraint: self.vocab.add_item(word) tokenized_data.append(tokenized_dial) return tokenized_data
Example #25
Source File: ngrams-example.py From python-examples with MIT License | 5 votes |
def get_ngrams(text, n): ngramnums = word_tokenize(text) ll = [x for x in ngramnums if not re.fullmatch('[' + string.punctuation + ']+', x)] ll = ngrams(ll, n) return [' '.join(grams) for grams in ll]
Example #26
Source File: disintegrator.py From quantified-self with MIT License | 5 votes |
def convert2simple(self, sentence=""): tokenized = word_tokenize(sentence) tokenized = self.__filter_punctuation(tokenized) tokenized = self.__filter_stopwords(tokenized) return " ".join(self.__lemmatize(tokenized))
Example #27
Source File: word_counter.py From wordcounter with Apache License 2.0 | 5 votes |
def append_ext(words): new_words = [] for item in words: word, count = item tag = nltk.pos_tag(word_tokenize(word))[0][1] # tag is like [('bigger', 'JJR')] new_words.append((word, count, tag)) return new_words
Example #28
Source File: word_counter.py From wordcounter with Apache License 2.0 | 5 votes |
def merge(words): new_words = [] for word in words: if word: tag = nltk.pos_tag(word_tokenize(word)) # tag is like [('bigger', 'JJR')] pos = get_wordnet_pos(tag[0][1]) if pos: lemmatized_word = lmtzr.lemmatize(word, pos) new_words.append(lemmatized_word) else: new_words.append(word) return new_words
Example #29
Source File: data_utils.py From dgm_latent_bow with MIT License | 5 votes |
def mscoco_read_json(file_path, bleu_baseline=False): """Read the mscoco dataset Args: file_path: path to the raw data, a string Returns: sentence_sets: the sentence sets, a list of paraphrase lists """ print("Reading mscoco raw data .. ") print(" data path: %s" % file_path) with open(file_path, "r") as fd: data = json.load(fd) print("%d sentences in total" % len(data["annotations"])) # aggregate all sentences of the same images image_idx = set([d["image_id"] for d in data["annotations"]]) paraphrases = {} for im in image_idx: paraphrases[im] = [] for d in tqdm(data["annotations"]): im = d["image_id"] sent = d["caption"] paraphrases[im].append(word_tokenize(sent)) sentence_sets = [paraphrases[im] for im in paraphrases] # blue on the training set, a baseline/ upperbound if(bleu_baseline): print("calculating bleu ... ") hypothesis = [s[0] for s in sentence_sets] references = [s[1:] for s in sentence_sets] bleu = corpus_bleu(references, hypothesis) print("bleu on the training set: %.4f" % bleu) return sentence_sets
Example #30
Source File: reader.py From ConvLab with MIT License | 5 votes |
def _tokenize(self, sent): return ' '.join(word_tokenize(sent))