Python nltk.word_tokenize() Examples
The following are 30
code examples of nltk.word_tokenize().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
nltk
, or try the search function
.
Example #1
Source File: kaggle.py From dl-models-for-qa with Apache License 2.0 | 8 votes |
def get_story_question_answer_triples(sqa_file): sqatriples = [] fsqa = open(sqa_file, "rb") for line in fsqa: line = line.strip().decode("utf8").encode("ascii", "ignore") if line.startswith("#"): continue story, question, answer, correct = line.split("\t") swords = [] story_sents = nltk.sent_tokenize(story) for story_sent in story_sents: swords.extend(nltk.word_tokenize(story_sent)) qwords = nltk.word_tokenize(question) awords = nltk.word_tokenize(answer) is_correct = int(correct) == 1 sqatriples.append((swords, qwords, awords, is_correct)) fsqa.close() return sqatriples
Example #2
Source File: math_expression_calculator.py From JARVIS with Apache License 2.0 | 8 votes |
def text_to_num(text): tokenized = nltk.word_tokenize(text); tags = nltk.pos_tag(tokenized) print(tags) chunkPattern = r""" Chunk0: {((<NN|CD.?|RB>)<CD.?|VBD.?|VBP.?|VBN.?|NN.?|RB.?|JJ>*)<NN|CD.?>} """ chunkParser = nltk.RegexpParser(chunkPattern) chunkedData = chunkParser.parse(tags) print(chunkedData) for subtree in chunkedData.subtrees(filter=lambda t: t.label() in "Chunk0"): exp = "" for l in subtree.leaves(): exp += str(l[0]) + " " exp = exp[:-1] print(exp) try: text = text.replace(exp, str(t2n.text2num(exp))) except Exception as e: print("error text2num ->", e.args) print("text2num -> ", text) return text
Example #3
Source File: utils.py From lisc with Apache License 2.0 | 6 votes |
def convert_string(text): """Convert a str of text into tokenized and selected list of words. Parameters ---------- text : str Text as one long string. Returns ------- words_cleaned : list of str List of tokenized words, after processing. Notes ----- This function sets text to lower case, and removes stopwords and punctuation. """ words = word_tokenize(text) words_cleaned = [word.lower() for word in words if ( (not word.lower() in stopwords.words('english')) & word.isalnum())] return words_cleaned
Example #4
Source File: nltk_plugin.py From self-attentive-parser with MIT License | 6 votes |
def _nltk_process_sents(self, sents): for sentence in sents: if isinstance(sentence, STRING_TYPES): if self._tokenizer_lang is None: raise ValueError( "No word tokenizer available for this language. " "Please tokenize before calling the parser." ) sentence = nltk.word_tokenize(sentence, self._tokenizer_lang) if IS_PY2: sentence = [ word.decode('utf-8', 'ignore') if isinstance(word, str) else word for word in sentence ] if not self._provides_tags: sentence = nltk.pos_tag(sentence) yield [word for word, tag in sentence], sentence else: yield sentence, sentence
Example #5
Source File: utils.py From Text-Classification-Models-Pytorch with MIT License | 6 votes |
def encode_text(text, word_embeddings, max_sen_len): ''' Encode a sequence of words into corresponding vector representation Input: text (string) : text (space separated words, etc..) word_embeddings (dict) : dictionary mapping from words to their representation max_sen_len (int) : maximum sentence length (in words) Returns: X (np.matrix) : matrix of shape (max_sen_len, embedding_size) after zero padding ''' default_embed = np.zeros(300) words = word_tokenize(text)[:max_sen_len] embeds = [word_embeddings.get(x, default_embed) for x in words] embeds += [default_embed] * (max_sen_len - len(embeds)) return np.array(embeds, dtype=np.float32)
Example #6
Source File: utils.py From Text-Classification-Models-Pytorch with MIT License | 6 votes |
def encode_text(text, word_embeddings): ''' Encode a sequence of words into corresponding vector representation Input: text (string) : text (space separated words, etc..) word_embeddings (dict) : dictionary mapping from words to their representation max_sent_len (int) : maximum sentence length (in words) Returns: X (np.array) : array of shape (embedding_size,) averaging all word vectors of text ''' embed = np.zeros(300) count = 0 words = word_tokenize(text) for word in words: if word in word_embeddings: embed += word_embeddings[word] count += 1 return embed / count
Example #7
Source File: VectorSpaceModel.py From Snowball with GNU General Public License v3.0 | 6 votes |
def __init__(self, sentences_file, stopwords): self.dictionary = None self.corpus = None f_sentences = codecs.open(sentences_file, encoding='utf-8') documents = list() count = 0 print("Gathering sentences and removing stopwords") for line in f_sentences: line = re.sub('<[A-Z]+>[^<]+</[A-Z]+>', '', line) # remove stop words and tokenize document = [word for word in nltk.word_tokenize(line.lower()) if word not in stopwords] documents.append(document) count += 1 if count % 10000 == 0: sys.stdout.write(".") f_sentences.close() self.dictionary = corpora.Dictionary(documents) self.corpus = [self.dictionary.doc2bow(text) for text in documents] self.tf_idf_model = TfidfModel(self.corpus) print(len(documents), "documents red") print(len(self.dictionary), " unique tokens")
Example #8
Source File: combined.py From Projects with MIT License | 6 votes |
def evaluate_sentiment(text): pos_score = 0 neg_score = 0 tokened = nltk.word_tokenize(text) pos_pairs = nltk.pos_tag(tokened) for tuple in pos_pairs: pos = '' if tuple[1] == "NN": pos = 'n/' if tuple[1] == "JJ": pos = 'a/' if tuple[1] == "VB": pos = 'v/' if tuple[1] == "RB": pos = 'r/' try: pos_score += sentiwordnet[pos+tuple[0].lower()][0] neg_score += sentiwordnet[pos+tuple[0].lower()][1] except: pass return pos_score, neg_score
Example #9
Source File: sentiwordnet.py From Projects with MIT License | 6 votes |
def evaluate_sentiment(text): pos_score = 0 neg_score = 0 tokened = nltk.word_tokenize(text.decode('utf8', 'ignore').replace('<br />',' ')) pos_pairs = nltk.pos_tag(tokened) for tuple in pos_pairs: pos = '' if tuple[1] == "NN": pos = 'n/' if tuple[1] == "JJ": pos = 'a/' if tuple[1] == "VB": pos = 'v/' if tuple[1] == "RB": pos = 'r/' try: pos_score += sentiwordnet[pos+tuple[0].lower()][0] neg_score += sentiwordnet[pos+tuple[0].lower()][1] except: pass return pos_score, neg_score
Example #10
Source File: evaluate.py From BERT with Apache License 2.0 | 6 votes |
def mixed_segmentation(in_str, rm_punc=False): in_str = str(in_str).decode('utf-8').lower().strip() segs_out = [] temp_str = "" sp_char = ['-',':','_','*','^','/','\\','~','`','+','=', ',','。',':','?','!','“','”',';','’','《','》','……','·','、', '「','」','(',')','-','~','『','』'] for char in in_str: if rm_punc and char in sp_char: continue if re.search(ur'[\u4e00-\u9fa5]', char) or char in sp_char: if temp_str != "": ss = nltk.word_tokenize(temp_str) segs_out.extend(ss) temp_str = "" segs_out.append(char) else: temp_str += char #handling last part
Example #11
Source File: nlp-6.4-tfidf-svm.py From Hands-on-NLP-with-NLTK-and-scikit-learn- with MIT License | 6 votes |
def extract_features(corpus): '''Extract TF-IDF features from corpus''' stop_words = nltk.corpus.stopwords.words("english") # vectorize means we turn non-numerical data into an array of numbers count_vectorizer = feature_extraction.text.CountVectorizer( lowercase=True, # for demonstration, True by default tokenizer=nltk.word_tokenize, # use the NLTK tokenizer min_df=2, # minimum document frequency, i.e. the word must appear more than once. ngram_range=(1, 2), stop_words=stop_words ) processed_corpus = count_vectorizer.fit_transform(corpus) processed_corpus = feature_extraction.text.TfidfTransformer().fit_transform( processed_corpus) return processed_corpus
Example #12
Source File: analyze_data.py From Machine-Translation with Apache License 2.0 | 6 votes |
def analyze_en(): translation_path = os.path.join(train_translation_folder, train_translation_en_filename) with open(translation_path, 'r') as f: sentences = f.readlines() sent_lengths = [] for sentence in tqdm(sentences): sentence_en = sentence.strip().lower() tokens = [normalizeString(s) for s in nltk.word_tokenize(sentence_en)] seg_list = list(jieba.cut(sentence.strip())) # Update word frequency sent_lengths.append(len(seg_list)) num_bins = 100 n, bins, patches = plt.hist(sent_lengths, num_bins, facecolor='blue', alpha=0.5) title = 'English Sentence Lengths Distribution' plt.title(title) plt.show()
Example #13
Source File: nlp-5-document-classification.py From Hands-on-NLP-with-NLTK-and-scikit-learn- with MIT License | 6 votes |
def extract_features(corpus): '''Extract TF-IDF features from corpus''' stop_words = nltk.corpus.stopwords.words("english") # vectorize means we turn non-numerical data into an array of numbers count_vectorizer = feature_extraction.text.CountVectorizer( lowercase=True, # for demonstration, True by default tokenizer=nltk.word_tokenize, # use the NLTK tokenizer min_df=2, # minimum document frequency, i.e. the word must appear more than once. ngram_range=(1, 2), stop_words=stop_words ) processed_corpus = count_vectorizer.fit_transform(corpus) processed_corpus = feature_extraction.text.TfidfTransformer().fit_transform( processed_corpus) return processed_corpus
Example #14
Source File: data_prep.py From Document-Classifier-LSTM with MIT License | 6 votes |
def preprocess(text): min_length = 3 text = re.sub('\d+','#',text) text = re.sub('\.',' eos ',text) # Tokenize words = map(lambda word: word.lower(), word_tokenize(text)) tokens = words # Remove non characters p = re.compile('[a-zA-Z#]+') # Filter tokens (we do not remove stopwords) filtered_tokens = list(filter(lambda token: p.match(token) and len(token)>=min_length and (token not in english_stopwords), tokens)) # Encode to ascii filtered_tokens = [token.encode('ascii','ignore') for token in filtered_tokens] return filtered_tokens # Modify this path
Example #15
Source File: Mem2Seq.py From ConvLab with MIT License | 6 votes |
def predict(self, query): usr = query print('Mem2Seq usr:', usr) #example input: 'please find a restaurant called nusha .' self.t += 1 print('Mem2Seq turn:', self.t) usr = ' '.join(word_tokenize(usr.lower())) self.memory += generate_memory(usr, '$u', self.t) src_plain = (self.memory+[['$$$$']*MEM_TOKEN_SIZE],) src_seqs = plain2tensor(self.lang.word2index, src_plain[0]) words = self.model.evaluate_batch(1, src_seqs, [len(src_plain[0])], None, None, None, None, src_plain) row = np.transpose(words)[0].tolist() if '<EOS>' in row: row = row[:row.index('<EOS>')] sys = ' '.join(row) sys = denormalize(sys) print('Mem2Seq sys:', sys) self.memory += generate_memory(sys, '$s', self.t) return sys
Example #16
Source File: phrasemachine.py From scattertext with Apache License 2.0 | 6 votes |
def __init__(self): import nltk from nltk.tag import PerceptronTagger from nltk.tokenize import TreebankWordTokenizer #return pkgutil.get_data('scattertext', # 'data/viz/semiotic_new.html').decode('utf-8') path = os.path.dirname(sys.modules['scattertext'].__file__)+'/data/' tokenizer_fn = path + 'punkt.english.pickle' tagger_fn = path + 'averaged_perceptron_tagger.pickle' #tokenizer_fn = os.path.abspath(resource_filename('scattertext.data', 'punkt.english.pickle')) #tagger_fn = os.path.abspath(resource_filename('scattertext.data', 'averaged_perceptron_tagger.pickle')) # Load the tagger self.tagger = PerceptronTagger(load=False) self.tagger.load(tagger_fn) # note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader. # Calling the TreebankWordTokenizer like this allows skipping the downloader. # It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads # https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25 self.tokenize = TreebankWordTokenizer().tokenize self.sent_detector = nltk.data.load(tokenizer_fn) # http://www.nltk.org/book/ch05.html
Example #17
Source File: classify.py From Building-Machine-Learning-Systems-With-Python-Second-Edition with MIT License | 5 votes |
def prepare_sent_features(): for pid, text in fetch_posts(chosen, with_index=True): if not text: meta[pid]['AvgSentLen'] = meta[pid]['AvgWordLen'] = 0 else: sent_lens = [len(nltk.word_tokenize( sent)) for sent in nltk.sent_tokenize(text)] meta[pid]['AvgSentLen'] = np.mean(sent_lens) meta[pid]['AvgWordLen'] = np.mean( [len(w) for w in nltk.word_tokenize(text)]) meta[pid]['NumAllCaps'] = np.sum( [word.isupper() for word in nltk.word_tokenize(text)]) meta[pid]['NumExclams'] = text.count('!')
Example #18
Source File: tokenizers.py From atis with MIT License | 5 votes |
def nl_tokenize(string): """Tokenizes a natural language string into tokens. Inputs: string: the string to tokenize. Outputs: a list of tokens. Assumes data is space-separated (this is true of ZC07 data in ATIS2/3). """ return nltk.word_tokenize(string)
Example #19
Source File: EvaluateTruecaser.py From truecaser with Apache License 2.0 | 5 votes |
def evaluateTrueCaser(testSentences, wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist): correctTokens = 0 totalTokens = 0 for sentence in testSentences: tokensCorrect = nltk.word_tokenize(sentence) tokens = [token.lower() for token in tokensCorrect] tokensTrueCase = getTrueCase(tokens, 'title', wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist) perfectMatch = True for idx in xrange(len(tokensCorrect)): totalTokens += 1 if tokensCorrect[idx] == tokensTrueCase[idx]: correctTokens += 1 else: perfectMatch = False if not perfectMatch: print tokensCorrect print tokensTrueCase print "-------------------" print "Accuracy: %.2f%%" % (correctTokens / float(totalTokens)*100)
Example #20
Source File: yelp.py From Point-Then-Operate with Apache License 2.0 | 5 votes |
def get_references(self): assert self.mode == 'test', 'Only test mode support get_references().' path0 = os.path.join(self.root, 'reference.0') path1 = os.path.join(self.root, 'reference.1') ref0 = [] ori0 = [] ref1 = [] ori1 = [] with open(path0) as f: for i, line in enumerate(f): if i in self.remove0: continue ori, ref = line.split('\t') ori = ori.split() ref = word_tokenize(ref.lower()) ori0.append(ori) ref0.append(ref) with open(path1) as f: for i, line in enumerate(f): if i in self.remove1: continue ori, ref = line.split('\t') ori = ori.split() ref = word_tokenize(ref.lower()) ori1.append(ori) ref1.append(ref) ori0 = [[w if w in self.vocab.word2id else self.vocab.id2word[self.unk] for w in sent] for sent in ori0] ref0 = [[w if w in self.vocab.word2id else self.vocab.id2word[self.unk] for w in sent] for sent in ref0] ori1 = [[w if w in self.vocab.word2id else self.vocab.id2word[self.unk] for w in sent] for sent in ori1] ref1 = [[w if w in self.vocab.word2id else self.vocab.id2word[self.unk] for w in sent] for sent in ref1] return ori0, ref0, ori1, ref1
Example #21
Source File: amazon.py From Point-Then-Operate with Apache License 2.0 | 5 votes |
def create_resort(self): """The file of human references is not originally aligned with the test split.""" _ids = [] ref_gt = [] with open(os.path.join(self.root, 'reference.0'), 'r') as f: for line in tqdm(f): ori, ref = line.split('\t') ori = ori.split() ref = word_tokenize(ref.lower()) ref_gt.append(ori) for sent in self.data0: _ids.append(ref_gt.index(sent)) with open(os.path.join(self.root, 'resort_0.txt'), 'w') as f: for _id in _ids: f.write(str(_id) + '\n') _ids = [] ref_gt = [] with open(os.path.join(self.root, 'reference.1'), 'r') as f: for line in tqdm(f): ori, ref = line.split('\t') ori = ori.split() ref = word_tokenize(ref.lower()) ref_gt.append(ori) for sent in self.data1: _ids.append(ref_gt.index(sent)) with open(os.path.join(self.root, 'resort_1.txt'), 'w') as f: for _id in _ids: f.write(str(_id) + '\n')
Example #22
Source File: tokens.py From chimera with MIT License | 5 votes |
def tokenize(s): return word_tokenize(s.replace("|", " | "))
Example #23
Source File: flashcards-embedding.py From dl-models-for-qa with Apache License 2.0 | 5 votes |
def __iter__(self): for line in open(self.filename, "rb"): line = line.strip() line = line.decode("utf8").encode("ascii", "ignore") _, question, answer = line.split("\t") qwords = nltk.word_tokenize(question) awords = nltk.word_tokenize(answer) yield qwords + awords # build model from sentences (CBOW w/negative sampling)
Example #24
Source File: textpro.py From comparable-text-miner with Apache License 2.0 | 5 votes |
def getLemma(text, contextFlag=False): lemmatizer = WordNetLemmatizer() #'NN':wordnet.NOUN,'JJ':wordnet.ADJ,'VB':wordnet.VERB,'RB':wordnet.ADV wordnet_tag ={'NN':'n','JJ':'a','VB':'v','RB':'r'} result = None if text.split() == 1: # on word tokenized = word_tokenize(t) tagged = pos_tag(tokenized)[0] lemma = '' try: lemma = lemmatizer.lemmatize(tagged[0],wordnet_tag[tagged[1][:2]]) except: lemma = lemmatizer.lemmatize(tagged[0]) result = lemma elif text.split() > 1 and contextFlag == True: # mutiple words i.e. text and without considering the context resultList = [] for t in text.split(): tokenized = word_tokenize(t) tagged = pos_tag(tokenized)[0] lemma = '' try: lemma = lemmatizer.lemmatize(tagged[0],wordnet_tag[tagged[1][:2]]) except: lemma = lemmatizer.lemmatize(tagged[0]) resultList.append(lemma) result = ' '.join(resultList) else: # mutiple words i.e. text and consider the context resultList = [] tokens = word_tokenize(text) tagged = pos_tag(tokens) for t in tagged: try: resultList.append(lemmatizer.lemmatize(t[0],wordnet_tag[t[1][:2]])) except: resultList.append(lemmatizer.lemmatize(t[0])) result = ' '.join(resultList) return result ################################################################################### # Given a Naive Bayes classifier, classify a text with a given certaintaity
Example #25
Source File: rd_ft.py From DeepLearn with MIT License | 5 votes |
def nouns(text): is_noun = lambda pos: pos[:2] == 'NN' tokenized = word_tokenize(text) nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)] return nouns #Average Edit Distance Value For Two String And The Average Edit Distance Between The Nouns Present In Them(Returns Float)
Example #26
Source File: hnd_ft.py From DeepLearn with MIT License | 5 votes |
def get_tokenized_lemmas(s): return [normalize_word(t) for t in nltk.word_tokenize(s)]
Example #27
Source File: data.py From fever-naacl-2018 with Apache License 2.0 | 5 votes |
def nltk_tokenizer(self,text): return " ".join(word_tokenize(text))
Example #28
Source File: chatbot.py From facebook-chatbot-python with MIT License | 5 votes |
def __init__(self, text): tokens = nltk.word_tokenize(text) print "tokenized" self._model = nltk.model.ngram.NgramModel(3, tokens) print "modelized"
Example #29
Source File: feature_engineering.py From DeepLearn with MIT License | 5 votes |
def get_tokenized_lemmas(s): return [normalize_word(t) for t in nltk.word_tokenize(s)]
Example #30
Source File: preprocessors.py From philo2vec with MIT License | 5 votes |
def get_words(text_stream): """ Tokenize and transform a stream of text. :param text_stream: list of sentences. :return: return the tokenized sentences after stemming and lower casing """ return [StemmingLookup.stem(word.lower()) for line in text_stream for word in word_tokenize(line) if word not in string.punctuation]