Python nltk.tokenize.wordpunct_tokenize() Examples
The following are 30
code examples of nltk.tokenize.wordpunct_tokenize().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
nltk.tokenize
, or try the search function
.
Example #1
Source File: data2tensor.py From deep-summarization with MIT License | 6 votes |
def generate_vocabulary(self, review_summary_file): """ :param review_summary_file: :return: """ self.rev_sum_pair = pd.read_csv(review_summary_file, header=0).values for review,summary in self.rev_sum_pair: rev_lst = wordpunct_tokenize(review) sum_lst = wordpunct_tokenize(summary) self.__add_list_to_dict(rev_lst) self.__add_list_to_dict(sum_lst) # Now store the "" empty string as the last word of the voacabulary self.map[""] = len(self.map) self.revmap[len(self.map)] = ""
Example #2
Source File: preprocessing.py From KATE with BSD 3-Clause "New" or "Revised" License | 6 votes |
def tiny_tokenize(text, stem=False, stop_words=[]): words = [] for token in wordpunct_tokenize(re.sub('[%s]' % re.escape(string.punctuation), ' ', \ text.decode(encoding='UTF-8', errors='ignore'))): if not token.isdigit() and not token in stop_words: if stem: try: w = EnglishStemmer().stem(token) except Exception as e: w = token else: w = token words.append(w) return words # return [EnglishStemmer().stem(token) if stem else token for token in wordpunct_tokenize( # re.sub('[%s]' % re.escape(string.punctuation), ' ', text.decode(encoding='UTF-8', errors='ignore'))) if # not token.isdigit() and not token in stop_words]
Example #3
Source File: utils.py From dl4ir-webnav with BSD 3-Clause "New" or "Revised" License | 6 votes |
def text2idx2(texts, vocab, dim): ''' Convert a list of texts to their corresponding vocabulary indexes. ''' out = -np.ones((len(texts), dim), dtype=np.int32) mask = np.zeros((len(texts), dim), dtype=np.float32) for i, text in enumerate(texts): j = 0 for word in wordpunct_tokenize(text): if word in vocab: out[i,j] = vocab[word] mask[i,j] = 1. j += 1 if j == dim: break return out, mask
Example #4
Source File: utils.py From dl4ir-webnav with BSD 3-Clause "New" or "Revised" License | 6 votes |
def augment(texts, dic_thes): if prm.aug<2: return texts out = [] for text in texts: words_orig = wordpunct_tokenize(text) maxrep = max(2,int(0.1*len(words_orig))) #define how many words will be replaced. For now, leave the maximum number as 10% of the words for j in range(prm.aug): words = list(words_orig) #copy for k in range(randint(1,maxrep)): idx = randint(0,len(words)-1) word = words[idx] if word in dic_thes: synonym = min(np.random.geometric(0.5), len(dic_thes[word])-1) #chose the synonym based on a geometric distribution #print 'fp',fp,"word", word,"synonym",dic_thes[word][synonym] words[idx] = dic_thes[word][synonym] out.append(" ".join(words)) return out
Example #5
Source File: utils.py From stochasticLDA with GNU General Public License v3.0 | 6 votes |
def parseDocument(doc, vocab): wordslist = list() countslist = list() doc = doc.lower() tokens = wordpunct_tokenize(doc) dictionary = dict() for word in tokens: if word in vocab: wordtk = vocab[word] if wordtk not in dictionary: dictionary[wordtk] = 1 else: dictionary[wordtk] += 1 wordslist.append(dictionary.keys()) countslist.append(dictionary.values()) return (wordslist[0], countslist[0])
Example #6
Source File: util.py From luscan-devel with GNU General Public License v2.0 | 5 votes |
def read_wordpunct_block(stream): toks = [] for i in range(20): # Read 20 lines at a time. toks.extend(wordpunct_tokenize(stream.readline())) return toks
Example #7
Source File: lemma.py From cltk with MIT License | 5 votes |
def lemmatize(self, text, best_guess=True, return_frequencies=False): """Lemmatize all tokens in a string or a list. A string is first tokenized using punkt. Throw a type error if the input is neither a string nor a list. """ if isinstance(text, str): tokens = wordpunct_tokenize(text) elif isinstance(text, list): tokens= text else: raise TypeError("lemmatize only works with strings or lists of string tokens.") return [self._lemmatize_token(token, best_guess, return_frequencies) for token in tokens]
Example #8
Source File: preprocessing.py From idea_relations with MIT License | 5 votes |
def tokenize(text, filter_stopwords=False, lowercase=True): words = wordpunct_tokenize(text) if filter_stopwords: words = [w for w in words if w not in STOPWORDS] return words
Example #9
Source File: preprocessing.py From KATE with BSD 3-Clause "New" or "Revised" License | 5 votes |
def tiny_tokenize_xml(text, stem=False, stop_words=[]): return [EnglishStemmer().stem(token) if stem else token for token in wordpunct_tokenize( re.sub('[%s]' % re.escape(string.punctuation), ' ', text.encode(encoding='ascii', errors='ignore'))) if not token.isdigit() and not token in stop_words]
Example #10
Source File: strUtil.py From NNED with MIT License | 5 votes |
def locateWord(word, wordsArr): if word in wordsArr: return wordsArr.index(word) else: idxs = [wordsArr.index(w) for w in wordsArr if word in wordpunct_tokenize(w)] return idxs[0]
Example #11
Source File: prepareDataSet_joint.py From NNED with MIT License | 5 votes |
def negSent2JointTrain(negSents, posSentNum): neg_training_data = [] for sentId, (sent_id, sent) in enumerate(negSents): wordsIn = wordpunct_tokenize(sent) sent = " ".join(wordsIn) eventTypeSequence = ["O" for i in range(len(wordsIn))] neg_training_data.append((str(sentId + posSentNum), sent, eventTypeSequence)) return neg_training_data
Example #12
Source File: pos.py From cltk with MIT License | 5 votes |
def tag_perceptron(self, untagged_string: str): """Tag POS with Perceptron tagger. :type untagged_string: str :param : An untagged, untokenized string of text. :rtype tagged_text: str """ untagged_tokens = wordpunct_tokenize(untagged_string) pickle_path = self.available_taggers['perceptron'] tagger = open_pickle(pickle_path) tagged_text = tagger.tag(untagged_tokens) return tagged_text
Example #13
Source File: utils.py From dl4ir-query-reformulator with BSD 3-Clause "New" or "Revised" License | 5 votes |
def BOW2(texts, vocab, dim): ''' Convert a list of texts to the BoW dense representation. ''' out = np.zeros((len(texts), dim), dtype=np.int32) mask = np.zeros((len(texts), dim), dtype=np.float32) for i, text in enumerate(texts): bow = BOW(wordpunct_tokenize(text), vocab) out[i,:len(bow[0])] = bow[0] mask[i,:len(bow[1])] = bow[1] return out, mask
Example #14
Source File: utils.py From dl4ir-query-reformulator with BSD 3-Clause "New" or "Revised" License | 5 votes |
def Word2Vec_encode(texts, wemb): out = np.zeros((len(texts), prm.dim_emb), dtype=np.float32) for i, text in enumerate(texts): words = wordpunct_tokenize(text) n = 0. for word in words: if word in wemb: out[i,:] += wemb[word] n += 1. out[i,:] /= max(1.,n) return out
Example #15
Source File: utils.py From dl4ir-query-reformulator with BSD 3-Clause "New" or "Revised" License | 5 votes |
def text2idx2(texts, vocab, dim, use_mask=False): ''' Convert a list of texts to their corresponding vocabulary indexes. ''' if use_mask: out = -np.ones((len(texts), dim), dtype=np.int32) mask = np.zeros((len(texts), dim), dtype=np.float32) else: out = -2 * np.ones((len(texts), dim), dtype=np.int32) out_lst = [] for i, text in enumerate(texts): words = wordpunct_tokenize(text)[:dim] for j, word in enumerate(words): if word in vocab: out[i,j] = vocab[word] else: out[i,j] = -1 # Unknown words out_lst.append(words) if use_mask: mask[i,:j] = 1. if use_mask: return out, mask, out_lst else: return out, out_lst
Example #16
Source File: rake.py From rake-nltk with MIT License | 5 votes |
def _generate_phrases(self, sentences): """Method to generate contender phrases given the sentences of the text document. :param sentences: List of strings where each string represents a sentence which forms the text. :return: Set of string tuples where each tuple is a collection of words forming a contender phrase. """ phrase_list = set() # Create contender phrases from sentences. for sentence in sentences: word_list = [word.lower() for word in wordpunct_tokenize(sentence)] phrase_list.update(self._get_phrase_list_from_words(word_list)) return phrase_list
Example #17
Source File: znltk.py From csirtg-smrt-v1 with Mozilla Public License 2.0 | 5 votes |
def top_tokens(text): freq_dict = defaultdict(int) tokens = wordpunct_tokenize(text) for token in tokens: freq_dict[token] += 1 return sorted(freq_dict, key=freq_dict.get, reverse=True)
Example #18
Source File: util.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def read_wordpunct_block(stream): toks = [] for i in range(20): # Read 20 lines at a time. toks.extend(wordpunct_tokenize(stream.readline())) return toks
Example #19
Source File: transcription.py From cltk with MIT License | 5 votes |
def transcribe(self, text, accentuate=True, syllabify=True): # input is word-tokenized, stripped of non-diacritic punctuation, # and diphthongs and diacritics are handled inp = [self._prep_text(w) for w in wordpunct_tokenize(text) if w not in self.punc] words = [] for w in inp: out = "" for c in w: ipa = self.table.get(c[0], c[0]) # if there are macrons in the diacritics, adds the ipa # notation for length (if it isn't there already) if chars.LONG in c[2]: if "ː" not in ipa: ipa = ipa[0] + "ː" + ipa[1:] if accentuate: # adds proper IPA notation for accents # if circumflex accent, adds appropriate # ipa tone contour notation if chars.CIRCUMFLEX in c[1]: ipa = ipa[0] + "̂" + ipa[1:] # if acute accent, adds appropriate # ipa tone contour notation if chars.ACUTE in c[1]: if len(ipa) > 1: ipa = ipa[0] + "́" + ipa[1:] else: ipa += "́" out += ipa transcription = Word(out, self.root) transcription._alternate() words.append(transcription) # Encloses output in brackets, proper notation for surface form. return "[" + " ".join([w._print_ipa(syllabify) for w in words]) + "]"
Example #20
Source File: transcription.py From cltk with MIT License | 5 votes |
def transcribe( self, text, macronize=True, syllabify=True, accentuate=True ): # if macronize, will first use the tagger to macronize input # otherwise, input will be the raw input string if macronize: text = self.macronizer.macronize_text(text) # input is word-tokenized, stripped of non-diacritic punctuation, # and diphthongs and diacritics are handled inp = [self._prep_text(w) for w in wordpunct_tokenize(text) if w not in self.punc] words = [] for w in inp: out = "" for c in w: if "̄" in c[1]: macron_added = c[0]+'̄' ipa = self.table.get(macron_added, macron_added) else: ipa = self.table.get(c[0], c[0]) out += ipa transcription = Word(out, self.root) transcription._alternate() words.append(transcription) # Encloses output in brackets, proper notation for surface form. return "[" + " ".join([w._print_ipa(syllabify, accentuate) for w in words]) + "]"
Example #21
Source File: pos.py From cltk with MIT License | 5 votes |
def tag_crf(self, untagged_string: str): """Tag POS with CRF tagger. :type untagged_string: str :param : An untagged, untokenized string of text. :rtype tagged_text: str """ untagged_tokens = wordpunct_tokenize(untagged_string) pickle_path = self.available_taggers['crf'] tagger = CRFTagger() tagger.set_model_file(pickle_path) tagged_text = tagger.tag(untagged_tokens) return tagged_text
Example #22
Source File: pos.py From cltk with MIT License | 5 votes |
def tag_ngram_12_backoff(self, untagged_string: str): """Tag POS with 1-, 2-gram tagger. :type untagged_string: str :param : An untagged, untokenized string of text. :rtype tagged_text: str """ untagged_tokens = wordpunct_tokenize(untagged_string) pickle_path = self.available_taggers['ngram_12_backoff'] tagger = open_pickle(pickle_path) tagged_text = tagger.tag(untagged_tokens) return tagged_text
Example #23
Source File: pos.py From cltk with MIT License | 5 votes |
def tag_ngram_123_backoff(self, untagged_string: str): """Tag POS with 1-, 2-, 3-gram tagger. :type untagged_string: str :param : An untagged, untokenized string of text. :rtype tagged_text: str """ untagged_tokens = wordpunct_tokenize(untagged_string) pickle_path = self.available_taggers['ngram_123_backoff'] tagger = open_pickle(pickle_path) tagged_text = tagger.tag(untagged_tokens) return tagged_text
Example #24
Source File: pos.py From cltk with MIT License | 5 votes |
def tag_trigram(self, untagged_string: str): """Tag POS with trigram tagger. :type untagged_string: str :param : An untagged, untokenized string of text. :rtype tagged_text: str """ untagged_tokens = wordpunct_tokenize(untagged_string) pickle_path = self.available_taggers['trigram'] tagger = open_pickle(pickle_path) tagged_text = tagger.tag(untagged_tokens) return tagged_text
Example #25
Source File: pos.py From cltk with MIT License | 5 votes |
def tag_bigram(self, untagged_string: str): """Tag POS with bigram tagger. :type untagged_string: str :param : An untagged, untokenized string of text. :rtype tagged_text: str """ untagged_tokens = wordpunct_tokenize(untagged_string) pickle_path = self.available_taggers['bigram'] tagger = open_pickle(pickle_path) tagged_text = tagger.tag(untagged_tokens) return tagged_text
Example #26
Source File: pos.py From cltk with MIT License | 5 votes |
def tag_unigram(self, untagged_string: str): """Tag POS with unigram tagger. :type untagged_string: str :param : An untagged, untokenized string of text. :rtype tagged_text: str """ untagged_tokens = wordpunct_tokenize(untagged_string) pickle_path = self.available_taggers['unigram'] tagger = open_pickle(pickle_path) tagged_text = tagger.tag(untagged_tokens) return tagged_text
Example #27
Source File: utils.py From dl4ir-webnav with BSD 3-Clause "New" or "Revised" License | 5 votes |
def Word2Vec_encode(texts, wemb): out = np.zeros((len(texts), prm.dim_emb), dtype=np.float32) for i, text in enumerate(texts): words = wordpunct_tokenize(text) n = 0. for word in words: if word in wemb: out[i,:] += wemb[word] n += 1. out[i,:] /= max(1.,n) return out
Example #28
Source File: utils.py From dl4ir-webnav with BSD 3-Clause "New" or "Revised" License | 5 votes |
def BOW2(texts, vocab, dim): ''' Convert a list of texts to the BoW dense representation. ''' out = np.zeros((len(texts), dim), dtype=np.int32) mask = np.zeros((len(texts), dim), dtype=np.float32) for i, text in enumerate(texts): bow = BOW(wordpunct_tokenize(text), vocab) out[i,:len(bow[0])] = bow[0] mask[i,:len(bow[1])] = bow[1] return out, mask
Example #29
Source File: transform.py From RecNet with MIT License | 5 votes |
def __call__(self, sentence): return wordpunct_tokenize(sentence)
Example #30
Source File: util.py From razzy-spinner with GNU General Public License v3.0 | 5 votes |
def read_wordpunct_block(stream): toks = [] for i in range(20): # Read 20 lines at a time. toks.extend(wordpunct_tokenize(stream.readline())) return toks