Python nltk.wordpunct_tokenize() Examples
The following are 19
code examples of nltk.wordpunct_tokenize().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
nltk
, or try the search function
.
Example #1
Source File: preprocess.py From atap with Apache License 2.0 | 5 votes |
def tokenize(self, fileid): """ Segments, tokenizes, and tags a document in the corpus. Returns a generator of paragraphs, which are lists of sentences, which in turn are lists of part of speech tagged words. """ for paragraph in self.corpus.paras(fileids=fileid): yield [ pos_tag(wordpunct_tokenize(sent)) for sent in sent_tokenize(paragraph) ]
Example #2
Source File: helper.py From transferable_sent2vec with MIT License | 5 votes |
def tokenize_and_normalize(s): """Tokenize and normalize string.""" token_list = [] tokens = wordpunct_tokenize(s.lower()) token_list.extend([x for x in tokens if not re.fullmatch('[' + string.punctuation + ']+', x)]) return token_list
Example #3
Source File: helper.py From transferable_sent2vec with MIT License | 5 votes |
def tokenize_and_normalize(s): """Tokenize and normalize string.""" token_list = [] tokens = wordpunct_tokenize(s.lower()) token_list.extend([x for x in tokens if not re.fullmatch('[' + string.punctuation + ']+', x)]) return token_list
Example #4
Source File: utils.py From dialog with Apache License 2.0 | 5 votes |
def tokenize(sent): tokens = tokenizer.tokenize(sent) ret = [] for t in tokens: if '<' not in t: ret.extend(wordpunct_tokenize(t)) else: ret.append(t) return ret
Example #5
Source File: cluster.py From dialog with Apache License 2.0 | 5 votes |
def normalize(sent): return wordpunct_tokenize(sent.lower())
Example #6
Source File: natural_language.py From luscan-devel with GNU General Public License v2.0 | 5 votes |
def calculate_language_scores(text): """ Calculate probability of given text to be written in several languages and return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0}. :param text: Text to analyze. :type text: str :return: Dictionary with languages and unique stopwords seen in analyzed text. :rtype: dict(str -> int) :raises: TypeError """ if not isinstance(text, basestring): raise TypeError("Expected basestring, got '%s' instead" % type(text)) if not text: return {} languages_ratios = {} # Split the text into separate tokens, using natural language punctuation signs. tokens = wordpunct_tokenize(text) tokenized_words = [word.lower() for word in tokens] for language in stopwords.fileids(): stopwords_set = set(stopwords.words(language)) words_set = set(tokenized_words) common_elements = words_set.intersection(stopwords_set) languages_ratios[language] = len(common_elements) # language "score" return languages_ratios #------------------------------------------------------------------------------
Example #7
Source File: reader.py From atap with Apache License 2.0 | 5 votes |
def tagged_tokens(self): for sent in self.sents(): for word in nltk.wordpunct_tokenize(sent): yield nltk.pos_tag(word)
Example #8
Source File: reader.py From atap with Apache License 2.0 | 5 votes |
def words(self): """ Returns a generator of words. """ for sent in self.sents(): for word in nltk.wordpunct_tokenize(sent): yield word
Example #9
Source File: am_reader.py From atap with Apache License 2.0 | 5 votes |
def words(self): """ Returns a generator of words. """ for sent in self.sents(): for word in nltk.wordpunct_tokenize(sent): yield word
Example #10
Source File: features.py From product-classifier with MIT License | 5 votes |
def tokenize(self, text): """ Returns a list of individual tokens from the text utilizing NLTK's tokenize built in utility (far better than split on space). It also removes any stopwords and punctuation from the text, as well as ensure that every token is normalized. For now, token = word as in bag of words (the feature we're using). """ for token in wordpunct_tokenize(text): token = self.normalize(token) if token in self.punctuation: continue if token in self.stopwords: continue yield token
Example #11
Source File: reader.py From atap with Apache License 2.0 | 5 votes |
def describe(self, fileids=None, categories=None): """ Performs a single pass of the corpus and returns a dictionary with a variety of metrics concerning the state of the corpus. """ started = time.time() # Structures to perform counting. counts = nltk.FreqDist() tokens = nltk.FreqDist() # Perform single pass over paragraphs, tokenize and count for para in self.paras(fileids, categories): counts['paras'] += 1 for sent in sent_tokenize(para): counts['sents'] += 1 for word in wordpunct_tokenize(sent): counts['words'] += 1 tokens[word] += 1 # Compute the number of files and categories in the corpus n_fileids = len(self.resolve(fileids, categories) or self.fileids()) n_topics = len(self.categories(self.resolve(fileids, categories))) # Return data structure with information return { 'files': n_fileids, 'topics': n_topics, 'paras': counts['paras'], 'sents': counts['sents'], 'words': counts['words'], 'vocab': len(tokens), 'lexdiv': float(counts['words']) / float(len(tokens)), 'ppdoc': float(counts['paras']) / float(n_fileids), 'sppar': float(counts['sents']) / float(counts['paras']), 'secs': time.time() - started, }
Example #12
Source File: reader.py From atap with Apache License 2.0 | 5 votes |
def tokenize(self, fileids=None, categories=None): """ Segments, tokenizes, and tags a document in the corpus. """ for paragraph in self.paras(fileids=fileids): yield [ pos_tag(wordpunct_tokenize(sent)) for sent in sent_tokenize(paragraph) ]
Example #13
Source File: reader.py From atap with Apache License 2.0 | 5 votes |
def words(self, fileids=None, categories=None): """ Uses the built in word tokenizer to extract tokens from sentences. Note that this method uses BeautifulSoup to parse HTML content. """ for sentence in self.sents(fileids, categories): for token in wordpunct_tokenize(sentence): yield token
Example #14
Source File: reader.py From atap with Apache License 2.0 | 5 votes |
def tokenize(self, fileids=None, categories=None): """ Segments, tokenizes, and tags a document in the corpus. """ for paragraph in self.corpus.paras(fileids=fileid): yield [ pos_tag(nltk.wordpunct_tokenize(sent)) for sent in nltk.sent_tokenize(paragraph) ]
Example #15
Source File: reader.py From atap with Apache License 2.0 | 5 votes |
def words(self, fileids=None, categories=None): """ Uses the built in word tokenizer to extract tokens from sentences. Note that this method uses BeautifulSoup to parse HTML content. """ for sentence in self.sents(fileids, categories): for token in wordpunct_tokenize(sentence): yield token
Example #16
Source File: recommender.py From atap with Apache License 2.0 | 5 votes |
def query(self, terms): """ Given input list of ingredient terms, return the k closest matching recipes. :param terms: list of strings :return: list of document indices of documents """ vect_doc = self.transformer.named_steps['transform'].fit_transform( wordpunct_tokenize(terms) ) dists, inds = self.tree.query(vect_doc, k=self.k) return inds[0]
Example #17
Source File: recommender.py From atap with Apache License 2.0 | 5 votes |
def recommend(self, terms): """ Given input list of ingredient terms, return the k closest matching recipes. :param terms: list of strings :return: list of document indices of documents """ vect_doc = self.vect.transform(wordpunct_tokenize(terms)) distance_matches = self.knn.transform(vect_doc) # the result is a list with a 2-tuple of arrays matches = distance_matches[0][1][0] # the matches are the indices of documents return matches
Example #18
Source File: parse.py From atap with Apache License 2.0 | 5 votes |
def parse(sent): parser = nltk.ChartParser(grammar) tokens = nltk.wordpunct_tokenize(sent) return parser.parse(tokens)
Example #19
Source File: natural_language.py From luscan-devel with GNU General Public License v2.0 | 4 votes |
def get_words(text, min_length = None, max_length = None): """ Parse the given text as natural language and extract words from it. Optionally filter the words by minimum and/or maximum length. :param text: Text to parse. :type text: str :param min_length: Minimum length required by each token. Use None for no limit. :type min_length: int | None :param min_length: Maximum length allowed by each token. Use None for no limit. :type min_length: int | None :return: Set of unique words extracted from the text. :rtype: set(str) :raises: TypeError, ValueError """ if min_length is not None: if not isinstance(min_length, int): raise TypeError("Expected int, got '%s' instead" % type(min_length)) elif min_length < 0: raise ValueError("Min length must be greater than 0, got %s." % min_length) if max_length is not None: if not isinstance(max_length, int): raise TypeError("Expected int, got '%s' instead" % type(min_length)) elif max_length < 0: raise ValueError("Min length must be greater than 0, got %s" % max_length) # Split the text into separate tokens, using natural language # punctuation signs. Then filter out by min/max length, and tokens # that aren't strictly alphabetic. Finally, convert the words to # lowercase form. return { word.lower() for word in wordpunct_tokenize(text) if ( word.isalpha() and (min_length is None or len(word) >= min_length) and (max_length is None or len(word) <= max_length) ) } #------------------------------------------------------------------------------