Python nltk.tokenize.treebank.TreebankWordTokenizer() Examples
The following are 12
code examples of nltk.tokenize.treebank.TreebankWordTokenizer().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
nltk.tokenize.treebank
, or try the search function
.
Example #1
Source File: treebank_encoder.py From PyTorch-NLP with BSD 3-Clause "New" or "Revised" License | 6 votes |
def __init__(self, *args, **kwargs): if 'tokenize' in kwargs: raise TypeError('``TreebankEncoder`` does not take keyword argument ``tokenize``.') if 'detokenize' in kwargs: raise TypeError('``TreebankEncoder`` does not take keyword argument ``detokenize``.') try: import nltk # Required for moses nltk.download('perluniprops') nltk.download('nonbreaking_prefixes') from nltk.tokenize.treebank import TreebankWordTokenizer from nltk.tokenize.treebank import TreebankWordDetokenizer except ImportError: print("Please install NLTK. " "See the docs at http://nltk.org for more information.") raise super().__init__( *args, tokenize=TreebankWordTokenizer().tokenize, detokenize=TreebankWordDetokenizer().detokenize, **kwargs)
Example #2
Source File: __init__.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 6 votes |
def word_tokenize(text, language='english', preserve_line=False): """ Return a tokenized copy of *text*, using NLTK's recommended word tokenizer (currently an improved :class:`.TreebankWordTokenizer` along with :class:`.PunktSentenceTokenizer` for the specified language). :param text: text to split into words :type text: str :param language: the model name in the Punkt corpus :type language: str :param preserve_line: An option to keep the preserve the sentence and not sentence tokenize it. :type preserve_line: bool """ sentences = [text] if preserve_line else sent_tokenize(text, language) return [ token for sent in sentences for token in _treebank_word_tokenizer.tokenize(sent) ]
Example #3
Source File: __init__.py From razzy-spinner with GNU General Public License v3.0 | 5 votes |
def word_tokenize(text, language='english'): """ Return a tokenized copy of *text*, using NLTK's recommended word tokenizer (currently :class:`.TreebankWordTokenizer` along with :class:`.PunktSentenceTokenizer` for the specified language). :param text: text to split into sentences :param language: the model name in the Punkt corpus """ return [token for sent in sent_tokenize(text, language) for token in _treebank_word_tokenize(sent)]
Example #4
Source File: util.py From razzy-spinner with GNU General Public License v3.0 | 5 votes |
def demo_liu_hu_lexicon(sentence, plot=False): """ Basic example of sentiment classification using Liu and Hu opinion lexicon. This function simply counts the number of positive, negative and neutral words in the sentence and classifies it depending on which polarity is more represented. Words that do not appear in the lexicon are considered as neutral. :param sentence: a sentence whose polarity has to be classified. :param plot: if True, plot a visual representation of the sentence polarity. """ from nltk.corpus import opinion_lexicon from nltk.tokenize import treebank tokenizer = treebank.TreebankWordTokenizer() pos_words = 0 neg_words = 0 tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)] x = list(range(len(tokenized_sent))) # x axis for the plot y = [] for word in tokenized_sent: if word in opinion_lexicon.positive(): pos_words += 1 y.append(1) # positive elif word in opinion_lexicon.negative(): neg_words += 1 y.append(-1) # negative else: y.append(0) # neutral if pos_words > neg_words: print('Positive') elif pos_words < neg_words: print('Negative') elif pos_words == neg_words: print('Neutral') if plot == True: _show_plot(x, y, x_labels=tokenized_sent, y_labels=['Negative', 'Neutral', 'Positive'])
Example #5
Source File: tokenizer.py From gutenberg with GNU General Public License v3.0 | 5 votes |
def tokenize_text(text, language="english"): '''Tokenize a string into a list of tokens. Use NLTK's Treebankwordtokenizer. Note that we first split into sentences using NLTK's sent_tokenize. We additionally call a filtering function to remove un-wanted tokens. IN: - text, str OUT: - list of strings ''' ## list of tokens list_tokens = [] ## split text into sentences sentences=sent_tokenize(text, language=language) ## define the tokenizer tokenizer = TreebankWordTokenizer() ## loop over all sentences for sent in sentences: ## tokenize the sentence sent_tokenized = tokenizer.tokenize(sent) ## lowercase the tokens ## add tokens to list of tokens list_tokens += sent_tokenized list_tokens = filter_tokens(list_tokens) return list_tokens
Example #6
Source File: word.py From cltk with MIT License | 5 votes |
def tokenize(self, text: str): """ :rtype: list :param text: text to be tokenized into sentences :type text: str """ sents = self.sent_tokenizer.tokenize(text) tokenizer = TreebankWordTokenizer() return [item for sublist in tokenizer.tokenize_sents(sents) for item in sublist]
Example #7
Source File: word.py From cltk with MIT License | 5 votes |
def tokenize(self, text: str): """ :rtype: list :param text: text to be tokenized into sentences :type text: str :param model: tokenizer object to used # Should be in init? :type model: object """ sents = self.sent_tokenizer.tokenize(text) tokenizer = TreebankWordTokenizer() return [item for sublist in tokenizer.tokenize_sents(sents) for item in sublist]
Example #8
Source File: __init__.py From luscan-devel with GNU General Public License v2.0 | 5 votes |
def word_tokenize(text): """ Return a tokenized copy of *text*, using NLTK's recommended word tokenizer (currently :class:`.TreebankWordTokenizer`). This tokenizer is designed to work on a sentence at a time. """ return _word_tokenize(text)
Example #9
Source File: utils.py From neural-tweet-search with Apache License 2.0 | 5 votes |
def word_tokenize(text, language='english'): """ Return a tokenized copy of *text*, using NLTK's recommended word tokenizer (currently :class:`.TreebankWordTokenizer` along with :class:`.PunktSentenceTokenizer` for the specified language). :param text: text to split into sentences :param language: the model name in the Punkt corpus """ if sys.version_info[0] < 3: return [token for token in _treebank_word_tokenize(text)] else: return [token for token in _treebank_word_tokenize(text.decode("UTF-8"))]
Example #10
Source File: nltk_processors.py From forte with Apache License 2.0 | 5 votes |
def __init__(self): super().__init__() self.tokenizer = TreebankWordTokenizer()
Example #11
Source File: word.py From cltk with MIT License | 4 votes |
def __init__(self, language): """Take language as argument to the class. Check availability and setup class variables.""" self.language = language self.available_languages = ['akkadian', 'arabic', 'french', # defaults to old_french 'greek', 'latin', 'middle_english', 'middle_french', 'middle_high_german', 'old_french', 'old_norse', 'sanskrit', 'multilingual'] assert self.language in self.available_languages, \ "Specific tokenizer not available for '{0}'. Only available for: '{1}'.".format( self.language, self.available_languages) # raise languages-specific warnings if self.language == 'french': self.language = 'old_french' LOG.warning("'french' defaults to 'old_french'. 'middle_french' also available.") # pylint: disable=line-too-long if self.language == 'arabic': self.toker = BaseArabyWordTokenizer('arabic') elif self.language == 'french': self.toker = BaseRegexWordTokenizer('old_french', OldFrenchTokenizerPatterns) elif self.language == 'greek': self.toker = BasePunktWordTokenizer('greek', GreekRegexSentenceTokenizer) elif self.language == 'latin': self.toker = LatinWordTokenizer() elif self.language == 'old_norse': self.toker = BaseRegexWordTokenizer('old_norse', OldNorseTokenizerPatterns) elif self.language == 'middle_english': self.toker = BaseRegexWordTokenizer('middle_english', MiddleEnglishTokenizerPatterns) elif self.language == 'middle_french': self.toker = BaseRegexWordTokenizer('old_french', OldFrenchTokenizerPatterns) elif self.language == 'middle_high_german': self.toker = BaseRegexWordTokenizer('middle_high_german', MiddleHighGermanTokenizerPatterns) elif self.language == 'old_french': self.toker = BaseRegexWordTokenizer('old_french', OldFrenchTokenizerPatterns) else: LOG.warning("Falling back to default tokenizer, the NLTK's `TreebankWordTokenizer()`.") self.toker = TreebankWordTokenizer()
Example #12
Source File: util.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 4 votes |
def demo_liu_hu_lexicon(sentence, plot=False): """ Basic example of sentiment classification using Liu and Hu opinion lexicon. This function simply counts the number of positive, negative and neutral words in the sentence and classifies it depending on which polarity is more represented. Words that do not appear in the lexicon are considered as neutral. :param sentence: a sentence whose polarity has to be classified. :param plot: if True, plot a visual representation of the sentence polarity. """ from nltk.corpus import opinion_lexicon from nltk.tokenize import treebank tokenizer = treebank.TreebankWordTokenizer() pos_words = 0 neg_words = 0 tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)] x = list(range(len(tokenized_sent))) # x axis for the plot y = [] for word in tokenized_sent: if word in opinion_lexicon.positive(): pos_words += 1 y.append(1) # positive elif word in opinion_lexicon.negative(): neg_words += 1 y.append(-1) # negative else: y.append(0) # neutral if pos_words > neg_words: print('Positive') elif pos_words < neg_words: print('Negative') elif pos_words == neg_words: print('Neutral') if plot == True: _show_plot( x, y, x_labels=tokenized_sent, y_labels=['Negative', 'Neutral', 'Positive'] )