Python Examples of nltk.tokenize.treebank.TreebankWordTokenizer

Source File: treebank_encoder.py From PyTorch-NLP with BSD 3-Clause "New" or "Revised" License

6 votes

def __init__(self, *args, **kwargs):
        if 'tokenize' in kwargs:
            raise TypeError('``TreebankEncoder`` does not take keyword argument ``tokenize``.')

        if 'detokenize' in kwargs:
            raise TypeError('``TreebankEncoder`` does not take keyword argument ``detokenize``.')

        try:
            import nltk

            # Required for moses
            nltk.download('perluniprops')
            nltk.download('nonbreaking_prefixes')

            from nltk.tokenize.treebank import TreebankWordTokenizer
            from nltk.tokenize.treebank import TreebankWordDetokenizer
        except ImportError:
            print("Please install NLTK. " "See the docs at http://nltk.org for more information.")
            raise

        super().__init__(
            *args,
            tokenize=TreebankWordTokenizer().tokenize,
            detokenize=TreebankWordDetokenizer().detokenize,
            **kwargs)

Source File: __init__.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International

6 votes

def word_tokenize(text, language='english', preserve_line=False):
    """
    Return a tokenized copy of *text*,
    using NLTK's recommended word tokenizer
    (currently an improved :class:`.TreebankWordTokenizer`
    along with :class:`.PunktSentenceTokenizer`
    for the specified language).

    :param text: text to split into words
    :type text: str
    :param language: the model name in the Punkt corpus
    :type language: str
    :param preserve_line: An option to keep the preserve the sentence and not sentence tokenize it.
    :type preserve_line: bool
    """
    sentences = [text] if preserve_line else sent_tokenize(text, language)
    return [
        token for sent in sentences for token in _treebank_word_tokenizer.tokenize(sent)
    ]

Source File: __init__.py From razzy-spinner with GNU General Public License v3.0

5 votes

def word_tokenize(text, language='english'):
    """
    Return a tokenized copy of *text*,
    using NLTK's recommended word tokenizer
    (currently :class:`.TreebankWordTokenizer`
    along with :class:`.PunktSentenceTokenizer`
    for the specified language).

    :param text: text to split into sentences
    :param language: the model name in the Punkt corpus
    """
    return [token for sent in sent_tokenize(text, language)
            for token in _treebank_word_tokenize(sent)]

Source File: util.py From razzy-spinner with GNU General Public License v3.0

5 votes

def demo_liu_hu_lexicon(sentence, plot=False):
    """
    Basic example of sentiment classification using Liu and Hu opinion lexicon.
    This function simply counts the number of positive, negative and neutral words
    in the sentence and classifies it depending on which polarity is more represented.
    Words that do not appear in the lexicon are considered as neutral.

    :param sentence: a sentence whose polarity has to be classified.
    :param plot: if True, plot a visual representation of the sentence polarity.
    """
    from nltk.corpus import opinion_lexicon
    from nltk.tokenize import treebank

    tokenizer = treebank.TreebankWordTokenizer()
    pos_words = 0
    neg_words = 0
    tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)]

    x = list(range(len(tokenized_sent))) # x axis for the plot
    y = []

    for word in tokenized_sent:
        if word in opinion_lexicon.positive():
            pos_words += 1
            y.append(1) # positive
        elif word in opinion_lexicon.negative():
            neg_words += 1
            y.append(-1) # negative
        else:
            y.append(0) # neutral

    if pos_words > neg_words:
        print('Positive')
    elif pos_words < neg_words:
        print('Negative')
    elif pos_words == neg_words:
        print('Neutral')

    if plot == True:
        _show_plot(x, y, x_labels=tokenized_sent, y_labels=['Negative', 'Neutral', 'Positive'])

Source File: tokenizer.py From gutenberg with GNU General Public License v3.0

5 votes

def tokenize_text(text, language="english"):
    '''Tokenize a string into a list of tokens.
    Use NLTK's Treebankwordtokenizer.
    Note that we first split into sentences using NLTK's sent_tokenize.
    We additionally call a filtering function to remove un-wanted tokens.
    
    IN:
    - text, str
    OUT:
    - list of strings
    '''
    ## list of tokens
    list_tokens = []
    
    ## split text into sentences
    sentences=sent_tokenize(text, language=language)
    
    ## define the tokenizer
    tokenizer = TreebankWordTokenizer()
    ## loop over all sentences
    for sent in sentences:
        ## tokenize the sentence
        sent_tokenized = tokenizer.tokenize(sent)
        ## lowercase the tokens
        ## add tokens to list of tokens
        list_tokens += sent_tokenized
    list_tokens = filter_tokens(list_tokens)
    return list_tokens

Source File: word.py From cltk with MIT License

5 votes

def tokenize(self, text: str):
        """
        :rtype: list
        :param text: text to be tokenized into sentences
        :type text: str
        """
        sents = self.sent_tokenizer.tokenize(text)
        tokenizer = TreebankWordTokenizer()
        return [item for sublist in tokenizer.tokenize_sents(sents) for item in sublist]

Source File: word.py From cltk with MIT License

5 votes

def tokenize(self, text: str):
        """
        :rtype: list
        :param text: text to be tokenized into sentences
        :type text: str
        :param model: tokenizer object to used # Should be in init?
        :type model: object
        """
        sents = self.sent_tokenizer.tokenize(text)
        tokenizer = TreebankWordTokenizer()
        return [item for sublist in tokenizer.tokenize_sents(sents) for item in sublist]

Source File: __init__.py From luscan-devel with GNU General Public License v2.0

5 votes

def word_tokenize(text):
    """
    Return a tokenized copy of *text*,
    using NLTK's recommended word tokenizer
    (currently :class:`.TreebankWordTokenizer`).
    This tokenizer is designed to work on a sentence at a time.
    """
    return _word_tokenize(text)

Source File: utils.py From neural-tweet-search with Apache License 2.0

5 votes

def word_tokenize(text, language='english'):
    """
    Return a tokenized copy of *text*,
    using NLTK's recommended word tokenizer
    (currently :class:`.TreebankWordTokenizer`
    along with :class:`.PunktSentenceTokenizer`
    for the specified language).

    :param text: text to split into sentences
    :param language: the model name in the Punkt corpus
    """
    if sys.version_info[0] < 3:
        return [token for token in _treebank_word_tokenize(text)]
    else:
        return [token for token in _treebank_word_tokenize(text.decode("UTF-8"))]

Source File: nltk_processors.py From forte with Apache License 2.0

5 votes

def __init__(self):
        super().__init__()
        self.tokenizer = TreebankWordTokenizer()

Source File: word.py From cltk with MIT License

4 votes

def __init__(self, language):
        """Take language as argument to the class. Check availability and
        setup class variables."""
        self.language = language
        self.available_languages = ['akkadian',
                                    'arabic',
                                    'french',  # defaults to old_french
                                    'greek',
                                    'latin',
                                    'middle_english',
                                    'middle_french',
                                    'middle_high_german',
                                    'old_french',
                                    'old_norse',
                                    'sanskrit',
                                    'multilingual']

        assert self.language in self.available_languages, \
            "Specific tokenizer not available for '{0}'. Only available for: '{1}'.".format(
                self.language,
                self.available_languages)

        # raise languages-specific warnings
        if self.language == 'french':
            self.language = 'old_french'
            LOG.warning("'french' defaults to 'old_french'. 'middle_french' also available.")  # pylint: disable=line-too-long

        if self.language == 'arabic':
            self.toker = BaseArabyWordTokenizer('arabic')
        elif self.language == 'french':
            self.toker = BaseRegexWordTokenizer('old_french',
                                                OldFrenchTokenizerPatterns)
        elif self.language == 'greek':
            self.toker = BasePunktWordTokenizer('greek',
                                                GreekRegexSentenceTokenizer)
        elif self.language == 'latin':
            self.toker = LatinWordTokenizer()
        elif self.language == 'old_norse':
            self.toker = BaseRegexWordTokenizer('old_norse',
                                                OldNorseTokenizerPatterns)
        elif self.language == 'middle_english':
            self.toker = BaseRegexWordTokenizer('middle_english',
                                                MiddleEnglishTokenizerPatterns)
        elif self.language == 'middle_french':
            self.toker = BaseRegexWordTokenizer('old_french',
                                                OldFrenchTokenizerPatterns)
        elif self.language == 'middle_high_german':
            self.toker = BaseRegexWordTokenizer('middle_high_german',
                                                MiddleHighGermanTokenizerPatterns)
        elif self.language == 'old_french':
            self.toker = BaseRegexWordTokenizer('old_french',
                                                OldFrenchTokenizerPatterns)
        else:
            LOG.warning("Falling back to default tokenizer, the NLTK's `TreebankWordTokenizer()`.")
            self.toker = TreebankWordTokenizer()

Source File: util.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International

4 votes

def demo_liu_hu_lexicon(sentence, plot=False):
    """
    Basic example of sentiment classification using Liu and Hu opinion lexicon.
    This function simply counts the number of positive, negative and neutral words
    in the sentence and classifies it depending on which polarity is more represented.
    Words that do not appear in the lexicon are considered as neutral.

    :param sentence: a sentence whose polarity has to be classified.
    :param plot: if True, plot a visual representation of the sentence polarity.
    """
    from nltk.corpus import opinion_lexicon
    from nltk.tokenize import treebank

    tokenizer = treebank.TreebankWordTokenizer()
    pos_words = 0
    neg_words = 0
    tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)]

    x = list(range(len(tokenized_sent)))  # x axis for the plot
    y = []

    for word in tokenized_sent:
        if word in opinion_lexicon.positive():
            pos_words += 1
            y.append(1)  # positive
        elif word in opinion_lexicon.negative():
            neg_words += 1
            y.append(-1)  # negative
        else:
            y.append(0)  # neutral

    if pos_words > neg_words:
        print('Positive')
    elif pos_words < neg_words:
        print('Negative')
    elif pos_words == neg_words:
        print('Neutral')

    if plot == True:
        _show_plot(
            x, y, x_labels=tokenized_sent, y_labels=['Negative', 'Neutral', 'Positive']
        )

Python nltk.tokenize.treebank.TreebankWordTokenizer() Examples