Python Examples of nltk.wordpunct

Source File: preprocess.py From atap with Apache License 2.0

5 votes

def tokenize(self, fileid):
        """
        Segments, tokenizes, and tags a document in the corpus. Returns a
        generator of paragraphs, which are lists of sentences, which in turn
        are lists of part of speech tagged words.
        """
        for paragraph in self.corpus.paras(fileids=fileid):
            yield [
                pos_tag(wordpunct_tokenize(sent))
                for sent in sent_tokenize(paragraph)
            ]

Source File: helper.py From transferable_sent2vec with MIT License

5 votes

def tokenize_and_normalize(s):
    """Tokenize and normalize string."""
    token_list = []
    tokens = wordpunct_tokenize(s.lower())
    token_list.extend([x for x in tokens if not re.fullmatch('[' + string.punctuation + ']+', x)])
    return token_list

Source File: helper.py From transferable_sent2vec with MIT License

5 votes

def tokenize_and_normalize(s):
    """Tokenize and normalize string."""
    token_list = []
    tokens = wordpunct_tokenize(s.lower())
    token_list.extend([x for x in tokens if not re.fullmatch('[' + string.punctuation + ']+', x)])
    return token_list

Source File: utils.py From dialog with Apache License 2.0

5 votes

def tokenize(sent):
    tokens = tokenizer.tokenize(sent)
    ret = []
    for t in tokens:
        if '<' not in t:
            ret.extend(wordpunct_tokenize(t))
        else:
            ret.append(t)
    return ret

Source File: cluster.py From dialog with Apache License 2.0

5 votes

def normalize(sent):
    return wordpunct_tokenize(sent.lower())

Source File: natural_language.py From luscan-devel with GNU General Public License v2.0

5 votes

def calculate_language_scores(text):
    """
    Calculate probability of given text to be written in several languages and
    return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0}.

    :param text: Text to analyze.
    :type text: str

    :return: Dictionary with languages and unique stopwords seen in analyzed text.
    :rtype: dict(str -> int)

    :raises: TypeError
    """
    if not isinstance(text, basestring):
        raise TypeError("Expected basestring, got '%s' instead" % type(text))
    if not text:
        return {}

    languages_ratios = {}

    # Split the text into separate tokens, using natural language punctuation signs.
    tokens = wordpunct_tokenize(text)
    tokenized_words = [word.lower() for word in tokens]

    for language in stopwords.fileids():
        stopwords_set = set(stopwords.words(language))
        words_set = set(tokenized_words)
        common_elements = words_set.intersection(stopwords_set)
        languages_ratios[language] = len(common_elements)  # language "score"

    return languages_ratios


#------------------------------------------------------------------------------

Source File: reader.py From atap with Apache License 2.0

5 votes

def tagged_tokens(self):
        for sent in self.sents():
            for word in nltk.wordpunct_tokenize(sent):
                yield nltk.pos_tag(word)

Source File: reader.py From atap with Apache License 2.0

5 votes

def words(self):
        """
        Returns a generator of words.
        """
        for sent in self.sents():
            for word in nltk.wordpunct_tokenize(sent):
                yield word

Source File: am_reader.py From atap with Apache License 2.0

5 votes

def words(self):
        """
        Returns a generator of words.
        """
        for sent in self.sents():
            for word in nltk.wordpunct_tokenize(sent):
                yield word

Source File: features.py From product-classifier with MIT License

5 votes

def tokenize(self, text):
        """
        Returns a list of individual tokens from the text utilizing NLTK's
        tokenize built in utility (far better than split on space). It also
        removes any stopwords and punctuation from the text, as well as
        ensure that every token is normalized.

        For now, token = word as in bag of words (the feature we're using).
        """
        for token in wordpunct_tokenize(text):
            token = self.normalize(token)
            if token in self.punctuation: continue
            if token in self.stopwords: continue
            yield token

Source File: reader.py From atap with Apache License 2.0

5 votes

def describe(self, fileids=None, categories=None):
        """
        Performs a single pass of the corpus and
        returns a dictionary with a variety of metrics
        concerning the state of the corpus.
        """
        started = time.time()

        # Structures to perform counting.
        counts  = nltk.FreqDist()
        tokens  = nltk.FreqDist()

        # Perform single pass over paragraphs, tokenize and count
        for para in self.paras(fileids, categories):
            counts['paras'] += 1

            for sent in sent_tokenize(para):
                counts['sents'] += 1

                for word in wordpunct_tokenize(sent):
                    counts['words'] += 1
                    tokens[word] += 1

        # Compute the number of files and categories in the corpus
        n_fileids = len(self.resolve(fileids, categories) or self.fileids())
        n_topics  = len(self.categories(self.resolve(fileids, categories)))

        # Return data structure with information
        return {
            'files':  n_fileids,
            'topics': n_topics,
            'paras':  counts['paras'],
            'sents':  counts['sents'],
            'words':  counts['words'],
            'vocab':  len(tokens),
            'lexdiv': float(counts['words']) / float(len(tokens)),
            'ppdoc':  float(counts['paras']) / float(n_fileids),
            'sppar':  float(counts['sents']) / float(counts['paras']),
            'secs':   time.time() - started,
        }

Source File: reader.py From atap with Apache License 2.0

5 votes

def tokenize(self, fileids=None, categories=None):
        """
        Segments, tokenizes, and tags a document in the corpus.
        """
        for paragraph in self.paras(fileids=fileids):
            yield [
                pos_tag(wordpunct_tokenize(sent))
                for sent in sent_tokenize(paragraph)
            ]

Source File: reader.py From atap with Apache License 2.0

5 votes

def words(self, fileids=None, categories=None):
        """
        Uses the built in word tokenizer to extract tokens from sentences.
        Note that this method uses BeautifulSoup to parse HTML content.
        """
        for sentence in self.sents(fileids, categories):
            for token in wordpunct_tokenize(sentence):
                yield token

Source File: reader.py From atap with Apache License 2.0

5 votes

def tokenize(self, fileids=None, categories=None):
        """
        Segments, tokenizes, and tags a document in the corpus.
        """
        for paragraph in self.corpus.paras(fileids=fileid):
            yield [
                pos_tag(nltk.wordpunct_tokenize(sent))
                for sent in nltk.sent_tokenize(paragraph)
            ]

Source File: reader.py From atap with Apache License 2.0

5 votes

def words(self, fileids=None, categories=None):
        """
        Uses the built in word tokenizer to extract tokens from sentences.
        Note that this method uses BeautifulSoup to parse HTML content.
        """
        for sentence in self.sents(fileids, categories):
            for token in wordpunct_tokenize(sentence):
                yield token

Source File: recommender.py From atap with Apache License 2.0

5 votes

def query(self, terms):
        """
        Given input list of ingredient terms,
        return the k closest matching recipes.

        :param terms: list of strings
        :return: list of document indices of documents
        """
        vect_doc = self.transformer.named_steps['transform'].fit_transform(
            wordpunct_tokenize(terms)
        )
        dists, inds = self.tree.query(vect_doc, k=self.k)
        return inds[0]

Source File: recommender.py From atap with Apache License 2.0

5 votes

def recommend(self, terms):
        """
        Given input list of ingredient terms,
        return the k closest matching recipes.

        :param terms: list of strings
        :return: list of document indices of documents
        """
        vect_doc = self.vect.transform(wordpunct_tokenize(terms))
        distance_matches = self.knn.transform(vect_doc)
        # the result is a list with a 2-tuple of arrays
        matches = distance_matches[0][1][0]
        # the matches are the indices of documents
        return matches

Source File: parse.py From atap with Apache License 2.0

5 votes

def parse(sent):
    parser = nltk.ChartParser(grammar)
    tokens = nltk.wordpunct_tokenize(sent)
    return parser.parse(tokens)

Source File: natural_language.py From luscan-devel with GNU General Public License v2.0

4 votes

def get_words(text, min_length = None, max_length = None):
    """
    Parse the given text as natural language and extract words from it.
    Optionally filter the words by minimum and/or maximum length.

    :param text: Text to parse.
    :type text: str

    :param min_length: Minimum length required by each token. Use None for no limit.
    :type min_length: int | None

    :param min_length: Maximum length allowed by each token. Use None for no limit.
    :type min_length: int | None

    :return: Set of unique words extracted from the text.
    :rtype: set(str)

    :raises: TypeError, ValueError
    """
    if min_length is not None:
        if not isinstance(min_length, int):
            raise TypeError("Expected int, got '%s' instead" % type(min_length))
        elif min_length < 0:
            raise ValueError("Min length must be greater than 0, got %s." % min_length)

    if max_length is not None:
        if not isinstance(max_length, int):
            raise TypeError("Expected int, got '%s' instead" % type(min_length))
        elif max_length < 0:
            raise ValueError("Min length must be greater than 0, got %s" % max_length)

    # Split the text into separate tokens, using natural language
    # punctuation signs. Then filter out by min/max length, and tokens
    # that aren't strictly alphabetic. Finally, convert the words to
    # lowercase form.
    return {
        word.lower() for word in wordpunct_tokenize(text) if
        (
            word.isalpha() and
            (min_length is None or len(word) >= min_length) and
            (max_length is None or len(word) <= max_length)
        )
    }


#------------------------------------------------------------------------------

Python nltk.wordpunct_tokenize() Examples