Python Examples of nltk.tokenize.WordPunctTokenizer

Source File: reader.py From atap with Apache License 2.0

6 votes

def __init__(self, root, fileids=PKL_PATTERN, **kwargs):
        """
        Initialize the corpus reader.  Categorization arguments
        (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
        the ``CategorizedCorpusReader`` constructor.  The remaining arguments
        are passed to the ``CorpusReader`` constructor.
        """
        # Add the default category pattern if not passed into the class.
        if not any(key.startswith('cat_') for key in kwargs.keys()):
            kwargs['cat_pattern'] = CAT_PATTERN

        CategorizedCorpusReader.__init__(self, kwargs)
        CorpusReader.__init__(self, root, fileids)

        self._word_tokenizer = WordPunctTokenizer()
        self._sent_tokenizer = nltk.data.LazyLoader(
            'tokenizers/punkt/english.pickle')

Source File: xmldocs.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International

6 votes

def words(self, fileid=None):
        """
        Returns all of the words and punctuation symbols in the specified file
        that were in text nodes -- ie, tags are ignored. Like the xml() method,
        fileid can only specify one file.

        :return: the given file's text nodes as a list of words and punctuation symbols
        :rtype: list(str)
        """

        elt = self.xml(fileid)
        encoding = self.encoding(fileid)
        word_tokenizer = WordPunctTokenizer()
        iterator = elt.getiterator()
        out = []

        for node in iterator:
            text = node.text
            if text is not None:
                if isinstance(text, bytes):
                    text = text.decode(encoding)
                toks = word_tokenizer.tokenize(text)
                out.extend(toks)
        return out

Source File: WGGraph.py From AbTextSumm with Mozilla Public License 2.0

6 votes

def getredundantComponents(sentences):
    window_size=4
    introList=[]
    midlist=[]
    endlist=[]
    
    for sent in sentences:
        words = WordPunctTokenizer().tokenize(sent)
        length_sent=len(words)
        
        f_point = (length_sent)//3
        m_point=(length_sent)//2
        index_span=window_size//2
        intro=' '.join(word for word in words[0:window_size])
        mid=' '.join(word for word in words[m_point-index_span:m_point+index_span])
        end=' '.join(word for word in words[-window_size:])
        introList.append(intro)
        midlist.append(mid)
        endlist.append(end)
    return introList, midlist, endlist

Source File: sw_loader.py From texar with Apache License 2.0

6 votes

def __getitem__(self, idx):
        idx, start, end = self.lst[idx]
        dialog = self.raw[idx][start:end]
        source, target = dialog[:-1], dialog[-1]

        spks, utts = list(zip(*[(speaker, WordPunctTokenizer().tokenize(uttr)) for speaker, uttr, _ in source]))

        spks = list(spks)

        while len(spks) < 10:
            spks.append(0)

        source = '|||'.join([' '.join(uttr) for uttr in utts])
        target_test = ' '.join(WordPunctTokenizer().tokenize(target[1]))

        return spks, source, target_test, target[0]

Source File: xmldocs.py From luscan-devel with GNU General Public License v2.0

6 votes

def words(self, fileid=None):
        """
        Returns all of the words and punctuation symbols in the specified file
        that were in text nodes -- ie, tags are ignored. Like the xml() method,
        fileid can only specify one file.

        :return: the given file's text nodes as a list of words and punctuation symbols
        :rtype: list(str)
        """

        elt = self.xml(fileid)
        word_tokenizer=WordPunctTokenizer()
        iterator = elt.getiterator()
        out = []

        for node in iterator:
            text = node.text
            if text is not None:
                toks = word_tokenizer.tokenize(text)
                out.extend(toks)
        return out

Source File: reader.py From atap with Apache License 2.0

6 votes

def __init__(self, root, fileids=DOC_PATTERN,
                 word_tokenizer=WordPunctTokenizer(),
                 sent_tokenizer=nltk.data.LazyLoader(
                     'tokenizers/punkt/english.pickle'),
                 encoding='latin-1', **kwargs):
        """
        Initialize the corpus reader.  Categorization arguments
        (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
        the ``CategorizedCorpusReader`` constructor.  The remaining
        arguments are passed to the ``CorpusReader`` constructor.
        """
        # Add the default category pattern if not passed into the class.
        if not any(key.startswith('cat_') for key in kwargs.keys()):
            kwargs['cat_pattern'] = CAT_PATTERN

        CategorizedCorpusReader.__init__(self, kwargs)
        CorpusReader.__init__(self, root, fileids, encoding)

        self._word_tokenizer = word_tokenizer
        self._sent_tokenizer = sent_tokenizer
        self._tags = TAGS

Source File: reader.py From atap with Apache License 2.0

6 votes

def __init__(self, root, fileids=PKL_PATTERN, **kwargs):
        """
        Initialize the corpus reader.  Categorization arguments
        (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
        the ``CategorizedCorpusReader`` constructor.  The remaining arguments
        are passed to the ``CorpusReader`` constructor.
        """
        # Add the default category pattern if not passed into the class.
        if not any(key.startswith('cat_') for key in kwargs.keys()):
            kwargs['cat_pattern'] = CAT_PATTERN

        CategorizedCorpusReader.__init__(self, kwargs)
        CorpusReader.__init__(self, root, fileids)

        self._word_tokenizer = WordPunctTokenizer()
        self._sent_tokenizer = nltk.data.LazyLoader(
            'tokenizers/punkt/english.pickle')

Source File: xmldocs.py From razzy-spinner with GNU General Public License v3.0

6 votes

def words(self, fileid=None):
        """
        Returns all of the words and punctuation symbols in the specified file
        that were in text nodes -- ie, tags are ignored. Like the xml() method,
        fileid can only specify one file.

        :return: the given file's text nodes as a list of words and punctuation symbols
        :rtype: list(str)
        """

        elt = self.xml(fileid)
        encoding = self.encoding(fileid)
        word_tokenizer=WordPunctTokenizer()
        iterator = elt.getiterator()
        out = []

        for node in iterator:
            text = node.text
            if text is not None:
                if isinstance(text, bytes):
                    text = text.decode(encoding)
                toks = word_tokenizer.tokenize(text)
                out.extend(toks)
        return out

Source File: reader.py From atap with Apache License 2.0

6 votes

def __init__(self, root, fileids=PKL_PATTERN, **kwargs):
        """
        Initialize the corpus reader.  Categorization arguments
        (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
        the ``CategorizedCorpusReader`` constructor.  The remaining arguments
        are passed to the ``CorpusReader`` constructor.
        """
        # Add the default category pattern if not passed into the class.
        if not any(key.startswith('cat_') for key in kwargs.keys()):
            kwargs['cat_pattern'] = CAT_PATTERN

        CategorizedCorpusReader.__init__(self, kwargs)
        CorpusReader.__init__(self, root, fileids)

        self._word_tokenizer = WordPunctTokenizer()
        self._sent_tokenizer = nltk.data.LazyLoader(
            'tokenizers/punkt/english.pickle')

Source File: parse_utils.py From deep-mlsa with Apache License 2.0

5 votes

def __init__(self):
        self.tokenizers = {
            'en': TweetTokenizer(),
            'de': WordPunctTokenizer(),
            'it': WordPunctTokenizer(),
            'fr': WordPunctTokenizer(),
            'default': WordPunctTokenizer()
        }

        self.tokenizer = TweetTokenizer()

Source File: document.py From magpie with MIT License

5 votes

def compute_wordset(self):
        tokens = WordPunctTokenizer().tokenize(self.text)
        lowercase = [t.lower() for t in tokens]
        return set(lowercase) - {',', '.', '!', ';', ':', '-', '', None}

Source File: sent_analysis.py From kryptoflow with GNU General Public License v3.0

5 votes

def __init__(self):

        self._sent_analyzer = SIA()
        self._word_tokenizer = WordPunctTokenizer().tokenize
        self._sent_tokenizer = nltk.data.LazyLoader(
            'tokenizers/punkt/english.pickle'
        ).tokenize
        self._ids = []

Source File: pipeline.py From RE-NLG-Dataset with MIT License

5 votes

def __get_words_boundaries(self):
        """
        function to tokenize words in the document and return words
        boundaries of each sentence using a tokenizer.
        :return:
        """
        tokenizer = WordPunctTokenizer()
        words = list(tokenizer.span_tokenize(self.text))
        return words

Source File: TfidfRetriever.py From SOQAL with MIT License

5 votes

def __init__(self, docs, k, ngrams, vectorizer=None, tfidf_matrix=None):
        self.k = k  # number of documents to return
        self.tokenizer = WordPunctTokenizer()
        self.stemmer = ARLSTem()
        self.docs = docs
        self.vectorizer = TfidfVectorizer(ngram_range=(1, ngrams), norm=None)
        if tfidf_matrix is None or vectorizer is None:
            self.tfidf_matrix = self.vectorizer.fit_transform(docs)
        else:
            self.vectorizer = vectorizer
            self.tfidf_matrix = tfidf_matrix

Source File: TfidfRetriever.py From SOQAL with MIT License

5 votes

def __init__(self, docs, k, ngrams, vectorizer=None, tfidf_matrix=None):
        self.k = k  # number of documents to return
        self.tokenizer = WordPunctTokenizer()
        self.stemmer = ARLSTem()
        self.docs = docs
        self.stopwords = stopwords.words('arabic')
        self.vectorizer = TfidfVectorizer(ngram_range=(1, ngrams), norm=None, stop_words=self.stopwords)
        if tfidf_matrix is None or vectorizer is None:
            docs_stemmed = self.docs_stem()
            self.tfidf_matrix = self.vectorizer.fit_transform(docs_stemmed)
        else:
            self.vectorizer = vectorizer
            self.tfidf_matrix = tfidf_matrix

Source File: slidingwindow_distance.py From SOQAL with MIT License

5 votes

def __init__(self):
        self.tokenizer = WordPunctTokenizer()
        self.stemmer = ARLSTem()
        self.SYMBOLS = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\"'

Source File: tokenizer.py From cotk with Apache License 2.0

5 votes

def __init__(self, method: str, special_tokens: List[str] = None):
		self.method = method
		self.special_tokens = special_tokens

		if method == "nltk":
			self._callable_tokenizer = WordPunctTokenizer().tokenize
		elif method == "space":
			self._callable_tokenizer = str.split
		else:
			raise ValueError('`method` is invalid value {}, should be "nltk" or "space" '.format(method))
		self._setting_hash = hashlib.sha256(dumps(["adapter", method, special_tokens])).hexdigest()

Source File: _preprocessor.py From sumpy with Apache License 2.0

5 votes

def build(self):

        if not hasattr(self, "_word_tokenizer"):
            self._word_tokenizer = None

        if self._word_tokenizer is None:
            self._word_tokenizer = WordPunctTokenizer().tokenize

Source File: preprocessor.py From sumpy with Apache License 2.0

5 votes

def build_word_tokenizer(self):
        """Return a function that splits a string into a sequence of words."""
        if self._word_tokenizer is not None:
            tokenize = self._word_tokenizer
        else:
            tokenize = WordPunctTokenizer().tokenize
        return tokenize

Source File: preprocessor.py From sumpy with Apache License 2.0

5 votes

def _build_word_tokenizer(self):
        """Return a function that splits a string into a sequence of words."""
        if self._word_tokenizer is not None:
            tokenize = self._word_tokenizer
        else:
            tokenize = WordPunctTokenizer().tokenize

        return tokenize

Source File: tfidf_reader.py From SOQAL with MIT License

5 votes

def __init__(self, P):
        self.tokenizer = WordPunctTokenizer()
        self.stemmer = ARLSTem()
        self.docs = self.get_answer_canditates(P)
        docs_stem = []
        for doc in self.docs:
            docs_stem.append(self.stem_string(doc))
        self.stopwords = stopwords.words('arabic')
        self.vectorizer = TfidfVectorizer(ngram_range=(1, 4), norm=None)  # , stop_words=self.stopwords)
        self.tfidf_matrix = self.vectorizer.fit_transform(docs_stem)

Source File: fasttext_embedding.py From SOQAL with MIT License

5 votes

def __init__(self, model_path):
        self.model_path = model_path
        print("loading fastText model ...")
        #self.model = pickle.load(open(self.model_path,"rb"))
        self.model = KeyedVectors.load_word2vec_format(self.model_path, encoding='utf-8', unicode_errors='ignore')
        print("done fastText loading model")
        self.tokenizer = WordPunctTokenizer()
        self.stemmer = ARLSTem()
        self.SYMBOLS = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\"'
        self.vocab = self.model.vocab

Source File: preprocess_aclImdb_v1.py From lambda-deep-learning-demo with Apache License 2.0

5 votes

def data_tockenize(text):
  tokenizer = WordPunctTokenizer()
  tokens = tokenizer.tokenize(text)
  return (" ".join(tokens)).strip()

Source File: sw_loader.py From texar with Apache License 2.0

4 votes

def __init__(self, jsonl_path, mode=None):
        self.mode = mode
        self.raw = []
        self.lst = []
        self.refs = []
        if mode == 'test':
            lst = json.load(open(jsonl_path, 'r'))
            for item in lst:
                context = item['context']
                dialog = []
                for utts in context:
                    p = utts.find(':')
                    dialog.append((
                        (utts[p - 1] == 'A') * 2 - 1, utts[p + 2:-1], 0))

                if dialog[0][1][-1] == '>':
                    dialog = dialog[1:]

                if len(dialog) == 0:
                    continue

                responses = []
                for resp in item['responses']:
                    responses.append(resp)

                spk = (item['speaker'] == 'A') * 2 - 1
                dialog.append((spk, responses[0], 0))
                responses = responses[1:]
                responses = [' '.join(WordPunctTokenizer().tokenize(resp))
                             for resp in responses]

                if len(responses) == 0:
                    continue

                self.raw.append(dialog)
                self.lst.append((len(self.raw) - 1, 0, len(dialog)))
                self.refs.append(responses)

            return

        from collections import Counter
        self.ct = Counter()
        self.topics = []
        with open(jsonl_path, 'r') as f:
            for idx, item in enumerate(reader(f)):
                utts = item['utts']
                self.topics.append(item['topic'])
                self.raw.append([(int(speaker == 'A') * 2 - 1, sentence, _)
                                 for speaker, sentence, _ in utts])

                lst = [(idx, start, start + wnd_sz)
                       for start in range(0, len(utts) - wnd_sz)] + \
                      [(idx, 0, end)
                       for end in range(2, min(wnd_sz + 1, len(utts)))]

                self.lst += lst

        self.refs = [['none']] * len(self.lst)

Source File: sw_loader.py From texar with Apache License 2.0

4 votes

def generate_reference_for_test_dialog(dataset, data_root):
    vocab = {}
    with open(os.path.join(data_root, 'vocab.txt'), 'r') as f:
        p = f.read().splitlines()
        for i, x in enumerate(p):
            vocab[x] = i

    dts_train = dataset['train']
    dts_val = dataset['val']
    dts_test = dataset['test']

    vectorizer = TfidfVectorizer(tokenizer=WordPunctTokenizer().tokenize,
                                 vocabulary=vocab)

    saved = []
    meta = []
    data = []
    tidx = {}
    for i in range(len(dts_test)):
        topic, cct, source, target = dts_test.get(i)
        meta.append((topic, cct, target))
        data.append(source)

    for i in range(len(dts_train)):
        topic, cct, source, target = dts_train.get(i)
        saved.append((topic, cct, target))
        data.append(source)

        if topic not in tidx:
            tidx[topic] = []
        tidx[topic].append(i)

    result = vectorizer.fit_transform(data)
    x = result[:len(dts_test)]
    y = result[len(dts_test):]

    from tqdm import tqdm
    from sklearn.preprocessing import normalize

    y = normalize(y)
    x = normalize(x)

    dts_test.refs = []
    for i in tqdm(range(len(dts_test))):
        c = tidx[meta[i][0]]
        p = (y * x[i].T).toarray().reshape(-1)[c]
        d = p.argsort()

        cnt = 0
        refs = []
        for a in d[::-1]:
            if saved[a][1] == meta[i][1]:
                refs.append(' '.join(
                    WordPunctTokenizer().tokenize(saved[a][2][1])))
                cnt += 1
                if cnt == 10:
                    break

        dts_test.refs.append(refs)

Python nltk.tokenize.WordPunctTokenizer() Examples