Python nltk.tokenize.WordPunctTokenizer() Examples

The following are 25 code examples of nltk.tokenize.WordPunctTokenizer(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module nltk.tokenize , or try the search function .
Example #1
Source File: reader.py    From atap with Apache License 2.0 6 votes vote down vote up
def __init__(self, root, fileids=PKL_PATTERN, **kwargs):
        """
        Initialize the corpus reader.  Categorization arguments
        (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
        the ``CategorizedCorpusReader`` constructor.  The remaining arguments
        are passed to the ``CorpusReader`` constructor.
        """
        # Add the default category pattern if not passed into the class.
        if not any(key.startswith('cat_') for key in kwargs.keys()):
            kwargs['cat_pattern'] = CAT_PATTERN

        CategorizedCorpusReader.__init__(self, kwargs)
        CorpusReader.__init__(self, root, fileids)

        self._word_tokenizer = WordPunctTokenizer()
        self._sent_tokenizer = nltk.data.LazyLoader(
            'tokenizers/punkt/english.pickle') 
Example #2
Source File: xmldocs.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 6 votes vote down vote up
def words(self, fileid=None):
        """
        Returns all of the words and punctuation symbols in the specified file
        that were in text nodes -- ie, tags are ignored. Like the xml() method,
        fileid can only specify one file.

        :return: the given file's text nodes as a list of words and punctuation symbols
        :rtype: list(str)
        """

        elt = self.xml(fileid)
        encoding = self.encoding(fileid)
        word_tokenizer = WordPunctTokenizer()
        iterator = elt.getiterator()
        out = []

        for node in iterator:
            text = node.text
            if text is not None:
                if isinstance(text, bytes):
                    text = text.decode(encoding)
                toks = word_tokenizer.tokenize(text)
                out.extend(toks)
        return out 
Example #3
Source File: WGGraph.py    From AbTextSumm with Mozilla Public License 2.0 6 votes vote down vote up
def getredundantComponents(sentences):
    window_size=4
    introList=[]
    midlist=[]
    endlist=[]
    
    for sent in sentences:
        words = WordPunctTokenizer().tokenize(sent)
        length_sent=len(words)
        
        f_point = (length_sent)//3
        m_point=(length_sent)//2
        index_span=window_size//2
        intro=' '.join(word for word in words[0:window_size])
        mid=' '.join(word for word in words[m_point-index_span:m_point+index_span])
        end=' '.join(word for word in words[-window_size:])
        introList.append(intro)
        midlist.append(mid)
        endlist.append(end)
    return introList, midlist, endlist 
Example #4
Source File: sw_loader.py    From texar with Apache License 2.0 6 votes vote down vote up
def __getitem__(self, idx):
        idx, start, end = self.lst[idx]
        dialog = self.raw[idx][start:end]
        source, target = dialog[:-1], dialog[-1]

        spks, utts = list(zip(*[(speaker, WordPunctTokenizer().tokenize(uttr)) for speaker, uttr, _ in source]))

        spks = list(spks)

        while len(spks) < 10:
            spks.append(0)

        source = '|||'.join([' '.join(uttr) for uttr in utts])
        target_test = ' '.join(WordPunctTokenizer().tokenize(target[1]))

        return spks, source, target_test, target[0] 
Example #5
Source File: xmldocs.py    From luscan-devel with GNU General Public License v2.0 6 votes vote down vote up
def words(self, fileid=None):
        """
        Returns all of the words and punctuation symbols in the specified file
        that were in text nodes -- ie, tags are ignored. Like the xml() method,
        fileid can only specify one file.

        :return: the given file's text nodes as a list of words and punctuation symbols
        :rtype: list(str)
        """

        elt = self.xml(fileid)
        word_tokenizer=WordPunctTokenizer()
        iterator = elt.getiterator()
        out = []

        for node in iterator:
            text = node.text
            if text is not None:
                toks = word_tokenizer.tokenize(text)
                out.extend(toks)
        return out 
Example #6
Source File: reader.py    From atap with Apache License 2.0 6 votes vote down vote up
def __init__(self, root, fileids=DOC_PATTERN,
                 word_tokenizer=WordPunctTokenizer(),
                 sent_tokenizer=nltk.data.LazyLoader(
                     'tokenizers/punkt/english.pickle'),
                 encoding='latin-1', **kwargs):
        """
        Initialize the corpus reader.  Categorization arguments
        (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
        the ``CategorizedCorpusReader`` constructor.  The remaining
        arguments are passed to the ``CorpusReader`` constructor.
        """
        # Add the default category pattern if not passed into the class.
        if not any(key.startswith('cat_') for key in kwargs.keys()):
            kwargs['cat_pattern'] = CAT_PATTERN

        CategorizedCorpusReader.__init__(self, kwargs)
        CorpusReader.__init__(self, root, fileids, encoding)

        self._word_tokenizer = word_tokenizer
        self._sent_tokenizer = sent_tokenizer
        self._tags = TAGS 
Example #7
Source File: reader.py    From atap with Apache License 2.0 6 votes vote down vote up
def __init__(self, root, fileids=PKL_PATTERN, **kwargs):
        """
        Initialize the corpus reader.  Categorization arguments
        (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
        the ``CategorizedCorpusReader`` constructor.  The remaining arguments
        are passed to the ``CorpusReader`` constructor.
        """
        # Add the default category pattern if not passed into the class.
        if not any(key.startswith('cat_') for key in kwargs.keys()):
            kwargs['cat_pattern'] = CAT_PATTERN

        CategorizedCorpusReader.__init__(self, kwargs)
        CorpusReader.__init__(self, root, fileids)

        self._word_tokenizer = WordPunctTokenizer()
        self._sent_tokenizer = nltk.data.LazyLoader(
            'tokenizers/punkt/english.pickle') 
Example #8
Source File: xmldocs.py    From razzy-spinner with GNU General Public License v3.0 6 votes vote down vote up
def words(self, fileid=None):
        """
        Returns all of the words and punctuation symbols in the specified file
        that were in text nodes -- ie, tags are ignored. Like the xml() method,
        fileid can only specify one file.

        :return: the given file's text nodes as a list of words and punctuation symbols
        :rtype: list(str)
        """

        elt = self.xml(fileid)
        encoding = self.encoding(fileid)
        word_tokenizer=WordPunctTokenizer()
        iterator = elt.getiterator()
        out = []

        for node in iterator:
            text = node.text
            if text is not None:
                if isinstance(text, bytes):
                    text = text.decode(encoding)
                toks = word_tokenizer.tokenize(text)
                out.extend(toks)
        return out 
Example #9
Source File: reader.py    From atap with Apache License 2.0 6 votes vote down vote up
def __init__(self, root, fileids=PKL_PATTERN, **kwargs):
        """
        Initialize the corpus reader.  Categorization arguments
        (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
        the ``CategorizedCorpusReader`` constructor.  The remaining arguments
        are passed to the ``CorpusReader`` constructor.
        """
        # Add the default category pattern if not passed into the class.
        if not any(key.startswith('cat_') for key in kwargs.keys()):
            kwargs['cat_pattern'] = CAT_PATTERN

        CategorizedCorpusReader.__init__(self, kwargs)
        CorpusReader.__init__(self, root, fileids)

        self._word_tokenizer = WordPunctTokenizer()
        self._sent_tokenizer = nltk.data.LazyLoader(
            'tokenizers/punkt/english.pickle') 
Example #10
Source File: parse_utils.py    From deep-mlsa with Apache License 2.0 5 votes vote down vote up
def __init__(self):
        self.tokenizers = {
            'en': TweetTokenizer(),
            'de': WordPunctTokenizer(),
            'it': WordPunctTokenizer(),
            'fr': WordPunctTokenizer(),
            'default': WordPunctTokenizer()
        }

        self.tokenizer = TweetTokenizer() 
Example #11
Source File: document.py    From magpie with MIT License 5 votes vote down vote up
def compute_wordset(self):
        tokens = WordPunctTokenizer().tokenize(self.text)
        lowercase = [t.lower() for t in tokens]
        return set(lowercase) - {',', '.', '!', ';', ':', '-', '', None} 
Example #12
Source File: sent_analysis.py    From kryptoflow with GNU General Public License v3.0 5 votes vote down vote up
def __init__(self):

        self._sent_analyzer = SIA()
        self._word_tokenizer = WordPunctTokenizer().tokenize
        self._sent_tokenizer = nltk.data.LazyLoader(
            'tokenizers/punkt/english.pickle'
        ).tokenize
        self._ids = [] 
Example #13
Source File: pipeline.py    From RE-NLG-Dataset with MIT License 5 votes vote down vote up
def __get_words_boundaries(self):
        """
        function to tokenize words in the document and return words
        boundaries of each sentence using a tokenizer.
        :return:
        """
        tokenizer = WordPunctTokenizer()
        words = list(tokenizer.span_tokenize(self.text))
        return words 
Example #14
Source File: TfidfRetriever.py    From SOQAL with MIT License 5 votes vote down vote up
def __init__(self, docs, k, ngrams, vectorizer=None, tfidf_matrix=None):
        self.k = k  # number of documents to return
        self.tokenizer = WordPunctTokenizer()
        self.stemmer = ARLSTem()
        self.docs = docs
        self.vectorizer = TfidfVectorizer(ngram_range=(1, ngrams), norm=None)
        if tfidf_matrix is None or vectorizer is None:
            self.tfidf_matrix = self.vectorizer.fit_transform(docs)
        else:
            self.vectorizer = vectorizer
            self.tfidf_matrix = tfidf_matrix 
Example #15
Source File: TfidfRetriever.py    From SOQAL with MIT License 5 votes vote down vote up
def __init__(self, docs, k, ngrams, vectorizer=None, tfidf_matrix=None):
        self.k = k  # number of documents to return
        self.tokenizer = WordPunctTokenizer()
        self.stemmer = ARLSTem()
        self.docs = docs
        self.stopwords = stopwords.words('arabic')
        self.vectorizer = TfidfVectorizer(ngram_range=(1, ngrams), norm=None, stop_words=self.stopwords)
        if tfidf_matrix is None or vectorizer is None:
            docs_stemmed = self.docs_stem()
            self.tfidf_matrix = self.vectorizer.fit_transform(docs_stemmed)
        else:
            self.vectorizer = vectorizer
            self.tfidf_matrix = tfidf_matrix 
Example #16
Source File: slidingwindow_distance.py    From SOQAL with MIT License 5 votes vote down vote up
def __init__(self):
        self.tokenizer = WordPunctTokenizer()
        self.stemmer = ARLSTem()
        self.SYMBOLS = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\"' 
Example #17
Source File: tokenizer.py    From cotk with Apache License 2.0 5 votes vote down vote up
def __init__(self, method: str, special_tokens: List[str] = None):
		self.method = method
		self.special_tokens = special_tokens

		if method == "nltk":
			self._callable_tokenizer = WordPunctTokenizer().tokenize
		elif method == "space":
			self._callable_tokenizer = str.split
		else:
			raise ValueError('`method` is invalid value {}, should be "nltk" or "space" '.format(method))
		self._setting_hash = hashlib.sha256(dumps(["adapter", method, special_tokens])).hexdigest() 
Example #18
Source File: _preprocessor.py    From sumpy with Apache License 2.0 5 votes vote down vote up
def build(self):

        if not hasattr(self, "_word_tokenizer"):
            self._word_tokenizer = None

        if self._word_tokenizer is None:
            self._word_tokenizer = WordPunctTokenizer().tokenize 
Example #19
Source File: preprocessor.py    From sumpy with Apache License 2.0 5 votes vote down vote up
def build_word_tokenizer(self):
        """Return a function that splits a string into a sequence of words."""
        if self._word_tokenizer is not None:
            tokenize = self._word_tokenizer
        else:
            tokenize = WordPunctTokenizer().tokenize
        return tokenize 
Example #20
Source File: preprocessor.py    From sumpy with Apache License 2.0 5 votes vote down vote up
def _build_word_tokenizer(self):
        """Return a function that splits a string into a sequence of words."""
        if self._word_tokenizer is not None:
            tokenize = self._word_tokenizer
        else:
            tokenize = WordPunctTokenizer().tokenize

        return tokenize 
Example #21
Source File: tfidf_reader.py    From SOQAL with MIT License 5 votes vote down vote up
def __init__(self, P):
        self.tokenizer = WordPunctTokenizer()
        self.stemmer = ARLSTem()
        self.docs = self.get_answer_canditates(P)
        docs_stem = []
        for doc in self.docs:
            docs_stem.append(self.stem_string(doc))
        self.stopwords = stopwords.words('arabic')
        self.vectorizer = TfidfVectorizer(ngram_range=(1, 4), norm=None)  # , stop_words=self.stopwords)
        self.tfidf_matrix = self.vectorizer.fit_transform(docs_stem) 
Example #22
Source File: fasttext_embedding.py    From SOQAL with MIT License 5 votes vote down vote up
def __init__(self, model_path):
        self.model_path = model_path
        print("loading fastText model ...")
        #self.model = pickle.load(open(self.model_path,"rb"))
        self.model = KeyedVectors.load_word2vec_format(self.model_path, encoding='utf-8', unicode_errors='ignore')
        print("done fastText loading model")
        self.tokenizer = WordPunctTokenizer()
        self.stemmer = ARLSTem()
        self.SYMBOLS = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\"'
        self.vocab = self.model.vocab 
Example #23
Source File: preprocess_aclImdb_v1.py    From lambda-deep-learning-demo with Apache License 2.0 5 votes vote down vote up
def data_tockenize(text):
  tokenizer = WordPunctTokenizer()
  tokens = tokenizer.tokenize(text)
  return (" ".join(tokens)).strip() 
Example #24
Source File: sw_loader.py    From texar with Apache License 2.0 4 votes vote down vote up
def __init__(self, jsonl_path, mode=None):
        self.mode = mode
        self.raw = []
        self.lst = []
        self.refs = []
        if mode == 'test':
            lst = json.load(open(jsonl_path, 'r'))
            for item in lst:
                context = item['context']
                dialog = []
                for utts in context:
                    p = utts.find(':')
                    dialog.append((
                        (utts[p - 1] == 'A') * 2 - 1, utts[p + 2:-1], 0))

                if dialog[0][1][-1] == '>':
                    dialog = dialog[1:]

                if len(dialog) == 0:
                    continue

                responses = []
                for resp in item['responses']:
                    responses.append(resp)

                spk = (item['speaker'] == 'A') * 2 - 1
                dialog.append((spk, responses[0], 0))
                responses = responses[1:]
                responses = [' '.join(WordPunctTokenizer().tokenize(resp))
                             for resp in responses]

                if len(responses) == 0:
                    continue

                self.raw.append(dialog)
                self.lst.append((len(self.raw) - 1, 0, len(dialog)))
                self.refs.append(responses)

            return

        from collections import Counter
        self.ct = Counter()
        self.topics = []
        with open(jsonl_path, 'r') as f:
            for idx, item in enumerate(reader(f)):
                utts = item['utts']
                self.topics.append(item['topic'])
                self.raw.append([(int(speaker == 'A') * 2 - 1, sentence, _)
                                 for speaker, sentence, _ in utts])

                lst = [(idx, start, start + wnd_sz)
                       for start in range(0, len(utts) - wnd_sz)] + \
                      [(idx, 0, end)
                       for end in range(2, min(wnd_sz + 1, len(utts)))]

                self.lst += lst

        self.refs = [['none']] * len(self.lst) 
Example #25
Source File: sw_loader.py    From texar with Apache License 2.0 4 votes vote down vote up
def generate_reference_for_test_dialog(dataset, data_root):
    vocab = {}
    with open(os.path.join(data_root, 'vocab.txt'), 'r') as f:
        p = f.read().splitlines()
        for i, x in enumerate(p):
            vocab[x] = i

    dts_train = dataset['train']
    dts_val = dataset['val']
    dts_test = dataset['test']

    vectorizer = TfidfVectorizer(tokenizer=WordPunctTokenizer().tokenize,
                                 vocabulary=vocab)

    saved = []
    meta = []
    data = []
    tidx = {}
    for i in range(len(dts_test)):
        topic, cct, source, target = dts_test.get(i)
        meta.append((topic, cct, target))
        data.append(source)

    for i in range(len(dts_train)):
        topic, cct, source, target = dts_train.get(i)
        saved.append((topic, cct, target))
        data.append(source)

        if topic not in tidx:
            tidx[topic] = []
        tidx[topic].append(i)

    result = vectorizer.fit_transform(data)
    x = result[:len(dts_test)]
    y = result[len(dts_test):]

    from tqdm import tqdm
    from sklearn.preprocessing import normalize

    y = normalize(y)
    x = normalize(x)

    dts_test.refs = []
    for i in tqdm(range(len(dts_test))):
        c = tidx[meta[i][0]]
        p = (y * x[i].T).toarray().reshape(-1)[c]
        d = p.argsort()

        cnt = 0
        refs = []
        for a in d[::-1]:
            if saved[a][1] == meta[i][1]:
                refs.append(' '.join(
                    WordPunctTokenizer().tokenize(saved[a][2][1])))
                cnt += 1
                if cnt == 10:
                    break

        dts_test.refs.append(refs)