Python nltk.tokenize.wordpunct_tokenize() Examples

The following are 30 code examples of nltk.tokenize.wordpunct_tokenize(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module nltk.tokenize , or try the search function .
Example #1
Source File: data2tensor.py    From deep-summarization with MIT License 6 votes vote down vote up
def generate_vocabulary(self, review_summary_file):
        """

        :param review_summary_file:
        :return:
        """
        self.rev_sum_pair = pd.read_csv(review_summary_file, header=0).values

        for review,summary in self.rev_sum_pair:
            rev_lst = wordpunct_tokenize(review)
            sum_lst = wordpunct_tokenize(summary)
            self.__add_list_to_dict(rev_lst)
            self.__add_list_to_dict(sum_lst)

        # Now store the "" empty string as the last word of the voacabulary
        self.map[""] = len(self.map)
        self.revmap[len(self.map)] = "" 
Example #2
Source File: preprocessing.py    From KATE with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def tiny_tokenize(text, stem=False, stop_words=[]):
    words = []
    for token in wordpunct_tokenize(re.sub('[%s]' % re.escape(string.punctuation), ' ', \
            text.decode(encoding='UTF-8', errors='ignore'))):
        if not token.isdigit() and not token in stop_words:
            if stem:
                try:
                    w = EnglishStemmer().stem(token)
                except Exception as e:
                    w = token
            else:
                w = token
            words.append(w)

    return words

    # return [EnglishStemmer().stem(token) if stem else token for token in wordpunct_tokenize(
    #                     re.sub('[%s]' % re.escape(string.punctuation), ' ', text.decode(encoding='UTF-8', errors='ignore'))) if
    #                     not token.isdigit() and not token in stop_words] 
Example #3
Source File: utils.py    From dl4ir-webnav with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def text2idx2(texts, vocab, dim):
    '''
    Convert a list of texts to their corresponding vocabulary indexes.
    '''
    out = -np.ones((len(texts), dim), dtype=np.int32)
    mask = np.zeros((len(texts), dim), dtype=np.float32)
    for i, text in enumerate(texts):
        j = 0
        for word in wordpunct_tokenize(text):
            if word in vocab:
                out[i,j] = vocab[word]
                mask[i,j] = 1.
                j += 1

                if j == dim:
                    break

    return out, mask 
Example #4
Source File: utils.py    From dl4ir-webnav with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def augment(texts, dic_thes):
    if prm.aug<2:
        return texts

    out = []
    for text in texts:

        words_orig = wordpunct_tokenize(text)
        maxrep = max(2,int(0.1*len(words_orig))) #define how many words will be replaced. For now, leave the maximum number as 10% of the words
        
        for j in range(prm.aug):
            words = list(words_orig) #copy
            for k in range(randint(1,maxrep)):
                idx = randint(0,len(words)-1)
                word = words[idx]
                if word in dic_thes:
                    
                    synonym = min(np.random.geometric(0.5), len(dic_thes[word])-1) #chose the synonym based on a geometric distribution
                    #print 'fp',fp,"word", word,"synonym",dic_thes[word][synonym]
                    words[idx] = dic_thes[word][synonym]

            out.append(" ".join(words))

    return out 
Example #5
Source File: utils.py    From stochasticLDA with GNU General Public License v3.0 6 votes vote down vote up
def parseDocument(doc, vocab):
	wordslist = list()
	countslist = list()
	doc = doc.lower()
	tokens = wordpunct_tokenize(doc)

	dictionary = dict()
	for word in tokens:
		if word in vocab:
			wordtk = vocab[word]
			if wordtk not in dictionary:
				dictionary[wordtk] = 1
			else:
				dictionary[wordtk] += 1

	wordslist.append(dictionary.keys())
	countslist.append(dictionary.values())
	return (wordslist[0], countslist[0]) 
Example #6
Source File: util.py    From luscan-devel with GNU General Public License v2.0 5 votes vote down vote up
def read_wordpunct_block(stream):
    toks = []
    for i in range(20): # Read 20 lines at a time.
        toks.extend(wordpunct_tokenize(stream.readline()))
    return toks 
Example #7
Source File: lemma.py    From cltk with MIT License 5 votes vote down vote up
def lemmatize(self, text, best_guess=True, return_frequencies=False):
		"""Lemmatize all tokens in a string or a list.  A string is first tokenized using punkt.
		Throw a type error if the input is neither a string nor a list.
		"""
		if isinstance(text, str):
			tokens = wordpunct_tokenize(text)
		elif isinstance(text, list):
			tokens= text
		else:
			raise TypeError("lemmatize only works with strings or lists of string tokens.")

		return [self._lemmatize_token(token, best_guess, return_frequencies) for token in tokens] 
Example #8
Source File: preprocessing.py    From idea_relations with MIT License 5 votes vote down vote up
def tokenize(text, filter_stopwords=False, lowercase=True):
    words = wordpunct_tokenize(text)
    if filter_stopwords:
        words = [w for w in words if w not in STOPWORDS]
    return words 
Example #9
Source File: preprocessing.py    From KATE with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def tiny_tokenize_xml(text, stem=False, stop_words=[]):
    return [EnglishStemmer().stem(token) if stem else token for token in wordpunct_tokenize(
                        re.sub('[%s]' % re.escape(string.punctuation), ' ', text.encode(encoding='ascii', errors='ignore'))) if
                        not token.isdigit() and not token in stop_words] 
Example #10
Source File: strUtil.py    From NNED with MIT License 5 votes vote down vote up
def locateWord(word, wordsArr):
    if word in wordsArr:
        return wordsArr.index(word)
    else:
        idxs = [wordsArr.index(w) for w in wordsArr if word in wordpunct_tokenize(w)]
        return idxs[0] 
Example #11
Source File: prepareDataSet_joint.py    From NNED with MIT License 5 votes vote down vote up
def negSent2JointTrain(negSents, posSentNum):
    neg_training_data = []
    for sentId, (sent_id, sent) in enumerate(negSents):
        wordsIn = wordpunct_tokenize(sent)
        sent = " ".join(wordsIn)
        eventTypeSequence = ["O" for i in range(len(wordsIn))]
        neg_training_data.append((str(sentId + posSentNum), sent, eventTypeSequence))
    return neg_training_data 
Example #12
Source File: pos.py    From cltk with MIT License 5 votes vote down vote up
def tag_perceptron(self, untagged_string: str):
        """Tag POS with Perceptron tagger.
        :type untagged_string: str
        :param : An untagged, untokenized string of text.
        :rtype tagged_text: str
        """
        untagged_tokens = wordpunct_tokenize(untagged_string)
        pickle_path = self.available_taggers['perceptron']
        tagger = open_pickle(pickle_path)
        tagged_text = tagger.tag(untagged_tokens)
        return tagged_text 
Example #13
Source File: utils.py    From dl4ir-query-reformulator with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def BOW2(texts, vocab, dim):
    '''
    Convert a list of texts to the BoW dense representation.
    '''
    out = np.zeros((len(texts), dim), dtype=np.int32)
    mask = np.zeros((len(texts), dim), dtype=np.float32)
    for i, text in enumerate(texts):
        bow = BOW(wordpunct_tokenize(text), vocab)
        out[i,:len(bow[0])] = bow[0]
        mask[i,:len(bow[1])] = bow[1]

    return out, mask 
Example #14
Source File: utils.py    From dl4ir-query-reformulator with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def Word2Vec_encode(texts, wemb):
    
    out = np.zeros((len(texts), prm.dim_emb), dtype=np.float32)
    for i, text in enumerate(texts):
        words = wordpunct_tokenize(text)
        n = 0.
        for word in words:
            if word in wemb:
                out[i,:] += wemb[word]
                n += 1.
        out[i,:] /= max(1.,n)

    return out 
Example #15
Source File: utils.py    From dl4ir-query-reformulator with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def text2idx2(texts, vocab, dim, use_mask=False):
    '''
    Convert a list of texts to their corresponding vocabulary indexes.
    '''
    
    if use_mask:
        out = -np.ones((len(texts), dim), dtype=np.int32)
        mask = np.zeros((len(texts), dim), dtype=np.float32)
    else:
        out = -2 * np.ones((len(texts), dim), dtype=np.int32)

    out_lst = []
    for i, text in enumerate(texts):
        words = wordpunct_tokenize(text)[:dim]

        for j, word in enumerate(words):
            if word in vocab:
                out[i,j] = vocab[word]
            else:
                out[i,j] = -1 # Unknown words

        out_lst.append(words)

        if use_mask:
            mask[i,:j] = 1.

    if use_mask:
        return out, mask, out_lst
    else:
        return out, out_lst 
Example #16
Source File: rake.py    From rake-nltk with MIT License 5 votes vote down vote up
def _generate_phrases(self, sentences):
        """Method to generate contender phrases given the sentences of the text
        document.

        :param sentences: List of strings where each string represents a
                          sentence which forms the text.
        :return: Set of string tuples where each tuple is a collection
                 of words forming a contender phrase.
        """
        phrase_list = set()
        # Create contender phrases from sentences.
        for sentence in sentences:
            word_list = [word.lower() for word in wordpunct_tokenize(sentence)]
            phrase_list.update(self._get_phrase_list_from_words(word_list))
        return phrase_list 
Example #17
Source File: znltk.py    From csirtg-smrt-v1 with Mozilla Public License 2.0 5 votes vote down vote up
def top_tokens(text):
    freq_dict = defaultdict(int)
    tokens = wordpunct_tokenize(text)

    for token in tokens:
        freq_dict[token] += 1

    return sorted(freq_dict, key=freq_dict.get, reverse=True) 
Example #18
Source File: util.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 5 votes vote down vote up
def read_wordpunct_block(stream):
    toks = []
    for i in range(20):  # Read 20 lines at a time.
        toks.extend(wordpunct_tokenize(stream.readline()))
    return toks 
Example #19
Source File: transcription.py    From cltk with MIT License 5 votes vote down vote up
def transcribe(self, text, accentuate=True, syllabify=True):
        # input is word-tokenized, stripped of non-diacritic punctuation, 
        # and diphthongs and diacritics are handled
        inp = [self._prep_text(w) for w in wordpunct_tokenize(text) 
            if w not in self.punc]
        words = []
        for w in inp:
            out = ""
            for c in w:
                ipa = self.table.get(c[0], c[0])
                # if there are macrons in the diacritics, adds the ipa 
                # notation for length (if it isn't there already)
                if chars.LONG in c[2]:
                    if "ː" not in ipa:
                        ipa = ipa[0] + "ː" + ipa[1:]
                if accentuate:
                    # adds proper IPA notation for accents
                    # if circumflex accent, adds appropriate 
                    # ipa tone contour notation
                    if chars.CIRCUMFLEX in c[1]:
                        ipa = ipa[0] + "̂" + ipa[1:]
                    # if acute accent, adds appropriate 
                    # ipa tone contour notation
                    if chars.ACUTE in c[1]:
                        if len(ipa) > 1:
                            ipa = ipa[0] + "́" + ipa[1:]
                        else:
                            ipa += "́"
                out += ipa
            transcription = Word(out, self.root)
            transcription._alternate()
            words.append(transcription)
        # Encloses output in brackets, proper notation for surface form.
        return "[" + " ".join([w._print_ipa(syllabify) for w in words]) + "]" 
Example #20
Source File: transcription.py    From cltk with MIT License 5 votes vote down vote up
def transcribe(
            self, text, macronize=True, syllabify=True, accentuate=True
            ):
        # if macronize, will first use the tagger to macronize input
        # otherwise, input will be the raw input string
        if macronize:
            text = self.macronizer.macronize_text(text)
        # input is word-tokenized, stripped of non-diacritic punctuation, 
        # and diphthongs and diacritics are handled
        inp = [self._prep_text(w) for w in wordpunct_tokenize(text) 
            if w not in self.punc]
        words = []
        for w in inp:
            out = ""
            for c in w:
                if "̄" in c[1]:
                    macron_added = c[0]+'̄'
                    ipa = self.table.get(macron_added, macron_added)
                else:
                    ipa = self.table.get(c[0], c[0])
                out += ipa
            transcription = Word(out, self.root)
            transcription._alternate()
            words.append(transcription)
        # Encloses output in brackets, proper notation for surface form.
        return "[" + " ".join([w._print_ipa(syllabify, accentuate) 
            for w in words]) + "]" 
Example #21
Source File: pos.py    From cltk with MIT License 5 votes vote down vote up
def tag_crf(self, untagged_string: str):
        """Tag POS with CRF tagger.
        :type untagged_string: str
        :param : An untagged, untokenized string of text.
        :rtype tagged_text: str
        """
        untagged_tokens = wordpunct_tokenize(untagged_string)
        pickle_path = self.available_taggers['crf']
        tagger = CRFTagger()
        tagger.set_model_file(pickle_path)
        tagged_text = tagger.tag(untagged_tokens)
        return tagged_text 
Example #22
Source File: pos.py    From cltk with MIT License 5 votes vote down vote up
def tag_ngram_12_backoff(self, untagged_string: str):
        """Tag POS with 1-, 2-gram tagger.
        :type untagged_string: str
        :param : An untagged, untokenized string of text.
        :rtype tagged_text: str
        """
        untagged_tokens = wordpunct_tokenize(untagged_string)
        pickle_path = self.available_taggers['ngram_12_backoff']
        tagger = open_pickle(pickle_path)
        tagged_text = tagger.tag(untagged_tokens)
        return tagged_text 
Example #23
Source File: pos.py    From cltk with MIT License 5 votes vote down vote up
def tag_ngram_123_backoff(self, untagged_string: str):
        """Tag POS with 1-, 2-, 3-gram tagger.
        :type untagged_string: str
        :param : An untagged, untokenized string of text.
        :rtype tagged_text: str
        """
        untagged_tokens = wordpunct_tokenize(untagged_string)
        pickle_path = self.available_taggers['ngram_123_backoff']
        tagger = open_pickle(pickle_path)
        tagged_text = tagger.tag(untagged_tokens)
        return tagged_text 
Example #24
Source File: pos.py    From cltk with MIT License 5 votes vote down vote up
def tag_trigram(self, untagged_string: str):
        """Tag POS with trigram tagger.
        :type untagged_string: str
        :param : An untagged, untokenized string of text.
        :rtype tagged_text: str
        """
        untagged_tokens = wordpunct_tokenize(untagged_string)
        pickle_path = self.available_taggers['trigram']
        tagger = open_pickle(pickle_path)
        tagged_text = tagger.tag(untagged_tokens)
        return tagged_text 
Example #25
Source File: pos.py    From cltk with MIT License 5 votes vote down vote up
def tag_bigram(self, untagged_string: str):
        """Tag POS with bigram tagger.
        :type untagged_string: str
        :param : An untagged, untokenized string of text.
        :rtype tagged_text: str
        """
        untagged_tokens = wordpunct_tokenize(untagged_string)
        pickle_path = self.available_taggers['bigram']
        tagger = open_pickle(pickle_path)
        tagged_text = tagger.tag(untagged_tokens)
        return tagged_text 
Example #26
Source File: pos.py    From cltk with MIT License 5 votes vote down vote up
def tag_unigram(self, untagged_string: str):
        """Tag POS with unigram tagger.
        :type untagged_string: str
        :param : An untagged, untokenized string of text.
        :rtype tagged_text: str
        """
        untagged_tokens = wordpunct_tokenize(untagged_string)
        pickle_path = self.available_taggers['unigram']
        tagger = open_pickle(pickle_path)
        tagged_text = tagger.tag(untagged_tokens)
        return tagged_text 
Example #27
Source File: utils.py    From dl4ir-webnav with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def Word2Vec_encode(texts, wemb):
    
    out = np.zeros((len(texts), prm.dim_emb), dtype=np.float32)
    for i, text in enumerate(texts):
        words = wordpunct_tokenize(text)
        n = 0.
        for word in words:
            if word in wemb:
                out[i,:] += wemb[word]
                n += 1.
        out[i,:] /= max(1.,n)

    return out 
Example #28
Source File: utils.py    From dl4ir-webnav with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def BOW2(texts, vocab, dim):
    '''
    Convert a list of texts to the BoW dense representation.
    '''
    out = np.zeros((len(texts), dim), dtype=np.int32)
    mask = np.zeros((len(texts), dim), dtype=np.float32)
    for i, text in enumerate(texts):
        bow = BOW(wordpunct_tokenize(text), vocab)
        out[i,:len(bow[0])] = bow[0]
        mask[i,:len(bow[1])] = bow[1]

    return out, mask 
Example #29
Source File: transform.py    From RecNet with MIT License 5 votes vote down vote up
def __call__(self, sentence):
        return wordpunct_tokenize(sentence) 
Example #30
Source File: util.py    From razzy-spinner with GNU General Public License v3.0 5 votes vote down vote up
def read_wordpunct_block(stream):
    toks = []
    for i in range(20): # Read 20 lines at a time.
        toks.extend(wordpunct_tokenize(stream.readline()))
    return toks