Python Examples of nltk.tokenize.wordpunct

Source File: data2tensor.py From deep-summarization with MIT License

6 votes

def generate_vocabulary(self, review_summary_file):
        """

        :param review_summary_file:
        :return:
        """
        self.rev_sum_pair = pd.read_csv(review_summary_file, header=0).values

        for review,summary in self.rev_sum_pair:
            rev_lst = wordpunct_tokenize(review)
            sum_lst = wordpunct_tokenize(summary)
            self.__add_list_to_dict(rev_lst)
            self.__add_list_to_dict(sum_lst)

        # Now store the "" empty string as the last word of the voacabulary
        self.map[""] = len(self.map)
        self.revmap[len(self.map)] = ""

Source File: preprocessing.py From KATE with BSD 3-Clause "New" or "Revised" License

6 votes

def tiny_tokenize(text, stem=False, stop_words=[]):
    words = []
    for token in wordpunct_tokenize(re.sub('[%s]' % re.escape(string.punctuation), ' ', \
            text.decode(encoding='UTF-8', errors='ignore'))):
        if not token.isdigit() and not token in stop_words:
            if stem:
                try:
                    w = EnglishStemmer().stem(token)
                except Exception as e:
                    w = token
            else:
                w = token
            words.append(w)

    return words

    # return [EnglishStemmer().stem(token) if stem else token for token in wordpunct_tokenize(
    #                     re.sub('[%s]' % re.escape(string.punctuation), ' ', text.decode(encoding='UTF-8', errors='ignore'))) if
    #                     not token.isdigit() and not token in stop_words]

Source File: utils.py From dl4ir-webnav with BSD 3-Clause "New" or "Revised" License

6 votes

def text2idx2(texts, vocab, dim):
    '''
    Convert a list of texts to their corresponding vocabulary indexes.
    '''
    out = -np.ones((len(texts), dim), dtype=np.int32)
    mask = np.zeros((len(texts), dim), dtype=np.float32)
    for i, text in enumerate(texts):
        j = 0
        for word in wordpunct_tokenize(text):
            if word in vocab:
                out[i,j] = vocab[word]
                mask[i,j] = 1.
                j += 1

                if j == dim:
                    break

    return out, mask

Source File: utils.py From dl4ir-webnav with BSD 3-Clause "New" or "Revised" License

6 votes

def augment(texts, dic_thes):
    if prm.aug<2:
        return texts

    out = []
    for text in texts:

        words_orig = wordpunct_tokenize(text)
        maxrep = max(2,int(0.1*len(words_orig))) #define how many words will be replaced. For now, leave the maximum number as 10% of the words
        
        for j in range(prm.aug):
            words = list(words_orig) #copy
            for k in range(randint(1,maxrep)):
                idx = randint(0,len(words)-1)
                word = words[idx]
                if word in dic_thes:
                    
                    synonym = min(np.random.geometric(0.5), len(dic_thes[word])-1) #chose the synonym based on a geometric distribution
                    #print 'fp',fp,"word", word,"synonym",dic_thes[word][synonym]
                    words[idx] = dic_thes[word][synonym]

            out.append(" ".join(words))

    return out

Source File: utils.py From stochasticLDA with GNU General Public License v3.0

6 votes

def parseDocument(doc, vocab):
	wordslist = list()
	countslist = list()
	doc = doc.lower()
	tokens = wordpunct_tokenize(doc)

	dictionary = dict()
	for word in tokens:
		if word in vocab:
			wordtk = vocab[word]
			if wordtk not in dictionary:
				dictionary[wordtk] = 1
			else:
				dictionary[wordtk] += 1

	wordslist.append(dictionary.keys())
	countslist.append(dictionary.values())
	return (wordslist[0], countslist[0])

Source File: util.py From luscan-devel with GNU General Public License v2.0

5 votes

def read_wordpunct_block(stream):
    toks = []
    for i in range(20): # Read 20 lines at a time.
        toks.extend(wordpunct_tokenize(stream.readline()))
    return toks

Source File: lemma.py From cltk with MIT License

5 votes

def lemmatize(self, text, best_guess=True, return_frequencies=False):
		"""Lemmatize all tokens in a string or a list.  A string is first tokenized using punkt.
		Throw a type error if the input is neither a string nor a list.
		"""
		if isinstance(text, str):
			tokens = wordpunct_tokenize(text)
		elif isinstance(text, list):
			tokens= text
		else:
			raise TypeError("lemmatize only works with strings or lists of string tokens.")

		return [self._lemmatize_token(token, best_guess, return_frequencies) for token in tokens]

Source File: preprocessing.py From idea_relations with MIT License

5 votes

def tokenize(text, filter_stopwords=False, lowercase=True):
    words = wordpunct_tokenize(text)
    if filter_stopwords:
        words = [w for w in words if w not in STOPWORDS]
    return words

Source File: preprocessing.py From KATE with BSD 3-Clause "New" or "Revised" License

5 votes

def tiny_tokenize_xml(text, stem=False, stop_words=[]):
    return [EnglishStemmer().stem(token) if stem else token for token in wordpunct_tokenize(
                        re.sub('[%s]' % re.escape(string.punctuation), ' ', text.encode(encoding='ascii', errors='ignore'))) if
                        not token.isdigit() and not token in stop_words]

Source File: strUtil.py From NNED with MIT License

5 votes

def locateWord(word, wordsArr):
    if word in wordsArr:
        return wordsArr.index(word)
    else:
        idxs = [wordsArr.index(w) for w in wordsArr if word in wordpunct_tokenize(w)]
        return idxs[0]

Source File: prepareDataSet_joint.py From NNED with MIT License

5 votes

def negSent2JointTrain(negSents, posSentNum):
    neg_training_data = []
    for sentId, (sent_id, sent) in enumerate(negSents):
        wordsIn = wordpunct_tokenize(sent)
        sent = " ".join(wordsIn)
        eventTypeSequence = ["O" for i in range(len(wordsIn))]
        neg_training_data.append((str(sentId + posSentNum), sent, eventTypeSequence))
    return neg_training_data

Source File: pos.py From cltk with MIT License

5 votes

def tag_perceptron(self, untagged_string: str):
        """Tag POS with Perceptron tagger.
        :type untagged_string: str
        :param : An untagged, untokenized string of text.
        :rtype tagged_text: str
        """
        untagged_tokens = wordpunct_tokenize(untagged_string)
        pickle_path = self.available_taggers['perceptron']
        tagger = open_pickle(pickle_path)
        tagged_text = tagger.tag(untagged_tokens)
        return tagged_text

Source File: utils.py From dl4ir-query-reformulator with BSD 3-Clause "New" or "Revised" License

5 votes

def BOW2(texts, vocab, dim):
    '''
    Convert a list of texts to the BoW dense representation.
    '''
    out = np.zeros((len(texts), dim), dtype=np.int32)
    mask = np.zeros((len(texts), dim), dtype=np.float32)
    for i, text in enumerate(texts):
        bow = BOW(wordpunct_tokenize(text), vocab)
        out[i,:len(bow[0])] = bow[0]
        mask[i,:len(bow[1])] = bow[1]

    return out, mask

Source File: utils.py From dl4ir-query-reformulator with BSD 3-Clause "New" or "Revised" License

5 votes

def Word2Vec_encode(texts, wemb):
    
    out = np.zeros((len(texts), prm.dim_emb), dtype=np.float32)
    for i, text in enumerate(texts):
        words = wordpunct_tokenize(text)
        n = 0.
        for word in words:
            if word in wemb:
                out[i,:] += wemb[word]
                n += 1.
        out[i,:] /= max(1.,n)

    return out

Source File: utils.py From dl4ir-query-reformulator with BSD 3-Clause "New" or "Revised" License

5 votes

def text2idx2(texts, vocab, dim, use_mask=False):
    '''
    Convert a list of texts to their corresponding vocabulary indexes.
    '''
    
    if use_mask:
        out = -np.ones((len(texts), dim), dtype=np.int32)
        mask = np.zeros((len(texts), dim), dtype=np.float32)
    else:
        out = -2 * np.ones((len(texts), dim), dtype=np.int32)

    out_lst = []
    for i, text in enumerate(texts):
        words = wordpunct_tokenize(text)[:dim]

        for j, word in enumerate(words):
            if word in vocab:
                out[i,j] = vocab[word]
            else:
                out[i,j] = -1 # Unknown words

        out_lst.append(words)

        if use_mask:
            mask[i,:j] = 1.

    if use_mask:
        return out, mask, out_lst
    else:
        return out, out_lst

Source File: rake.py From rake-nltk with MIT License

5 votes

def _generate_phrases(self, sentences):
        """Method to generate contender phrases given the sentences of the text
        document.

        :param sentences: List of strings where each string represents a
                          sentence which forms the text.
        :return: Set of string tuples where each tuple is a collection
                 of words forming a contender phrase.
        """
        phrase_list = set()
        # Create contender phrases from sentences.
        for sentence in sentences:
            word_list = [word.lower() for word in wordpunct_tokenize(sentence)]
            phrase_list.update(self._get_phrase_list_from_words(word_list))
        return phrase_list

Source File: znltk.py From csirtg-smrt-v1 with Mozilla Public License 2.0

5 votes

def top_tokens(text):
    freq_dict = defaultdict(int)
    tokens = wordpunct_tokenize(text)

    for token in tokens:
        freq_dict[token] += 1

    return sorted(freq_dict, key=freq_dict.get, reverse=True)

Source File: util.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International

5 votes

def read_wordpunct_block(stream):
    toks = []
    for i in range(20):  # Read 20 lines at a time.
        toks.extend(wordpunct_tokenize(stream.readline()))
    return toks

Source File: transcription.py From cltk with MIT License

5 votes

def transcribe(self, text, accentuate=True, syllabify=True):
        # input is word-tokenized, stripped of non-diacritic punctuation, 
        # and diphthongs and diacritics are handled
        inp = [self._prep_text(w) for w in wordpunct_tokenize(text) 
            if w not in self.punc]
        words = []
        for w in inp:
            out = ""
            for c in w:
                ipa = self.table.get(c[0], c[0])
                # if there are macrons in the diacritics, adds the ipa 
                # notation for length (if it isn't there already)
                if chars.LONG in c[2]:
                    if "ː" not in ipa:
                        ipa = ipa[0] + "ː" + ipa[1:]
                if accentuate:
                    # adds proper IPA notation for accents
                    # if circumflex accent, adds appropriate 
                    # ipa tone contour notation
                    if chars.CIRCUMFLEX in c[1]:
                        ipa = ipa[0] + "̂" + ipa[1:]
                    # if acute accent, adds appropriate 
                    # ipa tone contour notation
                    if chars.ACUTE in c[1]:
                        if len(ipa) > 1:
                            ipa = ipa[0] + "́" + ipa[1:]
                        else:
                            ipa += "́"
                out += ipa
            transcription = Word(out, self.root)
            transcription._alternate()
            words.append(transcription)
        # Encloses output in brackets, proper notation for surface form.
        return "[" + " ".join([w._print_ipa(syllabify) for w in words]) + "]"

Source File: transcription.py From cltk with MIT License

5 votes

def transcribe(
            self, text, macronize=True, syllabify=True, accentuate=True
            ):
        # if macronize, will first use the tagger to macronize input
        # otherwise, input will be the raw input string
        if macronize:
            text = self.macronizer.macronize_text(text)
        # input is word-tokenized, stripped of non-diacritic punctuation, 
        # and diphthongs and diacritics are handled
        inp = [self._prep_text(w) for w in wordpunct_tokenize(text) 
            if w not in self.punc]
        words = []
        for w in inp:
            out = ""
            for c in w:
                if "̄" in c[1]:
                    macron_added = c[0]+'̄'
                    ipa = self.table.get(macron_added, macron_added)
                else:
                    ipa = self.table.get(c[0], c[0])
                out += ipa
            transcription = Word(out, self.root)
            transcription._alternate()
            words.append(transcription)
        # Encloses output in brackets, proper notation for surface form.
        return "[" + " ".join([w._print_ipa(syllabify, accentuate) 
            for w in words]) + "]"

Source File: pos.py From cltk with MIT License

5 votes

def tag_crf(self, untagged_string: str):
        """Tag POS with CRF tagger.
        :type untagged_string: str
        :param : An untagged, untokenized string of text.
        :rtype tagged_text: str
        """
        untagged_tokens = wordpunct_tokenize(untagged_string)
        pickle_path = self.available_taggers['crf']
        tagger = CRFTagger()
        tagger.set_model_file(pickle_path)
        tagged_text = tagger.tag(untagged_tokens)
        return tagged_text

Source File: pos.py From cltk with MIT License

5 votes

def tag_ngram_12_backoff(self, untagged_string: str):
        """Tag POS with 1-, 2-gram tagger.
        :type untagged_string: str
        :param : An untagged, untokenized string of text.
        :rtype tagged_text: str
        """
        untagged_tokens = wordpunct_tokenize(untagged_string)
        pickle_path = self.available_taggers['ngram_12_backoff']
        tagger = open_pickle(pickle_path)
        tagged_text = tagger.tag(untagged_tokens)
        return tagged_text

Source File: pos.py From cltk with MIT License

5 votes

def tag_ngram_123_backoff(self, untagged_string: str):
        """Tag POS with 1-, 2-, 3-gram tagger.
        :type untagged_string: str
        :param : An untagged, untokenized string of text.
        :rtype tagged_text: str
        """
        untagged_tokens = wordpunct_tokenize(untagged_string)
        pickle_path = self.available_taggers['ngram_123_backoff']
        tagger = open_pickle(pickle_path)
        tagged_text = tagger.tag(untagged_tokens)
        return tagged_text

Source File: pos.py From cltk with MIT License

5 votes

def tag_trigram(self, untagged_string: str):
        """Tag POS with trigram tagger.
        :type untagged_string: str
        :param : An untagged, untokenized string of text.
        :rtype tagged_text: str
        """
        untagged_tokens = wordpunct_tokenize(untagged_string)
        pickle_path = self.available_taggers['trigram']
        tagger = open_pickle(pickle_path)
        tagged_text = tagger.tag(untagged_tokens)
        return tagged_text

Source File: pos.py From cltk with MIT License

5 votes

def tag_bigram(self, untagged_string: str):
        """Tag POS with bigram tagger.
        :type untagged_string: str
        :param : An untagged, untokenized string of text.
        :rtype tagged_text: str
        """
        untagged_tokens = wordpunct_tokenize(untagged_string)
        pickle_path = self.available_taggers['bigram']
        tagger = open_pickle(pickle_path)
        tagged_text = tagger.tag(untagged_tokens)
        return tagged_text

Source File: pos.py From cltk with MIT License

5 votes

def tag_unigram(self, untagged_string: str):
        """Tag POS with unigram tagger.
        :type untagged_string: str
        :param : An untagged, untokenized string of text.
        :rtype tagged_text: str
        """
        untagged_tokens = wordpunct_tokenize(untagged_string)
        pickle_path = self.available_taggers['unigram']
        tagger = open_pickle(pickle_path)
        tagged_text = tagger.tag(untagged_tokens)
        return tagged_text

Source File: utils.py From dl4ir-webnav with BSD 3-Clause "New" or "Revised" License

5 votes

def Word2Vec_encode(texts, wemb):
    
    out = np.zeros((len(texts), prm.dim_emb), dtype=np.float32)
    for i, text in enumerate(texts):
        words = wordpunct_tokenize(text)
        n = 0.
        for word in words:
            if word in wemb:
                out[i,:] += wemb[word]
                n += 1.
        out[i,:] /= max(1.,n)

    return out

Source File: utils.py From dl4ir-webnav with BSD 3-Clause "New" or "Revised" License

5 votes

def BOW2(texts, vocab, dim):
    '''
    Convert a list of texts to the BoW dense representation.
    '''
    out = np.zeros((len(texts), dim), dtype=np.int32)
    mask = np.zeros((len(texts), dim), dtype=np.float32)
    for i, text in enumerate(texts):
        bow = BOW(wordpunct_tokenize(text), vocab)
        out[i,:len(bow[0])] = bow[0]
        mask[i,:len(bow[1])] = bow[1]

    return out, mask

Source File: transform.py From RecNet with MIT License

5 votes

def __call__(self, sentence):
        return wordpunct_tokenize(sentence)

Source File: util.py From razzy-spinner with GNU General Public License v3.0

5 votes

def read_wordpunct_block(stream):
    toks = []
    for i in range(20): # Read 20 lines at a time.
        toks.extend(wordpunct_tokenize(stream.readline()))
    return toks

Python nltk.tokenize.wordpunct_tokenize() Examples