Python Examples of nltk.word

Source File: kaggle.py From dl-models-for-qa with Apache License 2.0

8 votes

def get_story_question_answer_triples(sqa_file):
    sqatriples = []
    fsqa = open(sqa_file, "rb")
    for line in fsqa:
        line = line.strip().decode("utf8").encode("ascii", "ignore")
        if line.startswith("#"):
            continue
        story, question, answer, correct = line.split("\t")
        swords = []
        story_sents = nltk.sent_tokenize(story)
        for story_sent in story_sents:
            swords.extend(nltk.word_tokenize(story_sent))
        qwords = nltk.word_tokenize(question)
        awords = nltk.word_tokenize(answer)
        is_correct = int(correct) == 1
        sqatriples.append((swords, qwords, awords, is_correct))
    fsqa.close()
    return sqatriples

Source File: math_expression_calculator.py From JARVIS with Apache License 2.0

8 votes

def text_to_num(text):
    tokenized = nltk.word_tokenize(text);
    tags = nltk.pos_tag(tokenized)
    print(tags)
    chunkPattern = r""" Chunk0: {((<NN|CD.?|RB>)<CD.?|VBD.?|VBP.?|VBN.?|NN.?|RB.?|JJ>*)<NN|CD.?>} """
    chunkParser = nltk.RegexpParser(chunkPattern)
    chunkedData = chunkParser.parse(tags)
    print(chunkedData)

    for subtree in chunkedData.subtrees(filter=lambda t: t.label() in "Chunk0"):
        exp = ""
        for l in subtree.leaves():
            exp += str(l[0]) + " "
        exp = exp[:-1]
        print(exp)
        try:
            text = text.replace(exp, str(t2n.text2num(exp)))
        except Exception as e:
            print("error text2num ->", e.args)
        print("text2num -> ", text)
    return text

Source File: utils.py From lisc with Apache License 2.0

6 votes

def convert_string(text):
    """Convert a str of text into tokenized and selected list of words.

    Parameters
    ----------
    text : str
        Text as one long string.

    Returns
    -------
    words_cleaned : list of str
        List of tokenized words, after processing.

    Notes
    -----
    This function sets text to lower case, and removes stopwords and punctuation.
    """

    words = word_tokenize(text)
    words_cleaned = [word.lower() for word in words if (
        (not word.lower() in stopwords.words('english')) & word.isalnum())]

    return words_cleaned

Source File: nltk_plugin.py From self-attentive-parser with MIT License

6 votes

def _nltk_process_sents(self, sents):
        for sentence in sents:
            if isinstance(sentence, STRING_TYPES):
                if self._tokenizer_lang is None:
                    raise ValueError(
                        "No word tokenizer available for this language. "
                        "Please tokenize before calling the parser."
                        )
                sentence = nltk.word_tokenize(sentence, self._tokenizer_lang)

            if IS_PY2:
                sentence = [
                    word.decode('utf-8', 'ignore') if isinstance(word, str) else word
                    for word in sentence
                    ]

            if not self._provides_tags:
                sentence = nltk.pos_tag(sentence)
                yield [word for word, tag in sentence], sentence
            else:
                yield sentence, sentence

Source File: utils.py From Text-Classification-Models-Pytorch with MIT License

6 votes

def encode_text(text, word_embeddings, max_sen_len):
    '''
    Encode a sequence of words into corresponding vector representation
    Input:
        text (string) : text (space separated words, etc..)
        word_embeddings (dict) : dictionary mapping from words to their representation
        max_sen_len (int) : maximum sentence length (in words)
    Returns:
        X (np.matrix) : matrix of shape (max_sen_len, embedding_size) after zero padding
    '''
    
    default_embed = np.zeros(300)
    words = word_tokenize(text)[:max_sen_len]
    embeds = [word_embeddings.get(x, default_embed) for x in words]
    embeds += [default_embed] * (max_sen_len - len(embeds))
    return np.array(embeds, dtype=np.float32)

Source File: utils.py From Text-Classification-Models-Pytorch with MIT License

6 votes

def encode_text(text, word_embeddings):
    '''
    Encode a sequence of words into corresponding vector representation
    Input:
        text (string) : text (space separated words, etc..)
        word_embeddings (dict) : dictionary mapping from words to their representation
        max_sent_len (int) : maximum sentence length (in words)
    Returns:
        X (np.array) : array of shape (embedding_size,) averaging all word vectors of text
    '''
    
    embed = np.zeros(300)
    count = 0
    words = word_tokenize(text)
    for word in words:
        if word in word_embeddings:
            embed += word_embeddings[word]
            count += 1
    return embed / count

Source File: VectorSpaceModel.py From Snowball with GNU General Public License v3.0

6 votes

def __init__(self, sentences_file, stopwords):
        self.dictionary = None
        self.corpus = None
        f_sentences = codecs.open(sentences_file, encoding='utf-8')
        documents = list()
        count = 0
        print("Gathering sentences and removing stopwords")
        for line in f_sentences:
            line = re.sub('<[A-Z]+>[^<]+</[A-Z]+>', '', line)

            # remove stop words and tokenize
            document = [word for word in nltk.word_tokenize(line.lower()) if word not in stopwords]
            documents.append(document)
            count += 1
            if count % 10000 == 0:
                sys.stdout.write(".")

        f_sentences.close()

        self.dictionary = corpora.Dictionary(documents)
        self.corpus = [self.dictionary.doc2bow(text) for text in documents]
        self.tf_idf_model = TfidfModel(self.corpus)

        print(len(documents), "documents red")
        print(len(self.dictionary), " unique tokens")

Source File: combined.py From Projects with MIT License

6 votes

def evaluate_sentiment(text):
    pos_score = 0
    neg_score = 0
    tokened = nltk.word_tokenize(text)
    pos_pairs = nltk.pos_tag(tokened)
    for tuple in pos_pairs:
        pos = ''
        if tuple[1] == "NN":
            pos = 'n/'
        if tuple[1] == "JJ":
            pos = 'a/'
        if tuple[1] == "VB":
            pos = 'v/'
        if tuple[1] == "RB":
            pos = 'r/'
        try:
            pos_score += sentiwordnet[pos+tuple[0].lower()][0]
            neg_score += sentiwordnet[pos+tuple[0].lower()][1]
        except:
            pass
    return pos_score, neg_score

Source File: sentiwordnet.py From Projects with MIT License

6 votes

def evaluate_sentiment(text):
    pos_score = 0
    neg_score = 0
    tokened = nltk.word_tokenize(text.decode('utf8', 'ignore').replace('<br />',' '))
    pos_pairs = nltk.pos_tag(tokened)
    for tuple in pos_pairs:
        pos = ''
        if tuple[1] == "NN":
            pos = 'n/'
        if tuple[1] == "JJ":
            pos = 'a/'
        if tuple[1] == "VB":
            pos = 'v/'
        if tuple[1] == "RB":
            pos = 'r/'
        try:
            pos_score += sentiwordnet[pos+tuple[0].lower()][0]
            neg_score += sentiwordnet[pos+tuple[0].lower()][1]
        except:
            pass
    return pos_score, neg_score

Source File: evaluate.py From BERT with Apache License 2.0

6 votes

def mixed_segmentation(in_str, rm_punc=False):
	in_str = str(in_str).decode('utf-8').lower().strip()
	segs_out = []
	temp_str = ""
	sp_char = ['-',':','_','*','^','/','\\','~','`','+','=',
			   '，','。','：','？','！','“','”','；','’','《','》','……','·','、',
			   '「','」','（','）','－','～','『','』']
	for char in in_str:
		if rm_punc and char in sp_char:
			continue
		if re.search(ur'[\u4e00-\u9fa5]', char) or char in sp_char:
			if temp_str != "":
				ss = nltk.word_tokenize(temp_str)
				segs_out.extend(ss)
				temp_str = ""
			segs_out.append(char)
		else:
			temp_str += char

	#handling last part

Source File: nlp-6.4-tfidf-svm.py From Hands-on-NLP-with-NLTK-and-scikit-learn- with MIT License

6 votes

def extract_features(corpus):
    '''Extract TF-IDF features from corpus'''

    stop_words = nltk.corpus.stopwords.words("english")

    # vectorize means we turn non-numerical data into an array of numbers
    count_vectorizer = feature_extraction.text.CountVectorizer(
        lowercase=True,  # for demonstration, True by default
        tokenizer=nltk.word_tokenize,  # use the NLTK tokenizer
        min_df=2,  # minimum document frequency, i.e. the word must appear more than once.
        ngram_range=(1, 2),
        stop_words=stop_words
    )
    processed_corpus = count_vectorizer.fit_transform(corpus)
    processed_corpus = feature_extraction.text.TfidfTransformer().fit_transform(
        processed_corpus)

    return processed_corpus

Source File: analyze_data.py From Machine-Translation with Apache License 2.0

6 votes

def analyze_en():
    translation_path = os.path.join(train_translation_folder, train_translation_en_filename)

    with open(translation_path, 'r') as f:
        sentences = f.readlines()

    sent_lengths = []

    for sentence in tqdm(sentences):
        sentence_en = sentence.strip().lower()
        tokens = [normalizeString(s) for s in nltk.word_tokenize(sentence_en)]
        seg_list = list(jieba.cut(sentence.strip()))
        # Update word frequency
        sent_lengths.append(len(seg_list))

    num_bins = 100
    n, bins, patches = plt.hist(sent_lengths, num_bins, facecolor='blue', alpha=0.5)
    title = 'English Sentence Lengths Distribution'
    plt.title(title)
    plt.show()

Source File: nlp-5-document-classification.py From Hands-on-NLP-with-NLTK-and-scikit-learn- with MIT License

6 votes

def extract_features(corpus):
    '''Extract TF-IDF features from corpus'''

    stop_words = nltk.corpus.stopwords.words("english")

    # vectorize means we turn non-numerical data into an array of numbers
    count_vectorizer = feature_extraction.text.CountVectorizer(
        lowercase=True,  # for demonstration, True by default
        tokenizer=nltk.word_tokenize,  # use the NLTK tokenizer
        min_df=2,  # minimum document frequency, i.e. the word must appear more than once.
        ngram_range=(1, 2),
        stop_words=stop_words
    )
    processed_corpus = count_vectorizer.fit_transform(corpus)
    processed_corpus = feature_extraction.text.TfidfTransformer().fit_transform(
        processed_corpus)

    return processed_corpus

Source File: data_prep.py From Document-Classifier-LSTM with MIT License

6 votes

def preprocess(text):
	min_length = 3
	text = re.sub('\d+','#',text)
	text = re.sub('\.',' eos ',text)
	# Tokenize
	words = map(lambda word: word.lower(), word_tokenize(text))
	tokens = words
	# Remove non characters
	p = re.compile('[a-zA-Z#]+')
	# Filter tokens (we do not remove stopwords)
	filtered_tokens = list(filter(lambda token: p.match(token) and len(token)>=min_length and (token not in english_stopwords), tokens))
	# Encode to ascii
	filtered_tokens = [token.encode('ascii','ignore') for token in filtered_tokens]

	return filtered_tokens


# Modify this path

Source File: Mem2Seq.py From ConvLab with MIT License

6 votes

def predict(self, query):
        usr = query
        print('Mem2Seq usr:', usr)
        #example input: 'please find a restaurant called nusha .'
        self.t += 1
        print('Mem2Seq turn:', self.t)
        usr = ' '.join(word_tokenize(usr.lower()))
        self.memory += generate_memory(usr, '$u', self.t)
        src_plain = (self.memory+[['$$$$']*MEM_TOKEN_SIZE],)
        src_seqs = plain2tensor(self.lang.word2index, src_plain[0])
        words = self.model.evaluate_batch(1, src_seqs, [len(src_plain[0])], None, None, None, None, src_plain)
        row = np.transpose(words)[0].tolist()
        if '<EOS>' in row:
            row = row[:row.index('<EOS>')]
        sys = ' '.join(row)
        sys = denormalize(sys)
        print('Mem2Seq sys:', sys)
        self.memory += generate_memory(sys, '$s', self.t)
        return sys

Source File: phrasemachine.py From scattertext with Apache License 2.0

6 votes

def __init__(self):
		import nltk
		from nltk.tag import PerceptronTagger
		from nltk.tokenize import TreebankWordTokenizer
		#return pkgutil.get_data('scattertext',
		#                        'data/viz/semiotic_new.html').decode('utf-8')
		path = os.path.dirname(sys.modules['scattertext'].__file__)+'/data/'
		tokenizer_fn = path + 'punkt.english.pickle'
		tagger_fn = path + 'averaged_perceptron_tagger.pickle'
		#tokenizer_fn = os.path.abspath(resource_filename('scattertext.data', 'punkt.english.pickle'))
		#tagger_fn = os.path.abspath(resource_filename('scattertext.data', 'averaged_perceptron_tagger.pickle'))
		# Load the tagger
		self.tagger = PerceptronTagger(load=False)
		self.tagger.load(tagger_fn)

		# note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader.
		#       Calling the TreebankWordTokenizer like this allows skipping the downloader.
		#       It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads
		#       https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25
		self.tokenize = TreebankWordTokenizer().tokenize
		self.sent_detector = nltk.data.load(tokenizer_fn)

	# http://www.nltk.org/book/ch05.html

Source File: classify.py From Building-Machine-Learning-Systems-With-Python-Second-Edition with MIT License

5 votes

def prepare_sent_features():
    for pid, text in fetch_posts(chosen, with_index=True):
        if not text:
            meta[pid]['AvgSentLen'] = meta[pid]['AvgWordLen'] = 0
        else:
            sent_lens = [len(nltk.word_tokenize(
                sent)) for sent in nltk.sent_tokenize(text)]
            meta[pid]['AvgSentLen'] = np.mean(sent_lens)
            meta[pid]['AvgWordLen'] = np.mean(
                [len(w) for w in nltk.word_tokenize(text)])

        meta[pid]['NumAllCaps'] = np.sum(
            [word.isupper() for word in nltk.word_tokenize(text)])

        meta[pid]['NumExclams'] = text.count('!')

Source File: tokenizers.py From atis with MIT License

5 votes

def nl_tokenize(string):
    """Tokenizes a natural language string into tokens.

    Inputs:
       string: the string to tokenize.
    Outputs:
        a list of tokens.

    Assumes data is space-separated (this is true of ZC07 data in ATIS2/3).
    """
    return nltk.word_tokenize(string)

Source File: EvaluateTruecaser.py From truecaser with Apache License 2.0

5 votes

def evaluateTrueCaser(testSentences, wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist):
    correctTokens = 0
    totalTokens = 0
    
    for sentence in testSentences:
        tokensCorrect = nltk.word_tokenize(sentence)
        tokens = [token.lower() for token in tokensCorrect]
        tokensTrueCase = getTrueCase(tokens, 'title', wordCasingLookup, uniDist, backwardBiDist, forwardBiDist, trigramDist)
        
        perfectMatch = True
        
        for idx in xrange(len(tokensCorrect)):
            totalTokens += 1
            if tokensCorrect[idx] == tokensTrueCase[idx]:
                correctTokens += 1
            else:
                perfectMatch = False
        
        if not perfectMatch:
            print tokensCorrect
            print tokensTrueCase
        
            print "-------------------"
    

    print "Accuracy: %.2f%%" % (correctTokens / float(totalTokens)*100)

Source File: yelp.py From Point-Then-Operate with Apache License 2.0

5 votes

def get_references(self):

        assert self.mode == 'test', 'Only test mode support get_references().'
        path0 = os.path.join(self.root, 'reference.0')
        path1 = os.path.join(self.root, 'reference.1')
        ref0 = []
        ori0 = []
        ref1 = []
        ori1 = []
        with open(path0) as f:
            for i, line in enumerate(f):
                if i in self.remove0:
                    continue
                ori, ref = line.split('\t')
                ori = ori.split()
                ref = word_tokenize(ref.lower())
                ori0.append(ori)
                ref0.append(ref)
        with open(path1) as f:
            for i, line in enumerate(f):
                if i in self.remove1:
                    continue
                ori, ref = line.split('\t')
                ori = ori.split()
                ref = word_tokenize(ref.lower())
                ori1.append(ori)
                ref1.append(ref)
        ori0 = [[w if w in self.vocab.word2id else self.vocab.id2word[self.unk] for w in sent] for sent in ori0]
        ref0 = [[w if w in self.vocab.word2id else self.vocab.id2word[self.unk] for w in sent] for sent in ref0]
        ori1 = [[w if w in self.vocab.word2id else self.vocab.id2word[self.unk] for w in sent] for sent in ori1]
        ref1 = [[w if w in self.vocab.word2id else self.vocab.id2word[self.unk] for w in sent] for sent in ref1]
        return ori0, ref0, ori1, ref1

Source File: amazon.py From Point-Then-Operate with Apache License 2.0

5 votes

def create_resort(self):
        """The file of human references is not originally aligned with the test split."""
        _ids = []
        ref_gt = []
        with open(os.path.join(self.root, 'reference.0'), 'r') as f:
            for line in tqdm(f):
                ori, ref = line.split('\t')
                ori = ori.split()
                ref = word_tokenize(ref.lower())
                ref_gt.append(ori)
        for sent in self.data0:
            _ids.append(ref_gt.index(sent))
        with open(os.path.join(self.root, 'resort_0.txt'), 'w') as f:
            for _id in _ids:
                f.write(str(_id) + '\n')

        _ids = []
        ref_gt = []
        with open(os.path.join(self.root, 'reference.1'), 'r') as f:
            for line in tqdm(f):
                ori, ref = line.split('\t')
                ori = ori.split()
                ref = word_tokenize(ref.lower())
                ref_gt.append(ori)
        for sent in self.data1:
            _ids.append(ref_gt.index(sent))
        with open(os.path.join(self.root, 'resort_1.txt'), 'w') as f:
            for _id in _ids:
                f.write(str(_id) + '\n')

Source File: tokens.py From chimera with MIT License

5 votes

def tokenize(s):
    return word_tokenize(s.replace("|", " | "))

Source File: flashcards-embedding.py From dl-models-for-qa with Apache License 2.0

5 votes

def __iter__(self):
        for line in open(self.filename, "rb"):
            line = line.strip()
            line = line.decode("utf8").encode("ascii", "ignore")
            _, question, answer = line.split("\t")
            qwords = nltk.word_tokenize(question)
            awords = nltk.word_tokenize(answer)
            yield qwords + awords

# build model from sentences (CBOW w/negative sampling)

Source File: textpro.py From comparable-text-miner with Apache License 2.0

5 votes

def getLemma(text, contextFlag=False):
	lemmatizer = WordNetLemmatizer()
	#'NN':wordnet.NOUN,'JJ':wordnet.ADJ,'VB':wordnet.VERB,'RB':wordnet.ADV
	wordnet_tag ={'NN':'n','JJ':'a','VB':'v','RB':'r'}
	result = None
	if text.split() == 1: # on word
		tokenized = word_tokenize(t)
		tagged = pos_tag(tokenized)[0]
		lemma = ''
		try: lemma = lemmatizer.lemmatize(tagged[0],wordnet_tag[tagged[1][:2]])
		except: lemma = lemmatizer.lemmatize(tagged[0])
		result = lemma
	elif text.split() > 1 and contextFlag == True: # mutiple words i.e. text and without considering the context
		resultList = []
		for t in text.split():
			tokenized = word_tokenize(t)
			tagged = pos_tag(tokenized)[0]
			lemma = ''
			try: lemma = lemmatizer.lemmatize(tagged[0],wordnet_tag[tagged[1][:2]])
			except: lemma = lemmatizer.lemmatize(tagged[0])
			resultList.append(lemma)
		result = ' '.join(resultList)
	else: # mutiple words i.e. text and consider the context
		resultList = []
		tokens = word_tokenize(text)
		tagged = pos_tag(tokens)
		for t in tagged:
			try: resultList.append(lemmatizer.lemmatize(t[0],wordnet_tag[t[1][:2]]))
			except: resultList.append(lemmatizer.lemmatize(t[0]))
		result = ' '.join(resultList)
	return result
###################################################################################

# Given a Naive Bayes classifier, classify a text with a given certaintaity

Source File: rd_ft.py From DeepLearn with MIT License

5 votes

def nouns(text):
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)]
    return nouns

#Average Edit Distance Value For Two String And The Average Edit Distance Between The Nouns Present In Them(Returns Float)

Source File: hnd_ft.py From DeepLearn with MIT License

5 votes

def get_tokenized_lemmas(s):
    return [normalize_word(t) for t in nltk.word_tokenize(s)]

Source File: data.py From fever-naacl-2018 with Apache License 2.0

5 votes

def nltk_tokenizer(self,text):
        return " ".join(word_tokenize(text))

Source File: chatbot.py From facebook-chatbot-python with MIT License

5 votes

def __init__(self, text):
        tokens = nltk.word_tokenize(text)
        print "tokenized"
        self._model = nltk.model.ngram.NgramModel(3, tokens)
        print "modelized"

Source File: feature_engineering.py From DeepLearn with MIT License

5 votes

def get_tokenized_lemmas(s):
    return [normalize_word(t) for t in nltk.word_tokenize(s)]

Source File: preprocessors.py From philo2vec with MIT License

5 votes

def get_words(text_stream):
        """
        Tokenize and transform a stream of text.
        :param text_stream: list of sentences.
        :return: return the tokenized sentences after stemming and lower casing
        """
        return [StemmingLookup.stem(word.lower())
                for line in text_stream
                for word in word_tokenize(line)
                if word not in string.punctuation]

Python nltk.word_tokenize() Examples