Python nltk.stem() Examples
The following are 15
code examples of nltk.stem().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
nltk
, or try the search function
.
Example #1
Source File: syntax.py From Gap with Apache License 2.0 | 6 votes |
def _nltkStemmer(self, name): """ NLTK Stemmer """ if name == 'porter': stemmer = PorterStemmer() elif name == 'snowball': stemmer = SnowballStemmer("english") elif name == "lancaster": stemmer = LancasterStemmer() else: return length = len(self._words) for i in range(length): word = self._words[i]['word'] l = len(word) # Don't stem short words or words already categorized if l < 4 or self._words[i]['tag'] != Vocabulary.UNTAG: continue self._words[i]['word'] = stemmer.stem(self._words[i]['word'])
Example #2
Source File: TextRank.py From exsto with Apache License 2.0 | 6 votes |
def wrap_words (pair): """wrap each (word, tag) pair as an object with fully indexed metadata""" global STEMMER index = pair[0] result = [] for word, tag in pair[1]: word = word.lower() stem = STEMMER.stem(word) if stem == "": stem = word keep = tag in ('JJ', 'NN', 'NNS', 'NNP',) result.append({ "id": 0, "index": index, "stem": stem, "word": word, "tag": tag, "keep": keep }) index += 1 return result ###################################################################### ## build a graph from raw text
Example #3
Source File: TagPreprocessing.py From RecSys2019_DeepLearning_Evaluation with GNU Affero General Public License v3.0 | 6 votes |
def tagFilterAndStemming(originalTag): # Remove non alphabetical character and split on spaces processedTag = re.sub("[^a-zA-Z0-9]", " ", originalTag) processedTag = re.sub(" +", " ", processedTag) processedTag = processedTag.split(" ") stopwords_set = set(stopwords.words('english')) stemmer = PorterStemmer() result = [] for tag in processedTag: tag_stemmed = stemmer.stem(tag) if tag_stemmed not in stopwords_set: result.append(tag_stemmed) return result
Example #4
Source File: combined.py From Projects with MIT License | 5 votes |
def build_analyzer(self): analyzer = super(TfidfVectorizer, self).build_analyzer() return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))
Example #5
Source File: 9.5 Skipgram_Keras.py From Natural-Language-Processing-with-Python-Cookbook with MIT License | 5 votes |
def preprocessing(text): text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split()) tokens = [word for sent in nltk.sent_tokenize(text2) for word in nltk.word_tokenize(sent)] tokens = [word.lower() for word in tokens] stopwds = stopwords.words('english') tokens = [token for token in tokens if token not in stopwds] tokens = [word for word in tokens if len(word)>=3] stemmer = PorterStemmer() tokens = [stemmer.stem(word) for word in tokens] tagged_corpus = pos_tag(tokens) Noun_tags = ['NN','NNP','NNPS','NNS'] Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ'] lemmatizer = WordNetLemmatizer() def prat_lemmatize(token,tag): if tag in Noun_tags: return lemmatizer.lemmatize(token,'n') elif tag in Verb_tags: return lemmatizer.lemmatize(token,'v') else: return lemmatizer.lemmatize(token,'n') pre_proc_text = " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus]) return pre_proc_text
Example #6
Source File: 9.2 Email_Classification.py From Natural-Language-Processing-with-Python-Cookbook with MIT License | 5 votes |
def preprocessing(text): text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split()) tokens = [word for sent in nltk.sent_tokenize(text2) for word in nltk.word_tokenize(sent)] tokens = [word.lower() for word in tokens] stopwds = stopwords.words('english') tokens = [token for token in tokens if token not in stopwds] tokens = [word for word in tokens if len(word)>=3] stemmer = PorterStemmer() try: tokens = [stemmer.stem(word) for word in tokens] except: tokens = tokens tagged_corpus = pos_tag(tokens) Noun_tags = ['NN','NNP','NNPS','NNS'] Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ'] lemmatizer = WordNetLemmatizer() def prat_lemmatize(token,tag): if tag in Noun_tags: return lemmatizer.lemmatize(token,'n') elif tag in Verb_tags: return lemmatizer.lemmatize(token,'v') else: return lemmatizer.lemmatize(token,'n') pre_proc_text = " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus]) return pre_proc_text
Example #7
Source File: stemming.py From nltk-server with MIT License | 5 votes |
def stemmer(method,data): """ Takes an array of words in JSON format. """ data = parse_input(data) if data == False: return ret_failure(703) else: res=[] if method == "lancaster": for word in data: try: res.append([word,LancasterSt.stem(word)]) except: return ret_failure(702) elif method == "porter": for word in data: try: res.append([word,PorterSt.stem(word)]) except: return ret_failure(702) elif method == 'snowball': for word in data: try: res.append([word,SnowballSt.stem(word)]) except: return ret_failure(702) else: abort(404) return ret_success(res)
Example #8
Source File: deploy.py From Election-Meddling with MIT License | 5 votes |
def data_preparation(tweet): #nltk.tag._POS_TAGGER #treebank tag set https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html url_regex = r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)' clean = re.sub(url_regex, '', tweet, flags = re.MULTILINE) # strip out urls. urls, ew, nasty. clean = clean.replace('\n', ' ').replace("'", " ").replace('"', ' ') try: clean = clean.decode("utf-8-sig").replace(u"\ufffd", "?") # strip out Byte Order Marks print("Detected BOS") except: pass clean = re.sub(r'[^a-zA-Z ]', '', clean, flags = re.MULTILINE) # the "#" symbol is actually called octothorpe. bananas. tokens = splitter.split(clean) # Tokeniztion lemma_pos_token = lemmatization_using_pos_tagger.pos_tag(tokens) # Part of speech tagging. out = ' '.join([out[1] for out in lemma_pos_token[0]]) return out ''' #https://pypi.org/project/hunspell/ #Double tokenizing. hunspell for units, nltk for context. import hunspell hobj = hunspell.HunSpell('/usr/share/myspell/en_US.dic', '/usr/share/myspell/en_US.aff') hobj.spell('spookie') hobj.suggest('spookie') hobj.spell('spooky') hobj.analyze('linked') hobj.stem('linked') '''
Example #9
Source File: Auto_NLP.py From Auto_ViML with Apache License 2.0 | 5 votes |
def tokenize_and_stem(text): stemmer = SnowballStemmer("english") text = re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", text) # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)] filtered_tokens = [] # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation) for token in tokens: if re.search('[a-zA-Z]', token): filtered_tokens.append(token) stems = [stemmer.stem(t) for t in filtered_tokens] return stems ################################################################################
Example #10
Source File: DataPrep.py From Fake_News_Detection with MIT License | 5 votes |
def stem_tokens(tokens, stemmer): stemmed = [] for token in tokens: stemmed.append(stemmer.stem(token)) return stemmed #process the data
Example #11
Source File: DataPrep.py From Fake_News_Detection with MIT License | 5 votes |
def process_data(data,exclude_stopword=True,stem=True): tokens = [w.lower() for w in data] tokens_stemmed = tokens tokens_stemmed = stem_tokens(tokens, eng_stemmer) tokens_stemmed = [w for w in tokens_stemmed if w not in stopwords ] return tokens_stemmed #creating ngrams #unigram
Example #12
Source File: DataPrep.py From Fake_News_Detection with MIT License | 5 votes |
def tokenizer_porter(text): return [porter.stem(word) for word in text.split()] #doc = ['runners like running and thus they run','this is a test for tokens'] #tokenizer([word for line in test_news.iloc[:,1] for word in line.lower().split()]) #show the distribution of labels in the train and test data
Example #13
Source File: Chapter 05_KNN n Naive Bayes.py From Statistics-for-Machine-Learning with MIT License | 5 votes |
def preprocessing(text): text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split()) tokens = [word for sent in nltk.sent_tokenize(text2) for word in nltk.word_tokenize(sent)] tokens = [word.lower() for word in tokens] stopwds = stopwords.words('english') tokens = [token for token in tokens if token not in stopwds] tokens = [word for word in tokens if len(word)>=3] stemmer = PorterStemmer() tokens = [stemmer.stem(word) for word in tokens] tagged_corpus = pos_tag(tokens) Noun_tags = ['NN','NNP','NNPS','NNS'] Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ'] lemmatizer = WordNetLemmatizer() def prat_lemmatize(token,tag): if tag in Noun_tags: return lemmatizer.lemmatize(token,'n') elif tag in Verb_tags: return lemmatizer.lemmatize(token,'v') else: return lemmatizer.lemmatize(token,'n') pre_proc_text = " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus]) return pre_proc_text
Example #14
Source File: prerank.py From nboost with Apache License 2.0 | 5 votes |
def tokenize(self, paragraph): words = [self.ps.stem(word) for word in word_tokenize(paragraph)] filtered_words = [word for word in words if word not in stopwords.words('english')] return filtered_words
Example #15
Source File: text_feature_extraction.py From RMDL with GNU General Public License v3.0 | 4 votes |
def text_cleaner(text, deep_clean=False, stem= True, stop_words=True, translite_rate=True): rules = [ {r'>\s+': u'>'}, # remove spaces after a tag opens or closes {r'\s+': u' '}, # replace consecutive spaces {r'\s*<br\s*/?>\s*': u'\n'}, # newline after a <br> {r'</(div)\s*>\s*': u'\n'}, # newline after </p> and </div> and <h1/>... {r'</(p|h\d)\s*>\s*': u'\n\n'}, # newline after </p> and </div> and <h1/>... {r'<head>.*<\s*(/head|body)[^>]*>': u''}, # remove <head> to </head> {r'<a\s+href="([^"]+)"[^>]*>.*</a>': r'\1'}, # show links instead of texts {r'[ \t]*<[^<]*?/?>': u''}, # remove remaining tags {r'^\s+': u''} # remove spaces at the beginning ] if deep_clean: text = text.replace(".", "") text = text.replace("[", " ") text = text.replace(",", " ") text = text.replace("]", " ") text = text.replace("(", " ") text = text.replace(")", " ") text = text.replace("\"", "") text = text.replace("-", " ") text = text.replace("=", " ") text = text.replace("?", " ") text = text.replace("!", " ") for rule in rules: for (k, v) in rule.items(): regex = re.compile(k) text = regex.sub(v, text) text = text.rstrip() text = text.strip() text = text.replace('+', ' ').replace('.', ' ').replace(',', ' ').replace(':', ' ') text = re.sub("(^|\W)\d+($|\W)", " ", text) if translite_rate: text = transliterate(text) if stem: text = PorterStemmer().stem(text) text = WordNetLemmatizer().lemmatize(text) if stop_words: stop_words = set(stopwords.words('english')) word_tokens = word_tokenize(text) text = [w for w in word_tokens if not w in stop_words] text = ' '.join(str(e) for e in text) else: for rule in rules: for (k, v) in rule.items(): regex = re.compile(k) text = regex.sub(v, text) text = text.rstrip() text = text.strip() return text.lower()