Python nltk.stem.PorterStemmer() Examples
The following are 23
code examples of nltk.stem.PorterStemmer().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
nltk.stem
, or try the search function
.
Example #1
Source File: porter.py From razzy-spinner with GNU General Public License v3.0 | 6 votes |
def __repr__(self): return '<PorterStemmer>' ## --NLTK-- ## This test procedure isn't applicable. #if __name__ == '__main__': # p = PorterStemmer() # if len(sys.argv) > 1: # for f in sys.argv[1:]: # with open(f, 'r') as infile: # while 1: # w = infile.readline() # if w == '': # break # w = w[:-1] # print(p.stem(w)) ##--NLTK-- ## Added a demo() function
Example #2
Source File: porter.py From luscan-devel with GNU General Public License v2.0 | 6 votes |
def __repr__(self): return '<PorterStemmer>' ## --NLTK-- ## This test procedure isn't applicable. #if __name__ == '__main__': # p = PorterStemmer() # if len(sys.argv) > 1: # for f in sys.argv[1:]: # infile = open(f, 'r') # while 1: # w = infile.readline() # if w == '': # break # w = w[:-1] # print p.stem(w) ##--NLTK-- ## Added a demo() function
Example #3
Source File: text.py From textplot with MIT License | 6 votes |
def plot_term_kdes(self, words, **kwargs): """ Plot kernel density estimates for multiple words. Args: words (list): A list of unstemmed terms. """ stem = PorterStemmer().stem for word in words: kde = self.kde(stem(word), **kwargs) plt.plot(kde) plt.show()
Example #4
Source File: utils.py From textplot with MIT License | 6 votes |
def tokenize(text): """ Yield tokens. Args: text (str): The original text. Yields: dict: The next token. """ stem = PorterStemmer().stem tokens = re.finditer('[a-z]+', text.lower()) for offset, match in enumerate(tokens): # Get the raw token. unstemmed = match.group(0) yield { # Emit the token. 'stemmed': stem(unstemmed), 'unstemmed': unstemmed, 'offset': offset }
Example #5
Source File: data_cleaning.py From Hands-On-Ensemble-Learning-with-Python with MIT License | 6 votes |
def preprocess(string): stemmer = PorterStemmer() # Remove any punctuation character removed_punc = ''.join([char for char in string if char not in punctuation]) cleaned = [] # Remove any stopword for word in removed_punc.split(' '): if word not in stops: cleaned.append(stemmer.stem(word.lower())) return ' '.join(cleaned) # Shuffle
Example #6
Source File: Word_Frequency_Summarization.py From nlp-akash with MIT License | 6 votes |
def _create_frequency_table(text_string) -> dict: """ we create a dictionary for the word frequency table. For this, we should only use the words that are not part of the stopWords array. Removing stop words and making frequency table Stemmer - an algorithm to bring words to its root word. :rtype: dict """ stopWords = set(stopwords.words("english")) words = word_tokenize(text_string) ps = PorterStemmer() freqTable = dict() for word in words: word = ps.stem(word) if word in stopWords: continue if word in freqTable: freqTable[word] += 1 else: freqTable[word] = 1 return freqTable
Example #7
Source File: TagPreprocessing.py From RecSys2019_DeepLearning_Evaluation with GNU Affero General Public License v3.0 | 6 votes |
def tagFilterAndStemming(originalTag): # Remove non alphabetical character and split on spaces processedTag = re.sub("[^a-zA-Z0-9]", " ", originalTag) processedTag = re.sub(" +", " ", processedTag) processedTag = processedTag.split(" ") stopwords_set = set(stopwords.words('english')) stemmer = PorterStemmer() result = [] for tag in processedTag: tag_stemmed = stemmer.stem(tag) if tag_stemmed not in stopwords_set: result.append(tag_stemmed) return result
Example #8
Source File: prerank.py From nboost with Apache License 2.0 | 5 votes |
def __init__(self, **kwargs): super().__init__(**kwargs) self.ps = PorterStemmer()
Example #9
Source File: word_stemmer.py From stog with MIT License | 5 votes |
def __init__(self): self.stemmer = NltkPorterStemmer()
Example #10
Source File: porter.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def demo(): """ A demonstration of the porter stemmer on a sample from the Penn Treebank corpus. """ from nltk.corpus import treebank from nltk import stem stemmer = stem.PorterStemmer() orig = [] stemmed = [] for item in treebank.fileids()[:3]: for (word, tag) in treebank.tagged_words(item): orig.append(word) stemmed.append(stemmer.stem(word)) # Convert the results to a string, and word-wrap them. results = ' '.join(stemmed) results = re.sub(r"(.{,70})\s", r'\1\n', results + ' ').rstrip() # Convert the original to a string, and word wrap it. original = ' '.join(orig) original = re.sub(r"(.{,70})\s", r'\1\n', original + ' ').rstrip() # Print the results. print('-Original-'.center(70).replace(' ', '*').replace('-', ' ')) print(original) print('-Results-'.center(70).replace(' ', '*').replace('-', ' ')) print(results) print('*' * 70)
Example #11
Source File: porter.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def __repr__(self): return '<PorterStemmer>'
Example #12
Source File: porter.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def __init__(self, mode=NLTK_EXTENSIONS): if mode not in ( self.NLTK_EXTENSIONS, self.MARTIN_EXTENSIONS, self.ORIGINAL_ALGORITHM, ): raise ValueError( "Mode must be one of PorterStemmer.NLTK_EXTENSIONS, " "PorterStemmer.MARTIN_EXTENSIONS, or " "PorterStemmer.ORIGINAL_ALGORITHM" ) self.mode = mode if self.mode == self.NLTK_EXTENSIONS: # This is a table of irregular forms. It is quite short, # but still reflects the errors actually drawn to Martin # Porter's attention over a 20 year period! irregular_forms = { "sky": ["sky", "skies"], "die": ["dying"], "lie": ["lying"], "tie": ["tying"], "news": ["news"], "inning": ["innings", "inning"], "outing": ["outings", "outing"], "canning": ["cannings", "canning"], "howe": ["howe"], "proceed": ["proceed"], "exceed": ["exceed"], "succeed": ["succeed"], } self.pool = {} for key in irregular_forms: for val in irregular_forms[key]: self.pool[val] = key self.vowels = frozenset(['a', 'e', 'i', 'o', 'u'])
Example #13
Source File: Chapter 05_KNN n Naive Bayes.py From Statistics-for-Machine-Learning with MIT License | 5 votes |
def preprocessing(text): text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split()) tokens = [word for sent in nltk.sent_tokenize(text2) for word in nltk.word_tokenize(sent)] tokens = [word.lower() for word in tokens] stopwds = stopwords.words('english') tokens = [token for token in tokens if token not in stopwds] tokens = [word for word in tokens if len(word)>=3] stemmer = PorterStemmer() tokens = [stemmer.stem(word) for word in tokens] tagged_corpus = pos_tag(tokens) Noun_tags = ['NN','NNP','NNPS','NNS'] Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ'] lemmatizer = WordNetLemmatizer() def prat_lemmatize(token,tag): if tag in Noun_tags: return lemmatizer.lemmatize(token,'n') elif tag in Verb_tags: return lemmatizer.lemmatize(token,'v') else: return lemmatizer.lemmatize(token,'n') pre_proc_text = " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus]) return pre_proc_text
Example #14
Source File: input_representation.py From ai-research-keyphrase-extraction with Apache License 2.0 | 5 votes |
def __init__(self, pos_tagged, lang, stem=False, min_word_len=3): """ :param pos_tagged: List of list : Text pos_tagged as a list of sentences where each sentence is a list of tuple (word, TAG). :param stem: If we want to apply stemming on the text. """ self.min_word_len = min_word_len self.considered_tags = {'NN', 'NNS', 'NNP', 'NNPS', 'JJ'} self.pos_tagged = [] self.filtered_pos_tagged = [] self.isStemmed = stem self.lang = lang if stem: stemmer = PorterStemmer() self.pos_tagged = [[(stemmer.stem(t[0]), t[1]) for t in sent] for sent in pos_tagged] else: self.pos_tagged = [[(t[0].lower(), t[1]) for t in sent] for sent in pos_tagged] temp = [] for sent in self.pos_tagged: s = [] for elem in sent: if len(elem[0]) < min_word_len: s.append((elem[0], 'LESS')) else: s.append(elem) temp.append(s) self.pos_tagged = temp # Convert some language-specific tag (NC, NE to NN) or ADJA ->JJ see convert method. if lang in ['fr', 'de']: self.pos_tagged = [[(tagged_token[0], convert(tagged_token[1])) for tagged_token in sentence] for sentence in self.pos_tagged] self.filtered_pos_tagged = [[(t[0].lower(), t[1]) for t in sent if self.is_candidate(t)] for sent in self.pos_tagged]
Example #15
Source File: word_stemmer.py From deep_qa with Apache License 2.0 | 5 votes |
def __init__(self): self.stemmer = NltkPorterStemmer()
Example #16
Source File: porter.py From luscan-devel with GNU General Public License v2.0 | 5 votes |
def demo(): """ A demonstration of the porter stemmer on a sample from the Penn Treebank corpus. """ from nltk.corpus import treebank from nltk import stem stemmer = stem.PorterStemmer() orig = [] stemmed = [] for item in treebank.files()[:3]: for (word, tag) in treebank.tagged_words(item): orig.append(word) stemmed.append(stemmer.stem(word)) # Convert the results to a string, and word-wrap them. results = ' '.join(stemmed) results = re.sub(r"(.{,70})\s", r'\1\n', results+' ').rstrip() # Convert the original to a string, and word wrap it. original = ' '.join(orig) original = re.sub(r"(.{,70})\s", r'\1\n', original+' ').rstrip() # Print the results. print '-Original-'.center(70).replace(' ', '*').replace('-', ' ') print original print '-Results-'.center(70).replace(' ', '*').replace('-', ' ') print results print '*'*70 ##--NLTK--
Example #17
Source File: word_stemmer.py From gtos with MIT License | 5 votes |
def __init__(self): self.stemmer = NltkPorterStemmer()
Example #18
Source File: word_stemmer.py From magnitude with MIT License | 5 votes |
def __init__(self): self.stemmer = NltkPorterStemmer() #overrides
Example #19
Source File: 9.2 Email_Classification.py From Natural-Language-Processing-with-Python-Cookbook with MIT License | 5 votes |
def preprocessing(text): text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split()) tokens = [word for sent in nltk.sent_tokenize(text2) for word in nltk.word_tokenize(sent)] tokens = [word.lower() for word in tokens] stopwds = stopwords.words('english') tokens = [token for token in tokens if token not in stopwds] tokens = [word for word in tokens if len(word)>=3] stemmer = PorterStemmer() try: tokens = [stemmer.stem(word) for word in tokens] except: tokens = tokens tagged_corpus = pos_tag(tokens) Noun_tags = ['NN','NNP','NNPS','NNS'] Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ'] lemmatizer = WordNetLemmatizer() def prat_lemmatize(token,tag): if tag in Noun_tags: return lemmatizer.lemmatize(token,'n') elif tag in Verb_tags: return lemmatizer.lemmatize(token,'v') else: return lemmatizer.lemmatize(token,'n') pre_proc_text = " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus]) return pre_proc_text
Example #20
Source File: 9.5 Skipgram_Keras.py From Natural-Language-Processing-with-Python-Cookbook with MIT License | 5 votes |
def preprocessing(text): text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split()) tokens = [word for sent in nltk.sent_tokenize(text2) for word in nltk.word_tokenize(sent)] tokens = [word.lower() for word in tokens] stopwds = stopwords.words('english') tokens = [token for token in tokens if token not in stopwds] tokens = [word for word in tokens if len(word)>=3] stemmer = PorterStemmer() tokens = [stemmer.stem(word) for word in tokens] tagged_corpus = pos_tag(tokens) Noun_tags = ['NN','NNP','NNPS','NNS'] Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ'] lemmatizer = WordNetLemmatizer() def prat_lemmatize(token,tag): if tag in Noun_tags: return lemmatizer.lemmatize(token,'n') elif tag in Verb_tags: return lemmatizer.lemmatize(token,'v') else: return lemmatizer.lemmatize(token,'n') pre_proc_text = " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus]) return pre_proc_text
Example #21
Source File: porter.py From razzy-spinner with GNU General Public License v3.0 | 5 votes |
def demo(): """ A demonstration of the porter stemmer on a sample from the Penn Treebank corpus. """ from nltk.corpus import treebank from nltk import stem stemmer = stem.PorterStemmer() orig = [] stemmed = [] for item in treebank.files()[:3]: for (word, tag) in treebank.tagged_words(item): orig.append(word) stemmed.append(stemmer.stem(word)) # Convert the results to a string, and word-wrap them. results = ' '.join(stemmed) results = re.sub(r"(.{,70})\s", r'\1\n', results+' ').rstrip() # Convert the original to a string, and word wrap it. original = ' '.join(orig) original = re.sub(r"(.{,70})\s", r'\1\n', original+' ').rstrip() # Print the results. print('-Original-'.center(70).replace(' ', '*').replace('-', ' ')) print(original) print('-Results-'.center(70).replace(' ', '*').replace('-', ' ')) print(results) print('*'*70) ##--NLTK--
Example #22
Source File: text_feature_extraction.py From RMDL with GNU General Public License v3.0 | 4 votes |
def text_cleaner(text, deep_clean=False, stem= True, stop_words=True, translite_rate=True): rules = [ {r'>\s+': u'>'}, # remove spaces after a tag opens or closes {r'\s+': u' '}, # replace consecutive spaces {r'\s*<br\s*/?>\s*': u'\n'}, # newline after a <br> {r'</(div)\s*>\s*': u'\n'}, # newline after </p> and </div> and <h1/>... {r'</(p|h\d)\s*>\s*': u'\n\n'}, # newline after </p> and </div> and <h1/>... {r'<head>.*<\s*(/head|body)[^>]*>': u''}, # remove <head> to </head> {r'<a\s+href="([^"]+)"[^>]*>.*</a>': r'\1'}, # show links instead of texts {r'[ \t]*<[^<]*?/?>': u''}, # remove remaining tags {r'^\s+': u''} # remove spaces at the beginning ] if deep_clean: text = text.replace(".", "") text = text.replace("[", " ") text = text.replace(",", " ") text = text.replace("]", " ") text = text.replace("(", " ") text = text.replace(")", " ") text = text.replace("\"", "") text = text.replace("-", " ") text = text.replace("=", " ") text = text.replace("?", " ") text = text.replace("!", " ") for rule in rules: for (k, v) in rule.items(): regex = re.compile(k) text = regex.sub(v, text) text = text.rstrip() text = text.strip() text = text.replace('+', ' ').replace('.', ' ').replace(',', ' ').replace(':', ' ') text = re.sub("(^|\W)\d+($|\W)", " ", text) if translite_rate: text = transliterate(text) if stem: text = PorterStemmer().stem(text) text = WordNetLemmatizer().lemmatize(text) if stop_words: stop_words = set(stopwords.words('english')) word_tokens = word_tokenize(text) text = [w for w in word_tokens if not w in stop_words] text = ' '.join(str(e) for e in text) else: for rule in rules: for (k, v) in rule.items(): regex = re.compile(k) text = regex.sub(v, text) text = text.rstrip() text = text.strip() return text.lower()
Example #23
Source File: test_text_filters.py From pliers with BSD 3-Clause "New" or "Revised" License | 4 votes |
def test_word_stemming_filter(): stim = ComplexTextStim(join(TEXT_DIR, 'sample_text.txt'), columns='to', default_duration=1) # With all defaults (porter stemmer) filt = WordStemmingFilter() assert isinstance(filt.stemmer, nls.PorterStemmer) stemmed = filt.transform(stim) stems = [s.text for s in stemmed] target = ['some', 'sampl', 'text', 'for', 'test', 'annot'] assert stems == target # Try a different stemmer filt = WordStemmingFilter(stemmer='snowball', language='english') assert isinstance(filt.stemmer, nls.SnowballStemmer) stemmed = filt.transform(stim) stems = [s.text for s in stemmed] assert stems == target # Handles StemmerI stemmer stemmer = nls.SnowballStemmer(language='english') filt = WordStemmingFilter(stemmer=stemmer) stemmed = filt.transform(stim) stems = [s.text for s in stemmed] assert stems == target # Try lemmatization filter try: nltk.find('taggers/universal_tagset') except LookupError: nltk.download('universal_tagset') try: nltk.find('corpora/wordnet') except LookupError: nltk.download('wordnet') stim = ComplexTextStim(text='These are tests for Stemming filters') filt = WordStemmingFilter(stemmer='wordnet') lemmatized = filt.transform(stim) lemmas = [l.text for l in lemmatized] target = ['these', 'be', 'test', 'for', 'stem', 'filter'] assert lemmas == target # Try case sensitive filt = WordStemmingFilter(stemmer='wordnet', case_sensitive=True) lemmatized = filt.transform(stim) lemmas = [l.text for l in lemmatized] target = ['These', 'be', 'test', 'for', 'Stemming', 'filter'] assert lemmas == target # Fails on invalid values with pytest.raises(ValueError): filt = WordStemmingFilter(stemmer='nonexistent_stemmer') # Try a long text stim stim2 = TextStim(text='theres something happening here') filt = WordStemmingFilter() assert filt.transform(stim2).text == 'there someth happen here'