Python nltk.stem.SnowballStemmer() Examples
The following are 6
code examples of nltk.stem.SnowballStemmer().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
nltk.stem
, or try the search function
.
Example #1
Source File: DataPrep.py From Fake_News_Detection with MIT License | 6 votes |
def data_qualityCheck(): print("Checking data qualitites...") train_news.isnull().sum() train_news.info() print("check finished.") #below datasets were used to test_news.isnull().sum() test_news.info() valid_news.isnull().sum() valid_news.info() #run the below function call to see the quality check results #data_qualityCheck() #eng_stemmer = SnowballStemmer('english') #stopwords = set(nltk.corpus.stopwords.words('english')) #Stemming
Example #2
Source File: ppdb.py From Interactive-Semantic-Parsing with Apache License 2.0 | 6 votes |
def collect_pairs_by_rel(filename, rel): """ Collect pairs from PPDB maintaining the specified relation. """ stemmer = SnowballStemmer("english") with open(filename, "r") as f: data = f.readlines() phrase2paraphrase = dict() for item in data: item = item.strip() phrase = item.split('|||')[1].strip() paraphrase = item.split('|||')[2].strip() if stemmer.stem(phrase) == stemmer.stem(paraphrase): continue entailment = item.split('|||')[-1].strip() if entailment == rel: add_to_dict_of_set(phrase, paraphrase, phrase2paraphrase) add_to_dict_of_set(paraphrase, phrase, phrase2paraphrase) print("Size: %d" % len(phrase2paraphrase)) return phrase2paraphrase
Example #3
Source File: tokenizer.py From PyTLDR with GNU General Public License v3.0 | 6 votes |
def __init__(self, language='english', stopwords=None, stemming=True): if stemming: self._stemmer = SnowballStemmer(language) else: self._stemmer = None if isinstance(stopwords, list): self._stopwords = stopwords elif isinstance(stopwords, (str, unicode)): # stopwords argument is a path try: self._stopwords = self._load_stopwords(stopwords) except IOError: raise IOError('stopwords argument must be a path to a .txt file, a list of word strings ' 'or None (which loads the default list)') else: # Load built-in stopwords stopwords_dir = 'stopwords/{0}.txt'.format(language.lower()) application_root = os.path.dirname(__file__) stopwords_file = os.path.join(application_root, '..', stopwords_dir) self._stopwords = self._load_stopwords(stopwords_file)
Example #4
Source File: language.py From summarize with MIT License | 5 votes |
def __init__(self, language): self.language = language self.stopwords = corpus.stopwords.words(language) self.stemmer = stem.SnowballStemmer(language)
Example #5
Source File: ppdb.py From Interactive-Semantic-Parsing with Apache License 2.0 | 5 votes |
def clean_paraphrase(paraphrase_dict): stemmer = SnowballStemmer("english") paraphrase_dict_clean = dict() print("Size: %d" % len(paraphrase_dict)) for phrase, paraphrases in paraphrase_dict.items(): new_paraphrases = set() for paraphrase in paraphrases: if stemmer.stem(phrase) != stemmer.stem(paraphrase): new_paraphrases.add(paraphrase) if len(new_paraphrases): paraphrase_dict_clean[phrase] = new_paraphrases print("Size: %d" % len(paraphrase_dict_clean)) return paraphrase_dict_clean
Example #6
Source File: test_text_filters.py From pliers with BSD 3-Clause "New" or "Revised" License | 4 votes |
def test_word_stemming_filter(): stim = ComplexTextStim(join(TEXT_DIR, 'sample_text.txt'), columns='to', default_duration=1) # With all defaults (porter stemmer) filt = WordStemmingFilter() assert isinstance(filt.stemmer, nls.PorterStemmer) stemmed = filt.transform(stim) stems = [s.text for s in stemmed] target = ['some', 'sampl', 'text', 'for', 'test', 'annot'] assert stems == target # Try a different stemmer filt = WordStemmingFilter(stemmer='snowball', language='english') assert isinstance(filt.stemmer, nls.SnowballStemmer) stemmed = filt.transform(stim) stems = [s.text for s in stemmed] assert stems == target # Handles StemmerI stemmer stemmer = nls.SnowballStemmer(language='english') filt = WordStemmingFilter(stemmer=stemmer) stemmed = filt.transform(stim) stems = [s.text for s in stemmed] assert stems == target # Try lemmatization filter try: nltk.find('taggers/universal_tagset') except LookupError: nltk.download('universal_tagset') try: nltk.find('corpora/wordnet') except LookupError: nltk.download('wordnet') stim = ComplexTextStim(text='These are tests for Stemming filters') filt = WordStemmingFilter(stemmer='wordnet') lemmatized = filt.transform(stim) lemmas = [l.text for l in lemmatized] target = ['these', 'be', 'test', 'for', 'stem', 'filter'] assert lemmas == target # Try case sensitive filt = WordStemmingFilter(stemmer='wordnet', case_sensitive=True) lemmatized = filt.transform(stim) lemmas = [l.text for l in lemmatized] target = ['These', 'be', 'test', 'for', 'Stemming', 'filter'] assert lemmas == target # Fails on invalid values with pytest.raises(ValueError): filt = WordStemmingFilter(stemmer='nonexistent_stemmer') # Try a long text stim stim2 = TextStim(text='theres something happening here') filt = WordStemmingFilter() assert filt.transform(stim2).text == 'there someth happen here'