Python nltk.PorterStemmer() Examples
The following are 10
code examples of nltk.PorterStemmer().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
nltk
, or try the search function
.
Example #1
Source File: ngram_featurizer.py From metal with Apache License 2.0 | 6 votes |
def __init__( self, anonymize=True, trim_window=5, lowercase=True, drop_stopwords=True, stem=True, ngram_range=(1, 3), **vectorizer_kwargs, ): self.anonymize = anonymize self.lowercase = lowercase self.drop_stopwords = drop_stopwords if drop_stopwords: nltk.download("stopwords") self.stopwords = set(nltk.corpus.stopwords.words("english")) self.trim_window = trim_window self.stem = stem if stem: self.porter = nltk.PorterStemmer() self.vectorizer = CountVectorizer( ngram_range=ngram_range, binary=True, **vectorizer_kwargs )
Example #2
Source File: TF_IDF_Summarization.py From nlp-akash with MIT License | 6 votes |
def _create_frequency_table(text_string) -> dict: """ we create a dictionary for the word frequency table. For this, we should only use the words that are not part of the stopWords array. Removing stop words and making frequency table Stemmer - an algorithm to bring words to its root word. :rtype: dict """ stopWords = set(stopwords.words("english")) words = word_tokenize(text_string) ps = PorterStemmer() freqTable = dict() for word in words: word = ps.stem(word) if word in stopWords: continue if word in freqTable: freqTable[word] += 1 else: freqTable[word] = 1 return freqTable
Example #3
Source File: TF_IDF_Summarization.py From nlp-akash with MIT License | 6 votes |
def _create_frequency_matrix(sentences): frequency_matrix = {} stopWords = set(stopwords.words("english")) ps = PorterStemmer() for sent in sentences: freq_table = {} words = word_tokenize(sent) for word in words: word = word.lower() word = ps.stem(word) if word in stopWords: continue if word in freq_table: freq_table[word] += 1 else: freq_table[word] = 1 frequency_matrix[sent[:15]] = freq_table return frequency_matrix
Example #4
Source File: data_preparation_tools.py From corpus-to-graph-ml with MIT License | 5 votes |
def stem_text(sent, context=None): processed_tokens = [] tokens = nltk.word_tokenize(sent) porter = nltk.PorterStemmer() for t in tokens: t = porter.stem(t) processed_tokens.append(t) return " ".join(processed_tokens) # Split to train and test sample sets:
Example #5
Source File: document.py From gender-bias with MIT License | 5 votes |
def stemmed_words(self) -> List: """ Compute the stems of words. Uses nltk.PorterStemmer. Returns: List """ words = self.words() porter = nltk.PorterStemmer() return [porter.stem(w) for w in words]
Example #6
Source File: utils.py From freesound-datasets with GNU Affero General Public License v3.0 | 5 votes |
def stem(word): ps = PorterStemmer() return ps.stem(word)
Example #7
Source File: eval.py From propara with Apache License 2.0 | 5 votes |
def stem(cls, w: str): if not w or len(w.strip()) == 0: return "" w_lower = w.lower() # Remove leading articles from the phrase (e.g., the rays => rays). # FIXME: change this logic to accept a list of leading articles. if w_lower.startswith("a "): w_lower = w_lower[2:] elif w_lower.startswith("an "): w_lower = w_lower[3:] elif w_lower.startswith("the "): w_lower = w_lower[4:] elif w_lower.startswith("your "): w_lower = w_lower[5:] elif w_lower.startswith("his "): w_lower = w_lower[4:] elif w_lower.startswith("their "): w_lower = w_lower[6:] elif w_lower.startswith("my "): w_lower = w_lower[3:] elif w_lower.startswith("another "): w_lower = w_lower[8:] elif w_lower.startswith("other "): w_lower = w_lower[6:] elif w_lower.startswith("this "): w_lower = w_lower[5:] elif w_lower.startswith("that "): w_lower = w_lower[5:] # Porter stemmer: rays => ray return PorterStemmer().stem(w_lower).strip()
Example #8
Source File: nlp.py From Quora with MIT License | 5 votes |
def stemming(tokens): """ stem tokens """ porter = nltk.PorterStemmer() return [porter.stem(t) for t in tokens]
Example #9
Source File: extract_statistical_features.py From Sarcasm-Detection with MIT License | 5 votes |
def get_ngrams(tokens, n, use_just_words=False, stem=False, for_semantics=False): if len(n) < 1: return {} if not for_semantics: if stem: porter = PorterStemmer() tokens = [porter.stem(t.lower()) for t in tokens] if use_just_words: tokens = [t.lower() for t in tokens if not t.startswith('@') and not t.startswith('#') and t not in string.punctuation] ngram_tokens = [] for i in n: for gram in ngrams(tokens, i): string_token = 'gram ' for j in range(i): string_token += gram[j] + ' ' ngram_tokens.append(string_token) ngram_features = {i: ngram_tokens.count(i) for i in set(ngram_tokens)} return ngram_features # Get sentiment features -- a total of 18 features derived # Emoji features: a count of the positive, negative and neutral emojis # along with the ratio of positive to negative emojis and negative to neutral # Using the MPQA subjectivity lexicon, we have to check words for their part of speech # and obtain features: a count of positive, negative and neutral words, as well as # a count of the strong and weak subjectives, along with their ratios and a total sentiment words. # Also using VADER sentiment analyser to obtain a score of sentiments held in a tweet (4 features)
Example #10
Source File: text_utils.py From document-qa with Apache License 2.0 | 5 votes |
def __init__(self, lower: bool = True, stemmer="port"): self.lower = lower self.stemmer = stemmer if stemmer == "port": self._stemmer = PorterStemmer() self._stem = self._stemmer.stem elif stemmer == "wordnet": self._stemmer = WordNetLemmatizer() self._stem = self._stemmer.lemmatize else: raise ValueError(stemmer) # stemming is slow, so we cache words as we go self.normalize_cache = {}