Python nltk.stem.snowball.SnowballStemmer() Examples
The following are 30
code examples of nltk.stem.snowball.SnowballStemmer().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
nltk.stem.snowball
, or try the search function
.
Example #1
Source File: topic_modeler.py From Artificial-Intelligence-with-Python with MIT License | 6 votes |
def process(input_text): # Create a regular expression tokenizer tokenizer = RegexpTokenizer(r'\w+') # Create a Snowball stemmer stemmer = SnowballStemmer('english') # Get the list of stop words stop_words = stopwords.words('english') # Tokenize the input string tokens = tokenizer.tokenize(input_text.lower()) # Remove the stop words tokens = [x for x in tokens if not x in stop_words] # Perform stemming on the tokenized words tokens_stemmed = [stemmer.stem(x) for x in tokens] return tokens_stemmed
Example #2
Source File: texttools.py From spice-hate_speech_detection with MIT License | 6 votes |
def stemming_message_snowball(message, stemmings_to_words=dict()): from nltk.stem.snowball import SnowballStemmer from nltk.tokenize import casual_tokenize stemmer = SnowballStemmer('finnish') if type(message) == None: return '', stemmings_to_words message.replace('#','') stemmed_message = [] for word in casual_tokenize(message): stemmed_word = stemmer.stem(word.lower()) stemmed_message.append(stemmed_word) stemmings_to_words[stemmed_word] = word stemmed_message = ' '.join(stemmed_message) return stemmed_message, stemmings_to_words
Example #3
Source File: test_stem.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 6 votes |
def test_german(self): stemmer_german = SnowballStemmer("german") stemmer_german2 = SnowballStemmer("german", ignore_stopwords=True) assert stemmer_german.stem("Schr\xe4nke") == 'schrank' assert stemmer_german2.stem("Schr\xe4nke") == 'schrank' assert stemmer_german.stem("keinen") == 'kein' assert stemmer_german2.stem("keinen") == 'keinen'
Example #4
Source File: test_stem.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 6 votes |
def test_russian(self): stemmer_russian = SnowballStemmer("russian") assert stemmer_russian.stem("авантненькая") == "авантненьк"
Example #5
Source File: test_stem.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 6 votes |
def test_arabic(self): """ this unit testing for test the snowball arabic light stemmer this stemmer deals with prefixes and suffixes """ # Test where the ignore_stopwords=True. ar_stemmer = SnowballStemmer("arabic", True) assert ar_stemmer.stem('الْعَرَبِــــــيَّة') == "عرب" assert ar_stemmer.stem("العربية") == "عرب" assert ar_stemmer.stem("فقالوا") == "قال" assert ar_stemmer.stem("الطالبات") == "طالب" assert ar_stemmer.stem("فالطالبات") == "طالب" assert ar_stemmer.stem("والطالبات") == "طالب" assert ar_stemmer.stem("الطالبون") == "طالب" assert ar_stemmer.stem("اللذان") == "اللذان" assert ar_stemmer.stem("من") == "من" # Test where the ignore_stopwords=False. ar_stemmer = SnowballStemmer("arabic", False) assert ar_stemmer.stem("اللذان") == "اللذ" # this is a stop word assert ar_stemmer.stem("الطالبات") == "طالب" assert ar_stemmer.stem("الكلمات") == "كلم" # test where create the arabic stemmer without given init value to ignore_stopwords ar_stemmer = SnowballStemmer("arabic") assert ar_stemmer.stem('الْعَرَبِــــــيَّة') == "عرب" assert ar_stemmer.stem("العربية") == "عرب" assert ar_stemmer.stem("فقالوا") == "قال" assert ar_stemmer.stem("الطالبات") == "طالب" assert ar_stemmer.stem("الكلمات") == "كلم"
Example #6
Source File: stemmer.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 6 votes |
def nltk_stemmer(stemmer, token, i=None, tokens=None): """Wrapper around a NLTK SnowballStemmer, which includes stop words for each language. Args: stemmer (SnowballStemmer): Stemmer instance that performs the stemming. token (lunr.Token): The token to stem. i (int): The index of the token in a set. tokens (list): A list of tokens representing the set. """ def wrapped_stem(token, metadata=None): return stemmer.stem(token) return token.update(wrapped_stem)
Example #7
Source File: stemmer.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 6 votes |
def get_language_stemmer(language): """Retrieves the SnowballStemmer for a particular language. Args: language (str): ISO-639-1 code of the language. """ from lunr.languages import SUPPORTED_LANGUAGES from nltk.stem.snowball import SnowballStemmer return SnowballStemmer(SUPPORTED_LANGUAGES[language])
Example #8
Source File: concept_based.py From acl2017-interactive_summarizer with Apache License 2.0 | 6 votes |
def __init__(self, input_directory, language): """ Args: input_directory (str): the directory from which text documents to be summarized are loaded. @type language: str """ self.input_directory = input_directory self.sentences = [] self.weights = {} self.c2s = defaultdict(set) self.concept_sets = defaultdict(frozenset) self.LANGUAGE = language # type: str self.stoplist = set(stopwords.words(self.LANGUAGE)) self.stemmer = SnowballStemmer(self.LANGUAGE) self.word_frequencies = defaultdict(int) self.w2s = defaultdict(set)
Example #9
Source File: run.py From themarketingtechnologist with Apache License 2.0 | 6 votes |
def tokenize(text): """ Tokenizes sequences of text and stems the tokens. :param text: String to tokenize :return: List with stemmed tokens """ tokens = nltk.WhitespaceTokenizer().tokenize(text) tokens = list(set(re.sub("[^a-zA-Z\']", "", token) for token in tokens)) tokens = [word for word in tokens if word not in stopwords.words('english')] tokens = list(set(re.sub("[^a-zA-Z]", "", token) for token in tokens)) stems = [] stemmer = SnowballStemmer("english") for token in tokens: token = stemmer.stem(token) if token != "": stems.append(token) return stems
Example #10
Source File: language_parser.py From cvscan with MIT License | 6 votes |
def clean_resume(resume_text): cleaned_resume = [] # replacing newlines and punctuations with space resume_text =resume_text.replace('\t', ' ').replace('\n', ' ') for punctuation in string.punctuation: resume_text = resume_text.replace(punctuation, ' ') resume_text = resume_text.split() # removing stop words and Stemming the remaining words in the resume stemmer = SnowballStemmer("english") for word in resume_text: if word not in stopwords.words('english') and not word.isdigit(): cleaned_resume.append(word.lower())#stemmer.stem(word)) cleaned_resume = ' '.join(cleaned_resume) return cleaned_resume
Example #11
Source File: converter.py From atap with Apache License 2.0 | 6 votes |
def conversion(source, dest): """ :param source: the unit of measure you have :param dest: the unit of measure need to convert to :return: """ stemmer = SnowballStemmer('english') source = stemmer.stem(source) dest = stemmer.stem(dest) try: units = conv_dict.get(source).get('Units')[ conv_dict.get(source).get('Destination').index(dest) ] except: units = None return units, source, dest
Example #12
Source File: upper_bound_ilp.py From acl2017-interactive_summarizer with Apache License 2.0 | 5 votes |
def __init__(self, language): self.sentences = [] self.docs = [] self.models = [] self.doc_sent_dict = {} self.ref_ngrams = [] self.LANGUAGE = language self.stemmer = SnowballStemmer(self.LANGUAGE) self.stoplist = set(stopwords.words(self.LANGUAGE))
Example #13
Source File: sume_wrap.py From acl2017-interactive_summarizer with Apache License 2.0 | 5 votes |
def __init__(self, language): self.s = sume.ConceptBasedILPSummarizer(" ", language) self.LANGUAGE = language self.stoplist = set(stopwords.words(self.LANGUAGE)) self.stemmer = SnowballStemmer(self.LANGUAGE)
Example #14
Source File: simulated_feedback.py From acl2017-interactive_summarizer with Apache License 2.0 | 5 votes |
def __init__(self, language, rouge, embeddings={}, fvector=[], ngrams_size=2, top_n=100, dump_base_dir=tempfile.mkdtemp(prefix="simufee-")): ''' Initialize the docs and models structure ''' self.Oracle = Oracle() # oracle self.SumeWrap = SumeWrap(language) # only used to load the sentences and push them into self.summarizer self.summarizer = sume.ConceptBasedILPSummarizer(" ", language) self.N = ngrams_size # how many words an should the ngrams consist of self.top_n = top_n # currently unused self.ref_ngrams = set() # set of ngrams that are in the reference summaries (for the feedback to peek) self.ref_phrases = set() # set of phrases that are in the reference summaries (for the feedback to peek) self.flight_recorder = FlightRecorder() # The flight-recorder stores all interactions wrt to concepts (eg. accepted, and rejected) self.info_data = [] # stats for the pipeline. The only thing that leaves this class self.initial_weights = {} # oracle reweighting self.language = language # document language. relevant for stemmer, embeddings, stopwords, parsing #self.stemmer = SnowballStemmer(self.language) if self.language == "english": self.stemmer = SnowballStemmer(self.language) #elf.stemmer = WordNetLemmatizer() else: self.stemmer = SnowballStemmer(self.language) self.stoplist = set(stopwords.words(self.language)) self.rouge = rouge self.cluster_size = 0.0 self.embeddings = embeddings # word2vec embeddings self.fvector = fvector # List of support vectors for active learning SVM self.pos_hash = {} # active learning // SVM self.concept_vec_idx = {} # active learning // SVM self.index_vec_concept = {} # active learning // SVM ### previously uninitialized fields... self.data = None # np.array(self.fvector) # active learning // SVM TODO rename self.data to somehting that contains svm... self.labels = None # active learning // SVM self.MAX_WEIGHT = None # int with # of documents (i.e. largest possible DF value) self.models = None # reference summaries, only needed for rouge score (as they are converted merged into one large summary) self.parse_type = None # None or "parse" self.prev_score = None # rouge scores of previous iteration. self.score = None # rouge scores of current iteration. self.summary_length = None # target summary length. self.ub_score = None # rouge scores of upper bound self.uncertainity = {} # active learning // SVM # graph based propagation settings self.graph = PageRankFeedbackGraph(self.stemmer, self.language) # self.graph = SimpleNgramFeedbackGraph(self.stemmer, self.language, N=5) self.debug_dump_target_dir = dump_base_dir self.allowed_number_of_feedback_per_iteration=5
Example #15
Source File: preprocessing.py From TextRank with MIT License | 5 votes |
def __init__(self): self.STOPWORDS = TextProcessor.__load_stopwords(path="../stopwords.txt") self.LEMMATIZER = WordNetLemmatizer() self.STEMMER = SnowballStemmer("english") self.PUNCTUATION = re.compile('([%s])+' % re.escape(string.punctuation), re.UNICODE) self.NUMERIC = re.compile(r"[0-9]+", re.UNICODE) self.PAT_ALPHABETIC = re.compile('(((?![\d])\w)+)', re.UNICODE)
Example #16
Source File: data.py From HOTT with MIT License | 5 votes |
def reduce_vocab(bow_data, vocab, embed_vocab, embed_aggregate='mean'): """Reduce vocabulary size by stemming and removing stop words. """ vocab = np.array(vocab) short = np.array([len(w) > 2 for w in vocab]) stop_words = set(stopwords.words('english')) stop = np.array([w not in stop_words for w in vocab]) reduced_vocab = vocab[np.logical_and(short, stop)] reduced_bow_data = bow_data[:, np.logical_and(short, stop)] stemmer = SnowballStemmer("english") stemmed_dict = {} stemmed_idx_mapping = {} stemmed_vocab = [] for i, w in enumerate(reduced_vocab): stem_w = stemmer.stem(w) if stem_w in stemmed_vocab: stemmed_dict[stem_w].append(w) stemmed_idx_mapping[stemmed_vocab.index(stem_w)].append(i) else: stemmed_dict[stem_w] = [w] stemmed_vocab.append(stem_w) stemmed_idx_mapping[stemmed_vocab.index(stem_w)] = [i] stemmed_bow_data = np.zeros((bow_data.shape[0], len(stemmed_vocab)), dtype=np.int) for i in range(len(stemmed_vocab)): stemmed_bow_data[:, i] = reduced_bow_data[:, stemmed_idx_mapping[i]].sum(axis=1).flatten() word_counts = stemmed_bow_data.sum(axis=0) stemmed_reduced_vocab = np.array(stemmed_vocab)[word_counts > 2].tolist() stemmed_reduced_bow_data = stemmed_bow_data[:, word_counts > 2] stemmed_reduced_embed_vocab = {} for w in stemmed_reduced_vocab: old_w_embed = [embed_vocab[w_old] for w_old in stemmed_dict[w]] if embed_aggregate == 'mean': new_w_embed = np.mean(old_w_embed, axis=0) elif embed_aggregate == 'first': new_w_embed = old_w_embed[0] else: print('Unknown embedding aggregation') break stemmed_reduced_embed_vocab[w] = new_w_embed return (stemmed_reduced_vocab, stemmed_reduced_embed_vocab, stemmed_reduced_bow_data)
Example #17
Source File: test_stem.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def test_short_strings_bug(self): stemmer = SnowballStemmer('english') assert stemmer.stem("y's") == 'y'
Example #18
Source File: parse_out_email_text.py From machine-learning with GNU General Public License v3.0 | 5 votes |
def parseOutText(f): """ given an opened email file f, parse out all text below the metadata block at the top (in Part 2, you will also add stemming capabilities) and return a string that contains all the words in the email (space-separated) example use case: f = open("email_file_name.txt", "r") text = parseOutText(f) """ f.seek(0) # go back to beginning of file (annoying) all_text = f.read() # split off metadata content = all_text.split("X-FileName:") words = "" if len(content) > 1: # remove punctuation text_string = content[1].translate(string.maketrans("", ""), string.punctuation) # project part 2: comment out the line below #words = text_string # split the text string into individual words, stem each word, stemmer = SnowballStemmer('english') text_string = text_string.split() #makes a list of words for i in range(len(text_string)): text_string[i] = stemmer.stem(text_string[i]) # and append the stemmed word to words (make sure there's a single # space between each stemmed word) words = " ".join(text_string)# this -> " " ensures space b/w words return words
Example #19
Source File: test_stem.py From razzy-spinner with GNU General Public License v3.0 | 5 votes |
def test_russian(self): # Russian words both consisting of Cyrillic # and Roman letters can be stemmed. stemmer_russian = SnowballStemmer("russian") assert stemmer_russian.stem("авантненькая") == "авантненьк" assert stemmer_russian.stem("avenantnen'kai^a") == "avenantnen'k"
Example #20
Source File: topic_modeling.py From Raspberry-Pi-3-Cookbook-for-Python-Programmers-Third-Edition with MIT License | 5 votes |
def __init__(self): # Create a regular expression tokenizer self.tokenizer = RegexpTokenizer(r'\w+') # get the list of stop words self.english_stop_words= stopwords.words('english') # Create a Snowball stemmer self.snowball_stemmer = SnowballStemmer('english') # Tokenizing, stop word removal, and stemming
Example #21
Source File: Auto_NLP.py From Auto_ViML with Apache License 2.0 | 5 votes |
def tokenize_and_stem(text): stemmer = SnowballStemmer("english") text = re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", text) # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)] filtered_tokens = [] # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation) for token in tokens: if re.search('[a-zA-Z]', token): filtered_tokens.append(token) stems = [stemmer.stem(t) for t in filtered_tokens] return stems ################################################################################
Example #22
Source File: topic_modeling.py From Python-Machine-Learning-Cookbook-Second-Edition with MIT License | 5 votes |
def __init__(self): # Create a regular expression tokenizer self.tokenizer = RegexpTokenizer(r'\w+') # get the list of stop words self.stop_words_english = stopwords.words('english') # Create a Snowball stemmer self.stemmer = SnowballStemmer('english') # Tokenizing, stop word removal, and stemming
Example #23
Source File: 2_train_and_eval_model.py From support-tickets-classification with MIT License | 5 votes |
def build_analyzer(self): analyzer = super(StemmedCountVectorizer, self).build_analyzer() stemmer = SnowballStemmer("english", ignore_stopwords=True) return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
Example #24
Source File: helpers.py From chirp with MIT License | 5 votes |
def cleaned_tokens(tokens): """Clean the tokens by removing stop words and stemming.""" # stemmer = SnowballStemmer("english") # stemmed = [stemmer.stem(token) for token in tokens] s = set(stopwords.words('english')) tokens = [x.lower() for x in tokens if not x.isdigit()] return filter(lambda w: not w.lower() in s, tokens)
Example #25
Source File: preprocessing.py From Projects with MIT License | 5 votes |
def __init__(self,bigrams=True,min_df=3,stemming=True,tfidf=True): self.regex = re.compile('[^a-zA-Z ]') self.stop = set(stopwords.words('english')) self.stemmer = SnowballStemmer("english") self.bigrams = bigrams self.min_df = min_df self.stemming = stemming self.tfidf = tfidf
Example #26
Source File: test.py From Quora-Question-Pairs with MIT License | 5 votes |
def preprocess(corpus, keep_list, cleaning = True, stemming = False, stem_type = None, lemmatization = True, remove_stopwords = True): ''' Purpose : Function to perform all pre-processing tasks (cleaning, stemming, lemmatization, stopwords removal etc.) Input : 'corpus' - Text corpus on which pre-processing tasks will be performed 'keep_list' - List of words to be retained during cleaning process 'cleaning', 'stemming', 'lemmatization', 'remove_stopwords' - Boolean variables indicating whether a particular task should be performed or not 'stem_type' - Choose between Porter stemmer or Snowball(Porter2) stemmer. Default is "None", which corresponds to Porter Stemmer. 'snowball' corresponds to Snowball Stemmer Note : Either stemming or lemmatization should be used. There's no benefit of using both of them together Output : Returns the processed text corpus ''' if cleaning == True: corpus = text_clean(corpus, keep_list) ''' All stopwords except the 'wh-' words are removed ''' if remove_stopwords == True: wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom'] stop = set(stopwords.words('english')) for word in wh_words: stop.remove(word) corpus = [[x for x in x.split() if x not in stop] for x in corpus] else : corpus = [[x for x in x.split()] for x in corpus] if lemmatization == True: lem = WordNetLemmatizer() corpus = [[lem.lemmatize(x, pos = 'v') for x in x] for x in corpus] if stemming == True: if stem_type == 'snowball': stemmer = SnowballStemmer(language = 'english') corpus = [[stemmer.stem(x) for x in x] for x in corpus] else : stemmer = PorterStemmer() corpus = [[stemmer.stem(x) for x in x] for x in corpus] return corpus
Example #27
Source File: MaLSTM_train.py From Quora-Question-Pairs with MIT License | 5 votes |
def preprocess(corpus, keep_list, cleaning = True, stemming = False, stem_type = None, lemmatization = True, remove_stopwords = True): ''' Purpose : Function to perform all pre-processing tasks (cleaning, stemming, lemmatization, stopwords removal etc.) Input : 'corpus' - Text corpus on which pre-processing tasks will be performed 'keep_list' - List of words to be retained during cleaning process 'cleaning', 'stemming', 'lemmatization', 'remove_stopwords' - Boolean variables indicating whether a particular task should be performed or not 'stem_type' - Choose between Porter stemmer or Snowball(Porter2) stemmer. Default is "None", which corresponds to Porter Stemmer. 'snowball' corresponds to Snowball Stemmer Note : Either stemming or lemmatization should be used. There's no benefit of using both of them together Output : Returns the processed text corpus ''' if cleaning == True: corpus = text_clean(corpus, keep_list) ''' All stopwords except the 'wh-' words are removed ''' if remove_stopwords == True: wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom'] stop = set(stopwords.words('english')) for word in wh_words: stop.remove(word) corpus = [[x for x in x.split() if x not in stop] for x in corpus] else : corpus = [[x for x in x.split()] for x in corpus] if lemmatization == True: lem = WordNetLemmatizer() corpus = [[lem.lemmatize(x, pos = 'v') for x in x] for x in corpus] if stemming == True: if stem_type == 'snowball': stemmer = SnowballStemmer(language = 'english') corpus = [[stemmer.stem(x) for x in x] for x in corpus] else : stemmer = PorterStemmer() corpus = [[stemmer.stem(x) for x in x] for x in corpus] return corpus
Example #28
Source File: test_stem.py From razzy-spinner with GNU General Public License v3.0 | 5 votes |
def test_short_strings_bug(self): stemmer = SnowballStemmer('english') assert stemmer.stem("y's") == 'y'
Example #29
Source File: test_stem.py From razzy-spinner with GNU General Public License v3.0 | 5 votes |
def test_spanish(self): stemmer = SnowballStemmer('spanish') assert stemmer.stem("Visionado") == 'vision' # The word 'algue' was raising an IndexError assert stemmer.stem("algue") == 'algu'
Example #30
Source File: test_stem.py From razzy-spinner with GNU General Public License v3.0 | 5 votes |
def test_german(self): stemmer_german = SnowballStemmer("german") stemmer_german2 = SnowballStemmer("german", ignore_stopwords=True) assert stemmer_german.stem("Schr\xe4nke") == 'schrank' assert stemmer_german2.stem("Schr\xe4nke") == 'schrank' assert stemmer_german.stem("keinen") == 'kein' assert stemmer_german2.stem("keinen") == 'keinen'