Python nltk.stem.wordnet.WordNetLemmatizer() Examples
The following are 30
code examples of nltk.stem.wordnet.WordNetLemmatizer().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
nltk.stem.wordnet
, or try the search function
.
Example #1
Source File: data_processing.py From Sarcasm-Detection with MIT License | 7 votes |
def ulterior_clean(tweets, filename): if not os.path.exists(filename): stopwords = get_stopwords_list() lemmatizer = WordNetLemmatizer() filtered_tweets = [] for tw in tweets: filtered_tweet = [] for t in tw.split(): token = t.lower() if token in stopwords: continue filtered_token = lemmatizer.lemmatize(token, 'v') filtered_token = lemmatizer.lemmatize(filtered_token) filtered_tweet.append(filtered_token) filtered_tweets.append(' '.join(filtered_tweet)) utils.save_file(filtered_tweets, filename) # Load the filtered tokens filtered_tweets = utils.load_file(filename) return filtered_tweets
Example #2
Source File: preprocess_lst_test.py From lexsub with Apache License 2.0 | 6 votes |
def is_atomic_mwe(mwe, verb_lemma, complement_lemma, synsets): mwe_count = 0 for synset in synsets: gloss_lemmas = set([WordNetLemmatizer().lemmatize(word) for word in synset.definition.split()]) if verb_lemma in gloss_lemmas or complement_lemma in gloss_lemmas: return False for syn_lemma in synset.lemmas: if syn_lemma.name != mwe: tokens = syn_lemma.name.split('_') for token in tokens: if token == verb_lemma: return False if len(tokens) == 2 and tokens[1] == complement_lemma: return False else: mwe_count += syn_lemma.count() return True
Example #3
Source File: predict.py From topics with Apache License 2.0 | 6 votes |
def extract_lemmatized_nouns(self, new_review): stopwords = self.load_stopwords() words = [] sentences = nltk.sent_tokenize(new_review.lower()) for sentence in sentences: tokens = nltk.word_tokenize(sentence) text = [word for word in tokens if word not in stopwords] tagged_text = nltk.pos_tag(text) for word, tag in tagged_text: words.append({"word": word, "pos": tag}) lem = WordNetLemmatizer() nouns = [] for word in words: if word["pos"] in ["NN", "NNS"]: nouns.append(lem.lemmatize(word["word"])) return nouns
Example #4
Source File: tokenizing.py From convai-bot-1337 with GNU General Public License v3.0 | 6 votes |
def convert_to_vw(text): tokenizer = nltk.RegexpTokenizer(r'\w+') lmtzr = WordNetLemmatizer() tokens = [t.lower() for t in tokenizer.tokenize(text)] id_ = 13371337 processed = [] for t in tokens: l = lmtzr.lemmatize(t) processed.append(l) counted = Counter(processed) res_str = str(id_) for k, v in counted.items(): if v != 1: res_str = res_str + " {}:{}".format(k, v) else: res_str = res_str + " {}".format(k) return res_str
Example #5
Source File: imagenet.py From mmfeat with BSD 3-Clause "New" or "Revised" License | 6 votes |
def __init__(self, save_dir, config_path='./miner.yaml'): super(ImageNetMiner, self).__init__(save_dir, config_path) self.__engine__ = 'imagenet' self.format_url = 'http://www.image-net.org/api/text/imagenet.synset.geturls?wnid={}' # maximum number of synsets to retrieve - we don't need all images necessarily, other- # wise we get enormous amounts of synsets for words like 'entity' or 'animal' self.max_synsets = 10000 self.wnl = WordNetLemmatizer() # url cache self.imgnet_url_cache = {} # whether we "level up" in hierarchy if no images found self.level_up_if_no_images = True
Example #6
Source File: features.py From product-classifier with MIT License | 5 votes |
def __init__(self, stoplist=None, punct=None, lemmatizer=None): # Load stopwords, punctuation, and lemmatizer # This takes a bit of work, so we only want to do it once! self.stopwords = stoplist or stopwords.words('english') self.punctuation = punct or string.punctuation self.lemmatizer = lemmatizer or WordNetLemmatizer()
Example #7
Source File: cs_inferrer.py From lexsub with Apache License 2.0 | 5 votes |
def filter_inferred(self, result_vec, candidates, pos): filtered_results = {} candidates_found = set() if result_vec != None: for word, weight in result_vec: wn_pos = to_wordnet_pos[pos] lemma = WordNetLemmatizer().lemmatize(word, wn_pos) if lemma in candidates: self.add_inference_result(lemma, weight, filtered_results, candidates_found) if lemma.title() in candidates: self.add_inference_result(lemma.title(), weight, filtered_results, candidates_found) if word in candidates: # there are some few cases where the candidates are not lemmatized self.add_inference_result(word, weight, filtered_results, candidates_found) if word.title() in candidates: # there are some few cases where the candidates are not lemmatized self.add_inference_result(word.title(), weight, filtered_results, candidates_found) # assign negative weights for candidates with no score # they will appear last sorted according to their unigram count # candidates_left = candidates - candidates_found # for candidate in candidates_left: # count = self.w2counts[candidate] if candidate in self.w2counts else 1 # score = -1 - (1.0/count) # between (-1,-2] # filtered_results[candidate] = score return filtered_results
Example #8
Source File: preprocess_lst_test.py From lexsub with Apache License 2.0 | 5 votes |
def lemmatize(pairs): triples = [] for pair in pairs: word = pair[0] pos = pair[1] wordnet_pos = wordnet.NOUN if (len(pos)>=2): pos_prefix = pos[:2] if (pos_prefix in to_wordnet_pos): wordnet_pos = to_wordnet_pos[pos_prefix] lemma = WordNetLemmatizer().lemmatize(word, wordnet_pos).lower(); triples.append([word, wordnet_pos, lemma]) return triples
Example #9
Source File: preprocess_lst_test.py From lexsub with Apache License 2.0 | 5 votes |
def detect_mwe(text_tokens, target_ind, wordnet_pos): if (target_ind < len(text_tokens)-1): verb_lemma = WordNetLemmatizer().lemmatize(text_tokens[target_ind], wordnet_pos) complement_lemma = WordNetLemmatizer().lemmatize(text_tokens[target_ind+1]) mwe = '_'.join([verb_lemma, complement_lemma]) synsets = wordnet.synsets(mwe, wordnet.VERB) if len(synsets) > 0: if (target_ind+1 < len(text_tokens)-1): mwe_right = '_'.join([WordNetLemmatizer().lemmatize(text_tokens[target_ind+1]), WordNetLemmatizer().lemmatize(text_tokens[target_ind+2])]) if len(wordnet.synsets(mwe_right)) > 0: return if is_atomic_mwe(mwe, verb_lemma, complement_lemma, synsets) == True: mwe = '='.join([text_tokens[target_ind], text_tokens[target_ind+1]]) text_tokens[target_ind] = mwe del text_tokens[target_ind+1]
Example #10
Source File: extract_baseline_features.py From Sarcasm-Detection with MIT License | 5 votes |
def get_features2(tweets, subj_dict): print("Getting features type 2...") features = [] tknzr = TweetTokenizer(preserve_case=True, reduce_len=False, strip_handles=False) lemmatizer = WordNetLemmatizer() for tweet in tweets: feature_list = [0.0] * 5 tokens = tknzr.tokenize(tweet) # Take the number of positive and negative words as features for word in tokens: stemmed = lemmatizer.lemmatize(word, 'v') stemmed = lemmatizer.lemmatize(stemmed) if stemmed in subj_dict: dictlist = [] for word in subj_dict[stemmed]: dictlist.extend(subj_dict[stemmed][word]) if 'strongsubj' in dictlist: value = 1.0 else: value = 0.5 if 'positive' in dictlist: feature_list[0] += value elif 'negative' in dictlist: feature_list[1] += value # Take the report of positives to negatives as a feature if feature_list[0] != 0.0 and feature_list[1] != 0.0: feature_list[2] = feature_list[0] / feature_list[1] # Derive features from punctuation feature_list[2] += count_apparitions(tokens, helper.punctuation) # Take strong negations as a feature feature_list[3] += count_apparitions(tokens, helper.strong_negations) # Take strong affirmatives as a feature feature_list[4] += count_apparitions(tokens, helper.strong_affirmatives) features.append(feature_list) print("Done.") return features
Example #11
Source File: data_processing.py From Sarcasm-Detection with MIT License | 5 votes |
def extract_lemmatized_tweet(tokens, pos, use_verbs=True, use_nouns=True, use_all=False): lemmatizer = WordNetLemmatizer() clean_data = [] for index in range(len(tokens)): if use_verbs and pos[index] is 'V': clean_data.append(lemmatizer.lemmatize(tokens[index].lower(), 'v')) if use_nouns and pos[index] is 'N': clean_data.append(lemmatizer.lemmatize(tokens[index].lower())) if use_all: lemmatized_word = lemmatizer.lemmatize(tokens[index].lower(), 'v') word = lemmatizer.lemmatize(lemmatized_word) if pos[index] not in ['^', ',', '$', '&', '!', '#', '@']: clean_data.append(word) return clean_data
Example #12
Source File: DepParser.py From python-zpar with MIT License | 5 votes |
def __init__(self, modelpath, libptr, zpar_session_obj): super(DepParser, self).__init__() # save the zpar session object self._zpar_session_obj = zpar_session_obj # set up a logger self.logger = logging.getLogger(__name__) # get the library method that loads the parser models self._load_depparser = libptr.load_depparser self._load_depparser.restype = c.c_int self._load_depparser.argtypes = [c.c_void_p, c.c_char_p] # get the library methods that parse sentences and files self._dep_parse_sentence = libptr.dep_parse_sentence self._dep_parse_sentence.restype = c.c_char_p self._dep_parse_sentence.argtypes = [c.c_void_p, c.c_char_p, c.c_bool] self._dep_parse_file = libptr.dep_parse_file self._dep_parse_file.restype = None self._dep_parse_file.argtypes = [c.c_void_p, c.c_char_p, c.c_char_p, c.c_bool] self._dep_parse_tagged_sentence = libptr.dep_parse_tagged_sentence self._dep_parse_tagged_sentence.restype = c.c_char_p self._dep_parse_tagged_sentence.argtypes = [c.c_void_p, c.c_char_p, c.c_char] self._dep_parse_tagged_file = libptr.dep_parse_tagged_file self._dep_parse_tagged_file.restype = None self._dep_parse_tagged_file.argtypes = [c.c_void_p, c.c_char_p, c.c_char_p, c.c_char] if self._load_depparser(self._zpar_session_obj, modelpath.encode('utf-8')): raise OSError('Cannot find dependency parser model at {}\n'.format(modelpath)) # set up the wordnet lemmatizer if we have it if _HAS_LEMMATIZER: self.lemmatizer = WordNetLemmatizer() else: self.lemmatizer = None
Example #13
Source File: linguist.py From ECDICT with MIT License | 5 votes |
def lemmatize (self, word, pos = 'n'): word = word.lower() if self.__lemmatizer is None: from nltk.stem.wordnet import WordNetLemmatizer self.__lemmatizer = WordNetLemmatizer() return self.__lemmatizer.lemmatize(word, pos) #---------------------------------------------------------------------- # global #----------------------------------------------------------------------
Example #14
Source File: lemma.py From broca with MIT License | 5 votes |
def __init__(self, n_jobs=1): self.lemmr = WordNetLemmatizer() self.stops = stopwords.words('english') self.n_jobs = n_jobs
Example #15
Source File: overkill.py From broca with MIT License | 5 votes |
def tokenize(self, docs): if self.lemmatize: lem = WordNetLemmatizer() #print('RAKE tokenizing...') pre_tdocs = RAKETokenizer(n_jobs=self.n_jobs).tokenize(docs) for i, tdoc in enumerate(pre_tdocs): for t in tdoc: if t.startswith('one'): print(t) print(i) #print('Additional Tokenizing docs...') if self.n_jobs == 1: tdocs = [pre_tokenize(doc, tdoc, lem=lem) for doc, tdoc in zip(docs, pre_tdocs)] else: tdocs = parallel(partial(pre_tokenize, lem=lem), zip(docs, pre_tdocs), self.n_jobs, expand_args=True) #print('Training bigram...') if self.bigram is None: self.bigram = Phrases(tdocs, min_count=self.min_count, threshold=self.threshold, delimiter=b' ') else: self.bigram.add_vocab(tdocs) #print('Training trigram...') if self.trigram is None: self.trigram = Phrases(self.bigram[tdocs], min_count=self.min_count, threshold=self.threshold, delimiter=b' ') else: self.trigram.add_vocab(self.bigram[tdocs]) return [tdoc for tdoc in self.trigram[self.bigram[tdocs]]]
Example #16
Source File: tree.py From props with MIT License | 5 votes |
def _VERBAL_PREDICATE_FEATURE_Lemma(self): from nltk.stem.wordnet import WordNetLemmatizer lmtzr = WordNetLemmatizer() if self.pos in pos_penn_to_wordnet: return lmtzr.lemmatize(self.word, pos_penn_to_wordnet[self.pos]) else: return False # TODO functions:
Example #17
Source File: reverb.py From BREDS with GNU Lesser General Public License v3.0 | 5 votes |
def __init__(self): self.lmtzr = WordNetLemmatizer() self.aux_verbs = ['be']
Example #18
Source File: preprocess.py From Unsupervised-Aspect-Extraction with Apache License 2.0 | 5 votes |
def parseSentence(line): lmtzr = WordNetLemmatizer() stop = stopwords.words('english') text_token = CountVectorizer().build_tokenizer()(line.lower()) text_rmstop = [i for i in text_token if i not in stop] text_stem = [lmtzr.lemmatize(w) for w in text_rmstop] return text_stem
Example #19
Source File: normalize.py From atap with Apache License 2.0 | 5 votes |
def __init__(self, language='english'): self.stopwords = frozenset(nltk.corpus.stopwords.words(language)) self.lemmatizer = WordNetLemmatizer()
Example #20
Source File: MaLSTM_train.py From Quora-Question-Pairs with MIT License | 5 votes |
def preprocess(corpus, keep_list, cleaning = True, stemming = False, stem_type = None, lemmatization = True, remove_stopwords = True): ''' Purpose : Function to perform all pre-processing tasks (cleaning, stemming, lemmatization, stopwords removal etc.) Input : 'corpus' - Text corpus on which pre-processing tasks will be performed 'keep_list' - List of words to be retained during cleaning process 'cleaning', 'stemming', 'lemmatization', 'remove_stopwords' - Boolean variables indicating whether a particular task should be performed or not 'stem_type' - Choose between Porter stemmer or Snowball(Porter2) stemmer. Default is "None", which corresponds to Porter Stemmer. 'snowball' corresponds to Snowball Stemmer Note : Either stemming or lemmatization should be used. There's no benefit of using both of them together Output : Returns the processed text corpus ''' if cleaning == True: corpus = text_clean(corpus, keep_list) ''' All stopwords except the 'wh-' words are removed ''' if remove_stopwords == True: wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom'] stop = set(stopwords.words('english')) for word in wh_words: stop.remove(word) corpus = [[x for x in x.split() if x not in stop] for x in corpus] else : corpus = [[x for x in x.split()] for x in corpus] if lemmatization == True: lem = WordNetLemmatizer() corpus = [[lem.lemmatize(x, pos = 'v') for x in x] for x in corpus] if stemming == True: if stem_type == 'snowball': stemmer = SnowballStemmer(language = 'english') corpus = [[stemmer.stem(x) for x in x] for x in corpus] else : stemmer = PorterStemmer() corpus = [[stemmer.stem(x) for x in x] for x in corpus] return corpus
Example #21
Source File: test.py From Quora-Question-Pairs with MIT License | 5 votes |
def preprocess(corpus, keep_list, cleaning = True, stemming = False, stem_type = None, lemmatization = True, remove_stopwords = True): ''' Purpose : Function to perform all pre-processing tasks (cleaning, stemming, lemmatization, stopwords removal etc.) Input : 'corpus' - Text corpus on which pre-processing tasks will be performed 'keep_list' - List of words to be retained during cleaning process 'cleaning', 'stemming', 'lemmatization', 'remove_stopwords' - Boolean variables indicating whether a particular task should be performed or not 'stem_type' - Choose between Porter stemmer or Snowball(Porter2) stemmer. Default is "None", which corresponds to Porter Stemmer. 'snowball' corresponds to Snowball Stemmer Note : Either stemming or lemmatization should be used. There's no benefit of using both of them together Output : Returns the processed text corpus ''' if cleaning == True: corpus = text_clean(corpus, keep_list) ''' All stopwords except the 'wh-' words are removed ''' if remove_stopwords == True: wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom'] stop = set(stopwords.words('english')) for word in wh_words: stop.remove(word) corpus = [[x for x in x.split() if x not in stop] for x in corpus] else : corpus = [[x for x in x.split()] for x in corpus] if lemmatization == True: lem = WordNetLemmatizer() corpus = [[lem.lemmatize(x, pos = 'v') for x in x] for x in corpus] if stemming == True: if stem_type == 'snowball': stemmer = SnowballStemmer(language = 'english') corpus = [[stemmer.stem(x) for x in x] for x in corpus] else : stemmer = PorterStemmer() corpus = [[stemmer.stem(x) for x in x] for x in corpus] return corpus
Example #22
Source File: ReVerb.py From Snowball with GNU General Public License v3.0 | 5 votes |
def __init__(self): self.lmtzr = WordNetLemmatizer() self.aux_verbs = ['be']
Example #23
Source File: textpro.py From comparable-text-miner with Apache License 2.0 | 5 votes |
def getLemma(text, contextFlag=False): lemmatizer = WordNetLemmatizer() #'NN':wordnet.NOUN,'JJ':wordnet.ADJ,'VB':wordnet.VERB,'RB':wordnet.ADV wordnet_tag ={'NN':'n','JJ':'a','VB':'v','RB':'r'} result = None if text.split() == 1: # on word tokenized = word_tokenize(t) tagged = pos_tag(tokenized)[0] lemma = '' try: lemma = lemmatizer.lemmatize(tagged[0],wordnet_tag[tagged[1][:2]]) except: lemma = lemmatizer.lemmatize(tagged[0]) result = lemma elif text.split() > 1 and contextFlag == True: # mutiple words i.e. text and without considering the context resultList = [] for t in text.split(): tokenized = word_tokenize(t) tagged = pos_tag(tokenized)[0] lemma = '' try: lemma = lemmatizer.lemmatize(tagged[0],wordnet_tag[tagged[1][:2]]) except: lemma = lemmatizer.lemmatize(tagged[0]) resultList.append(lemma) result = ' '.join(resultList) else: # mutiple words i.e. text and consider the context resultList = [] tokens = word_tokenize(text) tagged = pos_tag(tokens) for t in tagged: try: resultList.append(lemmatizer.lemmatize(t[0],wordnet_tag[t[1][:2]])) except: resultList.append(lemmatizer.lemmatize(t[0])) result = ' '.join(resultList) return result ################################################################################### # Given a Naive Bayes classifier, classify a text with a given certaintaity
Example #24
Source File: disintegrator.py From quantified-self with MIT License | 5 votes |
def __init__(self): self.stopwords = set(stopwords.words("english")) self.lemmatizer = WordNetLemmatizer()
Example #25
Source File: auxiliary_word2vec.py From ZeroShotVideoClassification with Apache License 2.0 | 5 votes |
def verbs2basicform(words): ret = [] for w in words: analysis = wn.synsets(w) if any([a.pos() == 'v' for a in analysis]): w = WordNetLemmatizer().lemmatize(w, 'v') ret.append(w) return ret
Example #26
Source File: preprocess.py From Attention-Based-Aspect-Extraction with Apache License 2.0 | 5 votes |
def parseSentence(line): lmtzr = WordNetLemmatizer() stop = stopwords.words('english') text_token = CountVectorizer().build_tokenizer()(line.lower()) text_rmstop = [i for i in text_token if i not in stop] text_stem = [lmtzr.lemmatize(w) for w in text_rmstop] return text_stem
Example #27
Source File: cs_inferrer.py From lexsub with Apache License 2.0 | 5 votes |
def generate_inferred(self, result_vec, target_word, target_lemma, pos): generated_results = {} min_weight = None if result_vec is not None: for word, weight in result_vec: if generated_word_re.match(word) != None: # make sure this is not junk wn_pos = to_wordnet_pos[pos] lemma = WordNetLemmatizer().lemmatize(word, wn_pos) if word != target_word and lemma != target_lemma: if lemma in generated_results: weight = max(weight, generated_results[lemma]) generated_results[lemma] = weight if min_weight is None: min_weight = weight else: min_weight = min(min_weight, weight) if min_weight is None: min_weight = 0.0 i = 0.0 for lemma in default_generated_results: if len(generated_results) >= len(default_generated_results): break; i -= 1.0 generated_results[lemma] = min_weight + i return generated_results
Example #28
Source File: transformers.py From atap with Apache License 2.0 | 5 votes |
def __init__(self, language='english'): self.stopwords = set(nltk.corpus.stopwords.words(language)) self.lemmatizer = WordNetLemmatizer()
Example #29
Source File: transformer.py From atap with Apache License 2.0 | 5 votes |
def __init__(self, language='english', minimum=2, maximum=200): self.min = minimum self.max = maximum self.stopwords = set(nltk.corpus.stopwords.words(language)) self.lemmatizer = WordNetLemmatizer()
Example #30
Source File: transformers.py From atap with Apache License 2.0 | 5 votes |
def __init__(self, language='english'): self.stopwords = set(nltk.corpus.stopwords.words(language)) self.lemmatizer = WordNetLemmatizer()