Python nltk.WordNetLemmatizer() Examples
The following are 15
code examples of nltk.WordNetLemmatizer().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
nltk
, or try the search function
.
Example #1
Source File: nlp.py From Quora with MIT License | 6 votes |
def lemmatize(tokens): """ lemmatize tokens """ try: wnl = nltk.WordNetLemmatizer() except LookupError: nltk.download('wordnet') wnl = nltk.WordNetLemmatizer() return [wnl.lemmatize(t) for t in tokens]
Example #2
Source File: detect.py From normalise with GNU General Public License v3.0 | 6 votes |
def cond2(w): """ Return word if its lemmatised form is not in the wordlist.""" wnl = WordNetLemmatizer() return wnl.lemmatize(w.lower()) not in wordlist
Example #3
Source File: nlp-2-spam-classification.py From Hands-on-NLP-with-NLTK-and-scikit-learn- with MIT License | 5 votes |
def preprocess_sentence(sentence): lemmatizer = nltk.WordNetLemmatizer() # clearly list out our preprocessing pipeline processed_tokens = nltk.word_tokenize(sentence) processed_tokens = [w.lower() for w in processed_tokens] # find least common elements word_counts = collections.Counter(processed_tokens) uncommon_words = word_counts.most_common()[:-10:-1] # remove these tokens processed_tokens = [w for w in processed_tokens if w not in stop_words] processed_tokens = [w for w in processed_tokens if w not in uncommon_words] # lemmatize processed_tokens = [lemmatizer.lemmatize(w) for w in processed_tokens] return processed_tokens
Example #4
Source File: lemmatization.py From MatchZoo-py with Apache License 2.0 | 5 votes |
def transform(self, input_: list) -> list: """ Lemmatization a sequence of tokens. :param input_: list of tokens to be lemmatized. :return tokens: list of lemmatizd tokens. """ lemmatizer = nltk.WordNetLemmatizer() return [lemmatizer.lemmatize(token, pos='v') for token in input_]
Example #5
Source File: learn.py From partisan-discourse with Apache License 2.0 | 5 votes |
def __init__(self, stopwords=None): self.stopwords = set(stopwords or nltk.corpus.stopwords.words('english')) self.lemmatizer = nltk.WordNetLemmatizer()
Example #6
Source File: lemmatization.py From MatchZoo with Apache License 2.0 | 5 votes |
def transform(self, input_: list) -> list: """ Lemmatization a sequence of tokens. :param input_: list of tokens to be lemmatized. :return tokens: list of lemmatizd tokens. """ lemmatizer = nltk.WordNetLemmatizer() return [lemmatizer.lemmatize(token, pos='v') for token in input_]
Example #7
Source File: nltk_normalization.py From vec4ir with MIT License | 5 votes |
def __init__(self): self.install_nltk_corpora('stopwords', 'wordnet', 'punkt') self.lemmatizer = nltk.WordNetLemmatizer() self.lemmatizer.lemmatize('') # Force nltk lazy corpus loader to do something. self.tokenizer = self.make_tokenizer() self.stopwords = nltk.corpus.stopwords.words('english') self.sent_tokenizer = None
Example #8
Source File: nltk_normalization.py From Quadflor with BSD 3-Clause "New" or "Revised" License | 5 votes |
def __init__(self): self.install_nltk_corpora('stopwords', 'wordnet', 'punkt') self.lemmatizer = nltk.WordNetLemmatizer() self.lemmatizer.lemmatize('') # Force nltk lazy corpus loader to do something. self.tokenizer = self.make_tokenizer() self.stopwords = nltk.corpus.stopwords.words('english') self.sent_tokenizer = None
Example #9
Source File: synset_analysis.py From Quadflor with BSD 3-Clause "New" or "Revised" License | 5 votes |
def __init__(self): NltkNormalizer.install_nltk_corpora('averaged_perceptron_tagger') self.normalizer = NltkNormalizer() self.lem = nltk.WordNetLemmatizer() self.tagger = nltk.PerceptronTagger() self.translation_dict = {'J': wn.ADJ, 'N': wn.NOUN, 'R': wn.ADV, 'V': wn.VERB}
Example #10
Source File: normalization.py From natural-language-preprocessings with MIT License | 5 votes |
def lemmatize_term(term, pos=None): if pos is None: synsets = wordnet.synsets(term) if not synsets: return term pos = synsets[0].pos() if pos == wordnet.ADJ_SAT: pos = wordnet.ADJ return nltk.WordNetLemmatizer().lemmatize(term, pos=pos)
Example #11
Source File: retrieval.py From cpae with MIT License | 5 votes |
def add_from_lemma_definitions(self, vocab, try_lower=False): """Add lemma definitions for non-lemmas. This code covers the following scenario: supposed a dictionary is crawled, but only for word lemmas. """ lemmatizer = nltk.WordNetLemmatizer() added = 0 for word in vocab.words: word_list = [word, word.lower()] if try_lower else [word] for word_to_lemma in word_list: try: for part_of_speech in ['a', 's', 'r', 'n', 'v']: lemma = lemmatizer.lemmatize(word_to_lemma, part_of_speech) lemma_defs = self._data.get(lemma) if lemma != word and lemma_defs: # This can be quite slow. But this code will not be used # very often. for def_ in lemma_defs: if not def_ in self._data[word]: added += 1 self._data[word].append(def_) except: logger.error("lemmatizer crashed on {}".format(word)) logger.error(traceback.format_exc()) logger.info("Added {} new defs in add_from_lemma_definitions".format(added)) self.save()
Example #12
Source File: retrieval.py From cpae with MIT License | 5 votes |
def crawl_lemmas(self, vocab): """Add Wordnet lemmas as definitions.""" lemmatizer = nltk.WordNetLemmatizer() for word in vocab.words: definitions = [] try: for part_of_speech in ['a', 's', 'r', 'n', 'v']: lemma = lemmatizer.lemmatize(word, part_of_speech) if lemma != word and not [lemma] in definitions: definitions.append([lemma]) except: logger.error("lemmatizer crashed on {}".format(word)) if definitions: self._data[word] = definitions self.save()
Example #13
Source File: SpamDetection_NLTK.py From Mastering-Machine-Learning-for-Penetration-Testing with MIT License | 5 votes |
def Process(data): lemmatizer = WordNetLemmatizer() return [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(unicode(sentence,errors='ignore'))]
Example #14
Source File: text_features.py From document-qa with Apache License 2.0 | 5 votes |
def __init__(self, require_unique_match, lemmatizer="word_net", empty_question_features=False, stop_words=None): self.lemmatizer = lemmatizer self.stop_words = stop_words self.empty_question_features = empty_question_features if lemmatizer == "word_net": self._lemmatizer = WordNetLemmatizer() else: raise ValueError() self._cache = {} self.require_unique_match = require_unique_match
Example #15
Source File: text_utils.py From document-qa with Apache License 2.0 | 5 votes |
def __init__(self, lower: bool = True, stemmer="port"): self.lower = lower self.stemmer = stemmer if stemmer == "port": self._stemmer = PorterStemmer() self._stem = self._stemmer.stem elif stemmer == "wordnet": self._stemmer = WordNetLemmatizer() self._stem = self._stemmer.lemmatize else: raise ValueError(stemmer) # stemming is slow, so we cache words as we go self.normalize_cache = {}