Python nltk.stem.wordnet.WordNetLemmatizer() Examples

The following are 30 code examples of nltk.stem.wordnet.WordNetLemmatizer(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module nltk.stem.wordnet , or try the search function .
Example #1
Source File: data_processing.py    From Sarcasm-Detection with MIT License 7 votes vote down vote up
def ulterior_clean(tweets, filename):
    if not os.path.exists(filename):
        stopwords = get_stopwords_list()
        lemmatizer = WordNetLemmatizer()
        filtered_tweets = []
        for tw in tweets:
            filtered_tweet = []
            for t in tw.split():
                token = t.lower()
                if token in stopwords:
                    continue
                filtered_token = lemmatizer.lemmatize(token, 'v')
                filtered_token = lemmatizer.lemmatize(filtered_token)
                filtered_tweet.append(filtered_token)
            filtered_tweets.append(' '.join(filtered_tweet))
        utils.save_file(filtered_tweets, filename)
    # Load the filtered tokens
    filtered_tweets = utils.load_file(filename)
    return filtered_tweets 
Example #2
Source File: preprocess_lst_test.py    From lexsub with Apache License 2.0 6 votes vote down vote up
def is_atomic_mwe(mwe, verb_lemma, complement_lemma, synsets):
    mwe_count = 0
    for synset in synsets:
        gloss_lemmas = set([WordNetLemmatizer().lemmatize(word) for word in synset.definition.split()])
        if verb_lemma in gloss_lemmas or complement_lemma in gloss_lemmas:
            return False
        for syn_lemma in synset.lemmas:
            if syn_lemma.name != mwe: 
                tokens = syn_lemma.name.split('_')
                for token in tokens:
                    if token == verb_lemma:
                        return False
                if len(tokens) == 2 and tokens[1] == complement_lemma:
                    return False
        else:
            mwe_count += syn_lemma.count()
    return True 
Example #3
Source File: predict.py    From topics with Apache License 2.0 6 votes vote down vote up
def extract_lemmatized_nouns(self, new_review):
        stopwords = self.load_stopwords()
        words = []

        sentences = nltk.sent_tokenize(new_review.lower())
        for sentence in sentences:
            tokens = nltk.word_tokenize(sentence)
            text = [word for word in tokens if word not in stopwords]
            tagged_text = nltk.pos_tag(text)

            for word, tag in tagged_text:
                words.append({"word": word, "pos": tag})

        lem = WordNetLemmatizer()
        nouns = []
        for word in words:
            if word["pos"] in ["NN", "NNS"]:
                nouns.append(lem.lemmatize(word["word"]))

        return nouns 
Example #4
Source File: tokenizing.py    From convai-bot-1337 with GNU General Public License v3.0 6 votes vote down vote up
def convert_to_vw(text):
    tokenizer = nltk.RegexpTokenizer(r'\w+')
    lmtzr = WordNetLemmatizer()
    tokens = [t.lower() for t in tokenizer.tokenize(text)]
    id_ = 13371337
    processed = []
    for t in tokens:
        l = lmtzr.lemmatize(t)
        processed.append(l)
    counted = Counter(processed)
    res_str = str(id_)
    for k, v in counted.items():
        if v != 1:
            res_str = res_str + " {}:{}".format(k, v)
        else:
            res_str = res_str + " {}".format(k)
    return res_str 
Example #5
Source File: imagenet.py    From mmfeat with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def __init__(self, save_dir, config_path='./miner.yaml'):
        super(ImageNetMiner, self).__init__(save_dir, config_path)
        self.__engine__ = 'imagenet'
        self.format_url = 'http://www.image-net.org/api/text/imagenet.synset.geturls?wnid={}'

        # maximum number of synsets to retrieve - we don't need all images necessarily, other-
        # wise we get enormous amounts of synsets for words like 'entity' or 'animal'
        self.max_synsets = 10000

        self.wnl = WordNetLemmatizer()

        # url cache
        self.imgnet_url_cache = {}

        # whether we "level up" in hierarchy if no images found
        self.level_up_if_no_images = True 
Example #6
Source File: features.py    From product-classifier with MIT License 5 votes vote down vote up
def __init__(self, stoplist=None, punct=None, lemmatizer=None):
        # Load stopwords, punctuation, and lemmatizer
        # This takes a bit of work, so we only want to do it once!
        self.stopwords   = stoplist or stopwords.words('english')
        self.punctuation = punct or string.punctuation
        self.lemmatizer  = lemmatizer or WordNetLemmatizer() 
Example #7
Source File: cs_inferrer.py    From lexsub with Apache License 2.0 5 votes vote down vote up
def filter_inferred(self, result_vec, candidates, pos):
    
        filtered_results = {}
        candidates_found = set()
        
        if result_vec != None:
            for word, weight in result_vec:
                wn_pos = to_wordnet_pos[pos]
                lemma = WordNetLemmatizer().lemmatize(word, wn_pos)
                if lemma in candidates:
                    self.add_inference_result(lemma, weight, filtered_results, candidates_found)
                if lemma.title() in candidates:
                    self.add_inference_result(lemma.title(), weight, filtered_results, candidates_found)
                if word in candidates: # there are some few cases where the candidates are not lemmatized
                    self.add_inference_result(word, weight, filtered_results, candidates_found)                    
                if word.title() in candidates: # there are some few cases where the candidates are not lemmatized
                    self.add_inference_result(word.title(), weight, filtered_results, candidates_found)
                    
        # assign negative weights for candidates with no score
        # they will appear last sorted according to their unigram count        
#        candidates_left = candidates - candidates_found
#        for candidate in candidates_left:            
#            count = self.w2counts[candidate] if candidate in self.w2counts else 1
#            score = -1 - (1.0/count) # between (-1,-2] 
#            filtered_results[candidate] = score   
         
        return filtered_results 
Example #8
Source File: preprocess_lst_test.py    From lexsub with Apache License 2.0 5 votes vote down vote up
def lemmatize(pairs):
    triples = []
    for pair in pairs:
        word = pair[0]
        pos = pair[1]
        wordnet_pos = wordnet.NOUN
        if (len(pos)>=2):
            pos_prefix = pos[:2]
            if (pos_prefix in to_wordnet_pos):
                wordnet_pos = to_wordnet_pos[pos_prefix]
        lemma = WordNetLemmatizer().lemmatize(word, wordnet_pos).lower();
        triples.append([word, wordnet_pos, lemma])
    return triples 
Example #9
Source File: preprocess_lst_test.py    From lexsub with Apache License 2.0 5 votes vote down vote up
def detect_mwe(text_tokens, target_ind, wordnet_pos):
    if (target_ind < len(text_tokens)-1):
        verb_lemma = WordNetLemmatizer().lemmatize(text_tokens[target_ind], wordnet_pos)
        complement_lemma = WordNetLemmatizer().lemmatize(text_tokens[target_ind+1])
        mwe = '_'.join([verb_lemma, complement_lemma])
        synsets = wordnet.synsets(mwe, wordnet.VERB) 
        if len(synsets) > 0:
            if (target_ind+1 < len(text_tokens)-1):
                mwe_right = '_'.join([WordNetLemmatizer().lemmatize(text_tokens[target_ind+1]), WordNetLemmatizer().lemmatize(text_tokens[target_ind+2])])
                if len(wordnet.synsets(mwe_right)) > 0:
                    return
            if is_atomic_mwe(mwe, verb_lemma, complement_lemma, synsets) == True:
                mwe = '='.join([text_tokens[target_ind], text_tokens[target_ind+1]])
                text_tokens[target_ind] = mwe
                del text_tokens[target_ind+1] 
Example #10
Source File: extract_baseline_features.py    From Sarcasm-Detection with MIT License 5 votes vote down vote up
def get_features2(tweets, subj_dict):
    print("Getting features type 2...")
    features = []
    tknzr = TweetTokenizer(preserve_case=True, reduce_len=False, strip_handles=False)
    lemmatizer = WordNetLemmatizer()
    for tweet in tweets:
        feature_list = [0.0] * 5
        tokens = tknzr.tokenize(tweet)
        # Take the number of positive and negative words as features
        for word in tokens:
            stemmed = lemmatizer.lemmatize(word, 'v')
            stemmed = lemmatizer.lemmatize(stemmed)
            if stemmed in subj_dict:
                dictlist = []
                for word in subj_dict[stemmed]:
                    dictlist.extend(subj_dict[stemmed][word])
                if 'strongsubj' in dictlist:
                    value = 1.0
                else:
                    value = 0.5
                if 'positive' in dictlist:
                    feature_list[0] += value
                elif 'negative' in dictlist:
                    feature_list[1] += value
        # Take the report of positives to negatives as a feature
        if feature_list[0] != 0.0 and feature_list[1] != 0.0:
            feature_list[2] = feature_list[0] / feature_list[1]
        # Derive features from punctuation
        feature_list[2] += count_apparitions(tokens, helper.punctuation)
        # Take strong negations as a feature
        feature_list[3] += count_apparitions(tokens, helper.strong_negations)
        # Take strong affirmatives as a feature
        feature_list[4] += count_apparitions(tokens, helper.strong_affirmatives)
        features.append(feature_list)
    print("Done.")
    return features 
Example #11
Source File: data_processing.py    From Sarcasm-Detection with MIT License 5 votes vote down vote up
def extract_lemmatized_tweet(tokens, pos, use_verbs=True, use_nouns=True, use_all=False):
    lemmatizer = WordNetLemmatizer()
    clean_data = []
    for index in range(len(tokens)):
        if use_verbs and pos[index] is 'V':
            clean_data.append(lemmatizer.lemmatize(tokens[index].lower(), 'v'))
        if use_nouns and pos[index] is 'N':
            clean_data.append(lemmatizer.lemmatize(tokens[index].lower()))
        if use_all:
            lemmatized_word = lemmatizer.lemmatize(tokens[index].lower(), 'v')
            word = lemmatizer.lemmatize(lemmatized_word)
            if pos[index] not in ['^', ',', '$', '&', '!', '#', '@']:
                clean_data.append(word)
    return clean_data 
Example #12
Source File: DepParser.py    From python-zpar with MIT License 5 votes vote down vote up
def __init__(self, modelpath, libptr, zpar_session_obj):
        super(DepParser, self).__init__()

        # save the zpar session object
        self._zpar_session_obj = zpar_session_obj

        # set up a logger
        self.logger = logging.getLogger(__name__)

        # get the library method that loads the parser models
        self._load_depparser = libptr.load_depparser
        self._load_depparser.restype = c.c_int
        self._load_depparser.argtypes = [c.c_void_p, c.c_char_p]

        # get the library methods that parse sentences and files
        self._dep_parse_sentence = libptr.dep_parse_sentence
        self._dep_parse_sentence.restype = c.c_char_p
        self._dep_parse_sentence.argtypes = [c.c_void_p, c.c_char_p, c.c_bool]

        self._dep_parse_file = libptr.dep_parse_file
        self._dep_parse_file.restype = None
        self._dep_parse_file.argtypes = [c.c_void_p, c.c_char_p, c.c_char_p, c.c_bool]

        self._dep_parse_tagged_sentence = libptr.dep_parse_tagged_sentence
        self._dep_parse_tagged_sentence.restype = c.c_char_p
        self._dep_parse_tagged_sentence.argtypes = [c.c_void_p, c.c_char_p, c.c_char]

        self._dep_parse_tagged_file = libptr.dep_parse_tagged_file
        self._dep_parse_tagged_file.restype = None
        self._dep_parse_tagged_file.argtypes = [c.c_void_p, c.c_char_p, c.c_char_p, c.c_char]

        if self._load_depparser(self._zpar_session_obj, modelpath.encode('utf-8')):
            raise OSError('Cannot find dependency parser model at {}\n'.format(modelpath))

        # set up the wordnet lemmatizer if we have it
        if _HAS_LEMMATIZER:
            self.lemmatizer = WordNetLemmatizer()
        else:
            self.lemmatizer = None 
Example #13
Source File: linguist.py    From ECDICT with MIT License 5 votes vote down vote up
def lemmatize (self, word, pos = 'n'):
		word = word.lower()
		if self.__lemmatizer is None:
			from nltk.stem.wordnet import WordNetLemmatizer
			self.__lemmatizer = WordNetLemmatizer()
		return self.__lemmatizer.lemmatize(word, pos)


#----------------------------------------------------------------------
# global
#---------------------------------------------------------------------- 
Example #14
Source File: lemma.py    From broca with MIT License 5 votes vote down vote up
def __init__(self, n_jobs=1):
        self.lemmr = WordNetLemmatizer()
        self.stops = stopwords.words('english')
        self.n_jobs = n_jobs 
Example #15
Source File: overkill.py    From broca with MIT License 5 votes vote down vote up
def tokenize(self, docs):
        if self.lemmatize:
            lem = WordNetLemmatizer()

        #print('RAKE tokenizing...')
        pre_tdocs = RAKETokenizer(n_jobs=self.n_jobs).tokenize(docs)

        for i, tdoc in enumerate(pre_tdocs):
            for t in tdoc:
                if t.startswith('one'):
                    print(t)
                    print(i)

        #print('Additional Tokenizing docs...')
        if self.n_jobs == 1:
            tdocs = [pre_tokenize(doc, tdoc, lem=lem) for doc, tdoc in zip(docs, pre_tdocs)]
        else:
            tdocs = parallel(partial(pre_tokenize, lem=lem), zip(docs, pre_tdocs), self.n_jobs, expand_args=True)

        #print('Training bigram...')
        if self.bigram is None:
            self.bigram = Phrases(tdocs,
                                  min_count=self.min_count,
                                  threshold=self.threshold,
                                  delimiter=b' ')
        else:
            self.bigram.add_vocab(tdocs)

        #print('Training trigram...')
        if self.trigram is None:
            self.trigram = Phrases(self.bigram[tdocs],
                                   min_count=self.min_count,
                                   threshold=self.threshold,
                                   delimiter=b' ')
        else:
            self.trigram.add_vocab(self.bigram[tdocs])

        return [tdoc for tdoc in self.trigram[self.bigram[tdocs]]] 
Example #16
Source File: tree.py    From props with MIT License 5 votes vote down vote up
def _VERBAL_PREDICATE_FEATURE_Lemma(self):
        from nltk.stem.wordnet import WordNetLemmatizer
        lmtzr = WordNetLemmatizer()
        if self.pos in pos_penn_to_wordnet:
            return lmtzr.lemmatize(self.word, pos_penn_to_wordnet[self.pos])
        else:
            return False

    # TODO functions: 
Example #17
Source File: reverb.py    From BREDS with GNU Lesser General Public License v3.0 5 votes vote down vote up
def __init__(self):
        self.lmtzr = WordNetLemmatizer()
        self.aux_verbs = ['be'] 
Example #18
Source File: preprocess.py    From Unsupervised-Aspect-Extraction with Apache License 2.0 5 votes vote down vote up
def parseSentence(line):
    lmtzr = WordNetLemmatizer()    
    stop = stopwords.words('english')
    text_token = CountVectorizer().build_tokenizer()(line.lower())
    text_rmstop = [i for i in text_token if i not in stop]
    text_stem = [lmtzr.lemmatize(w) for w in text_rmstop]
    return text_stem 
Example #19
Source File: normalize.py    From atap with Apache License 2.0 5 votes vote down vote up
def __init__(self, language='english'):
        self.stopwords  = frozenset(nltk.corpus.stopwords.words(language))
        self.lemmatizer = WordNetLemmatizer() 
Example #20
Source File: MaLSTM_train.py    From Quora-Question-Pairs with MIT License 5 votes vote down vote up
def preprocess(corpus, keep_list, cleaning = True, stemming = False, stem_type = None, lemmatization = True, remove_stopwords = True):
    
    '''
    Purpose : Function to perform all pre-processing tasks (cleaning, stemming, lemmatization, stopwords removal etc.)
    
    Input : 
    'corpus' - Text corpus on which pre-processing tasks will be performed
    'keep_list' - List of words to be retained during cleaning process
    'cleaning', 'stemming', 'lemmatization', 'remove_stopwords' - Boolean variables indicating whether a particular task should 
                                                                  be performed or not
    'stem_type' - Choose between Porter stemmer or Snowball(Porter2) stemmer. Default is "None", which corresponds to Porter
                  Stemmer. 'snowball' corresponds to Snowball Stemmer
    
    Note : Either stemming or lemmatization should be used. There's no benefit of using both of them together
    
    Output : Returns the processed text corpus
    
    '''
    if cleaning == True:
        corpus = text_clean(corpus, keep_list)
    
    ''' All stopwords except the 'wh-' words are removed '''
    if remove_stopwords == True:
        wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom']
        stop = set(stopwords.words('english'))
        for word in wh_words:
            stop.remove(word)
        corpus = [[x for x in x.split() if x not in stop] for x in corpus]
    else :
        corpus = [[x for x in x.split()] for x in corpus]
    
    if lemmatization == True:
        lem = WordNetLemmatizer()
        corpus = [[lem.lemmatize(x, pos = 'v') for x in x] for x in corpus]
    
    if stemming == True:
        if stem_type == 'snowball':
            stemmer = SnowballStemmer(language = 'english')
            corpus = [[stemmer.stem(x) for x in x] for x in corpus]
        else :
            stemmer = PorterStemmer()
            corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    
    
    return corpus 
Example #21
Source File: test.py    From Quora-Question-Pairs with MIT License 5 votes vote down vote up
def preprocess(corpus, keep_list, cleaning = True, stemming = False, stem_type = None, lemmatization = True, remove_stopwords = True):
    
    '''
    Purpose : Function to perform all pre-processing tasks (cleaning, stemming, lemmatization, stopwords removal etc.)
    
    Input : 
    'corpus' - Text corpus on which pre-processing tasks will be performed
    'keep_list' - List of words to be retained during cleaning process
    'cleaning', 'stemming', 'lemmatization', 'remove_stopwords' - Boolean variables indicating whether a particular task should 
                                                                  be performed or not
    'stem_type' - Choose between Porter stemmer or Snowball(Porter2) stemmer. Default is "None", which corresponds to Porter
                  Stemmer. 'snowball' corresponds to Snowball Stemmer
    
    Note : Either stemming or lemmatization should be used. There's no benefit of using both of them together
    
    Output : Returns the processed text corpus
    
    '''
    if cleaning == True:
        corpus = text_clean(corpus, keep_list)
    
    ''' All stopwords except the 'wh-' words are removed '''
    if remove_stopwords == True:
        wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom']
        stop = set(stopwords.words('english'))
        for word in wh_words:
            stop.remove(word)
        corpus = [[x for x in x.split() if x not in stop] for x in corpus]
    else :
        corpus = [[x for x in x.split()] for x in corpus]
    
    if lemmatization == True:
        lem = WordNetLemmatizer()
        corpus = [[lem.lemmatize(x, pos = 'v') for x in x] for x in corpus]
    
    if stemming == True:
        if stem_type == 'snowball':
            stemmer = SnowballStemmer(language = 'english')
            corpus = [[stemmer.stem(x) for x in x] for x in corpus]
        else :
            stemmer = PorterStemmer()
            corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    
        
    return corpus 
Example #22
Source File: ReVerb.py    From Snowball with GNU General Public License v3.0 5 votes vote down vote up
def __init__(self):
        self.lmtzr = WordNetLemmatizer()
        self.aux_verbs = ['be'] 
Example #23
Source File: textpro.py    From comparable-text-miner with Apache License 2.0 5 votes vote down vote up
def getLemma(text, contextFlag=False):
	lemmatizer = WordNetLemmatizer()
	#'NN':wordnet.NOUN,'JJ':wordnet.ADJ,'VB':wordnet.VERB,'RB':wordnet.ADV
	wordnet_tag ={'NN':'n','JJ':'a','VB':'v','RB':'r'}
	result = None
	if text.split() == 1: # on word
		tokenized = word_tokenize(t)
		tagged = pos_tag(tokenized)[0]
		lemma = ''
		try: lemma = lemmatizer.lemmatize(tagged[0],wordnet_tag[tagged[1][:2]])
		except: lemma = lemmatizer.lemmatize(tagged[0])
		result = lemma
	elif text.split() > 1 and contextFlag == True: # mutiple words i.e. text and without considering the context
		resultList = []
		for t in text.split():
			tokenized = word_tokenize(t)
			tagged = pos_tag(tokenized)[0]
			lemma = ''
			try: lemma = lemmatizer.lemmatize(tagged[0],wordnet_tag[tagged[1][:2]])
			except: lemma = lemmatizer.lemmatize(tagged[0])
			resultList.append(lemma)
		result = ' '.join(resultList)
	else: # mutiple words i.e. text and consider the context
		resultList = []
		tokens = word_tokenize(text)
		tagged = pos_tag(tokens)
		for t in tagged:
			try: resultList.append(lemmatizer.lemmatize(t[0],wordnet_tag[t[1][:2]]))
			except: resultList.append(lemmatizer.lemmatize(t[0]))
		result = ' '.join(resultList)
	return result
###################################################################################

# Given a Naive Bayes classifier, classify a text with a given certaintaity 
Example #24
Source File: disintegrator.py    From quantified-self with MIT License 5 votes vote down vote up
def __init__(self):
        self.stopwords = set(stopwords.words("english"))
        self.lemmatizer = WordNetLemmatizer() 
Example #25
Source File: auxiliary_word2vec.py    From ZeroShotVideoClassification with Apache License 2.0 5 votes vote down vote up
def verbs2basicform(words):
    ret = []
    for w in words:
        analysis = wn.synsets(w)
        if any([a.pos() == 'v' for a in analysis]):
            w = WordNetLemmatizer().lemmatize(w, 'v')
        ret.append(w)
    return ret 
Example #26
Source File: preprocess.py    From Attention-Based-Aspect-Extraction with Apache License 2.0 5 votes vote down vote up
def parseSentence(line):
    lmtzr = WordNetLemmatizer()
    stop = stopwords.words('english')
    text_token = CountVectorizer().build_tokenizer()(line.lower())
    text_rmstop = [i for i in text_token if i not in stop]
    text_stem = [lmtzr.lemmatize(w) for w in text_rmstop]
    return text_stem 
Example #27
Source File: cs_inferrer.py    From lexsub with Apache License 2.0 5 votes vote down vote up
def generate_inferred(self, result_vec, target_word, target_lemma, pos):
    
        generated_results = {}
        min_weight = None
        if result_vec is not None:
            for word, weight in result_vec:
                if generated_word_re.match(word) != None: # make sure this is not junk
                    wn_pos = to_wordnet_pos[pos]
                    lemma = WordNetLemmatizer().lemmatize(word, wn_pos)
                    if word != target_word and lemma != target_lemma:
                        if lemma in generated_results:
                            weight = max(weight, generated_results[lemma])
                        generated_results[lemma] = weight
                        if min_weight is None:
                            min_weight = weight
                        else:
                            min_weight = min(min_weight, weight)
                            
        if min_weight is None:
            min_weight = 0.0
        i = 0.0                
        for lemma in default_generated_results:
            if len(generated_results) >= len(default_generated_results):
                break;
            i -= 1.0
            generated_results[lemma] = min_weight + i
            
                
        return generated_results 
Example #28
Source File: transformers.py    From atap with Apache License 2.0 5 votes vote down vote up
def __init__(self, language='english'):
        self.stopwords  = set(nltk.corpus.stopwords.words(language))
        self.lemmatizer = WordNetLemmatizer() 
Example #29
Source File: transformer.py    From atap with Apache License 2.0 5 votes vote down vote up
def __init__(self, language='english', minimum=2, maximum=200):
        self.min = minimum
        self.max = maximum
        self.stopwords  = set(nltk.corpus.stopwords.words(language))
        self.lemmatizer = WordNetLemmatizer() 
Example #30
Source File: transformers.py    From atap with Apache License 2.0 5 votes vote down vote up
def __init__(self, language='english'):
        self.stopwords  = set(nltk.corpus.stopwords.words(language))
        self.lemmatizer = WordNetLemmatizer()