Python Examples of nltk.stem.wordnet.WordNetLemmatizer

Source File: data_processing.py From Sarcasm-Detection with MIT License

7 votes

def ulterior_clean(tweets, filename):
    if not os.path.exists(filename):
        stopwords = get_stopwords_list()
        lemmatizer = WordNetLemmatizer()
        filtered_tweets = []
        for tw in tweets:
            filtered_tweet = []
            for t in tw.split():
                token = t.lower()
                if token in stopwords:
                    continue
                filtered_token = lemmatizer.lemmatize(token, 'v')
                filtered_token = lemmatizer.lemmatize(filtered_token)
                filtered_tweet.append(filtered_token)
            filtered_tweets.append(' '.join(filtered_tweet))
        utils.save_file(filtered_tweets, filename)
    # Load the filtered tokens
    filtered_tweets = utils.load_file(filename)
    return filtered_tweets

Source File: preprocess_lst_test.py From lexsub with Apache License 2.0

6 votes

def is_atomic_mwe(mwe, verb_lemma, complement_lemma, synsets):
    mwe_count = 0
    for synset in synsets:
        gloss_lemmas = set([WordNetLemmatizer().lemmatize(word) for word in synset.definition.split()])
        if verb_lemma in gloss_lemmas or complement_lemma in gloss_lemmas:
            return False
        for syn_lemma in synset.lemmas:
            if syn_lemma.name != mwe: 
                tokens = syn_lemma.name.split('_')
                for token in tokens:
                    if token == verb_lemma:
                        return False
                if len(tokens) == 2 and tokens[1] == complement_lemma:
                    return False
        else:
            mwe_count += syn_lemma.count()
    return True

Source File: predict.py From topics with Apache License 2.0

6 votes

def extract_lemmatized_nouns(self, new_review):
        stopwords = self.load_stopwords()
        words = []

        sentences = nltk.sent_tokenize(new_review.lower())
        for sentence in sentences:
            tokens = nltk.word_tokenize(sentence)
            text = [word for word in tokens if word not in stopwords]
            tagged_text = nltk.pos_tag(text)

            for word, tag in tagged_text:
                words.append({"word": word, "pos": tag})

        lem = WordNetLemmatizer()
        nouns = []
        for word in words:
            if word["pos"] in ["NN", "NNS"]:
                nouns.append(lem.lemmatize(word["word"]))

        return nouns

Source File: tokenizing.py From convai-bot-1337 with GNU General Public License v3.0

6 votes

def convert_to_vw(text):
    tokenizer = nltk.RegexpTokenizer(r'\w+')
    lmtzr = WordNetLemmatizer()
    tokens = [t.lower() for t in tokenizer.tokenize(text)]
    id_ = 13371337
    processed = []
    for t in tokens:
        l = lmtzr.lemmatize(t)
        processed.append(l)
    counted = Counter(processed)
    res_str = str(id_)
    for k, v in counted.items():
        if v != 1:
            res_str = res_str + " {}:{}".format(k, v)
        else:
            res_str = res_str + " {}".format(k)
    return res_str

Source File: imagenet.py From mmfeat with BSD 3-Clause "New" or "Revised" License

6 votes

def __init__(self, save_dir, config_path='./miner.yaml'):
        super(ImageNetMiner, self).__init__(save_dir, config_path)
        self.__engine__ = 'imagenet'
        self.format_url = 'http://www.image-net.org/api/text/imagenet.synset.geturls?wnid={}'

        # maximum number of synsets to retrieve - we don't need all images necessarily, other-
        # wise we get enormous amounts of synsets for words like 'entity' or 'animal'
        self.max_synsets = 10000

        self.wnl = WordNetLemmatizer()

        # url cache
        self.imgnet_url_cache = {}

        # whether we "level up" in hierarchy if no images found
        self.level_up_if_no_images = True

Source File: features.py From product-classifier with MIT License

5 votes

def __init__(self, stoplist=None, punct=None, lemmatizer=None):
        # Load stopwords, punctuation, and lemmatizer
        # This takes a bit of work, so we only want to do it once!
        self.stopwords   = stoplist or stopwords.words('english')
        self.punctuation = punct or string.punctuation
        self.lemmatizer  = lemmatizer or WordNetLemmatizer()

Source File: cs_inferrer.py From lexsub with Apache License 2.0

5 votes

def filter_inferred(self, result_vec, candidates, pos):
    
        filtered_results = {}
        candidates_found = set()
        
        if result_vec != None:
            for word, weight in result_vec:
                wn_pos = to_wordnet_pos[pos]
                lemma = WordNetLemmatizer().lemmatize(word, wn_pos)
                if lemma in candidates:
                    self.add_inference_result(lemma, weight, filtered_results, candidates_found)
                if lemma.title() in candidates:
                    self.add_inference_result(lemma.title(), weight, filtered_results, candidates_found)
                if word in candidates: # there are some few cases where the candidates are not lemmatized
                    self.add_inference_result(word, weight, filtered_results, candidates_found)                    
                if word.title() in candidates: # there are some few cases where the candidates are not lemmatized
                    self.add_inference_result(word.title(), weight, filtered_results, candidates_found)
                    
        # assign negative weights for candidates with no score
        # they will appear last sorted according to their unigram count        
#        candidates_left = candidates - candidates_found
#        for candidate in candidates_left:            
#            count = self.w2counts[candidate] if candidate in self.w2counts else 1
#            score = -1 - (1.0/count) # between (-1,-2] 
#            filtered_results[candidate] = score   
         
        return filtered_results

Source File: preprocess_lst_test.py From lexsub with Apache License 2.0

5 votes

def lemmatize(pairs):
    triples = []
    for pair in pairs:
        word = pair[0]
        pos = pair[1]
        wordnet_pos = wordnet.NOUN
        if (len(pos)>=2):
            pos_prefix = pos[:2]
            if (pos_prefix in to_wordnet_pos):
                wordnet_pos = to_wordnet_pos[pos_prefix]
        lemma = WordNetLemmatizer().lemmatize(word, wordnet_pos).lower();
        triples.append([word, wordnet_pos, lemma])
    return triples

Source File: preprocess_lst_test.py From lexsub with Apache License 2.0

5 votes

def detect_mwe(text_tokens, target_ind, wordnet_pos):
    if (target_ind < len(text_tokens)-1):
        verb_lemma = WordNetLemmatizer().lemmatize(text_tokens[target_ind], wordnet_pos)
        complement_lemma = WordNetLemmatizer().lemmatize(text_tokens[target_ind+1])
        mwe = '_'.join([verb_lemma, complement_lemma])
        synsets = wordnet.synsets(mwe, wordnet.VERB) 
        if len(synsets) > 0:
            if (target_ind+1 < len(text_tokens)-1):
                mwe_right = '_'.join([WordNetLemmatizer().lemmatize(text_tokens[target_ind+1]), WordNetLemmatizer().lemmatize(text_tokens[target_ind+2])])
                if len(wordnet.synsets(mwe_right)) > 0:
                    return
            if is_atomic_mwe(mwe, verb_lemma, complement_lemma, synsets) == True:
                mwe = '='.join([text_tokens[target_ind], text_tokens[target_ind+1]])
                text_tokens[target_ind] = mwe
                del text_tokens[target_ind+1]

Source File: extract_baseline_features.py From Sarcasm-Detection with MIT License

5 votes

def get_features2(tweets, subj_dict):
    print("Getting features type 2...")
    features = []
    tknzr = TweetTokenizer(preserve_case=True, reduce_len=False, strip_handles=False)
    lemmatizer = WordNetLemmatizer()
    for tweet in tweets:
        feature_list = [0.0] * 5
        tokens = tknzr.tokenize(tweet)
        # Take the number of positive and negative words as features
        for word in tokens:
            stemmed = lemmatizer.lemmatize(word, 'v')
            stemmed = lemmatizer.lemmatize(stemmed)
            if stemmed in subj_dict:
                dictlist = []
                for word in subj_dict[stemmed]:
                    dictlist.extend(subj_dict[stemmed][word])
                if 'strongsubj' in dictlist:
                    value = 1.0
                else:
                    value = 0.5
                if 'positive' in dictlist:
                    feature_list[0] += value
                elif 'negative' in dictlist:
                    feature_list[1] += value
        # Take the report of positives to negatives as a feature
        if feature_list[0] != 0.0 and feature_list[1] != 0.0:
            feature_list[2] = feature_list[0] / feature_list[1]
        # Derive features from punctuation
        feature_list[2] += count_apparitions(tokens, helper.punctuation)
        # Take strong negations as a feature
        feature_list[3] += count_apparitions(tokens, helper.strong_negations)
        # Take strong affirmatives as a feature
        feature_list[4] += count_apparitions(tokens, helper.strong_affirmatives)
        features.append(feature_list)
    print("Done.")
    return features

Source File: data_processing.py From Sarcasm-Detection with MIT License

5 votes

def extract_lemmatized_tweet(tokens, pos, use_verbs=True, use_nouns=True, use_all=False):
    lemmatizer = WordNetLemmatizer()
    clean_data = []
    for index in range(len(tokens)):
        if use_verbs and pos[index] is 'V':
            clean_data.append(lemmatizer.lemmatize(tokens[index].lower(), 'v'))
        if use_nouns and pos[index] is 'N':
            clean_data.append(lemmatizer.lemmatize(tokens[index].lower()))
        if use_all:
            lemmatized_word = lemmatizer.lemmatize(tokens[index].lower(), 'v')
            word = lemmatizer.lemmatize(lemmatized_word)
            if pos[index] not in ['^', ',', '$', '&', '!', '#', '@']:
                clean_data.append(word)
    return clean_data

Source File: DepParser.py From python-zpar with MIT License

5 votes

def __init__(self, modelpath, libptr, zpar_session_obj):
        super(DepParser, self).__init__()

        # save the zpar session object
        self._zpar_session_obj = zpar_session_obj

        # set up a logger
        self.logger = logging.getLogger(__name__)

        # get the library method that loads the parser models
        self._load_depparser = libptr.load_depparser
        self._load_depparser.restype = c.c_int
        self._load_depparser.argtypes = [c.c_void_p, c.c_char_p]

        # get the library methods that parse sentences and files
        self._dep_parse_sentence = libptr.dep_parse_sentence
        self._dep_parse_sentence.restype = c.c_char_p
        self._dep_parse_sentence.argtypes = [c.c_void_p, c.c_char_p, c.c_bool]

        self._dep_parse_file = libptr.dep_parse_file
        self._dep_parse_file.restype = None
        self._dep_parse_file.argtypes = [c.c_void_p, c.c_char_p, c.c_char_p, c.c_bool]

        self._dep_parse_tagged_sentence = libptr.dep_parse_tagged_sentence
        self._dep_parse_tagged_sentence.restype = c.c_char_p
        self._dep_parse_tagged_sentence.argtypes = [c.c_void_p, c.c_char_p, c.c_char]

        self._dep_parse_tagged_file = libptr.dep_parse_tagged_file
        self._dep_parse_tagged_file.restype = None
        self._dep_parse_tagged_file.argtypes = [c.c_void_p, c.c_char_p, c.c_char_p, c.c_char]

        if self._load_depparser(self._zpar_session_obj, modelpath.encode('utf-8')):
            raise OSError('Cannot find dependency parser model at {}\n'.format(modelpath))

        # set up the wordnet lemmatizer if we have it
        if _HAS_LEMMATIZER:
            self.lemmatizer = WordNetLemmatizer()
        else:
            self.lemmatizer = None

Source File: linguist.py From ECDICT with MIT License

5 votes

def lemmatize (self, word, pos = 'n'):
		word = word.lower()
		if self.__lemmatizer is None:
			from nltk.stem.wordnet import WordNetLemmatizer
			self.__lemmatizer = WordNetLemmatizer()
		return self.__lemmatizer.lemmatize(word, pos)


#----------------------------------------------------------------------
# global
#----------------------------------------------------------------------

Source File: lemma.py From broca with MIT License

5 votes

def __init__(self, n_jobs=1):
        self.lemmr = WordNetLemmatizer()
        self.stops = stopwords.words('english')
        self.n_jobs = n_jobs

Source File: overkill.py From broca with MIT License

5 votes

def tokenize(self, docs):
        if self.lemmatize:
            lem = WordNetLemmatizer()

        #print('RAKE tokenizing...')
        pre_tdocs = RAKETokenizer(n_jobs=self.n_jobs).tokenize(docs)

        for i, tdoc in enumerate(pre_tdocs):
            for t in tdoc:
                if t.startswith('one'):
                    print(t)
                    print(i)

        #print('Additional Tokenizing docs...')
        if self.n_jobs == 1:
            tdocs = [pre_tokenize(doc, tdoc, lem=lem) for doc, tdoc in zip(docs, pre_tdocs)]
        else:
            tdocs = parallel(partial(pre_tokenize, lem=lem), zip(docs, pre_tdocs), self.n_jobs, expand_args=True)

        #print('Training bigram...')
        if self.bigram is None:
            self.bigram = Phrases(tdocs,
                                  min_count=self.min_count,
                                  threshold=self.threshold,
                                  delimiter=b' ')
        else:
            self.bigram.add_vocab(tdocs)

        #print('Training trigram...')
        if self.trigram is None:
            self.trigram = Phrases(self.bigram[tdocs],
                                   min_count=self.min_count,
                                   threshold=self.threshold,
                                   delimiter=b' ')
        else:
            self.trigram.add_vocab(self.bigram[tdocs])

        return [tdoc for tdoc in self.trigram[self.bigram[tdocs]]]

Source File: tree.py From props with MIT License

5 votes

def _VERBAL_PREDICATE_FEATURE_Lemma(self):
        from nltk.stem.wordnet import WordNetLemmatizer
        lmtzr = WordNetLemmatizer()
        if self.pos in pos_penn_to_wordnet:
            return lmtzr.lemmatize(self.word, pos_penn_to_wordnet[self.pos])
        else:
            return False

    # TODO functions:

Source File: reverb.py From BREDS with GNU Lesser General Public License v3.0

5 votes

def __init__(self):
        self.lmtzr = WordNetLemmatizer()
        self.aux_verbs = ['be']

Source File: preprocess.py From Unsupervised-Aspect-Extraction with Apache License 2.0

5 votes

def parseSentence(line):
    lmtzr = WordNetLemmatizer()    
    stop = stopwords.words('english')
    text_token = CountVectorizer().build_tokenizer()(line.lower())
    text_rmstop = [i for i in text_token if i not in stop]
    text_stem = [lmtzr.lemmatize(w) for w in text_rmstop]
    return text_stem

Source File: normalize.py From atap with Apache License 2.0

5 votes

def __init__(self, language='english'):
        self.stopwords  = frozenset(nltk.corpus.stopwords.words(language))
        self.lemmatizer = WordNetLemmatizer()

Source File: MaLSTM_train.py From Quora-Question-Pairs with MIT License

5 votes

def preprocess(corpus, keep_list, cleaning = True, stemming = False, stem_type = None, lemmatization = True, remove_stopwords = True):
    
    '''
    Purpose : Function to perform all pre-processing tasks (cleaning, stemming, lemmatization, stopwords removal etc.)
    
    Input : 
    'corpus' - Text corpus on which pre-processing tasks will be performed
    'keep_list' - List of words to be retained during cleaning process
    'cleaning', 'stemming', 'lemmatization', 'remove_stopwords' - Boolean variables indicating whether a particular task should 
                                                                  be performed or not
    'stem_type' - Choose between Porter stemmer or Snowball(Porter2) stemmer. Default is "None", which corresponds to Porter
                  Stemmer. 'snowball' corresponds to Snowball Stemmer
    
    Note : Either stemming or lemmatization should be used. There's no benefit of using both of them together
    
    Output : Returns the processed text corpus
    
    '''
    if cleaning == True:
        corpus = text_clean(corpus, keep_list)
    
    ''' All stopwords except the 'wh-' words are removed '''
    if remove_stopwords == True:
        wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom']
        stop = set(stopwords.words('english'))
        for word in wh_words:
            stop.remove(word)
        corpus = [[x for x in x.split() if x not in stop] for x in corpus]
    else :
        corpus = [[x for x in x.split()] for x in corpus]
    
    if lemmatization == True:
        lem = WordNetLemmatizer()
        corpus = [[lem.lemmatize(x, pos = 'v') for x in x] for x in corpus]
    
    if stemming == True:
        if stem_type == 'snowball':
            stemmer = SnowballStemmer(language = 'english')
            corpus = [[stemmer.stem(x) for x in x] for x in corpus]
        else :
            stemmer = PorterStemmer()
            corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    
    
    return corpus

Source File: test.py From Quora-Question-Pairs with MIT License

5 votes

def preprocess(corpus, keep_list, cleaning = True, stemming = False, stem_type = None, lemmatization = True, remove_stopwords = True):
    
    '''
    Purpose : Function to perform all pre-processing tasks (cleaning, stemming, lemmatization, stopwords removal etc.)
    
    Input : 
    'corpus' - Text corpus on which pre-processing tasks will be performed
    'keep_list' - List of words to be retained during cleaning process
    'cleaning', 'stemming', 'lemmatization', 'remove_stopwords' - Boolean variables indicating whether a particular task should 
                                                                  be performed or not
    'stem_type' - Choose between Porter stemmer or Snowball(Porter2) stemmer. Default is "None", which corresponds to Porter
                  Stemmer. 'snowball' corresponds to Snowball Stemmer
    
    Note : Either stemming or lemmatization should be used. There's no benefit of using both of them together
    
    Output : Returns the processed text corpus
    
    '''
    if cleaning == True:
        corpus = text_clean(corpus, keep_list)
    
    ''' All stopwords except the 'wh-' words are removed '''
    if remove_stopwords == True:
        wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom']
        stop = set(stopwords.words('english'))
        for word in wh_words:
            stop.remove(word)
        corpus = [[x for x in x.split() if x not in stop] for x in corpus]
    else :
        corpus = [[x for x in x.split()] for x in corpus]
    
    if lemmatization == True:
        lem = WordNetLemmatizer()
        corpus = [[lem.lemmatize(x, pos = 'v') for x in x] for x in corpus]
    
    if stemming == True:
        if stem_type == 'snowball':
            stemmer = SnowballStemmer(language = 'english')
            corpus = [[stemmer.stem(x) for x in x] for x in corpus]
        else :
            stemmer = PorterStemmer()
            corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    
        
    return corpus

Source File: ReVerb.py From Snowball with GNU General Public License v3.0

5 votes

def __init__(self):
        self.lmtzr = WordNetLemmatizer()
        self.aux_verbs = ['be']

Source File: textpro.py From comparable-text-miner with Apache License 2.0

5 votes

def getLemma(text, contextFlag=False):
	lemmatizer = WordNetLemmatizer()
	#'NN':wordnet.NOUN,'JJ':wordnet.ADJ,'VB':wordnet.VERB,'RB':wordnet.ADV
	wordnet_tag ={'NN':'n','JJ':'a','VB':'v','RB':'r'}
	result = None
	if text.split() == 1: # on word
		tokenized = word_tokenize(t)
		tagged = pos_tag(tokenized)[0]
		lemma = ''
		try: lemma = lemmatizer.lemmatize(tagged[0],wordnet_tag[tagged[1][:2]])
		except: lemma = lemmatizer.lemmatize(tagged[0])
		result = lemma
	elif text.split() > 1 and contextFlag == True: # mutiple words i.e. text and without considering the context
		resultList = []
		for t in text.split():
			tokenized = word_tokenize(t)
			tagged = pos_tag(tokenized)[0]
			lemma = ''
			try: lemma = lemmatizer.lemmatize(tagged[0],wordnet_tag[tagged[1][:2]])
			except: lemma = lemmatizer.lemmatize(tagged[0])
			resultList.append(lemma)
		result = ' '.join(resultList)
	else: # mutiple words i.e. text and consider the context
		resultList = []
		tokens = word_tokenize(text)
		tagged = pos_tag(tokens)
		for t in tagged:
			try: resultList.append(lemmatizer.lemmatize(t[0],wordnet_tag[t[1][:2]]))
			except: resultList.append(lemmatizer.lemmatize(t[0]))
		result = ' '.join(resultList)
	return result
###################################################################################

# Given a Naive Bayes classifier, classify a text with a given certaintaity

Source File: disintegrator.py From quantified-self with MIT License

5 votes

def __init__(self):
        self.stopwords = set(stopwords.words("english"))
        self.lemmatizer = WordNetLemmatizer()

Source File: auxiliary_word2vec.py From ZeroShotVideoClassification with Apache License 2.0

5 votes

def verbs2basicform(words):
    ret = []
    for w in words:
        analysis = wn.synsets(w)
        if any([a.pos() == 'v' for a in analysis]):
            w = WordNetLemmatizer().lemmatize(w, 'v')
        ret.append(w)
    return ret

Source File: preprocess.py From Attention-Based-Aspect-Extraction with Apache License 2.0

5 votes

def parseSentence(line):
    lmtzr = WordNetLemmatizer()
    stop = stopwords.words('english')
    text_token = CountVectorizer().build_tokenizer()(line.lower())
    text_rmstop = [i for i in text_token if i not in stop]
    text_stem = [lmtzr.lemmatize(w) for w in text_rmstop]
    return text_stem

Source File: cs_inferrer.py From lexsub with Apache License 2.0

5 votes

def generate_inferred(self, result_vec, target_word, target_lemma, pos):
    
        generated_results = {}
        min_weight = None
        if result_vec is not None:
            for word, weight in result_vec:
                if generated_word_re.match(word) != None: # make sure this is not junk
                    wn_pos = to_wordnet_pos[pos]
                    lemma = WordNetLemmatizer().lemmatize(word, wn_pos)
                    if word != target_word and lemma != target_lemma:
                        if lemma in generated_results:
                            weight = max(weight, generated_results[lemma])
                        generated_results[lemma] = weight
                        if min_weight is None:
                            min_weight = weight
                        else:
                            min_weight = min(min_weight, weight)
                            
        if min_weight is None:
            min_weight = 0.0
        i = 0.0                
        for lemma in default_generated_results:
            if len(generated_results) >= len(default_generated_results):
                break;
            i -= 1.0
            generated_results[lemma] = min_weight + i
            
                
        return generated_results

Source File: transformers.py From atap with Apache License 2.0

5 votes

def __init__(self, language='english'):
        self.stopwords  = set(nltk.corpus.stopwords.words(language))
        self.lemmatizer = WordNetLemmatizer()

Source File: transformer.py From atap with Apache License 2.0

5 votes

def __init__(self, language='english', minimum=2, maximum=200):
        self.min = minimum
        self.max = maximum
        self.stopwords  = set(nltk.corpus.stopwords.words(language))
        self.lemmatizer = WordNetLemmatizer()

Source File: transformers.py From atap with Apache License 2.0

5 votes

def __init__(self, language='english'):
        self.stopwords  = set(nltk.corpus.stopwords.words(language))
        self.lemmatizer = WordNetLemmatizer()

Python nltk.stem.wordnet.WordNetLemmatizer() Examples