Python Examples of nltk.corpus.wordnet.NOUN

Source File: laplacian_tags.py From jingwei with MIT License

7 votes

def tag_semantic_similarity(x, y, ic):
    mx = wn.morphy(x)
    my = wn.morphy(y)

    if mx is None or my is None:
        return 0

    synX = wn.synsets(mx, pos=wn.NOUN)
    synY = wn.synsets(my, pos=wn.NOUN)

    if len(synX) > 0 and len(synY) > 0:
        maxSim = synX[0].lin_similarity(synY[0], ic)
    else:
        maxSim = 0

    return maxSim

Source File: _common.py From tmtoolkit with Apache License 2.0

6 votes

def pos_tag_convert_penn_to_wn(tag):
    """
    Convert POS tag from Penn tagset to WordNet tagset.

    :param tag: a tag from Penn tagset
    :return: a tag from WordNet tagset or None if no corresponding tag could be found
    """
    from nltk.corpus import wordnet as wn

    if tag in ['JJ', 'JJR', 'JJS']:
        return wn.ADJ
    elif tag in ['RB', 'RBR', 'RBS']:
        return wn.ADV
    elif tag in ['NN', 'NNS', 'NNP', 'NNPS']:
        return wn.NOUN
    elif tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
        return wn.VERB
    return None

Source File: syngraph.py From atap with Apache License 2.0

6 votes

def graph_synsets(terms, pos=wn.NOUN, depth=2):
    """
    Create a networkx graph of the given terms to the given depth.
    """

    G = nx.Graph(
        name="WordNet Synsets Graph for {}".format(", ".join(terms)), depth=depth,
    )

    def add_term_links(G, term, current_depth):
        for syn in wn.synsets(term):
            for name in syn.lemma_names():
                G.add_edge(term, name)
                if current_depth < depth:
                    add_term_links(G, name, current_depth+1)

    for term in terms:
        add_term_links(G, term, 0)

    return G

Source File: deploy.py From Election-Meddling with MIT License

6 votes

def get_wordnet_pos(self,treebank_tag):
        """
        return WORDNET POS compliance to WORDENT lemmatization (a,n,r,v) 
        """

        if treebank_tag.startswith('J'):
            return wordnet.ADJ

        elif treebank_tag.startswith('V'):
            return wordnet.VERB

        elif treebank_tag.startswith('N'):
            return wordnet.NOUN

        elif treebank_tag.startswith('R'):
            return wordnet.ADV

        else:
            return wordnet.NOUN

Source File: wn_persistent_api.py From combine-FEVER-NSMN with MIT License

6 votes

def convert_to_wn_pos(pos):
    if pos.startswith("J"):
        return wn.ADJ
    elif pos.startswith("V"):
        return wn.VERB
    elif pos.startswith("N"):
        return wn.NOUN
    elif pos.startswith("R"):
        return wn.ADV
    else:
        return ""

Source File: normalization.py From text-analytics-with-python with Apache License 2.0

6 votes

def pos_tag_text(text):
    
    def penn_to_wn_tags(pos_tag):
        if pos_tag.startswith('J'):
            return wn.ADJ
        elif pos_tag.startswith('V'):
            return wn.VERB
        elif pos_tag.startswith('N'):
            return wn.NOUN
        elif pos_tag.startswith('R'):
            return wn.ADV
        else:
            return None
    
    tagged_text = tag(text)
    tagged_lower_text = [(word.lower(), penn_to_wn_tags(pos_tag))
                         for word, pos_tag in
                         tagged_text]
    return tagged_lower_text
    
# lemmatize text based on POS tags

Source File: normalization.py From text-analytics-with-python with Apache License 2.0

6 votes

def pos_tag_text(text):
    
    def penn_to_wn_tags(pos_tag):
        if pos_tag.startswith('J'):
            return wn.ADJ
        elif pos_tag.startswith('V'):
            return wn.VERB
        elif pos_tag.startswith('N'):
            return wn.NOUN
        elif pos_tag.startswith('R'):
            return wn.ADV
        else:
            return None
    
    tagged_text = tag(text)
    tagged_lower_text = [(word.lower(), penn_to_wn_tags(pos_tag))
                         for word, pos_tag in
                         tagged_text]
    return tagged_lower_text
    
# lemmatize text based on POS tags

Source File: normalization.py From text-analytics-with-python with Apache License 2.0

6 votes

def pos_tag_text(text):
    
    def penn_to_wn_tags(pos_tag):
        if pos_tag.startswith('J'):
            return wn.ADJ
        elif pos_tag.startswith('V'):
            return wn.VERB
        elif pos_tag.startswith('N'):
            return wn.NOUN
        elif pos_tag.startswith('R'):
            return wn.ADV
        else:
            return None
    
    tagged_text = tag(text)
    tagged_lower_text = [(word.lower(), penn_to_wn_tags(pos_tag))
                         for word, pos_tag in
                         tagged_text]
    return tagged_lower_text
    
# lemmatize text based on POS tags

Source File: normalization.py From text-analytics-with-python with Apache License 2.0

6 votes

def pos_tag_text(text):
    
    def penn_to_wn_tags(pos_tag):
        if pos_tag.startswith('J'):
            return wn.ADJ
        elif pos_tag.startswith('V'):
            return wn.VERB
        elif pos_tag.startswith('N'):
            return wn.NOUN
        elif pos_tag.startswith('R'):
            return wn.ADV
        else:
            return None
    
    tagged_text = tag(text)
    tagged_lower_text = [(word.lower(), penn_to_wn_tags(pos_tag))
                         for word, pos_tag in
                         tagged_text]
    return tagged_lower_text
    
# lemmatize text based on POS tags

Source File: dcs.py From broca with MIT License

5 votes

def _disambiguate_doc(self, tagged_tokens):
        """
        Takes a list of tagged tokens, representing a document,
        in the form:

            [(token, tag), ...]

        And returns a mapping of terms to their disambiguated concepts (synsets).
        """

        # Group tokens by PoS
        pos_groups = {pos: [] for pos in [wn.NOUN, wn.VERB, wn.ADJ, wn.ADV]}
        for tok, tag in tagged_tokens:
            if tag in pos_groups:
                pos_groups[tag].append(tok)

        #print(pos_groups)

        # Map of final term -> concept mappings
        map = {}
        for tag, toks in pos_groups.items():
            map.update(self._disambiguate_pos(toks, tag))

        #nice_map = {k: map[k].lemma_names() for k in map.keys()}
        #print(json.dumps(nice_map, indent=4, sort_keys=True))

        return map

Source File: wordnet_app.py From luscan-devel with GNU General Public License v2.0

5 votes

def _pos_tuples():
    return [
        (wn.NOUN,'N','noun'),
        (wn.VERB,'V','verb'),
        (wn.ADJ,'J','adj'),
        (wn.ADV,'R','adv')]

Source File: normalize.py From atap with Apache License 2.0

5 votes

def lemmatize(self, token, pos_tag):
        tag = {
            'N': wn.NOUN,
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
        }.get(pos_tag[0], wn.NOUN)

        return self.lemmatizer.lemmatize(token, tag)

Source File: preprocess_lst_test.py From lexsub with Apache License 2.0

5 votes

def lemmatize(pairs):
    triples = []
    for pair in pairs:
        word = pair[0]
        pos = pair[1]
        wordnet_pos = wordnet.NOUN
        if (len(pos)>=2):
            pos_prefix = pos[:2]
            if (pos_prefix in to_wordnet_pos):
                wordnet_pos = to_wordnet_pos[pos_prefix]
        lemma = WordNetLemmatizer().lemmatize(word, wordnet_pos).lower();
        triples.append([word, wordnet_pos, lemma])
    return triples

Source File: __init__.py From wordai with MIT License

5 votes

def _sentence_to_mongo(typ, items):
    import nltk
    from nltk.corpus import wordnet

    def wordnet_pos(tag):
        if tag.startswith('J'):
            return wordnet.ADJ
        elif tag.startswith('V'):
            return wordnet.VERB
        elif tag.startswith('N'):
            return wordnet.NOUN
        elif tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN

    # nltk.download('punkt')
    nltk.download('averaged_perceptron_tagger')
    nltk.download('stopwords')
    nltk.download('wordnet')
    nltk.download('punkt')
    stop_words = set(nltk.corpus.stopwords.words('english'))
    stemmer = nltk.stem.WordNetLemmatizer()
    sentences = []
    for trans in items:
        eng, chn = trans.getsource(), trans.gettarget()
        tokens = nltk.word_tokenize(eng)
        pos_tag = [pos[1] for pos in nltk.pos_tag(tokens)]
        roots = [stemmer.lemmatize(word, wordnet_pos(pos_tag[idx])) for idx, word in enumerate(tokens)]
        cleanword = [token for token in roots if token.isalpha() and token not in stop_words and len(token) >= 3]
        # remove duplicates
        clean_word = list(dict.fromkeys(cleanword))
        if len(clean_word) > 0:
            score = Word.search_words(*clean_word).sum('star') / len(clean_word)
        else:
            score = -1
        sentence = Sentence(eng=eng, chn=chn, words=tokens, pos_tag=pos_tag, roots=roots, score=score, typ=typ)
        sentences.append(sentence)
        if len(sentences) > 50:
            Sentence.objects.insert(sentences)
            sentences = []

Source File: transformer.py From atap with Apache License 2.0

5 votes

def lemmatize(self, token, pos_tag):
        tag = {
            'N': wn.NOUN,
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
        }.get(pos_tag[0], wn.NOUN)

        return self.lemmatizer.lemmatize(token, tag)

Source File: build.py From atap with Apache License 2.0

5 votes

def lemmatize(self, token, pos_tag):
        tag = {
            'N': wn.NOUN,
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
        }.get(pos_tag[0], wn.NOUN)

        return self.lemmatizer.lemmatize(token, tag)

Source File: synset_analysis.py From Quadflor with BSD 3-Clause "New" or "Revised" License

5 votes

def __init__(self):
        NltkNormalizer.install_nltk_corpora('averaged_perceptron_tagger')
        self.normalizer = NltkNormalizer()
        self.lem = nltk.WordNetLemmatizer()
        self.tagger = nltk.PerceptronTagger()
        self.translation_dict = {'J': wn.ADJ, 'N': wn.NOUN, 'R': wn.ADV, 'V': wn.VERB}

Source File: agglomerative.py From atap with Apache License 2.0

5 votes

def wnpos(tag):
    # Return the WordNet POS tag from the Penn Treebank tag
    return {
        'N': wn.NOUN,
        'V': wn.VERB,
        'R': wn.ADV,
        'J': wn.ADJ
    }.get(tag[0], wn.NOUN)

Source File: preprocessing.py From TextRank with MIT License

5 votes

def __get_wordnet_pos(treebank_tag):
        """Maps the treebank tags to WordNet part of speech names"""
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return None

Source File: kmeans.py From atap with Apache License 2.0

5 votes

def wnpos(tag):
    # Return the WordNet POS tag from the Penn Treebank tag
    return {
        'N': wn.NOUN,
        'V': wn.VERB,
        'R': wn.ADV,
        'J': wn.ADJ
    }.get(tag[0], wn.NOUN)

Source File: transformers.py From atap with Apache License 2.0

5 votes

def lemmatize(self, token, pos_tag):
        tag = {
            'N': wn.NOUN,
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
        }.get(pos_tag[0], wn.NOUN)

        return self.lemmatizer.lemmatize(token, tag)

Source File: transformer.py From atap with Apache License 2.0

5 votes

def lemmatize(self, token, pos_tag):
        tag = {
            'N': wn.NOUN,
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
        }.get(pos_tag[0], wn.NOUN)

        return self.lemmatizer.lemmatize(token, tag)

Source File: sentiwordnet.py From yenlp with GNU General Public License v3.0

5 votes

def wordnet_pos_code(tag):
    '''Translation from nltk tags to Wordnet code'''
    if tag.startswith('NN'):
        return wordnet.NOUN
    elif tag.startswith('VB'):
        return wordnet.VERB
    elif tag.startswith('JJ'):
        return wordnet.ADJ
    elif tag.startswith('RB'):
        return wordnet.ADV
    else:
        return ''

Source File: test_preprocess_func.py From tmtoolkit with Apache License 2.0

5 votes

def test_pos_tag_convert_penn_to_wn():
    assert pos_tag_convert_penn_to_wn('JJ') == wn.ADJ
    assert pos_tag_convert_penn_to_wn('RB') == wn.ADV
    assert pos_tag_convert_penn_to_wn('NN') == wn.NOUN
    assert pos_tag_convert_penn_to_wn('VB') == wn.VERB

    for tag in ('', 'invalid', None):
        assert pos_tag_convert_penn_to_wn(tag) is None

Source File: wordnet_app.py From razzy-spinner with GNU General Public License v3.0

5 votes

def _pos_tuples():
    return [
        (wn.NOUN,'N','noun'),
        (wn.VERB,'V','verb'),
        (wn.ADJ,'J','adj'),
        (wn.ADV,'R','adv')]

Source File: main.py From tensorflow-XNN with MIT License

5 votes

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

Source File: main.py From tensorflow-XNN with MIT License

5 votes

def lemmatize_word(word, pos=wordnet.NOUN):
    return LEMMATIZER.lemmatize(word, pos)

Source File: main.py From tensorflow-XNN with MIT License

5 votes

def lemmatize_sentence(sentence):
    res = []
    sentence_ = get_valid_words(sentence)
    for word, pos in pos_tag(sentence_):
        wordnet_pos = get_wordnet_pos(pos) or wordnet.NOUN
        res.append(lemmatize_word(word, pos=wordnet_pos))
    return res

Source File: check_availability.py From jingwei with MIT License

5 votes

def check_robustpca(trainCollection, testCollection, feature):
    ready = True
    
    # check matlab    
    if not check_matlab():
        print_msg('RobustPCA (%s, %s, %s)' % (trainCollection, testCollection, feature), 'Matlab is not available or incorrectly configured.')
        ready = False
    
    # check if knn is available
    if not check_knn(trainCollection, testCollection, feature):
        print_msg('RobustPCA (%s, %s, %s)' % (trainCollection, testCollection, feature), 'KNN is not available.')        
        ready = False

    # check data files
    datafiles = [ os.path.join(ROOT_PATH, trainCollection, 'TextData', 'id.userid.lemmtags.txt'),
                  os.path.join(ROOT_PATH, trainCollection, 'FeatureData', feature)]
    res = find_missing_files(datafiles)
    if res:
        print_msg('RobustPCA (%s, %s, %s)' % (trainCollection, testCollection, feature), 'the following files or folders are missing:\n%s' % res)
        return False    
              
    # check external dependencies  
    try:
        import h5py
        import numpy
        import scipy.io
        import scipy.sparse
        from nltk.corpus import wordnet as wn
        from nltk.corpus import wordnet_ic
        brown_ic = wordnet_ic.ic('ic-brown.dat')
        wn.morphy('cat')
        wn.synsets('cat', pos=wn.NOUN)
    except Exception, e:
        try:
            import nltk
            nltk.download('brown')
            nltk.download('wordnet')
            nltk.download('wordnet_ic')
        except Exception, e:
            print e
            ready = False

Source File: wordnet_similarity.py From jingwei with MIT License

5 votes

def wup_similarity(tagx, tagy):
    scores = []
    for pos in [wn.NOUN, wn.VERB, wn.ADJ, wn.ADJ_SAT, wn.ADV]:
        try:
            synsetx = wn.synset('%s.%s.01' % (tagx,pos))
            synsety = wn.synset('%s.%s.01' % (tagy,pos))
            score = synsetx.wup_similarity(synsety)
            if score is None:
                score = 0
        except Exception, e:
            score = 0
        scores.append(score)

Python nltk.corpus.wordnet.NOUN Examples