Python Examples of nltk.pos

Source File: math_expression_calculator.py From JARVIS with Apache License 2.0

8 votes

def text_to_num(text):
    tokenized = nltk.word_tokenize(text);
    tags = nltk.pos_tag(tokenized)
    print(tags)
    chunkPattern = r""" Chunk0: {((<NN|CD.?|RB>)<CD.?|VBD.?|VBP.?|VBN.?|NN.?|RB.?|JJ>*)<NN|CD.?>} """
    chunkParser = nltk.RegexpParser(chunkPattern)
    chunkedData = chunkParser.parse(tags)
    print(chunkedData)

    for subtree in chunkedData.subtrees(filter=lambda t: t.label() in "Chunk0"):
        exp = ""
        for l in subtree.leaves():
            exp += str(l[0]) + " "
        exp = exp[:-1]
        print(exp)
        try:
            text = text.replace(exp, str(t2n.text2num(exp)))
        except Exception as e:
            print("error text2num ->", e.args)
        print("text2num -> ", text)
    return text

Source File: nlp.py From partisan-discourse with Apache License 2.0

6 votes

def preprocess(html):
    """
    Returns a preprocessed document consisting of a list of paragraphs, which
    is a list of sentences, which is a list of tuples, where each tuple is a
    (token, part of speech) pair.
    """
    try:
        return [
            [
                nltk.pos_tag(nltk.wordpunct_tokenize(sent))
                for sent in nltk.sent_tokenize(paragraph)
            ]
            for paragraph in para_tokenize(html)
        ]
    except Exception as e:
        raise NLTKError("could not preprocess text: {}".format(str(e)))

Source File: nltkmgr.py From sia-cog with MIT License

6 votes

def tokenize(data, language="english", filterStopWords = False, tagging = False):
    result = {}
    tags = []
    filterChars = [",", ".", "?", ";", ":", "'", "!", "@", "#", "$", "%", "&", "*", "(", ")", "+", "{", "}", "[", "]", "\\", "|"]
    sent_token = nltk.tokenize.sent_tokenize(data, language)
    word_token = nltk.tokenize.word_tokenize(data, language)
    word_token = [w for w in word_token if not w in filterChars]
    if filterStopWords is True:
        stop_words = set(stopwords.words(language))
        word_token = [w for w in word_token if not w in stop_words]

    if tagging is True:
        tags = nltk.pos_tag(word_token)

    result = {"sent_token": sent_token, "word_token": word_token, "pos_tag": tags}
    return json.loads(jsonpickle.encode(result, unpicklable=False))

Source File: language_util.py From talk-generator with MIT License

6 votes

def get_last_noun_and_article(sentence):
    tokens = nltk.word_tokenize(sentence)
    tags = nltk.pos_tag(tokens)

    noun = None
    for tag in reversed(tags):
        if "NN" in tag[1]:
            if noun:
                noun = (tag[0] + " " + noun).strip()
            else:
                noun = tag[0]

        # If encountering an article while there is a noun found
        elif bool(noun):
            if "DT" in tag[1] or "PRP$" in tag[1]:
                return tag[0] + " " + noun
            return noun

    return None

Source File: document.py From gender-bias with MIT License

6 votes

def words_by_part_of_speech(self) -> dict:
        """
        Compute the parts of speech for each word in the document.

        Uses nltk.pos_tag.

        Returns:
            dict

        """
        words = self.words()
        tagged = nltk.pos_tag(words)
        categories = {}
        for _type in {t[1] for t in tagged}:
            categories[_type] = [t[0] for t in tagged if t[1] == _type]
        return categories

Source File: IDAMagicStrings.py From idamagicstrings with GNU Affero General Public License v3.0

6 votes

def nltk_preprocess(strings):
  if not has_nltk:
    return

  strings = "\n".join(map(str, list(strings)))
  tokens = re.findall(FUNCTION_NAMES_REGEXP, strings)
  l = []
  for token in tokens:
    l.append(token[0])
  word_tags = nltk.pos_tag(l)
  for word, tag in word_tags:
    try:
      FOUND_TOKENS[word.lower()].add(tag)
    except:
      FOUND_TOKENS[word.lower()] = set([tag])

#-------------------------------------------------------------------------------

Source File: nltk_plugin.py From self-attentive-parser with MIT License

6 votes

def _nltk_process_sents(self, sents):
        for sentence in sents:
            if isinstance(sentence, STRING_TYPES):
                if self._tokenizer_lang is None:
                    raise ValueError(
                        "No word tokenizer available for this language. "
                        "Please tokenize before calling the parser."
                        )
                sentence = nltk.word_tokenize(sentence, self._tokenizer_lang)

            if IS_PY2:
                sentence = [
                    word.decode('utf-8', 'ignore') if isinstance(word, str) else word
                    for word in sentence
                    ]

            if not self._provides_tags:
                sentence = nltk.pos_tag(sentence)
                yield [word for word, tag in sentence], sentence
            else:
                yield sentence, sentence

Source File: combined.py From Projects with MIT License

6 votes

def evaluate_sentiment(text):
    pos_score = 0
    neg_score = 0
    tokened = nltk.word_tokenize(text)
    pos_pairs = nltk.pos_tag(tokened)
    for tuple in pos_pairs:
        pos = ''
        if tuple[1] == "NN":
            pos = 'n/'
        if tuple[1] == "JJ":
            pos = 'a/'
        if tuple[1] == "VB":
            pos = 'v/'
        if tuple[1] == "RB":
            pos = 'r/'
        try:
            pos_score += sentiwordnet[pos+tuple[0].lower()][0]
            neg_score += sentiwordnet[pos+tuple[0].lower()][1]
        except:
            pass
    return pos_score, neg_score

Source File: sentiwordnet.py From Projects with MIT License

6 votes

def evaluate_sentiment(text):
    pos_score = 0
    neg_score = 0
    tokened = nltk.word_tokenize(text.decode('utf8', 'ignore').replace('<br />',' '))
    pos_pairs = nltk.pos_tag(tokened)
    for tuple in pos_pairs:
        pos = ''
        if tuple[1] == "NN":
            pos = 'n/'
        if tuple[1] == "JJ":
            pos = 'a/'
        if tuple[1] == "VB":
            pos = 'v/'
        if tuple[1] == "RB":
            pos = 'r/'
        try:
            pos_score += sentiwordnet[pos+tuple[0].lower()][0]
            neg_score += sentiwordnet[pos+tuple[0].lower()][1]
        except:
            pass
    return pos_score, neg_score

Source File: Auto_NLP.py From Auto_ViML with Apache License 2.0

6 votes

def process_text(text):
    soup = BeautifulSoup(text, "lxml")
    tags_del = soup.get_text()
    no_html = re.sub('<[^>]*>', '', tags_del)
    tokenized = casual_tokenizer(no_html)
    lower = [item.lower() for item in tokenized]
    decontract = [expandContractions(item, c_re=c_re) for item in lower]
    tagged = nltk.pos_tag(decontract)
    lemma = lemma_wordnet(tagged)
    #no_num = [re.sub('[0-9]+', '', each) for each in lemma]
    no_punc = [w for w in lemma if w not in punc]
    no_stop = [w for w in no_punc if w not in stop_words]
    return no_stop
################################################################################################################################################################
####   THE ABOVE Process_Text secion Re-used with Permission from:
####  R O B   S A L G A D O    robert.salgado@gmail.com Thank YOU!
################################################################################

Source File: custom.py From lexpredict-contraxsuite with GNU Affero General Public License v3.0

6 votes

def extract_nnp_phrases(text):
    """
    NNP extractor convenience method.
    :param text:
    :return:
    """
    phrase_list = []

    for sentence in nltk.sent_tokenize(text):
        # Get POS
        tokens = nltk.word_tokenize(sentence)
        pos = nltk.pos_tag(tokens)

        # Get POS
        phrase = []

        for t, p in pos:
            if p in ["NNP", "NNPS"] or t in [",", "&"]:
                phrase.append(t)
            else:
                if len(phrase) > 1:
                    phrase_list.append(clean_nnp_phrase(phrase))
                phrase = []

    return phrase_list

Source File: annotate.py From serapis with MIT License

6 votes

def annotate_sentence(sentence_dict, term):
    """
    Annotates a sentence object from a message with Penn Treebank POS tags.

    Args:
        sentence_dict: dict -- Must contain 's' and 's_clean', which is the
                       sentence with all occurrences of the search term
                       replaced with '_TERM-'
    Returns:
        dict -- updated sentence_dict with 'pos_tags' field.

    """
    tags = pos_tag(word_tokenize(sentence_dict['s_clean']))
    pos_tags = ['/'.join(b) for b in tags]
    sentence_dict['pos_tags'] = " ".join(pos_tags)
    sentence_dict['features'] = {}
    return sentence_dict

Source File: annotate.py From serapis with MIT License

6 votes

def annotate_pos_with_term(sentence, term):
    """POS-tag single sentence while preserving _TERM_ using the original term"""
    try:
        pos_term = []

        # replace term if necessary
        if '_term_' not in sentence.lower():
            sentence_term = sentence.lower().replace(term.lower(), '_TERM_')
        else:
            sentence_term = sentence.lower()

        tok = word_tokenize(sentence_term)
        tags = pos_tag(tok)

        for tag in tags:
            if '_TERM_' in tag[0].upper():
                pos_term.append('_TERM_')
            else:
                pos_term.append(tag[1])

        return ' '.join(pos_term)
    except Exception, e:
        log.error('POS annotation error: %s', e)
        return None

Source File: raw_data.py From open-sesame with Apache License 2.0

6 votes

def make_data_instance(text, index):
    """
    Takes a line of text and creates a CoNLL09Example instance from it.
    """
    tokenized = nltk.tokenize.word_tokenize(text.lstrip().rstrip())
    pos_tagged = [p[1] for p in nltk.pos_tag(tokenized)]

    lemmatized = [lemmatizer.lemmatize(tokenized[i]) 
                    if not pos_tagged[i].startswith("V") else lemmatizer.lemmatize(tokenized[i], pos='v') 
                    for i in range(len(tokenized))]

    conll_lines = ["{}\t{}\t_\t{}\t_\t{}\t{}\t_\t_\t_\t_\t_\t_\t_\tO\n".format(
        i+1, tokenized[i], lemmatized[i], pos_tagged[i], index) for i in range(len(tokenized))]
    elements = [CoNLL09Element(conll_line) for conll_line in conll_lines]

    sentence = Sentence(syn_type=None, elements=elements)
    instance = CoNLL09Example(sentence, elements)

    return instance

Source File: sick_extender.py From Sentence-similarity-classifier-for-pyTorch with MIT License

6 votes

def line_prep(self, line):
        """ Tokenizes and POS-tags a line from the SICK corpus to be compatible with WordNet synset lookup. """
        # Split line into sentences + score
        s1, s2, sim_score = line.split('\t')
        # Tokenize
        s1_tokens = word_tokenize(s1)
        s2_tokens = word_tokenize(s2)
        # Assign part of speech tags
        s1_penn_pos = nltk.pos_tag(s1_tokens)
        s2_penn_pos = nltk.pos_tag(s2_tokens)
        # Convert to WordNet POS tags and store word position in sentence for replacement
        # Each tuple contains (word, WordNet_POS_tag, position)
        s1_wn_pos = list()
        s2_wn_pos = list()
        for idx, item in enumerate(s1_penn_pos):
            if self.get_wordnet_pos(item[1]) != 'OTHER':
                s1_wn_pos.append((item[0], self.get_wordnet_pos(item[1]), s1_penn_pos.index(item)))
        for idx, item in enumerate(s2_penn_pos):
            if self.get_wordnet_pos(item[1]) != 'OTHER':
                s2_wn_pos.append((item[0], self.get_wordnet_pos(item[1]), s2_penn_pos.index(item)))

        # Each tuple contains (word, WordNet_POS_tag, position); Source sentence provided for use in disambiguation
        return [(s1_wn_pos, s1_tokens), (s2_wn_pos, s2_tokens)], sim_score

Source File: preprocessing.py From TBBTCorpus with Apache License 2.0

5 votes

def __init__(self, speaker, words, scene, act_tag):
        self.speaker = speaker
        self.addresse = []
        self.topic = []
        self.words = []
        self.scene = scene
        self.act_tag = act_tag
        for token,pos  in nltk.pos_tag(words):
            self.words.append((token,pos))
        self.speaker_attribute = None

Source File: PipelineQ.py From Natural-Language-Processing-with-Python-Cookbook with MIT License

5 votes

def extractPOS():
    while True:
        if queues[0].empty():
            break
        else:
            data = queues[0].get()
            words = data['input']
            postags = nltk.pos_tag(words)
            queues[0].task_done()
            queues[1].put({'uuid': data['uuid'], 'input': postags}, True)

Source File: 9.2 Email_Classification.py From Natural-Language-Processing-with-Python-Cookbook with MIT License

5 votes

def preprocessing(text):
    text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split())

    tokens = [word for sent in nltk.sent_tokenize(text2) for word in
              nltk.word_tokenize(sent)]
    
    tokens = [word.lower() for word in tokens]
    
    stopwds = stopwords.words('english')
    tokens = [token for token in tokens if token not in stopwds]
    
    tokens = [word for word in tokens if len(word)>=3]
    
    stemmer = PorterStemmer()
    try:
        tokens = [stemmer.stem(word) for word in tokens]

    except:
        tokens = tokens
        
    tagged_corpus = pos_tag(tokens)    
    
    Noun_tags = ['NN','NNP','NNPS','NNS']
    Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ']

    lemmatizer = WordNetLemmatizer()

    def prat_lemmatize(token,tag):
        if tag in Noun_tags:
            return lemmatizer.lemmatize(token,'n')
        elif tag in Verb_tags:
            return lemmatizer.lemmatize(token,'v')
        else:
            return lemmatizer.lemmatize(token,'n')
    
    pre_proc_text =  " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus])             

    return pre_proc_text

Source File: Train3.py From Natural-Language-Processing-with-Python-Cookbook with MIT License

5 votes

def buildDictionary():
    dictionary = {}
    for sent in sampleData():
        partsOfSpeechTags = nltk.pos_tag(nltk.word_tokenize(sent))
        for tag in partsOfSpeechTags:
            value = tag[0]
            pos = tag[1]
            dictionary[value] = pos
    return dictionary

Source File: Dictionary.py From Natural-Language-Processing-with-Python-Cookbook with MIT License

5 votes

def __init__(self, sentence):
        self.words = nltk.word_tokenize(sentence)
        self.tagged = nltk.pos_tag(self.words)
        self.buildDictionary()
        self.buildReverseDictionary()

Source File: 9.5 Skipgram_Keras.py From Natural-Language-Processing-with-Python-Cookbook with MIT License

5 votes

def preprocessing(text):
    text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split())

    tokens = [word for sent in nltk.sent_tokenize(text2) for word in
              nltk.word_tokenize(sent)]
    
    tokens = [word.lower() for word in tokens]
    
    stopwds = stopwords.words('english')
    tokens = [token for token in tokens if token not in stopwds]
    
    tokens = [word for word in tokens if len(word)>=3]
    
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]

    tagged_corpus = pos_tag(tokens)    
    
    Noun_tags = ['NN','NNP','NNPS','NNS']
    Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ']

    lemmatizer = WordNetLemmatizer()

    def prat_lemmatize(token,tag):
        if tag in Noun_tags:
            return lemmatizer.lemmatize(token,'n')
        elif tag in Verb_tags:
            return lemmatizer.lemmatize(token,'v')
        else:
            return lemmatizer.lemmatize(token,'n')
    
    pre_proc_text =  " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus])             

    return pre_proc_text

Source File: ContextTagger.py From Natural-Language-Processing-with-Python-Cookbook with MIT License

5 votes

def getSentenceWords():
    sentwords = []
    for sentence in sentences:
        words = nltk.pos_tag(nltk.word_tokenize(sentence))
        sentwords.append(words)
    return sentwords

Source File: Training.py From Natural-Language-Processing-with-Python-Cookbook with MIT License

5 votes

def buildIOBTags(text):
    chunkparser = myParser()
    words = nltk.word_tokenize(text)
    postags = nltk.pos_tag(words)
    tree = chunkparser.parse(postags)
    # This whole thing can be replaced by
    # nltk.chunk.tree2conlltags(tree) function
    # which returns 3 tuple
    return nltk.chunk.tree2conlltags(tree)

Source File: __init__.py From nltk-server with MIT License

5 votes

def pos_tag(data):
	data = parse_input(data)
	if data == False:
		return ret_failure(703)
	else:
		try:
			res = nltk.pos_tag(data)
			return ret_success(res)
		except LookupError: 
			return ret_failure(704)
		except:
			return ret_failure(702)

Source File: keywords.py From cornerwise with MIT License

5 votes

def keywords(text):
    """
    :param text: a text string to be evaluated

    :returns: An iterable of strings containing the recognized
    """
    tokenized = nltk.word_tokenize(text)
    tagged = nltk.pos_tag(tokenized)
    return list(map(join_words, noun_phrases(tagged)))

Source File: data.py From augmented_seq2seq with GNU General Public License v3.0

5 votes

def split_and_tag(line):
    wtags = nltk.pos_tag(nltk.word_tokenize(line.strip()))
    words = []
    for w,t in wtags:
        if t == 'CD' or t == 'FW':
            w = t
        words.append(w)
    return words

Source File: data.py From augmented_seq2seq with GNU General Public License v3.0

5 votes

def encode_seq(seq, lookup):
    indices = []
    for word in seq:
        if word in lookup:
            indices.append(lookup[word])
        else:
            tag = nltk.pos_tag([word])[-1][-1]
            if tag in lookup:
                indices.append(lookup[tag])
            else:
                indices.append(lookup[UNK])
    return indices

Source File: featurizer.py From combine-FEVER-NSMN with MIT License

5 votes

def wn_pos_tag(sent):
    sent_with_pos = nltk.pos_tag(sent)

    output = [(w, convert_to_wn_pos(p)) for (w, p) in sent_with_pos]

    return output

Source File: custom.py From lexpredict-contraxsuite with GNU Affero General Public License v3.0

5 votes

def get_entity_noun_phrase(text, entity_type):
    """
    NLTK noun phrase extractor convenience method.
    :param text:
    :param entity_type:
    :return:
    """
    p0 = 0
    p1 = None

    pos_list = nltk.pos_tag(nltk.word_tokenize(text))
    for i, _ in enumerate(pos_list):
        if i > 0:
            if pos_list[i][1] == "NNP" and pos_list[i - 1][1] != "NNP":
                p0 = i
            elif pos_list[i][1] != "NNP" and pos_list[i - 1][1] == "NNP":
                p0 = None
        if pos_list[i][0].lower() == entity_type.lower() and pos_list[i][1] == 'NNP':
            p1 = i + 1
            break

    if p0 is not None and p1 is not None:
        entity_noun_phrase = " ".join([x[0] for x in pos_list[p0:p1]])
        return entity_noun_phrase
    else:
        return None

Source File: word.py From Valx with GNU General Public License v3.0

5 votes

def word_pos_tagging(words):
	pos = pos_tag (words)
	return pos


# counting the words number for a sentence

Python nltk.pos_tag() Examples