Python nltk.pos_tag() Examples

The following are 30 code examples of nltk.pos_tag(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module nltk , or try the search function .
Example #1
Source File: math_expression_calculator.py    From JARVIS with Apache License 2.0 8 votes vote down vote up
def text_to_num(text):
    tokenized = nltk.word_tokenize(text);
    tags = nltk.pos_tag(tokenized)
    print(tags)
    chunkPattern = r""" Chunk0: {((<NN|CD.?|RB>)<CD.?|VBD.?|VBP.?|VBN.?|NN.?|RB.?|JJ>*)<NN|CD.?>} """
    chunkParser = nltk.RegexpParser(chunkPattern)
    chunkedData = chunkParser.parse(tags)
    print(chunkedData)

    for subtree in chunkedData.subtrees(filter=lambda t: t.label() in "Chunk0"):
        exp = ""
        for l in subtree.leaves():
            exp += str(l[0]) + " "
        exp = exp[:-1]
        print(exp)
        try:
            text = text.replace(exp, str(t2n.text2num(exp)))
        except Exception as e:
            print("error text2num ->", e.args)
        print("text2num -> ", text)
    return text 
Example #2
Source File: nlp.py    From partisan-discourse with Apache License 2.0 6 votes vote down vote up
def preprocess(html):
    """
    Returns a preprocessed document consisting of a list of paragraphs, which
    is a list of sentences, which is a list of tuples, where each tuple is a
    (token, part of speech) pair.
    """
    try:
        return [
            [
                nltk.pos_tag(nltk.wordpunct_tokenize(sent))
                for sent in nltk.sent_tokenize(paragraph)
            ]
            for paragraph in para_tokenize(html)
        ]
    except Exception as e:
        raise NLTKError("could not preprocess text: {}".format(str(e))) 
Example #3
Source File: nltkmgr.py    From sia-cog with MIT License 6 votes vote down vote up
def tokenize(data, language="english", filterStopWords = False, tagging = False):
    result = {}
    tags = []
    filterChars = [",", ".", "?", ";", ":", "'", "!", "@", "#", "$", "%", "&", "*", "(", ")", "+", "{", "}", "[", "]", "\\", "|"]
    sent_token = nltk.tokenize.sent_tokenize(data, language)
    word_token = nltk.tokenize.word_tokenize(data, language)
    word_token = [w for w in word_token if not w in filterChars]
    if filterStopWords is True:
        stop_words = set(stopwords.words(language))
        word_token = [w for w in word_token if not w in stop_words]

    if tagging is True:
        tags = nltk.pos_tag(word_token)

    result = {"sent_token": sent_token, "word_token": word_token, "pos_tag": tags}
    return json.loads(jsonpickle.encode(result, unpicklable=False)) 
Example #4
Source File: language_util.py    From talk-generator with MIT License 6 votes vote down vote up
def get_last_noun_and_article(sentence):
    tokens = nltk.word_tokenize(sentence)
    tags = nltk.pos_tag(tokens)

    noun = None
    for tag in reversed(tags):
        if "NN" in tag[1]:
            if noun:
                noun = (tag[0] + " " + noun).strip()
            else:
                noun = tag[0]

        # If encountering an article while there is a noun found
        elif bool(noun):
            if "DT" in tag[1] or "PRP$" in tag[1]:
                return tag[0] + " " + noun
            return noun

    return None 
Example #5
Source File: document.py    From gender-bias with MIT License 6 votes vote down vote up
def words_by_part_of_speech(self) -> dict:
        """
        Compute the parts of speech for each word in the document.

        Uses nltk.pos_tag.

        Returns:
            dict

        """
        words = self.words()
        tagged = nltk.pos_tag(words)
        categories = {}
        for _type in {t[1] for t in tagged}:
            categories[_type] = [t[0] for t in tagged if t[1] == _type]
        return categories 
Example #6
Source File: IDAMagicStrings.py    From idamagicstrings with GNU Affero General Public License v3.0 6 votes vote down vote up
def nltk_preprocess(strings):
  if not has_nltk:
    return

  strings = "\n".join(map(str, list(strings)))
  tokens = re.findall(FUNCTION_NAMES_REGEXP, strings)
  l = []
  for token in tokens:
    l.append(token[0])
  word_tags = nltk.pos_tag(l)
  for word, tag in word_tags:
    try:
      FOUND_TOKENS[word.lower()].add(tag)
    except:
      FOUND_TOKENS[word.lower()] = set([tag])

#------------------------------------------------------------------------------- 
Example #7
Source File: nltk_plugin.py    From self-attentive-parser with MIT License 6 votes vote down vote up
def _nltk_process_sents(self, sents):
        for sentence in sents:
            if isinstance(sentence, STRING_TYPES):
                if self._tokenizer_lang is None:
                    raise ValueError(
                        "No word tokenizer available for this language. "
                        "Please tokenize before calling the parser."
                        )
                sentence = nltk.word_tokenize(sentence, self._tokenizer_lang)

            if IS_PY2:
                sentence = [
                    word.decode('utf-8', 'ignore') if isinstance(word, str) else word
                    for word in sentence
                    ]

            if not self._provides_tags:
                sentence = nltk.pos_tag(sentence)
                yield [word for word, tag in sentence], sentence
            else:
                yield sentence, sentence 
Example #8
Source File: combined.py    From Projects with MIT License 6 votes vote down vote up
def evaluate_sentiment(text):
    pos_score = 0
    neg_score = 0
    tokened = nltk.word_tokenize(text)
    pos_pairs = nltk.pos_tag(tokened)
    for tuple in pos_pairs:
        pos = ''
        if tuple[1] == "NN":
            pos = 'n/'
        if tuple[1] == "JJ":
            pos = 'a/'
        if tuple[1] == "VB":
            pos = 'v/'
        if tuple[1] == "RB":
            pos = 'r/'
        try:
            pos_score += sentiwordnet[pos+tuple[0].lower()][0]
            neg_score += sentiwordnet[pos+tuple[0].lower()][1]
        except:
            pass
    return pos_score, neg_score 
Example #9
Source File: sentiwordnet.py    From Projects with MIT License 6 votes vote down vote up
def evaluate_sentiment(text):
    pos_score = 0
    neg_score = 0
    tokened = nltk.word_tokenize(text.decode('utf8', 'ignore').replace('<br />',' '))
    pos_pairs = nltk.pos_tag(tokened)
    for tuple in pos_pairs:
        pos = ''
        if tuple[1] == "NN":
            pos = 'n/'
        if tuple[1] == "JJ":
            pos = 'a/'
        if tuple[1] == "VB":
            pos = 'v/'
        if tuple[1] == "RB":
            pos = 'r/'
        try:
            pos_score += sentiwordnet[pos+tuple[0].lower()][0]
            neg_score += sentiwordnet[pos+tuple[0].lower()][1]
        except:
            pass
    return pos_score, neg_score 
Example #10
Source File: Auto_NLP.py    From Auto_ViML with Apache License 2.0 6 votes vote down vote up
def process_text(text):
    soup = BeautifulSoup(text, "lxml")
    tags_del = soup.get_text()
    no_html = re.sub('<[^>]*>', '', tags_del)
    tokenized = casual_tokenizer(no_html)
    lower = [item.lower() for item in tokenized]
    decontract = [expandContractions(item, c_re=c_re) for item in lower]
    tagged = nltk.pos_tag(decontract)
    lemma = lemma_wordnet(tagged)
    #no_num = [re.sub('[0-9]+', '', each) for each in lemma]
    no_punc = [w for w in lemma if w not in punc]
    no_stop = [w for w in no_punc if w not in stop_words]
    return no_stop
################################################################################################################################################################
####   THE ABOVE Process_Text secion Re-used with Permission from:
####  R O B   S A L G A D O    robert.salgado@gmail.com Thank YOU!
################################################################################ 
Example #11
Source File: custom.py    From lexpredict-contraxsuite with GNU Affero General Public License v3.0 6 votes vote down vote up
def extract_nnp_phrases(text):
    """
    NNP extractor convenience method.
    :param text:
    :return:
    """
    phrase_list = []

    for sentence in nltk.sent_tokenize(text):
        # Get POS
        tokens = nltk.word_tokenize(sentence)
        pos = nltk.pos_tag(tokens)

        # Get POS
        phrase = []

        for t, p in pos:
            if p in ["NNP", "NNPS"] or t in [",", "&"]:
                phrase.append(t)
            else:
                if len(phrase) > 1:
                    phrase_list.append(clean_nnp_phrase(phrase))
                phrase = []

    return phrase_list 
Example #12
Source File: annotate.py    From serapis with MIT License 6 votes vote down vote up
def annotate_sentence(sentence_dict, term):
    """
    Annotates a sentence object from a message with Penn Treebank POS tags.

    Args:
        sentence_dict: dict -- Must contain 's' and 's_clean', which is the
                       sentence with all occurrences of the search term
                       replaced with '_TERM-'
    Returns:
        dict -- updated sentence_dict with 'pos_tags' field.

    """
    tags = pos_tag(word_tokenize(sentence_dict['s_clean']))
    pos_tags = ['/'.join(b) for b in tags]
    sentence_dict['pos_tags'] = " ".join(pos_tags)
    sentence_dict['features'] = {}
    return sentence_dict 
Example #13
Source File: annotate.py    From serapis with MIT License 6 votes vote down vote up
def annotate_pos_with_term(sentence, term):
    """POS-tag single sentence while preserving _TERM_ using the original term"""
    try:
        pos_term = []

        # replace term if necessary
        if '_term_' not in sentence.lower():
            sentence_term = sentence.lower().replace(term.lower(), '_TERM_')
        else:
            sentence_term = sentence.lower()

        tok = word_tokenize(sentence_term)
        tags = pos_tag(tok)

        for tag in tags:
            if '_TERM_' in tag[0].upper():
                pos_term.append('_TERM_')
            else:
                pos_term.append(tag[1])

        return ' '.join(pos_term)
    except Exception, e:
        log.error('POS annotation error: %s', e)
        return None 
Example #14
Source File: raw_data.py    From open-sesame with Apache License 2.0 6 votes vote down vote up
def make_data_instance(text, index):
    """
    Takes a line of text and creates a CoNLL09Example instance from it.
    """
    tokenized = nltk.tokenize.word_tokenize(text.lstrip().rstrip())
    pos_tagged = [p[1] for p in nltk.pos_tag(tokenized)]

    lemmatized = [lemmatizer.lemmatize(tokenized[i]) 
                    if not pos_tagged[i].startswith("V") else lemmatizer.lemmatize(tokenized[i], pos='v') 
                    for i in range(len(tokenized))]

    conll_lines = ["{}\t{}\t_\t{}\t_\t{}\t{}\t_\t_\t_\t_\t_\t_\t_\tO\n".format(
        i+1, tokenized[i], lemmatized[i], pos_tagged[i], index) for i in range(len(tokenized))]
    elements = [CoNLL09Element(conll_line) for conll_line in conll_lines]

    sentence = Sentence(syn_type=None, elements=elements)
    instance = CoNLL09Example(sentence, elements)

    return instance 
Example #15
Source File: sick_extender.py    From Sentence-similarity-classifier-for-pyTorch with MIT License 6 votes vote down vote up
def line_prep(self, line):
        """ Tokenizes and POS-tags a line from the SICK corpus to be compatible with WordNet synset lookup. """
        # Split line into sentences + score
        s1, s2, sim_score = line.split('\t')
        # Tokenize
        s1_tokens = word_tokenize(s1)
        s2_tokens = word_tokenize(s2)
        # Assign part of speech tags
        s1_penn_pos = nltk.pos_tag(s1_tokens)
        s2_penn_pos = nltk.pos_tag(s2_tokens)
        # Convert to WordNet POS tags and store word position in sentence for replacement
        # Each tuple contains (word, WordNet_POS_tag, position)
        s1_wn_pos = list()
        s2_wn_pos = list()
        for idx, item in enumerate(s1_penn_pos):
            if self.get_wordnet_pos(item[1]) != 'OTHER':
                s1_wn_pos.append((item[0], self.get_wordnet_pos(item[1]), s1_penn_pos.index(item)))
        for idx, item in enumerate(s2_penn_pos):
            if self.get_wordnet_pos(item[1]) != 'OTHER':
                s2_wn_pos.append((item[0], self.get_wordnet_pos(item[1]), s2_penn_pos.index(item)))

        # Each tuple contains (word, WordNet_POS_tag, position); Source sentence provided for use in disambiguation
        return [(s1_wn_pos, s1_tokens), (s2_wn_pos, s2_tokens)], sim_score 
Example #16
Source File: preprocessing.py    From TBBTCorpus with Apache License 2.0 5 votes vote down vote up
def __init__(self, speaker, words, scene, act_tag):
        self.speaker = speaker
        self.addresse = []
        self.topic = []
        self.words = []
        self.scene = scene
        self.act_tag = act_tag
        for token,pos  in nltk.pos_tag(words):
            self.words.append((token,pos))
        self.speaker_attribute = None 
Example #17
Source File: PipelineQ.py    From Natural-Language-Processing-with-Python-Cookbook with MIT License 5 votes vote down vote up
def extractPOS():
    while True:
        if queues[0].empty():
            break
        else:
            data = queues[0].get()
            words = data['input']
            postags = nltk.pos_tag(words)
            queues[0].task_done()
            queues[1].put({'uuid': data['uuid'], 'input': postags}, True) 
Example #18
Source File: 9.2 Email_Classification.py    From Natural-Language-Processing-with-Python-Cookbook with MIT License 5 votes vote down vote up
def preprocessing(text):
    text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split())

    tokens = [word for sent in nltk.sent_tokenize(text2) for word in
              nltk.word_tokenize(sent)]
    
    tokens = [word.lower() for word in tokens]
    
    stopwds = stopwords.words('english')
    tokens = [token for token in tokens if token not in stopwds]
    
    tokens = [word for word in tokens if len(word)>=3]
    
    stemmer = PorterStemmer()
    try:
        tokens = [stemmer.stem(word) for word in tokens]

    except:
        tokens = tokens
        
    tagged_corpus = pos_tag(tokens)    
    
    Noun_tags = ['NN','NNP','NNPS','NNS']
    Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ']

    lemmatizer = WordNetLemmatizer()

    def prat_lemmatize(token,tag):
        if tag in Noun_tags:
            return lemmatizer.lemmatize(token,'n')
        elif tag in Verb_tags:
            return lemmatizer.lemmatize(token,'v')
        else:
            return lemmatizer.lemmatize(token,'n')
    
    pre_proc_text =  " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus])             

    return pre_proc_text 
Example #19
Source File: Train3.py    From Natural-Language-Processing-with-Python-Cookbook with MIT License 5 votes vote down vote up
def buildDictionary():
    dictionary = {}
    for sent in sampleData():
        partsOfSpeechTags = nltk.pos_tag(nltk.word_tokenize(sent))
        for tag in partsOfSpeechTags:
            value = tag[0]
            pos = tag[1]
            dictionary[value] = pos
    return dictionary 
Example #20
Source File: Dictionary.py    From Natural-Language-Processing-with-Python-Cookbook with MIT License 5 votes vote down vote up
def __init__(self, sentence):
        self.words = nltk.word_tokenize(sentence)
        self.tagged = nltk.pos_tag(self.words)
        self.buildDictionary()
        self.buildReverseDictionary() 
Example #21
Source File: 9.5 Skipgram_Keras.py    From Natural-Language-Processing-with-Python-Cookbook with MIT License 5 votes vote down vote up
def preprocessing(text):
    text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split())

    tokens = [word for sent in nltk.sent_tokenize(text2) for word in
              nltk.word_tokenize(sent)]
    
    tokens = [word.lower() for word in tokens]
    
    stopwds = stopwords.words('english')
    tokens = [token for token in tokens if token not in stopwds]
    
    tokens = [word for word in tokens if len(word)>=3]
    
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]

    tagged_corpus = pos_tag(tokens)    
    
    Noun_tags = ['NN','NNP','NNPS','NNS']
    Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ']

    lemmatizer = WordNetLemmatizer()

    def prat_lemmatize(token,tag):
        if tag in Noun_tags:
            return lemmatizer.lemmatize(token,'n')
        elif tag in Verb_tags:
            return lemmatizer.lemmatize(token,'v')
        else:
            return lemmatizer.lemmatize(token,'n')
    
    pre_proc_text =  " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus])             

    return pre_proc_text 
Example #22
Source File: ContextTagger.py    From Natural-Language-Processing-with-Python-Cookbook with MIT License 5 votes vote down vote up
def getSentenceWords():
    sentwords = []
    for sentence in sentences:
        words = nltk.pos_tag(nltk.word_tokenize(sentence))
        sentwords.append(words)
    return sentwords 
Example #23
Source File: Training.py    From Natural-Language-Processing-with-Python-Cookbook with MIT License 5 votes vote down vote up
def buildIOBTags(text):
    chunkparser = myParser()
    words = nltk.word_tokenize(text)
    postags = nltk.pos_tag(words)
    tree = chunkparser.parse(postags)
    # This whole thing can be replaced by
    # nltk.chunk.tree2conlltags(tree) function
    # which returns 3 tuple
    return nltk.chunk.tree2conlltags(tree) 
Example #24
Source File: __init__.py    From nltk-server with MIT License 5 votes vote down vote up
def pos_tag(data):
	data = parse_input(data)
	if data == False:
		return ret_failure(703)
	else:
		try:
			res = nltk.pos_tag(data)
			return ret_success(res)
		except LookupError: 
			return ret_failure(704)
		except:
			return ret_failure(702) 
Example #25
Source File: keywords.py    From cornerwise with MIT License 5 votes vote down vote up
def keywords(text):
    """
    :param text: a text string to be evaluated

    :returns: An iterable of strings containing the recognized
    """
    tokenized = nltk.word_tokenize(text)
    tagged = nltk.pos_tag(tokenized)
    return list(map(join_words, noun_phrases(tagged))) 
Example #26
Source File: data.py    From augmented_seq2seq with GNU General Public License v3.0 5 votes vote down vote up
def split_and_tag(line):
    wtags = nltk.pos_tag(nltk.word_tokenize(line.strip()))
    words = []
    for w,t in wtags:
        if t == 'CD' or t == 'FW':
            w = t
        words.append(w)
    return words 
Example #27
Source File: data.py    From augmented_seq2seq with GNU General Public License v3.0 5 votes vote down vote up
def encode_seq(seq, lookup):
    indices = []
    for word in seq:
        if word in lookup:
            indices.append(lookup[word])
        else:
            tag = nltk.pos_tag([word])[-1][-1]
            if tag in lookup:
                indices.append(lookup[tag])
            else:
                indices.append(lookup[UNK])
    return indices 
Example #28
Source File: featurizer.py    From combine-FEVER-NSMN with MIT License 5 votes vote down vote up
def wn_pos_tag(sent):
    sent_with_pos = nltk.pos_tag(sent)

    output = [(w, convert_to_wn_pos(p)) for (w, p) in sent_with_pos]

    return output 
Example #29
Source File: custom.py    From lexpredict-contraxsuite with GNU Affero General Public License v3.0 5 votes vote down vote up
def get_entity_noun_phrase(text, entity_type):
    """
    NLTK noun phrase extractor convenience method.
    :param text:
    :param entity_type:
    :return:
    """
    p0 = 0
    p1 = None

    pos_list = nltk.pos_tag(nltk.word_tokenize(text))
    for i, _ in enumerate(pos_list):
        if i > 0:
            if pos_list[i][1] == "NNP" and pos_list[i - 1][1] != "NNP":
                p0 = i
            elif pos_list[i][1] != "NNP" and pos_list[i - 1][1] == "NNP":
                p0 = None
        if pos_list[i][0].lower() == entity_type.lower() and pos_list[i][1] == 'NNP':
            p1 = i + 1
            break

    if p0 is not None and p1 is not None:
        entity_noun_phrase = " ".join([x[0] for x in pos_list[p0:p1]])
        return entity_noun_phrase
    else:
        return None 
Example #30
Source File: word.py    From Valx with GNU General Public License v3.0 5 votes vote down vote up
def word_pos_tagging(words):
	pos = pos_tag (words)
	return pos


# counting the words number for a sentence