Python nltk.pos_tag() Examples
The following are 30
code examples of nltk.pos_tag().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
nltk
, or try the search function
.
Example #1
Source File: math_expression_calculator.py From JARVIS with Apache License 2.0 | 8 votes |
def text_to_num(text): tokenized = nltk.word_tokenize(text); tags = nltk.pos_tag(tokenized) print(tags) chunkPattern = r""" Chunk0: {((<NN|CD.?|RB>)<CD.?|VBD.?|VBP.?|VBN.?|NN.?|RB.?|JJ>*)<NN|CD.?>} """ chunkParser = nltk.RegexpParser(chunkPattern) chunkedData = chunkParser.parse(tags) print(chunkedData) for subtree in chunkedData.subtrees(filter=lambda t: t.label() in "Chunk0"): exp = "" for l in subtree.leaves(): exp += str(l[0]) + " " exp = exp[:-1] print(exp) try: text = text.replace(exp, str(t2n.text2num(exp))) except Exception as e: print("error text2num ->", e.args) print("text2num -> ", text) return text
Example #2
Source File: nlp.py From partisan-discourse with Apache License 2.0 | 6 votes |
def preprocess(html): """ Returns a preprocessed document consisting of a list of paragraphs, which is a list of sentences, which is a list of tuples, where each tuple is a (token, part of speech) pair. """ try: return [ [ nltk.pos_tag(nltk.wordpunct_tokenize(sent)) for sent in nltk.sent_tokenize(paragraph) ] for paragraph in para_tokenize(html) ] except Exception as e: raise NLTKError("could not preprocess text: {}".format(str(e)))
Example #3
Source File: nltkmgr.py From sia-cog with MIT License | 6 votes |
def tokenize(data, language="english", filterStopWords = False, tagging = False): result = {} tags = [] filterChars = [",", ".", "?", ";", ":", "'", "!", "@", "#", "$", "%", "&", "*", "(", ")", "+", "{", "}", "[", "]", "\\", "|"] sent_token = nltk.tokenize.sent_tokenize(data, language) word_token = nltk.tokenize.word_tokenize(data, language) word_token = [w for w in word_token if not w in filterChars] if filterStopWords is True: stop_words = set(stopwords.words(language)) word_token = [w for w in word_token if not w in stop_words] if tagging is True: tags = nltk.pos_tag(word_token) result = {"sent_token": sent_token, "word_token": word_token, "pos_tag": tags} return json.loads(jsonpickle.encode(result, unpicklable=False))
Example #4
Source File: language_util.py From talk-generator with MIT License | 6 votes |
def get_last_noun_and_article(sentence): tokens = nltk.word_tokenize(sentence) tags = nltk.pos_tag(tokens) noun = None for tag in reversed(tags): if "NN" in tag[1]: if noun: noun = (tag[0] + " " + noun).strip() else: noun = tag[0] # If encountering an article while there is a noun found elif bool(noun): if "DT" in tag[1] or "PRP$" in tag[1]: return tag[0] + " " + noun return noun return None
Example #5
Source File: document.py From gender-bias with MIT License | 6 votes |
def words_by_part_of_speech(self) -> dict: """ Compute the parts of speech for each word in the document. Uses nltk.pos_tag. Returns: dict """ words = self.words() tagged = nltk.pos_tag(words) categories = {} for _type in {t[1] for t in tagged}: categories[_type] = [t[0] for t in tagged if t[1] == _type] return categories
Example #6
Source File: IDAMagicStrings.py From idamagicstrings with GNU Affero General Public License v3.0 | 6 votes |
def nltk_preprocess(strings): if not has_nltk: return strings = "\n".join(map(str, list(strings))) tokens = re.findall(FUNCTION_NAMES_REGEXP, strings) l = [] for token in tokens: l.append(token[0]) word_tags = nltk.pos_tag(l) for word, tag in word_tags: try: FOUND_TOKENS[word.lower()].add(tag) except: FOUND_TOKENS[word.lower()] = set([tag]) #-------------------------------------------------------------------------------
Example #7
Source File: nltk_plugin.py From self-attentive-parser with MIT License | 6 votes |
def _nltk_process_sents(self, sents): for sentence in sents: if isinstance(sentence, STRING_TYPES): if self._tokenizer_lang is None: raise ValueError( "No word tokenizer available for this language. " "Please tokenize before calling the parser." ) sentence = nltk.word_tokenize(sentence, self._tokenizer_lang) if IS_PY2: sentence = [ word.decode('utf-8', 'ignore') if isinstance(word, str) else word for word in sentence ] if not self._provides_tags: sentence = nltk.pos_tag(sentence) yield [word for word, tag in sentence], sentence else: yield sentence, sentence
Example #8
Source File: combined.py From Projects with MIT License | 6 votes |
def evaluate_sentiment(text): pos_score = 0 neg_score = 0 tokened = nltk.word_tokenize(text) pos_pairs = nltk.pos_tag(tokened) for tuple in pos_pairs: pos = '' if tuple[1] == "NN": pos = 'n/' if tuple[1] == "JJ": pos = 'a/' if tuple[1] == "VB": pos = 'v/' if tuple[1] == "RB": pos = 'r/' try: pos_score += sentiwordnet[pos+tuple[0].lower()][0] neg_score += sentiwordnet[pos+tuple[0].lower()][1] except: pass return pos_score, neg_score
Example #9
Source File: sentiwordnet.py From Projects with MIT License | 6 votes |
def evaluate_sentiment(text): pos_score = 0 neg_score = 0 tokened = nltk.word_tokenize(text.decode('utf8', 'ignore').replace('<br />',' ')) pos_pairs = nltk.pos_tag(tokened) for tuple in pos_pairs: pos = '' if tuple[1] == "NN": pos = 'n/' if tuple[1] == "JJ": pos = 'a/' if tuple[1] == "VB": pos = 'v/' if tuple[1] == "RB": pos = 'r/' try: pos_score += sentiwordnet[pos+tuple[0].lower()][0] neg_score += sentiwordnet[pos+tuple[0].lower()][1] except: pass return pos_score, neg_score
Example #10
Source File: Auto_NLP.py From Auto_ViML with Apache License 2.0 | 6 votes |
def process_text(text): soup = BeautifulSoup(text, "lxml") tags_del = soup.get_text() no_html = re.sub('<[^>]*>', '', tags_del) tokenized = casual_tokenizer(no_html) lower = [item.lower() for item in tokenized] decontract = [expandContractions(item, c_re=c_re) for item in lower] tagged = nltk.pos_tag(decontract) lemma = lemma_wordnet(tagged) #no_num = [re.sub('[0-9]+', '', each) for each in lemma] no_punc = [w for w in lemma if w not in punc] no_stop = [w for w in no_punc if w not in stop_words] return no_stop ################################################################################################################################################################ #### THE ABOVE Process_Text secion Re-used with Permission from: #### R O B S A L G A D O robert.salgado@gmail.com Thank YOU! ################################################################################
Example #11
Source File: custom.py From lexpredict-contraxsuite with GNU Affero General Public License v3.0 | 6 votes |
def extract_nnp_phrases(text): """ NNP extractor convenience method. :param text: :return: """ phrase_list = [] for sentence in nltk.sent_tokenize(text): # Get POS tokens = nltk.word_tokenize(sentence) pos = nltk.pos_tag(tokens) # Get POS phrase = [] for t, p in pos: if p in ["NNP", "NNPS"] or t in [",", "&"]: phrase.append(t) else: if len(phrase) > 1: phrase_list.append(clean_nnp_phrase(phrase)) phrase = [] return phrase_list
Example #12
Source File: annotate.py From serapis with MIT License | 6 votes |
def annotate_sentence(sentence_dict, term): """ Annotates a sentence object from a message with Penn Treebank POS tags. Args: sentence_dict: dict -- Must contain 's' and 's_clean', which is the sentence with all occurrences of the search term replaced with '_TERM-' Returns: dict -- updated sentence_dict with 'pos_tags' field. """ tags = pos_tag(word_tokenize(sentence_dict['s_clean'])) pos_tags = ['/'.join(b) for b in tags] sentence_dict['pos_tags'] = " ".join(pos_tags) sentence_dict['features'] = {} return sentence_dict
Example #13
Source File: annotate.py From serapis with MIT License | 6 votes |
def annotate_pos_with_term(sentence, term): """POS-tag single sentence while preserving _TERM_ using the original term""" try: pos_term = [] # replace term if necessary if '_term_' not in sentence.lower(): sentence_term = sentence.lower().replace(term.lower(), '_TERM_') else: sentence_term = sentence.lower() tok = word_tokenize(sentence_term) tags = pos_tag(tok) for tag in tags: if '_TERM_' in tag[0].upper(): pos_term.append('_TERM_') else: pos_term.append(tag[1]) return ' '.join(pos_term) except Exception, e: log.error('POS annotation error: %s', e) return None
Example #14
Source File: raw_data.py From open-sesame with Apache License 2.0 | 6 votes |
def make_data_instance(text, index): """ Takes a line of text and creates a CoNLL09Example instance from it. """ tokenized = nltk.tokenize.word_tokenize(text.lstrip().rstrip()) pos_tagged = [p[1] for p in nltk.pos_tag(tokenized)] lemmatized = [lemmatizer.lemmatize(tokenized[i]) if not pos_tagged[i].startswith("V") else lemmatizer.lemmatize(tokenized[i], pos='v') for i in range(len(tokenized))] conll_lines = ["{}\t{}\t_\t{}\t_\t{}\t{}\t_\t_\t_\t_\t_\t_\t_\tO\n".format( i+1, tokenized[i], lemmatized[i], pos_tagged[i], index) for i in range(len(tokenized))] elements = [CoNLL09Element(conll_line) for conll_line in conll_lines] sentence = Sentence(syn_type=None, elements=elements) instance = CoNLL09Example(sentence, elements) return instance
Example #15
Source File: sick_extender.py From Sentence-similarity-classifier-for-pyTorch with MIT License | 6 votes |
def line_prep(self, line): """ Tokenizes and POS-tags a line from the SICK corpus to be compatible with WordNet synset lookup. """ # Split line into sentences + score s1, s2, sim_score = line.split('\t') # Tokenize s1_tokens = word_tokenize(s1) s2_tokens = word_tokenize(s2) # Assign part of speech tags s1_penn_pos = nltk.pos_tag(s1_tokens) s2_penn_pos = nltk.pos_tag(s2_tokens) # Convert to WordNet POS tags and store word position in sentence for replacement # Each tuple contains (word, WordNet_POS_tag, position) s1_wn_pos = list() s2_wn_pos = list() for idx, item in enumerate(s1_penn_pos): if self.get_wordnet_pos(item[1]) != 'OTHER': s1_wn_pos.append((item[0], self.get_wordnet_pos(item[1]), s1_penn_pos.index(item))) for idx, item in enumerate(s2_penn_pos): if self.get_wordnet_pos(item[1]) != 'OTHER': s2_wn_pos.append((item[0], self.get_wordnet_pos(item[1]), s2_penn_pos.index(item))) # Each tuple contains (word, WordNet_POS_tag, position); Source sentence provided for use in disambiguation return [(s1_wn_pos, s1_tokens), (s2_wn_pos, s2_tokens)], sim_score
Example #16
Source File: preprocessing.py From TBBTCorpus with Apache License 2.0 | 5 votes |
def __init__(self, speaker, words, scene, act_tag): self.speaker = speaker self.addresse = [] self.topic = [] self.words = [] self.scene = scene self.act_tag = act_tag for token,pos in nltk.pos_tag(words): self.words.append((token,pos)) self.speaker_attribute = None
Example #17
Source File: PipelineQ.py From Natural-Language-Processing-with-Python-Cookbook with MIT License | 5 votes |
def extractPOS(): while True: if queues[0].empty(): break else: data = queues[0].get() words = data['input'] postags = nltk.pos_tag(words) queues[0].task_done() queues[1].put({'uuid': data['uuid'], 'input': postags}, True)
Example #18
Source File: 9.2 Email_Classification.py From Natural-Language-Processing-with-Python-Cookbook with MIT License | 5 votes |
def preprocessing(text): text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split()) tokens = [word for sent in nltk.sent_tokenize(text2) for word in nltk.word_tokenize(sent)] tokens = [word.lower() for word in tokens] stopwds = stopwords.words('english') tokens = [token for token in tokens if token not in stopwds] tokens = [word for word in tokens if len(word)>=3] stemmer = PorterStemmer() try: tokens = [stemmer.stem(word) for word in tokens] except: tokens = tokens tagged_corpus = pos_tag(tokens) Noun_tags = ['NN','NNP','NNPS','NNS'] Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ'] lemmatizer = WordNetLemmatizer() def prat_lemmatize(token,tag): if tag in Noun_tags: return lemmatizer.lemmatize(token,'n') elif tag in Verb_tags: return lemmatizer.lemmatize(token,'v') else: return lemmatizer.lemmatize(token,'n') pre_proc_text = " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus]) return pre_proc_text
Example #19
Source File: Train3.py From Natural-Language-Processing-with-Python-Cookbook with MIT License | 5 votes |
def buildDictionary(): dictionary = {} for sent in sampleData(): partsOfSpeechTags = nltk.pos_tag(nltk.word_tokenize(sent)) for tag in partsOfSpeechTags: value = tag[0] pos = tag[1] dictionary[value] = pos return dictionary
Example #20
Source File: Dictionary.py From Natural-Language-Processing-with-Python-Cookbook with MIT License | 5 votes |
def __init__(self, sentence): self.words = nltk.word_tokenize(sentence) self.tagged = nltk.pos_tag(self.words) self.buildDictionary() self.buildReverseDictionary()
Example #21
Source File: 9.5 Skipgram_Keras.py From Natural-Language-Processing-with-Python-Cookbook with MIT License | 5 votes |
def preprocessing(text): text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split()) tokens = [word for sent in nltk.sent_tokenize(text2) for word in nltk.word_tokenize(sent)] tokens = [word.lower() for word in tokens] stopwds = stopwords.words('english') tokens = [token for token in tokens if token not in stopwds] tokens = [word for word in tokens if len(word)>=3] stemmer = PorterStemmer() tokens = [stemmer.stem(word) for word in tokens] tagged_corpus = pos_tag(tokens) Noun_tags = ['NN','NNP','NNPS','NNS'] Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ'] lemmatizer = WordNetLemmatizer() def prat_lemmatize(token,tag): if tag in Noun_tags: return lemmatizer.lemmatize(token,'n') elif tag in Verb_tags: return lemmatizer.lemmatize(token,'v') else: return lemmatizer.lemmatize(token,'n') pre_proc_text = " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus]) return pre_proc_text
Example #22
Source File: ContextTagger.py From Natural-Language-Processing-with-Python-Cookbook with MIT License | 5 votes |
def getSentenceWords(): sentwords = [] for sentence in sentences: words = nltk.pos_tag(nltk.word_tokenize(sentence)) sentwords.append(words) return sentwords
Example #23
Source File: Training.py From Natural-Language-Processing-with-Python-Cookbook with MIT License | 5 votes |
def buildIOBTags(text): chunkparser = myParser() words = nltk.word_tokenize(text) postags = nltk.pos_tag(words) tree = chunkparser.parse(postags) # This whole thing can be replaced by # nltk.chunk.tree2conlltags(tree) function # which returns 3 tuple return nltk.chunk.tree2conlltags(tree)
Example #24
Source File: __init__.py From nltk-server with MIT License | 5 votes |
def pos_tag(data): data = parse_input(data) if data == False: return ret_failure(703) else: try: res = nltk.pos_tag(data) return ret_success(res) except LookupError: return ret_failure(704) except: return ret_failure(702)
Example #25
Source File: keywords.py From cornerwise with MIT License | 5 votes |
def keywords(text): """ :param text: a text string to be evaluated :returns: An iterable of strings containing the recognized """ tokenized = nltk.word_tokenize(text) tagged = nltk.pos_tag(tokenized) return list(map(join_words, noun_phrases(tagged)))
Example #26
Source File: data.py From augmented_seq2seq with GNU General Public License v3.0 | 5 votes |
def split_and_tag(line): wtags = nltk.pos_tag(nltk.word_tokenize(line.strip())) words = [] for w,t in wtags: if t == 'CD' or t == 'FW': w = t words.append(w) return words
Example #27
Source File: data.py From augmented_seq2seq with GNU General Public License v3.0 | 5 votes |
def encode_seq(seq, lookup): indices = [] for word in seq: if word in lookup: indices.append(lookup[word]) else: tag = nltk.pos_tag([word])[-1][-1] if tag in lookup: indices.append(lookup[tag]) else: indices.append(lookup[UNK]) return indices
Example #28
Source File: featurizer.py From combine-FEVER-NSMN with MIT License | 5 votes |
def wn_pos_tag(sent): sent_with_pos = nltk.pos_tag(sent) output = [(w, convert_to_wn_pos(p)) for (w, p) in sent_with_pos] return output
Example #29
Source File: custom.py From lexpredict-contraxsuite with GNU Affero General Public License v3.0 | 5 votes |
def get_entity_noun_phrase(text, entity_type): """ NLTK noun phrase extractor convenience method. :param text: :param entity_type: :return: """ p0 = 0 p1 = None pos_list = nltk.pos_tag(nltk.word_tokenize(text)) for i, _ in enumerate(pos_list): if i > 0: if pos_list[i][1] == "NNP" and pos_list[i - 1][1] != "NNP": p0 = i elif pos_list[i][1] != "NNP" and pos_list[i - 1][1] == "NNP": p0 = None if pos_list[i][0].lower() == entity_type.lower() and pos_list[i][1] == 'NNP': p1 = i + 1 break if p0 is not None and p1 is not None: entity_noun_phrase = " ".join([x[0] for x in pos_list[p0:p1]]) return entity_noun_phrase else: return None
Example #30
Source File: word.py From Valx with GNU General Public License v3.0 | 5 votes |
def word_pos_tagging(words): pos = pos_tag (words) return pos # counting the words number for a sentence