Python nltk.tag() Examples
The following are 30
code examples of nltk.tag().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
nltk
, or try the search function
.
Example #1
Source File: glue.py From razzy-spinner with GNU General Public License v3.0 | 6 votes |
def get_semtypes(self, node): """ Based on the node, return a list of plausible semtypes in order of plausibility. """ rel = node['rel'].lower() word = node['word'].lower() if rel == 'spec': if word in SPEC_SEMTYPES: return [SPEC_SEMTYPES[word]] else: return [SPEC_SEMTYPES['default']] elif rel in ['nmod', 'vmod']: return [node['tag'], rel] else: return [node['tag']]
Example #2
Source File: phrasemachine.py From phrasemachine with MIT License | 6 votes |
def __init__(self): import nltk from nltk.tag import PerceptronTagger from nltk.tokenize import TreebankWordTokenizer tokenizer_fn = os.path.abspath(resource_filename('phrasemachine.data', 'punkt.english.pickle')) tagger_fn = os.path.abspath(resource_filename('phrasemachine.data', 'averaged_perceptron_tagger.pickle')) # Load the tagger self.tagger = PerceptronTagger(load=False) self.tagger.load(tagger_fn) # note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader. # Calling the TreebankWordTokenizer like this allows skipping the downloader. # It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads # https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25 self.tokenize = TreebankWordTokenizer().tokenize self.sent_detector = nltk.data.load(tokenizer_fn) # http://www.nltk.org/book/ch05.html
Example #3
Source File: phrasemachine.py From phrasemachine with MIT License | 6 votes |
def extract_JK(pos_seq): """The 'JK' method in Handler et al. 2016. Returns token positions of valid ngrams.""" def find_ngrams(input_list, num_): '''get ngrams of len n from input list''' return zip(*[input_list[i:] for i in range(num_)]) # copied from M and S chp 5''' patterns = set(['AN', 'NN', 'AAN', 'ANN', 'NAN', 'NNN', 'NPN']) pos_seq = [tag2coarse.get(tag,'O') for tag in pos_seq] pos_seq = [(i, p) for i, p in enumerate(pos_seq)] ngrams = [ngram for n in range(1, 4) for ngram in find_ngrams(pos_seq, n)] def stringify(s): return "".join(a[1] for a in s) def positionify(s): return tuple(a[0] for a in s) ngrams = filter(lambda x: stringify(x) in patterns, ngrams) return [set(positionify(n)) for n in ngrams] ########
Example #4
Source File: entity_recognizer_mod.py From nlp-services with MIT License | 6 votes |
def bio_tagger(self, ne_tagged): bio_tagged = [] prev_tag = "O" for token, tag in ne_tagged: if tag == "O": # O bio_tagged.append((token, tag)) prev_tag = tag continue if tag != "O" and prev_tag == "O": # Begin NE bio_tagged.append((token, "B-" + tag)) prev_tag = tag elif prev_tag != "O" and prev_tag == tag: # Inside NE bio_tagged.append((token, "I-" + tag)) prev_tag = tag elif prev_tag != "O" and prev_tag != tag: # Adjacent NE bio_tagged.append((token, "B-" + tag)) prev_tag = tag return bio_tagged # Create tree
Example #5
Source File: glue.py From luscan-devel with GNU General Public License v2.0 | 6 votes |
def get_semtypes(self, node): """ Based on the node, return a list of plausible semtypes in order of plausibility. """ semtype_name = None rel = node['rel'].lower() word = node['word'].lower() if rel == 'spec': if word in SPEC_SEMTYPES: return [SPEC_SEMTYPES[word]] else: return [SPEC_SEMTYPES['default']] elif rel in ['nmod', 'vmod']: return [node['tag'], rel] else: return [node['tag']]
Example #6
Source File: glue.py From luscan-devel with GNU General Public License v2.0 | 6 votes |
def lookup(self, node, depgraph, counter): semtype_names = self.get_semtypes(node) semtype = None for name in semtype_names: if name in self: semtype = self[name] break if semtype is None: # raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word) return [] self.add_missing_dependencies(node, depgraph) lookup = self._lookup_semtype_option(semtype, node, depgraph) if not len(lookup): raise KeyError, "There is no GlueDict entry for sem type of '%s'"\ " with tag '%s', and rel '%s'" %\ (node['word'], node['tag'], node['rel']) return self.get_glueformulas_from_semtype_entry(lookup, node['word'], node, depgraph, counter)
Example #7
Source File: IDAMagicStrings.py From idamagicstrings with GNU Affero General Public License v3.0 | 6 votes |
def nltk_preprocess(strings): if not has_nltk: return strings = "\n".join(map(str, list(strings))) tokens = re.findall(FUNCTION_NAMES_REGEXP, strings) l = [] for token in tokens: l.append(token[0]) word_tags = nltk.pos_tag(l) for word, tag in word_tags: try: FOUND_TOKENS[word.lower()].add(tag) except: FOUND_TOKENS[word.lower()] = set([tag]) #-------------------------------------------------------------------------------
Example #8
Source File: EnglishPOSTagger.py From NeuronBlocks with MIT License | 6 votes |
def postag_multi(self, multi_sentence): """ tag multiple sentences one time RECOMMAND! Because the efficiency of stanford pos tagger in NLTK is too slow. Args: multi_sentence: [[token1, token2], ..., [...]] Returns: """ #word_pos_pairs_multi_sent = self.eng_tagger.tag_sents(multi_sentence) ''' word_pos_pairs_multi_sent = pos_tag_sents(multi_sentence) pos_lists = [] for word_pos_pairs in word_pos_pairs_multi_sent: pos_lists.append([pos for (word, pos) in word_pos_pairs]) return pos_lists ''' return [self.postag(sent) for sent in multi_sentence]
Example #9
Source File: glue.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 6 votes |
def lookup(self, node, depgraph, counter): semtype_names = self.get_semtypes(node) semtype = None for name in semtype_names: if name in self: semtype = self[name] break if semtype is None: # raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word) return [] self.add_missing_dependencies(node, depgraph) lookup = self._lookup_semtype_option(semtype, node, depgraph) if not len(lookup): raise KeyError( "There is no GlueDict entry for sem type of '%s' " "with tag '%s', and rel '%s'" % (node['word'], node['tag'], node['rel']) ) return self.get_glueformulas_from_semtype_entry( lookup, node['word'], node, depgraph, counter )
Example #10
Source File: phrasemachine.py From scattertext with Apache License 2.0 | 6 votes |
def __init__(self): import nltk from nltk.tag import PerceptronTagger from nltk.tokenize import TreebankWordTokenizer #return pkgutil.get_data('scattertext', # 'data/viz/semiotic_new.html').decode('utf-8') path = os.path.dirname(sys.modules['scattertext'].__file__)+'/data/' tokenizer_fn = path + 'punkt.english.pickle' tagger_fn = path + 'averaged_perceptron_tagger.pickle' #tokenizer_fn = os.path.abspath(resource_filename('scattertext.data', 'punkt.english.pickle')) #tagger_fn = os.path.abspath(resource_filename('scattertext.data', 'averaged_perceptron_tagger.pickle')) # Load the tagger self.tagger = PerceptronTagger(load=False) self.tagger.load(tagger_fn) # note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader. # Calling the TreebankWordTokenizer like this allows skipping the downloader. # It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads # https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25 self.tokenize = TreebankWordTokenizer().tokenize self.sent_detector = nltk.data.load(tokenizer_fn) # http://www.nltk.org/book/ch05.html
Example #11
Source File: phrasemachine.py From scattertext with Apache License 2.0 | 6 votes |
def extract_JK(pos_seq): """The 'JK' method in Handler et al. 2016. Returns token positions of valid ngrams.""" def find_ngrams(input_list, num_): '''get ngrams of len n from input list''' return zip(*[input_list[i:] for i in range(num_)]) # copied from M and S chp 5''' patterns = set(['AN', 'NN', 'AAN', 'ANN', 'NAN', 'NNN', 'NPN']) pos_seq = [tag2coarse.get(tag, 'O') for tag in pos_seq] pos_seq = [(i, p) for i, p in enumerate(pos_seq)] ngrams = [ngram for n in range(1, 4) for ngram in find_ngrams(pos_seq, n)] def stringify(s): return "".join(a[1] for a in s) def positionify(s): return tuple(a[0] for a in s) ngrams = filter(lambda x: stringify(x) in patterns, ngrams) return [set(positionify(n)) for n in ngrams] ########
Example #12
Source File: glue.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 6 votes |
def get_semtypes(self, node): """ Based on the node, return a list of plausible semtypes in order of plausibility. """ rel = node['rel'].lower() word = node['word'].lower() if rel == 'spec': if word in SPEC_SEMTYPES: return [SPEC_SEMTYPES[word]] else: return [SPEC_SEMTYPES['default']] elif rel in ['nmod', 'vmod']: return [node['tag'], rel] else: return [node['tag']]
Example #13
Source File: glue.py From razzy-spinner with GNU General Public License v3.0 | 6 votes |
def lookup(self, node, depgraph, counter): semtype_names = self.get_semtypes(node) semtype = None for name in semtype_names: if name in self: semtype = self[name] break if semtype is None: # raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word) return [] self.add_missing_dependencies(node, depgraph) lookup = self._lookup_semtype_option(semtype, node, depgraph) if not len(lookup): raise KeyError( "There is no GlueDict entry for sem type of '%s' " "with tag '%s', and rel '%s'" % (node['word'], node['tag'], node['rel']) ) return self.get_glueformulas_from_semtype_entry(lookup, node['word'], node, depgraph, counter)
Example #14
Source File: data_processing.py From Densely-Interactive-Inference-Network with Apache License 2.0 | 5 votes |
def generate_pos_feature_tensor(parses, left_padding_and_cropping_pairs): pos_vectors = [] for parse in parses: pos = parsing_parse(parse) pos_vector = [(idx, POS_dict.get(tag, 0)) for idx, tag in enumerate(pos)] pos_vectors.append(pos_vector) return construct_one_hot_feature_tensor(pos_vectors, left_padding_and_cropping_pairs, 2, column_size=len(POS_Tagging))
Example #15
Source File: data_processing.py From Densely-Interactive-Inference-Network with Apache License 2.0 | 5 votes |
def parse_to_pos_vector(parse, left_padding_and_cropping_pair = (0,0)): # ONE HOT pos = parsing_parse(parse) pos_vector = [POS_dict.get(tag,0) for tag in pos] left_padding, left_cropping = left_padding_and_cropping_pair vector = np.zeros((FIXED_PARAMETERS["seq_length"],len(POS_Tagging))) assert left_padding == 0 or left_cropping == 0 for i in range(FIXED_PARAMETERS["seq_length"]): if i < len(pos_vector): vector[i + left_padding, pos_vector[i + left_cropping]] = 1 else: break return vector
Example #16
Source File: data_processing.py From Densely-Interactive-Inference-Network with Apache License 2.0 | 5 votes |
def generate_quora_pos_feature_tensor(parses, left_padding_and_cropping_pairs): pos_vectors = [] for parse in parses: pos = parse.split() pos_vector = [(idx, POS_dict.get(tag, 0)) for idx, tag in enumerate(pos)] pos_vectors.append(pos_vector) return construct_one_hot_feature_tensor(pos_vectors, left_padding_and_cropping_pairs, 2, column_size=len(POS_Tagging))
Example #17
Source File: EnglishPOSTagger.py From NeuronBlocks with MIT License | 5 votes |
def postag(self, word_list): """ Args: word_list: word list Returns: pos tag list """ #word_pos_pairs = self.eng_tagger.tag(word_list) #word_pos_pairs = pos_tag(word_list) word_pos_pairs = nltk.tag._pos_tag(word_list, None, self.eng_tagger) pos_list = [pos for (word, pos) in word_pos_pairs] return pos_list
Example #18
Source File: paraphrase.py From textfool with MIT License | 5 votes |
def _synonym_prefilter_fn(token, synonym): ''' Similarity heuristics go here ''' if (len(synonym.text.split()) > 2) or \ (synonym.lemma == token.lemma) or \ (synonym.tag != token.tag) or \ (token.text.lower() == 'be'): return False else: return True
Example #19
Source File: paraphrase.py From textfool with MIT License | 5 votes |
def _get_wordnet_pos(spacy_token): '''Wordnet POS tag''' pos = spacy_token.tag_[0].lower() if pos in ['a', 'n', 'v']: return pos
Example #20
Source File: phrasemachine.py From phrasemachine with MIT License | 5 votes |
def tag_tokens(self, tokens): word_pos_pairs = self.tagger.tag(tokens) return {'tokens': tokens, 'pos': [tag for (w,tag) in word_pos_pairs]}
Example #21
Source File: phrasemachine.py From phrasemachine with MIT License | 5 votes |
def tag_text(self, text): '''take input text and return tokens w/ part of speech tags using NLTK''' # putting import here instead of top of file b.c. not all will have nltk installed sents = self.sent_detector.tokenize(text) # TODO: this will fail on some unicode chars. I think assumes ascii word_pos_pairs = [] all_tokens = [] for sent in sents: tokens = self.tokenize(sent) all_tokens = all_tokens + tokens word_pos_pairs = word_pos_pairs + self.tagger.tag(tokens) return {'tokens': all_tokens, 'pos': [tag for (w,tag) in word_pos_pairs]}
Example #22
Source File: relextract.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def _join(lst, sep=' ', untag=False): """ Join a list into a string, turning tags tuples into tag strings or just words. :param untag: if ``True``, omit the tag from tagged input strings. :type lst: list :rtype: str """ try: return sep.join(lst) except TypeError: if untag: return sep.join(tup[0] for tup in lst) from nltk.tag import tuple2str return sep.join(tuple2str(tup) for tup in lst)
Example #23
Source File: phrasemachine.py From phrasemachine with MIT License | 5 votes |
def coarse_tag_str(pos_seq): """Convert POS sequence to our coarse system, formatted as a string.""" global tag2coarse tags = [tag2coarse.get(tag,'O') for tag in pos_seq] return ''.join(tags) # POS extraction assuming list of POS tags as input. # >>> pyre.extract_finditer(["VB","JJ","NN","NN","QQ","QQ",]) # [(1, 4)] # >>> pyre.extract_ngram_filter(["VB","JJ","NN","NN","QQ","QQ",]) # [(1, 3), (1, 4), (2, 3), (2, 4), (3, 4)]
Example #24
Source File: entity_recognizer_mod.py From nlp-services with MIT License | 5 votes |
def stanford_tagger(self, token_text): st = StanfordNERTagger(self.english_model, self.stanford_jar, encoding='utf-8') ne_tagged = st.tag(token_text) return (ne_tagged) # NLTK POS and NER taggers
Example #25
Source File: relextract.py From luscan-devel with GNU General Public License v2.0 | 5 votes |
def _join(lst, sep=' ', untag=False): """ Join a list into a string, turning tags tuples into tag strings or just words. :param untag: if ``True``, omit the tag from tagged input strings. :type lst: list :rtype: str """ try: return join(lst, sep=sep) except TypeError: if untag: return join([tup[0] for tup in lst], sep=sep) from nltk.tag import tuple2str return join([tuple2str(tup) for tup in lst], sep=sep)
Example #26
Source File: address_extract.py From address_extraction with MIT License | 5 votes |
def parse(self, tagged_sent): chunks = self.tagger.tag(tagged_sent) # Transform the result from [((w1, t1), iob1), ...] # to the preferred list of triplets format [(w1, t1, iob1), ...] iob_triplets = [(w, t, c) for ((w, t), c) in chunks] # Transform the list of triplets to nltk.Tree format return conlltags2tree(iob_triplets)
Example #27
Source File: keyphrase_test_dataset.py From seq2seq-keyphrase with MIT License | 5 votes |
def get_postag_with_index(sources, idx2word, word2idx): path = os.path.dirname(__file__) path = path[:path.rfind(os.sep, 0, len(path)-10)+1] + 'stanford-postagger/' print(path) # jar = '/Users/memray/Project/stanford/stanford-postagger/stanford-postagger.jar' jar = path + '/stanford-postagger.jar' model = path + '/models/english-bidirectional-distsim.tagger' pos_tagger = StanfordPOSTagger(model, jar) # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-left3words-distsim.tagger' # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-bidirectional-distsim.tagger' stanford_dir = jar.rpartition('/')[0] stanford_jars = find_jars_within_path(stanford_dir) pos_tagger._stanford_jar = ':'.join(stanford_jars) tagged_source = [] # Predict on testing data for idx in xrange(len(sources)): # len(test_data_plain) test_s_o = sources[idx] source_text = keyphrase_utils.cut_zero(test_s_o, idx2word) text = pos_tagger.tag(source_text) print('[%d/%d] : %s' % (idx, len(sources), str(text))) tagged_source.append(text) return tagged_source
Example #28
Source File: keyphrase_test_dataset.py From seq2seq-keyphrase with MIT License | 5 votes |
def get_postag_with_record(records, pairs): path = os.path.dirname(__file__) path = path[:path.rfind(os.sep, 0, len(path)-10)+1] + 'stanford-postagger/' print(path) # jar = '/Users/memray/Project/stanford/stanford-postagger/stanford-postagger.jar' jar = path + '/stanford-postagger.jar' model = path + '/models/english-bidirectional-distsim.tagger' pos_tagger = StanfordPOSTagger(model, jar) # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-left3words-distsim.tagger' # model = '/Users/memray/Project/stanford/stanford-postagger/models/english-bidirectional-distsim.tagger' stanford_dir = jar.rpartition('/')[0] stanford_jars = find_jars_within_path(stanford_dir) pos_tagger._stanford_jar = ':'.join(stanford_jars) tagged_source = [] # Predict on testing data for idx, (record, pair) in enumerate(zip(records, pairs)): # len(test_data_plain) print('*' * 100) print('File: ' + record['name']) print('Input: ' + str(pair[0])) text = pos_tagger.tag(pair[0]) print('[%d/%d][%d] : %s' % (idx, len(records) , len(pair[0]), str(text))) tagged_source.append(text) return tagged_source
Example #29
Source File: keyphrase_test_dataset.py From seq2seq-keyphrase with MIT License | 5 votes |
def load_xml(self, xmldir): ''' for KDD/WWW/UMD only :return: doclist ''' for filename in os.listdir(xmldir): with open(xmldir+filename) as textfile: doc = Document() doc.name = filename[:filename.find('.xml')] import string printable = set(string.printable) # print((filename)) try: lines = textfile.readlines() xml = ''.join([filter(lambda x: x in printable, l) for l in lines]) root = ET.fromstring(xml) doc.title = root.findall("title")[0].text doc.abstract = root.findall("abstract")[0].text doc.phrases = [n.text for n in root.findall("*/tag")] self.doclist.append(doc) except UnicodeDecodeError: print('UnicodeDecodeError detected! %s' % filename )
Example #30
Source File: phrasemachine.py From scattertext with Apache License 2.0 | 5 votes |
def tag_tokens(self, tokens): word_pos_pairs = self.tagger.tag(tokens) return {'tokens': tokens, 'pos': [tag for (w, tag) in word_pos_pairs]}