Python nltk.chunk() Examples
The following are 6
code examples of nltk.chunk().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
nltk
, or try the search function
.
Example #1
Source File: transformer.py From atap with Apache License 2.0 | 6 votes |
def extract_candidate_phrases(self, sents): """ For a document, parse sentences using our chunker created by our grammar, converting the parse tree into a tagged sequence. Extract phrases, rejoin with a space, and yield the document represented as a list of it's keyphrases. """ for sent in sents: sent = self.normalize(sent) if not sent: continue chunks = tree2conlltags(self.chunker.parse(sent)) phrases = [ " ".join(word for word, pos, chunk in group).lower() for key, group in groupby( chunks, lambda term: term[-1] != 'O' ) if key ] for phrase in phrases: yield phrase
Example #2
Source File: token_merger.py From Parsivar with MIT License | 5 votes |
def train_merger(self, train_file_path, test_split=0.1): print("Loading Data...") file = open(train_file_path, "r", encoding='utf-8') file_content = file.read() file_content = file_content.split("\n\n") data_list = [] for line in file_content: line = nltk.chunk.util.conllstr2tree(line, chunk_types=('NP',), root_label='S') if (len(line) > 0): data_list.append(line) # train_sents, test_sents = train_test_split(data_list, test_size=test_split, random_state=91) train_sents = data_list test_sents = [] print("Training the model ...") # Transform the trees in IOB annotated sentences [(word, pos, chunk), ...] chunked_sents = [tree2conlltags(sent) for sent in train_sents] # Transform the triplets in pairs, make it compatible with the tagger interface [((word, pos), chunk), ...] def triplets2tagged_pairs(iob_sent): return [((word, pos), chunk) for word, pos, chunk in iob_sent] chunked_sents = [triplets2tagged_pairs(sent) for sent in chunked_sents] self.feature_detector = self.features self.tagger = ClassifierBasedTagger( train=chunked_sents, feature_detector=self.features) token_merger_model = self.tagger if len(test_sents) > 0: print("evaluating...") print(token_merger_model.evaluate(test_sents)) return token_merger_model
Example #3
Source File: entity_recognizer_mod.py From nlp-services with MIT License | 5 votes |
def structure_ne(self, ne_tree): ne = [] for subtree in ne_tree: if type(subtree) == Tree: # If subtree is a noun chunk, i.e. NE != "O" ne_label = subtree.label() ne_string = " ".join([token for token, pos in subtree.leaves()]) ne.append((ne_string, ne_label)) return ne # Nltk Named Entity Recognizer
Example #4
Source File: do_benchmark.py From PyRATA with Apache License 2.0 | 5 votes |
def measure_pattern_time_v2(iteration_number, size, pattern): gw = execnet.makegateway("popen//python=python2.7") channel = gw.remote_exec(""" from nltk.corpus import brown words = brown.words()[:%s] text = ' '.join(words) from pattern.en import parsetree text_tree = parsetree(text, tokenize = True, # Split punctuation marks from words? tags = True, # Parse part-of-speech tags? (NN, JJ, ...) chunks = False, # Parse chunks? (NP, VP, PNP, ...) relations = False, # Parse chunk relations? (-SBJ, -OBJ, ...) lemmata = False, # Parse lemmata? (ate => eat) encoding = 'utf-8', # Input string encoding. tagset = None) # Penn Treebank II (default) or UNIVERSAL. from pattern.search import search def measure_pattern_search(): global pattern_search_result #Make measure_me able to modify the value pattern_search_result = search("%s", text_tree) #print ("clip.pattern len(result)="+str(len(pattern_search_result))) from timeit import Timer pattern_search_time = Timer(measure_pattern_search) #print ('pattern_search_time') def pattern_search_timeit(): runtimes = [pattern_search_time.timeit(number=1) for i in range (0, %s)] average = sum(runtimes)/len(runtimes) # return ''.join(['timit: #runs=', str(%s), ' ; average=', str(average),' ; min=', str(min(runtimes))]) return [runtimes, average, min(runtimes), len(pattern_search_result)] channel.send(pattern_search_timeit()) """ % (size, pattern, iteration_number, iteration_number)) channel.send([]) return channel.receive()
Example #5
Source File: do_benchmark.py From PyRATA with Apache License 2.0 | 5 votes |
def write_pattern_v2(iteration_number, size, pattern): gw = execnet.makegateway("popen//python=python2.7") channel = gw.remote_exec(""" from nltk.corpus import brown size = %s words = brown.words()[:size] text = ' '.join(words) from pattern.en import parsetree text_tree = parsetree(text, tokenize = True, # Split punctuation marks from words? tags = True, # Parse part-of-speech tags? (NN, JJ, ...) chunks = False, # Parse chunks? (NP, VP, PNP, ...) relations = False, # Parse chunk relations? (-SBJ, -OBJ, ...) lemmata = False, # Parse lemmata? (ate => eat) encoding = 'utf-8', # Input string encoding. tagset = None) # Penn Treebank II (default) or UNIVERSAL. def backslash(string): for ch in [' ','?', '+', '*', '.', '[', ']', '~' , '{', '}', '|', '"', "'", ',', ':', '<', '>']: if ch in string: string=string.replace(ch,'_') return string from pattern.search import search pattern = "%s" pattern_search_result = search(pattern, text_tree) measure_pattern_search() filename = '/tmp/benchmark_'+analyzer_name+'_'+str(size)+"_"+str(len(pattern_search_result))+'_'+backslash(pattern) thefile = open(filename, 'w') for item in pattern_search_result: print>>thefile, item channel.send([filename, size, len(pattern_search_result)]) """ % (size, pattern, iteration_number, iteration_number)) channel.send([]) return channel.receive()
Example #6
Source File: nltk.py From PyRATA with Apache License 2.0 | 5 votes |
def pyrata2conll (dictList, **kwargs): """ See 3.1 Reading IOB Format and the CoNLL 2000 Corpus http://www.nltk.org/book/ch07.html can be used wi nltk.chunk.conllstr2tree(text, chunk_types=['NP']).draw() """ if 'raw' in kwargs.keys(): rawFeatureName = kwargs['raw'] if 'pos' in kwargs.keys(): posFeatureName = kwargs['pos'] if 'chunk' in kwargs.keys(): chunkFeatureName = kwargs['chunk'] text = '' for e in dictList: text.append(' '.join([e[rawFeatureName], e[posFeatureName], e[chunkFeatureName], '\n'])) return text # extend a given dictList # merge dictList # """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" # Run all the tests # """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""