Python nltk.tree() Examples
The following are 30
code examples of nltk.tree().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
nltk
, or try the search function
.
Example #1
Source File: relextract.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 6 votes |
def ieer_headlines(): from nltk.corpus import ieer from nltk.tree import Tree print("IEER: First 20 Headlines") print("=" * 45) trees = [ (doc.docno, doc.headline) for file in ieer.fileids() for doc in ieer.parsed_docs(file) ] for tree in trees[:20]: print() print("%s:\n%s" % tree) ############################################# ## Dutch CONLL2002: take_on_role(PER, ORG #############################################
Example #2
Source File: util.py From razzy-spinner with GNU General Public License v3.0 | 6 votes |
def tree2conlltags(t): """ Return a list of 3-tuples containing ``(word, tag, IOB-tag)``. Convert a tree to the CoNLL IOB tag format. :param t: The tree to be converted. :type t: Tree :rtype: list(tuple) """ tags = [] for child in t: try: category = child.label() prefix = "B-" for contents in child: if isinstance(contents, Tree): raise ValueError("Tree is too deeply nested to be printed in CoNLL format") tags.append((contents[0], contents[1], prefix+category)) prefix = "I-" except AttributeError: tags.append((child[0], child[1], "O")) return tags
Example #3
Source File: util.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 6 votes |
def tree2conlltags(t): """ Return a list of 3-tuples containing ``(word, tag, IOB-tag)``. Convert a tree to the CoNLL IOB tag format. :param t: The tree to be converted. :type t: Tree :rtype: list(tuple) """ tags = [] for child in t: try: category = child.label() prefix = "B-" for contents in child: if isinstance(contents, Tree): raise ValueError( "Tree is too deeply nested to be printed in CoNLL format" ) tags.append((contents[0], contents[1], prefix + category)) prefix = "I-" except AttributeError: tags.append((child[0], child[1], "O")) return tags
Example #4
Source File: relextract.py From razzy-spinner with GNU General Public License v3.0 | 6 votes |
def ieer_headlines(): from nltk.corpus import ieer from nltk.tree import Tree print("IEER: First 20 Headlines") print("=" * 45) trees = [(doc.docno, doc.headline) for file in ieer.fileids() for doc in ieer.parsed_docs(file)] for tree in trees[:20]: print() print("%s:\n%s" % tree) ############################################# ## Dutch CONLL2002: take_on_role(PER, ORG #############################################
Example #5
Source File: relextract.py From luscan-devel with GNU General Public License v2.0 | 6 votes |
def ieer_headlines(): from nltk.corpus import ieer from nltk.tree import Tree print "IEER: First 20 Headlines" print "=" * 45 trees = [doc.headline for file in ieer.fileids() for doc in ieer.parsed_docs(file)] for tree in trees[:20]: print print "%s:\n%s" % (doc.docno, tree) ############################################# ## Dutch CONLL2002: take_on_role(PER, ORG #############################################
Example #6
Source File: util.py From luscan-devel with GNU General Public License v2.0 | 6 votes |
def tree2conlltags(t): """ Return a list of 3-tuples containing ``(word, tag, IOB-tag)``. Convert a tree to the CoNLL IOB tag format. :param t: The tree to be converted. :type t: Tree :rtype: list(tuple) """ tags = [] for child in t: try: category = child.node prefix = "B-" for contents in child: if isinstance(contents, Tree): raise ValueError, "Tree is too deeply nested to be printed in CoNLL format" tags.append((contents[0], contents[1], prefix+category)) prefix = "I-" except AttributeError: tags.append((child[0], child[1], "O")) return tags
Example #7
Source File: viterbi.py From luscan-devel with GNU General Public License v2.0 | 6 votes |
def _trace_production(self, production, p, span, width): """ Print trace output indicating that a given production has been applied at a given location. :param production: The production that has been applied :type production: Production :param p: The probability of the tree produced by the production. :type p: float :param span: The span of the production :type span: tuple :rtype: None """ str = '|' + '.' * span[0] str += '=' * (span[1] - span[0]) str += '.' * (width - span[1]) + '| ' str += '%s' % production if self._trace > 2: str = '%-40s %12.10f ' % (str, p) print str
Example #8
Source File: viterbi.py From luscan-devel with GNU General Public License v2.0 | 5 votes |
def _find_instantiations(self, span, constituents): """ :return: a list of the production instantiations that cover a given span of the text. A "production instantiation" is a tuple containing a production and a list of children, where the production's right hand side matches the list of children; and the children cover ``span``. :rtype: list of ``pair`` of ``Production``, (list of (``ProbabilisticTree`` or token. :type span: tuple(int, int) :param span: The section of the text for which we are trying to find production instantiations. The span is specified as a pair of integers, where the first integer is the index of the first token that should be covered by the production instantiation; and the second integer is the index of the first token that should not be covered by the production instantiation. :type constituents: dict(tuple(int,int,Nonterminal) -> ProbabilisticToken or ProbabilisticTree) :param constituents: The most likely constituents table. This table records the most probable tree representation for any given span and node value. See the module documentation for more information. """ rv = [] for production in self._grammar.productions(): childlists = self._match_rhs(production.rhs(), span, constituents) for childlist in childlists: rv.append( (production, childlist) ) return rv
Example #9
Source File: relextract.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def tree2semi_rel(tree): """ Group a chunk structure into a list of 'semi-relations' of the form (list(str), ``Tree``). In order to facilitate the construction of (``Tree``, string, ``Tree``) triples, this identifies pairs whose first member is a list (possibly empty) of terminal strings, and whose second member is a ``Tree`` of the form (NE_label, terminals). :param tree: a chunk tree :return: a list of pairs (list(str), ``Tree``) :rtype: list of tuple """ from nltk.tree import Tree semi_rels = [] semi_rel = [[], None] for dtr in tree: if not isinstance(dtr, Tree): semi_rel[0].append(dtr) else: # dtr is a Tree semi_rel[1] = dtr semi_rels.append(semi_rel) semi_rel = [[], None] return semi_rels
Example #10
Source File: chunked.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def _untag(self, tree): for i, child in enumerate(tree): if isinstance(child, Tree): self._untag(child) elif isinstance(child, tuple): tree[i] = child[0] else: raise ValueError('expected child to be Tree or tuple') return tree
Example #11
Source File: util.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def conlltags2tree( sentence, chunk_types=('NP', 'PP', 'VP'), root_label='S', strict=False ): """ Convert the CoNLL IOB format to a tree. """ tree = Tree(root_label, []) for (word, postag, chunktag) in sentence: if chunktag is None: if strict: raise ValueError("Bad conll tag sequence") else: # Treat as O tree.append((word, postag)) elif chunktag.startswith('B-'): tree.append(Tree(chunktag[2:], [(word, postag)])) elif chunktag.startswith('I-'): if ( len(tree) == 0 or not isinstance(tree[-1], Tree) or tree[-1].label() != chunktag[2:] ): if strict: raise ValueError("Bad conll tag sequence") else: # Treat as B-* tree.append(Tree(chunktag[2:], [(word, postag)])) else: tree[-1].append((word, postag)) elif chunktag == 'O': tree.append((word, postag)) else: raise ValueError("Bad conll tag {0!r}".format(chunktag)) return tree
Example #12
Source File: util.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def accuracy(chunker, gold): """ Score the accuracy of the chunker against the gold standard. Strip the chunk information from the gold standard and rechunk it using the chunker, then compute the accuracy score. :type chunker: ChunkParserI :param chunker: The chunker being evaluated. :type gold: tree :param gold: The chunk structures to score the chunker on. :rtype: float """ gold_tags = [] test_tags = [] for gold_tree in gold: test_tree = chunker.parse(gold_tree.flatten()) gold_tags += tree2conlltags(gold_tree) test_tags += tree2conlltags(test_tree) # print 'GOLD:', gold_tags[:50] # print 'TEST:', test_tags[:50] return _accuracy(gold_tags, test_tags) # Patched for increased performance by Yoav Goldberg <yoavg@cs.bgu.ac.il>, 2006-01-13 # -- statistics are evaluated only on demand, instead of at every sentence evaluation # # SB: use nltk.metrics for precision/recall scoring? #
Example #13
Source File: relextract.py From luscan-devel with GNU General Public License v2.0 | 5 votes |
def mk_pairs(tree): """ Group a chunk structure into a list of pairs of the form (list(str), ``Tree``) In order to facilitate the construction of (``Tree``, string, ``Tree``) triples, this identifies pairs whose first member is a list (possibly empty) of terminal strings, and whose second member is a ``Tree`` of the form (NE_label, terminals). :param tree: a chunk tree :return: a list of pairs (list(str), ``Tree``) :rtype: list of tuple """ from nltk.tree import Tree pairs = [] pair = [[], None] for dtr in tree: if not isinstance(dtr, Tree): pair[0].append(dtr) else: # dtr is a Tree pair[1] = dtr pairs.append(pair) pair = [[], None] return pairs
Example #14
Source File: chunked.py From luscan-devel with GNU General Public License v2.0 | 5 votes |
def _untag(self, tree): for i, child in enumerate(tree): if isinstance(child, Tree): self._untag(child) elif isinstance(child, tuple): tree[i] = child[0] else: raise ValueError('expected child to be Tree or tuple') return tree
Example #15
Source File: knbc.py From luscan-devel with GNU General Public License v2.0 | 5 votes |
def demo(): import nltk from nltk.corpus.util import LazyCorpusLoader root = nltk.data.find('corpora/knbc/corpus1') fileids = [f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*") if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)] def _knbc_fileids_sort(x): cells = x.split('-') return (cells[0], int(cells[1]), int(cells[2]), int(cells[3])) knbc = LazyCorpusLoader('knbc/corpus1', KNBCorpusReader, sorted(fileids, key=_knbc_fileids_sort), encoding='euc-jp') print knbc.fileids()[:10] print ''.join( knbc.words()[:100] ) print '\n\n'.join( '%s' % tree for tree in knbc.parsed_sents()[:2] ) knbc.morphs2str = lambda morphs: '/'.join( "%s(%s)"%(m[0], m[1].split(' ')[2]) for m in morphs if m[0] != 'EOS' ).encode('utf-8') print '\n\n'.join( '%s' % tree for tree in knbc.parsed_sents()[:2] ) print '\n'.join( ' '.join("%s/%s"%(w[0], w[1].split(' ')[2]) for w in sent) for sent in knbc.tagged_sents()[0:2] )
Example #16
Source File: knbc.py From luscan-devel with GNU General Public License v2.0 | 5 votes |
def __init__(self, root, fileids, encoding=None, morphs2str=_morphs2str_default): """ Initialize KNBCorpusReader morphs2str is a function to convert morphlist to str for tree representation for _parse() """ CorpusReader.__init__(self, root, fileids, encoding) self.morphs2str = morphs2str
Example #17
Source File: viterbi.py From luscan-devel with GNU General Public License v2.0 | 5 votes |
def _match_rhs(self, rhs, span, constituents): """ :return: a set of all the lists of children that cover ``span`` and that match ``rhs``. :rtype: list(list(ProbabilisticTree or token) :type rhs: list(Nonterminal or any) :param rhs: The list specifying what kinds of children need to cover ``span``. Each nonterminal in ``rhs`` specifies that the corresponding child should be a tree whose node value is that nonterminal's symbol. Each terminal in ``rhs`` specifies that the corresponding child should be a token whose type is that terminal. :type span: tuple(int, int) :param span: The section of the text for which we are trying to find child lists. The span is specified as a pair of integers, where the first integer is the index of the first token that should be covered by the child list; and the second integer is the index of the first token that should not be covered by the child list. :type constituents: dict(tuple(int,int,Nonterminal) -> ProbabilisticToken or ProbabilisticTree) :param constituents: The most likely constituents table. This table records the most probable tree representation for any given span and node value. See the module documentation for more information. """ (start, end) = span # Base case if start >= end and rhs == (): return [[]] if start >= end or rhs == (): return [] # Find everything that matches the 1st symbol of the RHS childlists = [] for split in range(start, end+1): l=constituents.get((start,split,rhs[0])) if l is not None: rights = self._match_rhs(rhs[1:], (split,end), constituents) childlists += [[l]+r for r in rights] return childlists
Example #18
Source File: util.py From razzy-spinner with GNU General Public License v3.0 | 5 votes |
def accuracy(chunker, gold): """ Score the accuracy of the chunker against the gold standard. Strip the chunk information from the gold standard and rechunk it using the chunker, then compute the accuracy score. :type chunker: ChunkParserI :param chunker: The chunker being evaluated. :type gold: tree :param gold: The chunk structures to score the chunker on. :rtype: float """ gold_tags = [] test_tags = [] for gold_tree in gold: test_tree = chunker.parse(gold_tree.flatten()) gold_tags += tree2conlltags(gold_tree) test_tags += tree2conlltags(test_tree) # print 'GOLD:', gold_tags[:50] # print 'TEST:', test_tags[:50] return _accuracy(gold_tags, test_tags) # Patched for increased performance by Yoav Goldberg <yoavg@cs.bgu.ac.il>, 2006-01-13 # -- statistics are evaluated only on demand, instead of at every sentence evaluation # # SB: use nltk.metrics for precision/recall scoring? #
Example #19
Source File: viterbi.py From luscan-devel with GNU General Public License v2.0 | 5 votes |
def parse(self, tokens): # Inherit docs from ParserI tokens = list(tokens) self._grammar.check_coverage(tokens) # The most likely constituent table. This table specifies the # most likely constituent for a given span and type. # Constituents can be either Trees or tokens. For Trees, # the "type" is the Nonterminal for the tree's root node # value. For Tokens, the "type" is the token's type. # The table is stored as a dictionary, since it is sparse. constituents = {} # Initialize the constituents dictionary with the words from # the text. if self._trace: print ('Inserting tokens into the most likely'+ ' constituents table...') for index in range(len(tokens)): token = tokens[index] constituents[index,index+1,token] = token if self._trace > 1: self._trace_lexical_insertion(token, index, len(tokens)) # Consider each span of length 1, 2, ..., n; and add any trees # that might cover that span to the constituents dictionary. for length in range(1, len(tokens)+1): if self._trace: print ('Finding the most likely constituents'+ ' spanning %d text elements...' % length) for start in range(len(tokens)-length+1): span = (start, start+length) self._add_constituents_spanning(span, constituents, tokens) # Return the tree that spans the entire text & have the right cat return constituents.get((0, len(tokens), self._grammar.start()))
Example #20
Source File: util.py From luscan-devel with GNU General Public License v2.0 | 5 votes |
def conlltags2tree(sentence, chunk_types=('NP','PP','VP'), top_node='S', strict=False): """ Convert the CoNLL IOB format to a tree. """ tree = Tree(top_node, []) for (word, postag, chunktag) in sentence: if chunktag is None: if strict: raise ValueError("Bad conll tag sequence") else: # Treat as O tree.append((word,postag)) elif chunktag.startswith('B-'): tree.append(Tree(chunktag[2:], [(word,postag)])) elif chunktag.startswith('I-'): if (len(tree)==0 or not isinstance(tree[-1], Tree) or tree[-1].node != chunktag[2:]): if strict: raise ValueError("Bad conll tag sequence") else: # Treat as B-* tree.append(Tree(chunktag[2:], [(word,postag)])) else: tree[-1].append((word,postag)) elif chunktag == 'O': tree.append((word,postag)) else: raise ValueError("Bad conll tag %r" % chunktag) return tree
Example #21
Source File: util.py From luscan-devel with GNU General Public License v2.0 | 5 votes |
def accuracy(chunker, gold): """ Score the accuracy of the chunker against the gold standard. Strip the chunk information from the gold standard and rechunk it using the chunker, then compute the accuracy score. :type chunker: ChunkParserI :param chunker: The chunker being evaluated. :type gold: tree :param gold: The chunk structures to score the chunker on. :rtype: float """ gold_tags = [] test_tags = [] for gold_tree in gold: test_tree = chunker.parse(gold_tree.flatten()) gold_tags += tree2conlltags(gold_tree) test_tags += tree2conlltags(test_tree) # print 'GOLD:', gold_tags[:50] # print 'TEST:', test_tags[:50] return _accuracy(gold_tags, test_tags) # Patched for increased performance by Yoav Goldberg <yoavg@cs.bgu.ac.il>, 2006-01-13 # -- statistics are evaluated only on demand, instead of at every sentence evaluation # # SB: use nltk.metrics for precision/recall scoring? #
Example #22
Source File: svo.py From py-nltk-svo with MIT License | 5 votes |
def get_object(self, sub_tree): """ Returns an Object with all attributes of an object """ siblings = self.pred_verb_phrase_siblings Object = None for each_tree in sub_tree: if each_tree.label() in ["NP", "PP"]: sub_nodes = each_tree.subtrees() sub_nodes = [each for each in sub_nodes if each.pos()] for each in sub_nodes: if each.label() in self.noun_types: Object = each.leaves() break break else: sub_nodes = each_tree.subtrees() sub_nodes = [each for each in sub_nodes if each.pos()] for each in sub_nodes: if each.label() in self.adjective_types: Object = each.leaves() break # Get first noun in the tree self.pred_verb_phrase_siblings = None return {'object': Object}
Example #23
Source File: relextract.py From razzy-spinner with GNU General Public License v3.0 | 5 votes |
def tree2semi_rel(tree): """ Group a chunk structure into a list of 'semi-relations' of the form (list(str), ``Tree``). In order to facilitate the construction of (``Tree``, string, ``Tree``) triples, this identifies pairs whose first member is a list (possibly empty) of terminal strings, and whose second member is a ``Tree`` of the form (NE_label, terminals). :param tree: a chunk tree :return: a list of pairs (list(str), ``Tree``) :rtype: list of tuple """ from nltk.tree import Tree semi_rels = [] semi_rel = [[], None] for dtr in tree: if not isinstance(dtr, Tree): semi_rel[0].append(dtr) else: # dtr is a Tree semi_rel[1] = dtr semi_rels.append(semi_rel) semi_rel = [[], None] return semi_rels
Example #24
Source File: chunked.py From razzy-spinner with GNU General Public License v3.0 | 5 votes |
def _untag(self, tree): for i, child in enumerate(tree): if isinstance(child, Tree): self._untag(child) elif isinstance(child, tuple): tree[i] = child[0] else: raise ValueError('expected child to be Tree or tuple') return tree
Example #25
Source File: util.py From razzy-spinner with GNU General Public License v3.0 | 5 votes |
def conlltags2tree(sentence, chunk_types=('NP','PP','VP'), root_label='S', strict=False): """ Convert the CoNLL IOB format to a tree. """ tree = Tree(root_label, []) for (word, postag, chunktag) in sentence: if chunktag is None: if strict: raise ValueError("Bad conll tag sequence") else: # Treat as O tree.append((word,postag)) elif chunktag.startswith('B-'): tree.append(Tree(chunktag[2:], [(word,postag)])) elif chunktag.startswith('I-'): if (len(tree)==0 or not isinstance(tree[-1], Tree) or tree[-1].label() != chunktag[2:]): if strict: raise ValueError("Bad conll tag sequence") else: # Treat as B-* tree.append(Tree(chunktag[2:], [(word,postag)])) else: tree[-1].append((word,postag)) elif chunktag == 'O': tree.append((word,postag)) else: raise ValueError("Bad conll tag %r" % chunktag) return tree
Example #26
Source File: util.py From luscan-devel with GNU General Public License v2.0 | 4 votes |
def conllstr2tree(s, chunk_types=('NP', 'PP', 'VP'), top_node="S"): """ Return a chunk structure for a single sentence encoded in the given CONLL 2000 style string. This function converts a CoNLL IOB string into a tree. It uses the specified chunk types (defaults to NP, PP and VP), and creates a tree rooted at a node labeled S (by default). :param s: The CoNLL string to be converted. :type s: str :param chunk_types: The chunk types to be converted. :type chunk_types: tuple :param top_node: The node label to use for the root. :type top_node: str :rtype: Tree """ stack = [Tree(top_node, [])] for lineno, line in enumerate(s.split('\n')): if not line.strip(): continue # Decode the line. match = _LINE_RE.match(line) if match is None: raise ValueError, 'Error on line %d' % lineno (word, tag, state, chunk_type) = match.groups() # If it's a chunk type we don't care about, treat it as O. if (chunk_types is not None and chunk_type not in chunk_types): state = 'O' # For "Begin"/"Outside", finish any completed chunks - # also do so for "Inside" which don't match the previous token. mismatch_I = state == 'I' and chunk_type != stack[-1].node if state in 'BO' or mismatch_I: if len(stack) == 2: stack.pop() # For "Begin", start a new chunk. if state == 'B' or mismatch_I: chunk = Tree(chunk_type, []) stack[-1].append(chunk) stack.append(chunk) # Add the new word token. stack[-1].append((word, tag)) return stack[0]
Example #27
Source File: util.py From luscan-devel with GNU General Public License v2.0 | 4 votes |
def tagstr2tree(s, chunk_node="NP", top_node="S", sep='/'): """ Divide a string of bracketted tagged text into chunks and unchunked tokens, and produce a Tree. Chunks are marked by square brackets (``[...]``). Words are delimited by whitespace, and each word should have the form ``text/tag``. Words that do not contain a slash are assigned a ``tag`` of None. :param s: The string to be converted :type s: str :param chunk_node: The label to use for chunk nodes :type chunk_node: str :param top_node: The label to use for the root of the tree :type top_node: str :rtype: Tree """ WORD_OR_BRACKET = re.compile(r'\[|\]|[^\[\]\s]+') stack = [Tree(top_node, [])] for match in WORD_OR_BRACKET.finditer(s): text = match.group() if text[0] == '[': if len(stack) != 1: raise ValueError('Unexpected [ at char %d' % match.start()) chunk = Tree(chunk_node, []) stack[-1].append(chunk) stack.append(chunk) elif text[0] == ']': if len(stack) != 2: raise ValueError('Unexpected ] at char %d' % match.start()) stack.pop() else: if sep is None: stack[-1].append(text) else: stack[-1].append(str2tuple(text, sep)) if len(stack) != 1: raise ValueError('Expected ] at char %d' % len(s)) return stack[0] ### CONLL
Example #28
Source File: util.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 4 votes |
def tagstr2tree( s, chunk_label="NP", root_label="S", sep='/', source_tagset=None, target_tagset=None ): """ Divide a string of bracketted tagged text into chunks and unchunked tokens, and produce a Tree. Chunks are marked by square brackets (``[...]``). Words are delimited by whitespace, and each word should have the form ``text/tag``. Words that do not contain a slash are assigned a ``tag`` of None. :param s: The string to be converted :type s: str :param chunk_label: The label to use for chunk nodes :type chunk_label: str :param root_label: The label to use for the root of the tree :type root_label: str :rtype: Tree """ WORD_OR_BRACKET = re.compile(r'\[|\]|[^\[\]\s]+') stack = [Tree(root_label, [])] for match in WORD_OR_BRACKET.finditer(s): text = match.group() if text[0] == '[': if len(stack) != 1: raise ValueError('Unexpected [ at char {:d}'.format(match.start())) chunk = Tree(chunk_label, []) stack[-1].append(chunk) stack.append(chunk) elif text[0] == ']': if len(stack) != 2: raise ValueError('Unexpected ] at char {:d}'.format(match.start())) stack.pop() else: if sep is None: stack[-1].append(text) else: word, tag = str2tuple(text, sep) if source_tagset and target_tagset: tag = map_tag(source_tagset, target_tagset, tag) stack[-1].append((word, tag)) if len(stack) != 1: raise ValueError('Expected ] at char {:d}'.format(len(s))) return stack[0] ### CONLL
Example #29
Source File: util.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 4 votes |
def conllstr2tree(s, chunk_types=('NP', 'PP', 'VP'), root_label="S"): """ Return a chunk structure for a single sentence encoded in the given CONLL 2000 style string. This function converts a CoNLL IOB string into a tree. It uses the specified chunk types (defaults to NP, PP and VP), and creates a tree rooted at a node labeled S (by default). :param s: The CoNLL string to be converted. :type s: str :param chunk_types: The chunk types to be converted. :type chunk_types: tuple :param root_label: The node label to use for the root. :type root_label: str :rtype: Tree """ stack = [Tree(root_label, [])] for lineno, line in enumerate(s.split('\n')): if not line.strip(): continue # Decode the line. match = _LINE_RE.match(line) if match is None: raise ValueError('Error on line {:d}'.format(lineno)) (word, tag, state, chunk_type) = match.groups() # If it's a chunk type we don't care about, treat it as O. if chunk_types is not None and chunk_type not in chunk_types: state = 'O' # For "Begin"/"Outside", finish any completed chunks - # also do so for "Inside" which don't match the previous token. mismatch_I = state == 'I' and chunk_type != stack[-1].label() if state in 'BO' or mismatch_I: if len(stack) == 2: stack.pop() # For "Begin", start a new chunk. if state == 'B' or mismatch_I: chunk = Tree(chunk_type, []) stack[-1].append(chunk) stack.append(chunk) # Add the new word token. stack[-1].append((word, tag)) return stack[0]
Example #30
Source File: util.py From razzy-spinner with GNU General Public License v3.0 | 4 votes |
def conllstr2tree(s, chunk_types=('NP', 'PP', 'VP'), root_label="S"): """ Return a chunk structure for a single sentence encoded in the given CONLL 2000 style string. This function converts a CoNLL IOB string into a tree. It uses the specified chunk types (defaults to NP, PP and VP), and creates a tree rooted at a node labeled S (by default). :param s: The CoNLL string to be converted. :type s: str :param chunk_types: The chunk types to be converted. :type chunk_types: tuple :param root_label: The node label to use for the root. :type root_label: str :rtype: Tree """ stack = [Tree(root_label, [])] for lineno, line in enumerate(s.split('\n')): if not line.strip(): continue # Decode the line. match = _LINE_RE.match(line) if match is None: raise ValueError('Error on line %d' % lineno) (word, tag, state, chunk_type) = match.groups() # If it's a chunk type we don't care about, treat it as O. if (chunk_types is not None and chunk_type not in chunk_types): state = 'O' # For "Begin"/"Outside", finish any completed chunks - # also do so for "Inside" which don't match the previous token. mismatch_I = state == 'I' and chunk_type != stack[-1].label() if state in 'BO' or mismatch_I: if len(stack) == 2: stack.pop() # For "Begin", start a new chunk. if state == 'B' or mismatch_I: chunk = Tree(chunk_type, []) stack[-1].append(chunk) stack.append(chunk) # Add the new word token. stack[-1].append((word, tag)) return stack[0]