Python nltk.ChartParser() Examples
The following are 8
code examples of nltk.ChartParser().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
nltk
, or try the search function
.
Example #1
Source File: GrammarVAE_codes.py From selfies with Apache License 2.0 | 6 votes |
def to_one_hot(smiles, MaxNumSymbols, check=True): """ Encode a list of smiles strings to one-hot vectors """ assert type(smiles) == list prod_map = {} for ix, prod in enumerate(zinc_grammar.GCFG.productions()): prod_map[prod] = ix tokenize = get_zinc_tokenizer(zinc_grammar.GCFG) tokens = list(map(tokenize, smiles)) parser = nltk.ChartParser(zinc_grammar.GCFG) parse_trees = [next(parser.parse(t)) for t in tokens] productions_seq = [tree.productions() for tree in parse_trees] #if check: # print(productions_seq) indices = [np.array([prod_map[prod] for prod in entry], dtype=int) for entry in productions_seq] one_hot = np.zeros((len(indices), MaxNumSymbols, NCHARS), dtype=np.float32) for i in range(len(indices)): num_productions = len(indices[i]) one_hot[i][np.arange(num_productions),indices[i]] = 1. one_hot[i][np.arange(num_productions, MaxNumSymbols),-1] = 1. return one_hot
Example #2
Source File: cfg_util.py From guacamol_baselines with MIT License | 5 votes |
def encode(smiles): GCFG = smiles_grammar.GCFG tokenize = get_smiles_tokenizer(GCFG) tokens = tokenize(smiles) parser = nltk.ChartParser(GCFG) parse_tree = parser.parse(tokens).__next__() productions_seq = parse_tree.productions() productions = GCFG.productions() prod_map = {} for ix, prod in enumerate(productions): prod_map[prod] = ix indices = np.array([prod_map[prod] for prod in productions_seq], dtype=int) return indices
Example #3
Source File: parse.py From atap with Apache License 2.0 | 5 votes |
def parse(sent): parser = nltk.ChartParser(grammar) tokens = nltk.wordpunct_tokenize(sent) return parser.parse(tokens)
Example #4
Source File: cfg_parser.py From sdvae with MIT License | 5 votes |
def load(self, filepath): cfg_string = ''.join(list(open(filepath).readlines())) # parse from nltk cfg_grammar = nltk.CFG.fromstring(cfg_string) # self.cfg_parser = cfg_parser = nltk.RecursiveDescentParser(cfg_grammar) self.cfg_parser = cfg_parser = nltk.ChartParser(cfg_grammar) # our info for rule macthing self.head_to_rules = head_to_rules = {} self.valid_tokens = valid_tokens = set() rule_ranges = {} total_num_rules = 0 first_head = None for line in cfg_string.split('\n'): if len(line.strip()) > 0: head, rules = line.split('->') head = Nonterminal(head.strip()) # remove space rules = [_.strip() for _ in rules.split('|')] # split and remove space rules = [tuple([Nonterminal(_) if not _.startswith("'") else _[1:-1] for _ in rule.split()]) for rule in rules] head_to_rules[head] = rules for rule in rules: for t in rule: if isinstance(t, str): valid_tokens.add(t) if first_head is None: first_head = head rule_ranges[head] = (total_num_rules, total_num_rules + len(rules)) total_num_rules += len(rules) self.first_head = first_head self.rule_ranges = rule_ranges self.total_num_rules = total_num_rules
Example #5
Source File: evaluate.py From sdvae with MIT License | 5 votes |
def get_parser(production_file): prods = [_.strip() for _ in open(production_file).readlines()] + ['Nothing -> None'] string = '\n'.join(prods) GCFG = nltk.CFG.fromstring(string) parser = nltk.ChartParser(GCFG) return parser
Example #6
Source File: cfg_parser.py From sdvae with MIT License | 5 votes |
def load(self, filepath): cfg_string = ''.join(list(open(filepath).readlines())) # parse from nltk cfg_grammar = nltk.CFG.fromstring(cfg_string) # self.cfg_parser = cfg_parser = nltk.RecursiveDescentParser(cfg_grammar) self.cfg_parser = cfg_parser = nltk.ChartParser(cfg_grammar) # our info for rule macthing self.head_to_rules = head_to_rules = {} self.valid_tokens = valid_tokens = set() rule_ranges = {} total_num_rules = 0 first_head = None for line in cfg_string.split('\n'): if len(line.strip()) > 0: head, rules = line.split('->') head = Nonterminal(head.strip()) # remove space rules = [_.strip() for _ in rules.split('|')] # split and remove space rules = [ tuple([Nonterminal(_) if not _.startswith("'") else _[1:-1] for _ in rule.split()]) for rule in rules ] head_to_rules[head] = rules for rule in rules: for t in rule: if isinstance(t, str): valid_tokens.add(t) if first_head is None: first_head = head rule_ranges[head] = (total_num_rules, total_num_rules + len(rules)) total_num_rules += len(rules) self.first_head = first_head self.rule_ranges = rule_ranges self.total_num_rules = total_num_rules
Example #7
Source File: GrammarVAE_codes.py From selfies with Apache License 2.0 | 5 votes |
def SizeOneHot(smiles, check=True): """ Encode a list of smiles strings to one-hot vectors """ assert type(smiles) == list prod_map = {} for ix, prod in enumerate(zinc_grammar.GCFG.productions()): prod_map[prod] = ix tokenize = get_zinc_tokenizer(zinc_grammar.GCFG) tokens = list(map(tokenize, smiles)) parser = nltk.ChartParser(zinc_grammar.GCFG) parse_trees = [next(parser.parse(t)) for t in tokens] productions_seq = [tree.productions() for tree in parse_trees] indices = [np.array([prod_map[prod] for prod in entry], dtype=int) for entry in productions_seq] return len(indices[0]) # SINGLE EXAMPLE #smile = [L[0]] ##smile = ['C'] #one_hot_single = to_one_hot(smile, ) #print(one_hot_single.shape) #print(one_hot_single) # GOING THROUGH ALL OF ZINC.... #OH = np.zeros((len(L),MAX_LEN,NCHARS)) #for i in range(0, len(L), 100): # print('Processing: i=[' + str(i) + ':' + str(i+100) + ']') # onehot = to_one_hot(L[i:i+100], False) # OH[i:i+100,:,:] = onehot # #h5f = h5py.File('zinc_grammar_dataset.h5','w') #h5f.create_dataset('data', data=OH) #h5f.close()
Example #8
Source File: Cfg.py From Texygen with MIT License | 5 votes |
def __init__(self, cfg_grammar=None, test_file=None): super().__init__() self.name = 'cfg' if cfg_grammar is None: cfg_grammar = """ S -> S PLUS x | S SUB x | S PROD x | S DIV x | x | '(' S ')' PLUS -> '+' SUB -> '-' PROD -> '*' DIV -> '/' x -> 'x' | 'y' """ self.grammar = nltk.CFG.fromstring(cfg_grammar) self.parser = nltk.ChartParser(self.grammar) self.test_file = test_file