Python nltk.ChartParser() Examples

The following are 8 code examples of nltk.ChartParser(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module nltk , or try the search function .
Example #1
Source File: GrammarVAE_codes.py    From selfies with Apache License 2.0 6 votes vote down vote up
def to_one_hot(smiles, MaxNumSymbols, check=True):
    """ Encode a list of smiles strings to one-hot vectors """
    assert type(smiles) == list
    prod_map = {}
    for ix, prod in enumerate(zinc_grammar.GCFG.productions()):
        prod_map[prod] = ix
    tokenize = get_zinc_tokenizer(zinc_grammar.GCFG)
    tokens = list(map(tokenize, smiles))
    parser = nltk.ChartParser(zinc_grammar.GCFG)
    parse_trees = [next(parser.parse(t)) for t in tokens]
    productions_seq = [tree.productions() for tree in parse_trees]
    
    #if check:
    #    print(productions_seq)
        
    indices = [np.array([prod_map[prod] for prod in entry], dtype=int) for entry in productions_seq]
    one_hot = np.zeros((len(indices), MaxNumSymbols, NCHARS), dtype=np.float32)
    for i in range(len(indices)):
        num_productions = len(indices[i])
        one_hot[i][np.arange(num_productions),indices[i]] = 1.
        one_hot[i][np.arange(num_productions, MaxNumSymbols),-1] = 1.
    return one_hot 
Example #2
Source File: cfg_util.py    From guacamol_baselines with MIT License 5 votes vote down vote up
def encode(smiles):
    GCFG = smiles_grammar.GCFG
    tokenize = get_smiles_tokenizer(GCFG)
    tokens = tokenize(smiles)
    parser = nltk.ChartParser(GCFG)
    parse_tree = parser.parse(tokens).__next__()
    productions_seq = parse_tree.productions()
    productions = GCFG.productions()
    prod_map = {}
    for ix, prod in enumerate(productions):
        prod_map[prod] = ix
    indices = np.array([prod_map[prod] for prod in productions_seq], dtype=int)
    return indices 
Example #3
Source File: parse.py    From atap with Apache License 2.0 5 votes vote down vote up
def parse(sent):
    parser = nltk.ChartParser(grammar)
    tokens = nltk.wordpunct_tokenize(sent)
    return parser.parse(tokens) 
Example #4
Source File: cfg_parser.py    From sdvae with MIT License 5 votes vote down vote up
def load(self, filepath):
        cfg_string = ''.join(list(open(filepath).readlines()))

        # parse from nltk
        cfg_grammar = nltk.CFG.fromstring(cfg_string)
        # self.cfg_parser = cfg_parser = nltk.RecursiveDescentParser(cfg_grammar)
        self.cfg_parser = cfg_parser = nltk.ChartParser(cfg_grammar)


        # our info for rule macthing
        self.head_to_rules = head_to_rules = {}
        self.valid_tokens = valid_tokens = set()
        rule_ranges = {}
        total_num_rules = 0
        first_head = None
        for line in cfg_string.split('\n'):
            if len(line.strip()) > 0:
                head, rules = line.split('->')
                head = Nonterminal(head.strip())  # remove space
                rules = [_.strip() for _ in rules.split('|')]  # split and remove space
                rules = [tuple([Nonterminal(_) if not _.startswith("'") else _[1:-1] for _ in rule.split()]) for rule in rules]
                head_to_rules[head] = rules

                for rule in rules:
                    for t in rule:
                        if isinstance(t, str):
                            valid_tokens.add(t)

                if first_head is None:
                    first_head = head

                rule_ranges[head] = (total_num_rules, total_num_rules + len(rules))
                total_num_rules += len(rules)

        self.first_head = first_head

        self.rule_ranges = rule_ranges
        self.total_num_rules = total_num_rules 
Example #5
Source File: evaluate.py    From sdvae with MIT License 5 votes vote down vote up
def get_parser(production_file):
    prods = [_.strip() for _ in open(production_file).readlines()] + ['Nothing -> None']
    string = '\n'.join(prods)
    GCFG = nltk.CFG.fromstring(string)
    parser = nltk.ChartParser(GCFG)
    return parser 
Example #6
Source File: cfg_parser.py    From sdvae with MIT License 5 votes vote down vote up
def load(self, filepath):
        cfg_string = ''.join(list(open(filepath).readlines()))

        # parse from nltk
        cfg_grammar = nltk.CFG.fromstring(cfg_string)
        # self.cfg_parser = cfg_parser = nltk.RecursiveDescentParser(cfg_grammar)
        self.cfg_parser = cfg_parser = nltk.ChartParser(cfg_grammar)

        # our info for rule macthing
        self.head_to_rules = head_to_rules = {}
        self.valid_tokens = valid_tokens = set()
        rule_ranges = {}
        total_num_rules = 0
        first_head = None
        for line in cfg_string.split('\n'):
            if len(line.strip()) > 0:
                head, rules = line.split('->')
                head = Nonterminal(head.strip())    # remove space
                rules = [_.strip() for _ in rules.split('|')]    # split and remove space
                rules = [
                    tuple([Nonterminal(_) if not _.startswith("'") else _[1:-1] for _ in rule.split()])
                    for rule in rules
                ]
                head_to_rules[head] = rules

                for rule in rules:
                    for t in rule:
                        if isinstance(t, str):
                            valid_tokens.add(t)

                if first_head is None:
                    first_head = head

                rule_ranges[head] = (total_num_rules, total_num_rules + len(rules))
                total_num_rules += len(rules)

        self.first_head = first_head

        self.rule_ranges = rule_ranges
        self.total_num_rules = total_num_rules 
Example #7
Source File: GrammarVAE_codes.py    From selfies with Apache License 2.0 5 votes vote down vote up
def SizeOneHot(smiles, check=True):
    """ Encode a list of smiles strings to one-hot vectors """
    assert type(smiles) == list
    prod_map = {}
    for ix, prod in enumerate(zinc_grammar.GCFG.productions()):
        prod_map[prod] = ix
    tokenize = get_zinc_tokenizer(zinc_grammar.GCFG)
    tokens = list(map(tokenize, smiles))
    parser = nltk.ChartParser(zinc_grammar.GCFG)
    parse_trees = [next(parser.parse(t)) for t in tokens]
    productions_seq = [tree.productions() for tree in parse_trees]
    
    indices = [np.array([prod_map[prod] for prod in entry], dtype=int) for entry in productions_seq]
    return len(indices[0])


# SINGLE EXAMPLE
#smile = [L[0]]
##smile = ['C']
#one_hot_single =  to_one_hot(smile, )
#print(one_hot_single.shape)
#print(one_hot_single)


# GOING THROUGH ALL OF ZINC....

#OH = np.zeros((len(L),MAX_LEN,NCHARS))
#for i in range(0, len(L), 100):
#    print('Processing: i=[' + str(i) + ':' + str(i+100) + ']')
#    onehot = to_one_hot(L[i:i+100], False)
#    OH[i:i+100,:,:] = onehot
#
#h5f = h5py.File('zinc_grammar_dataset.h5','w')
#h5f.create_dataset('data', data=OH)
#h5f.close() 
Example #8
Source File: Cfg.py    From Texygen with MIT License 5 votes vote down vote up
def __init__(self, cfg_grammar=None, test_file=None):
        super().__init__()
        self.name = 'cfg'
        if cfg_grammar is None:
            cfg_grammar = """
              S -> S PLUS x | S SUB x |  S PROD x | S DIV x | x | '(' S ')'
              PLUS -> '+'
              SUB -> '-'
              PROD -> '*'
              DIV -> '/'
              x -> 'x' | 'y'
            """
        self.grammar = nltk.CFG.fromstring(cfg_grammar)
        self.parser = nltk.ChartParser(self.grammar)
        self.test_file = test_file