Python nltk.tokenize.treebank.TreebankWordDetokenizer() Examples

The following are 7 code examples of nltk.tokenize.treebank.TreebankWordDetokenizer(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module nltk.tokenize.treebank , or try the search function

Example #1

Source File: treebank_encoder.py From PyTorch-NLP with BSD 3-Clause "New" or "Revised" License

6 votes

def __init__(self, *args, **kwargs):
        if 'tokenize' in kwargs:
            raise TypeError('``TreebankEncoder`` does not take keyword argument ``tokenize``.')

        if 'detokenize' in kwargs:
            raise TypeError('``TreebankEncoder`` does not take keyword argument ``detokenize``.')

        try:
            import nltk

            # Required for moses
            nltk.download('perluniprops')
            nltk.download('nonbreaking_prefixes')

            from nltk.tokenize.treebank import TreebankWordTokenizer
            from nltk.tokenize.treebank import TreebankWordDetokenizer
        except ImportError:
            print("Please install NLTK. " "See the docs at http://nltk.org for more information.")
            raise

        super().__init__(
            *args,
            tokenize=TreebankWordTokenizer().tokenize,
            detokenize=TreebankWordDetokenizer().detokenize,
            **kwargs)

Example #2

Source File: solver.py From neural_chat with MIT License

6 votes

def __init__(self, config, train_data_loader, eval_data_loader, vocab, is_train=True, model=None):
        self.config = config
        self.epoch_i = 0
        self.train_data_loader = train_data_loader
        self.eval_data_loader = eval_data_loader
        self.vocab = vocab
        self.is_train = is_train
        self.model = model
        self.detokenizer = Detok()

        if config.emotion or config.infersent or config.context_input_only:
            self.botmoji = Botmoji()
            self.botsent = Botsent(config.dataset_dir.joinpath('train'), version=1, explained_var=0.95)

        # Info for saving epoch metrics to a csv file
        if self.config.mode == 'train':
            self.pandas_path = os.path.join(config.save_path, "metrics.csv")
            self.outfile_dict = {k: getattr(config, k) for k in OUTPUT_FILE_PARAMS}
            self.df = pd.DataFrame()

        self.save_priming_sentences()

Example #3

Source File: solver.py From neural_chat with MIT License

6 votes

def __init__(self, config, train_data_loader, eval_data_loader, vocab, is_train=True, model=None):
        self.config = config
        self.epoch_i = 0
        self.train_data_loader = train_data_loader
        self.eval_data_loader = eval_data_loader
        self.vocab = vocab
        self.is_train = is_train
        self.model = model
        self.detokenizer = Detok()

        if config.emotion or config.infersent or config.context_input_only:
            self.botmoji = Botmoji()
            self.botsent = Botsent(config.dataset_dir.joinpath('train'), version=1, explained_var=0.95)

        # Info for saving epoch metrics to a csv file
        if self.config.mode == 'train':
            self.pandas_path = os.path.join(config.save_path, "metrics.csv")
            self.outfile_dict = {k: getattr(config, k) for k in OUTPUT_FILE_PARAMS}
            self.df = pd.DataFrame()

        self.save_priming_sentences()

Example #4

Source File: utils.py From ConvLab with MIT License

5 votes

def get_detokenize():
    return lambda x: TreebankWordDetokenizer().detokenize(x)

Example #5

Source File: utils.py From NeuralDialog-LaRL with Apache License 2.0

5 votes

def get_detokenize():
    return lambda x: TreebankWordDetokenizer().detokenize(x)

Example #6

Source File: utils.py From NeuralDialog-ZSDG with Apache License 2.0

5 votes

def get_dekenize():
    return lambda x: TreebankWordDetokenizer().detokenize(x)

Example #7

Source File: cnndm.py From nlp-recipes with MIT License

5 votes

def detokenize(line):
    """
    Detokenizes the processed CNN/DM dataset to recover the original dataset,
    e.g. converts "-LRB-" back to "(" and "-RRB-" back to ")".
    """
    line = line.strip().replace("``", '"').replace("''", '"').replace("`", "'")
    twd = TreebankWordDetokenizer()
    s_list = [
        twd.detokenize(x.strip().split(" "), convert_parentheses=True)
        for x in line.split("<S_SEP>")
    ]
    return " ".join(s_list)