Python torchtext.data.Pipeline() Examples

The following are 16 code examples of torchtext.data.Pipeline(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module torchtext.data , or try the search function .
Example #1
Source File: test_pipeline.py    From decaNLP with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_composition(self):
        id_pipeline = data.Pipeline()
        pipeline = data.Pipeline(TestPipeline.repeat_n)
        pipeline.add_before(id_pipeline)
        pipeline.add_after(id_pipeline)
        pipeline.add_before(six.text_type.lower)
        pipeline.add_after(six.text_type.capitalize)

        other_pipeline = data.Pipeline(six.text_type.swapcase)
        other_pipeline.add_before(pipeline)

        # Assert pipeline gives proper results after composition
        # (test that we aren't modfifying pipes member)
        assert pipeline("teST") == "Testtesttest"
        assert pipeline(["ElE1", "eLe2"]) == ["Ele1ele1ele1", "Ele2ele2ele2"]

        # Assert pipeline that we added to gives proper results
        assert other_pipeline("teST") == "tESTTESTTEST"
        assert other_pipeline(["ElE1", "eLe2"]) == ["eLE1ELE1ELE1", "eLE2ELE2ELE2"] 
Example #2
Source File: test_field.py    From decaNLP with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_preprocess(self):
        # Default case.
        field = data.Field()
        assert field.preprocess("Test string.") == ["Test", "string."]

        # Test that lowercase is properly applied.
        field_lower = data.Field(lower=True)
        assert field_lower.preprocess("Test string.") == ["test", "string."]

        # Test that custom preprocessing pipelines are properly applied.
        preprocess_pipeline = data.Pipeline(lambda x: x + "!")
        field_preprocessing = data.Field(preprocessing=preprocess_pipeline,
                                         lower=True)
        assert field_preprocessing.preprocess("Test string.") == ["test!", "string.!"]

        # Test that non-sequential data is properly handled.
        field_not_sequential = data.Field(sequential=False, lower=True,
                                          preprocessing=preprocess_pipeline)
        assert field_not_sequential.preprocess("Test string.") == "test string.!"

        # Non-regression test that we do not try to decode unicode strings to unicode
        field_not_sequential = data.Field(sequential=False, lower=True,
                                          preprocessing=preprocess_pipeline)
        assert field_not_sequential.preprocess("ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T") == "ᑌᑎiᑕoᗪᕮ_tᕮ᙭t!" 
Example #3
Source File: classification_datasets.py    From DiPS with Apache License 2.0 6 votes vote down vote up
def __init__(self, text_field, label_field, path=None, examples=None, **kwargs):
        """Create an MR dataset instance given a path and fields.
        Arguments:
            text_field: The field that will be used for text data.
            label_field: The field that will be used for label data.
            path: Path to the data file.
            examples: The examples contain all the data.
            Remaining keyword arguments: Passed to the constructor of
                data.Dataset.
        """
        # text_field.preprocessing = data.Pipeline(clean_str)
        fields = [('text', text_field), ('label', label_field)]
        if examples is None:
            path = self.dirname if path is None else path
            examples = []
            with codecs.open(os.path.join(path, 'rt-polarity.neg'),'r','utf8') as f:
                examples += [
                    data.Example.fromlist([line, 'negative'], fields) for line in f]
            with codecs.open(os.path.join(path, 'rt-polarity.pos'),'r','utf8') as f:
                examples += [
                    data.Example.fromlist([line, 'positive'], fields) for line in f]
        super(MR, self).__init__(examples, fields, **kwargs) 
Example #4
Source File: test_pipeline.py    From text with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_composition(self):
        id_pipeline = data.Pipeline()
        pipeline = data.Pipeline(TestPipeline.repeat_n)
        pipeline.add_before(id_pipeline)
        pipeline.add_after(id_pipeline)
        pipeline.add_before(str.lower)
        pipeline.add_after(str.capitalize)

        other_pipeline = data.Pipeline(str.swapcase)
        other_pipeline.add_before(pipeline)

        # Assert pipeline gives proper results after composition
        # (test that we aren't modfifying pipes member)
        assert pipeline("teST") == "Testtesttest"
        assert pipeline(["ElE1", "eLe2"]) == ["Ele1ele1ele1", "Ele2ele2ele2"]

        # Assert pipeline that we added to gives proper results
        assert other_pipeline("teST") == "tESTTESTTEST"
        assert other_pipeline(["ElE1", "eLe2"]) == ["eLE1ELE1ELE1", "eLE2ELE2ELE2"] 
Example #5
Source File: test_field.py    From text with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_preprocess(self):
        # Default case.
        field = data.Field()
        assert field.preprocess("Test string.") == ["Test", "string."]

        # Test that lowercase is properly applied.
        field_lower = data.Field(lower=True)
        assert field_lower.preprocess("Test string.") == ["test", "string."]

        # Test that custom preprocessing pipelines are properly applied.
        preprocess_pipeline = data.Pipeline(lambda x: x + "!")
        field_preprocessing = data.Field(preprocessing=preprocess_pipeline,
                                         lower=True)
        assert field_preprocessing.preprocess("Test string.") == ["test!", "string.!"]

        # Test that non-sequential data is properly handled.
        field_not_sequential = data.Field(sequential=False, lower=True,
                                          preprocessing=preprocess_pipeline)
        assert field_not_sequential.preprocess("Test string.") == "test string.!"

        # Non-regression test that we do not try to decode unicode strings to unicode
        field_not_sequential = data.Field(sequential=False, lower=True,
                                          preprocessing=preprocess_pipeline)
        assert field_not_sequential.preprocess("ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T") == "ᑌᑎiᑕoᗪᕮ_tᕮ᙭t!" 
Example #6
Source File: test_pipeline.py    From decaNLP with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_pipeline(self):
        id_pipeline = data.Pipeline()
        assert id_pipeline("Test STring") == "Test STring"
        assert id_pipeline("ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T") == "ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T"
        assert id_pipeline(["1241", "Some String"]) == ["1241", "Some String"]

        pipeline = data.Pipeline(six.text_type.lower)
        assert pipeline("Test STring") == "test string"
        assert pipeline("ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T") == "ᑌᑎiᑕoᗪᕮ_tᕮ᙭t"
        assert pipeline(["1241", "Some String"]) == ["1241", "some string"]

        args_pipeline = data.Pipeline(TestPipeline.repeat_n)
        assert args_pipeline("test", 5) == "testtesttesttesttest"
        assert args_pipeline(["ele1", "ele2"], 2) == ["ele1ele1", "ele2ele2"] 
Example #7
Source File: test_pipeline.py    From decaNLP with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_exceptions(self):
        with self.assertRaises(ValueError):
            data.Pipeline("Not Callable") 
Example #8
Source File: test_pipeline.py    From text with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_pipeline(self):
        id_pipeline = data.Pipeline()
        assert id_pipeline("Test STring") == "Test STring"
        assert id_pipeline("ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T") == "ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T"
        assert id_pipeline(["1241", "Some String"]) == ["1241", "Some String"]

        pipeline = data.Pipeline(str.lower)
        assert pipeline("Test STring") == "test string"
        assert pipeline("ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T") == "ᑌᑎiᑕoᗪᕮ_tᕮ᙭t"
        assert pipeline(["1241", "Some String"]) == ["1241", "some string"]

        args_pipeline = data.Pipeline(TestPipeline.repeat_n)
        assert args_pipeline("test", 5) == "testtesttesttesttest"
        assert args_pipeline(["ele1", "ele2"], 2) == ["ele1ele1", "ele2ele2"] 
Example #9
Source File: test_pipeline.py    From text with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_exceptions(self):
        with self.assertRaises(ValueError):
            data.Pipeline("Not Callable") 
Example #10
Source File: bridge.py    From castor with Apache License 2.0 5 votes vote down vote up
def __init__(self, args):
        if not args.cuda:
            args.gpu = -1
        if torch.cuda.is_available() and args.cuda:
            print("Note: You are using GPU for training")
            torch.cuda.set_device(args.gpu)
            torch.cuda.manual_seed(args.seed)
        if torch.cuda.is_available() and not args.cuda:
            print("Warning: You have Cuda but do not use it. You are using CPU for training")

        torch.manual_seed(args.seed)
        np.random.seed(args.seed)
        random.seed(args.seed)

        self.QID = data.Field(sequential=False)
        self.QUESTION = data.Field(batch_first=True)
        self.ANSWER = data.Field(batch_first=True)
        self.LABEL = data.Field(sequential=False)
        self.EXTERNAL = data.Field(sequential=True, tensor_type=torch.FloatTensor, batch_first=True, use_vocab=False,
                              postprocessing=data.Pipeline(lambda arr, _, train: [float(y) for y in arr]))

        if 'TrecQA' in args.dataset:
            train, dev, test = TrecDataset.splits(self.QID, self.QUESTION, self.ANSWER, self.EXTERNAL, self.LABEL)
        elif 'WikiQA' in args.dataset:
            train, dev, test = WikiDataset.splits(self.QID, self.QUESTION, self.ANSWER, self.EXTERNAL, self.LABEL)
        else:
            print("Unsupported dataset")
            exit()

        self.QID.build_vocab(train, dev, test)
        self.QUESTION.build_vocab(train, dev, test)
        self.ANSWER.build_vocab(train, dev, test)
        self.LABEL.build_vocab(train, dev, test)

        if args.cuda:
            self.model = torch.load(args.model, map_location=lambda storage, location: storage.cuda(args.gpu))
        else:
            self.model = torch.load(args.model, map_location=lambda storage, location: storage)

        self.gpu = args.gpu 
Example #11
Source File: train_E2E.py    From conv-emotion with MIT License 4 votes vote down vote up
def get_E2E_loaders(path, valid=0.1, batch_size=32):
    utterance = data.Field(tokenize=tokenizer, lower=True)
    label     = data.Field(sequential=False, postprocessing=Pipeline(convert_token=convert_token))
    id        = data.Field(use_vocab=False,sequential=False)
    fields = [('id', id),
              ('turn1', utterance),
              ('turn2', utterance),
              ('turn3', utterance),
              ('label', label)]

    train = data.TabularDataset('{}/train.txt'.format(path),
                                format='tsv',
                                fields=fields,
                                skip_header=True)
    valid = data.TabularDataset('{}/valid.txt'.format(path),
                                format='tsv',
                                fields=fields,
                                skip_header=True)

    test = data.TabularDataset('{}/test.txt'.format(path),
                                format='tsv',
                                fields=fields,
                                skip_header=True)
    vectors = vocab.Vectors(name='emojiplusglove.txt', cache='/media/backup/nlp-cic/DialogueRNN/')
    utterance.build_vocab(train, valid, test, vectors=vectors)
    #utterance.build_vocab(train, valid, test, vectors='glove.840B.300d')
    label.build_vocab(train)
    train_iter = BucketIterator(train,
                                  train=True,
                                  batch_size=batch_size,
                                  sort_key=lambda x: len(x.turn3),
                                  device=torch.device(0))
    valid_iter = BucketIterator(valid,
                                  batch_size=batch_size,
                                  sort_key=lambda x: len(x.turn3),
                                  device=torch.device(0))
    test_iter = BucketIterator(test,
                                  batch_size=batch_size,
                                  sort_key=lambda x: len(x.turn3),
                                  device=torch.device(0))
    return train_iter, valid_iter, test_iter,\
            utterance.vocab.vectors if not args.cuda else utterance.vocab.vectors.cuda(),\
            label.vocab.itos 
Example #12
Source File: mydatasets.py    From pytorch-in-action with MIT License 4 votes vote down vote up
def __init__(self, text_field, label_field, path=None, text_cnt=1000, examples=None, **kwargs):
        """Create an MR dataset instance given a path and fields.

        Arguments:
            text_field: The field that will be used for text data.

            label_field: The field that will be used for label data.
            path: Path to the data file.
            examples: The examples contain all the data.
            Remaining keyword arguments: Passed to the constructor of
                data.Dataset.
        """

        def clean_str(string):
            """
            Tokenization/string cleaning for all datasets except for SST.
            Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
            """
            string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
            string = re.sub(r"\'s", " \'s", string)
            string = re.sub(r"\'ve", " \'ve", string)
            string = re.sub(r"n\'t", " n\'t", string)
            string = re.sub(r"\'re", " \'re", string)
            string = re.sub(r"\'d", " \'d", string)
            string = re.sub(r"\'ll", " \'ll", string)
            string = re.sub(r",", " , ", string)
            string = re.sub(r"!", " ! ", string)
            string = re.sub(r"\(", " \( ", string)
            string = re.sub(r"\)", " \) ", string)
            string = re.sub(r"\?", " \? ", string)
            string = re.sub(r"\s{2,}", " ", string)
            return string.strip().lower()

        text_field.preprocessing = data.Pipeline(clean_str)
        fields = [('text', text_field), ('label', label_field)]

        categories = ['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']
        if examples is None:
            path = self.dirname if path is None else path
            examples = []
            for sub_path in categories:

                sub_path_one = os.path.join(path, sub_path)
                sub_paths_two = os.listdir(sub_path_one)
                cnt = 0
                for sub_path_two in sub_paths_two:
                    lines = ""
                    with open(os.path.join(sub_path_one, sub_path_two), encoding="utf8", errors='ignore') as f:
                        lines = f.read()
                    examples += [data.Example.fromlist([lines, sub_path], fields)]
                    cnt += 1

        super(NEWS_20, self).__init__(examples, fields, **kwargs) 
Example #13
Source File: inputs.py    From torchnlp with Apache License 2.0 4 votes vote down vote up
def get_input_processor_words(vocab_word, vocab_char=None, convert_digits=True):
    """
    Returns a function that converts text into a processed batch. Required duing
    inference.
    Parameters:
        vocab_word: Instance of torchtext.Vocab for input word vocabulary
        vocab_char[optional]: Instance of torchtext.Vocab for input per-word 
                              character vocabulary
        convert_digits: If True will convert numbers to single 0's
    """
    inputs_word = data.Field(init_token="<bos>", eos_token="<eos>", batch_first=True, lower=True,
                                preprocessing=data.Pipeline(
                                    lambda w: '0' if convert_digits and w.isdigit() else w ))
    # Set the vocab object manually without building from training dataset
    inputs_word.vocab = vocab_word

    if vocab_char is not None:
        inputs_char_nesting = data.Field(tokenize=list, init_token="<bos>", eos_token="<eos>", 
                                        batch_first=True)

        inputs_char = data.NestedField(inputs_char_nesting, 
                                        init_token="<bos>", eos_token="<eos>")
        # Set the vocab object manually without building from training dataset
        inputs_char.vocab = inputs_char_nesting.vocab = vocab_char
        
        fields = [(('inputs_word', 'inputs_char'), (inputs_word, inputs_char))]
    else:
        fields = [('inputs_word', inputs_word)]


    def input_processor_fn(inputs):
        if not isinstance(inputs, list):
            inputs = [inputs]

        examples = []
        for line in inputs:
            examples.append(data.Example.fromlist([line], fields))
        
        dataset = data.Dataset(examples, fields)
        # Entire input in one batch
        return data.Batch(data=dataset, 
                          dataset=dataset,
                          device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu"))

    return input_processor_fn 
Example #14
Source File: mydatasets.py    From cnn-text-classification-pytorch with Apache License 2.0 4 votes vote down vote up
def __init__(self, text_field, label_field, path=None, examples=None, **kwargs):
        """Create an MR dataset instance given a path and fields.

        Arguments:
            text_field: The field that will be used for text data.
            label_field: The field that will be used for label data.
            path: Path to the data file.
            examples: The examples contain all the data.
            Remaining keyword arguments: Passed to the constructor of
                data.Dataset.
        """
        def clean_str(string):
            """
            Tokenization/string cleaning for all datasets except for SST.
            Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
            """
            string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
            string = re.sub(r"\'s", " \'s", string)
            string = re.sub(r"\'ve", " \'ve", string)
            string = re.sub(r"n\'t", " n\'t", string)
            string = re.sub(r"\'re", " \'re", string)
            string = re.sub(r"\'d", " \'d", string)
            string = re.sub(r"\'ll", " \'ll", string)
            string = re.sub(r",", " , ", string)
            string = re.sub(r"!", " ! ", string)
            string = re.sub(r"\(", " \( ", string)
            string = re.sub(r"\)", " \) ", string)
            string = re.sub(r"\?", " \? ", string)
            string = re.sub(r"\s{2,}", " ", string)
            return string.strip()

        text_field.preprocessing = data.Pipeline(clean_str)
        fields = [('text', text_field), ('label', label_field)]

        if examples is None:
            path = self.dirname if path is None else path
            examples = []
            with open(os.path.join(path, 'rt-polarity.neg'), errors='ignore') as f:
                examples += [
                    data.Example.fromlist([line, 'negative'], fields) for line in f]
            with open(os.path.join(path, 'rt-polarity.pos'), errors='ignore') as f:
                examples += [
                    data.Example.fromlist([line, 'positive'], fields) for line in f]
        super(MR, self).__init__(examples, fields, **kwargs) 
Example #15
Source File: mydatasets.py    From char-cnn-text-classification-pytorch with Apache License 2.0 4 votes vote down vote up
def __init__(self, text_field, label_field, path=None, examples=None, **kwargs):
        """Create an MR dataset instance given a path and fields.

        Arguments:
            text_field: The field that will be used for text data.
            label_field: The field that will be used for label data.
            path: Path to the data file.
            examples: The examples contain all the data.
            Remaining keyword arguments: Passed to the constructor of
                data.Dataset.
        """
        def clean_str(string):
            """
            Tokenization/string cleaning for all datasets except for SST.
            Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
            """
            string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
            string = re.sub(r"\'s", " \'s", string)
            string = re.sub(r"\'ve", " \'ve", string)
            string = re.sub(r"n\'t", " n\'t", string)
            string = re.sub(r"\'re", " \'re", string)
            string = re.sub(r"\'d", " \'d", string)
            string = re.sub(r"\'ll", " \'ll", string)
            string = re.sub(r",", " , ", string)
            string = re.sub(r"!", " ! ", string)
            string = re.sub(r"\(", " \( ", string)
            string = re.sub(r"\)", " \) ", string)
            string = re.sub(r"\?", " \? ", string)
            string = re.sub(r"\s{2,}", " ", string)
            return string.strip()

        text_field.preprocessing = data.Pipeline(clean_str)
        fields = [('text', text_field), ('label', label_field)]

        if examples is None:
            path = self.dirname if path is None else path
            examples = []
            with codecs.open(os.path.join(path, 'rt-polarity.neg'), encoding='utf-8', errors='ignore') as f:
                examples += [
                    data.Example.fromlist([line, 'negative'], fields) for line in f]
            with codecs.open(os.path.join(path, 'rt-polarity.pos'), encoding='utf-8', errors='ignore') as f:
                examples += [
                    data.Example.fromlist([line, 'positive'], fields) for line in f]
        super(MR, self).__init__(examples, fields, **kwargs) 
Example #16
Source File: data_loader_txt.py    From char-cnn-text-classification-pytorch with Apache License 2.0 4 votes vote down vote up
def __init__(self, text_field, label_field, path=None, examples=None, **kwargs):
        """Create an MR dataset instance given a path and fields.

        Arguments:
            text_field: The field that will be used for text data.
            label_field: The field that will be used for label data.
            path: Path to the data file.
            examples: The examples contain all the data.
            Remaining keyword arguments: Passed to the constructor of
                data.Dataset.
        """
        def clean_str(string):
            """
            Tokenization/string cleaning for all datasets except for SST.
            Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
            """
            string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
            string = re.sub(r"\'s", " \'s", string)
            string = re.sub(r"\'ve", " \'ve", string)
            string = re.sub(r"n\'t", " n\'t", string)
            string = re.sub(r"\'re", " \'re", string)
            string = re.sub(r"\'d", " \'d", string)
            string = re.sub(r"\'ll", " \'ll", string)
            string = re.sub(r",", " , ", string)
            string = re.sub(r"!", " ! ", string)
            string = re.sub(r"\(", " \( ", string)
            string = re.sub(r"\)", " \) ", string)
            string = re.sub(r"\?", " \? ", string)
            string = re.sub(r"\s{2,}", " ", string)
            return string.strip()

        text_field.preprocessing = data.Pipeline(clean_str)
        fields = [('text', text_field), ('label', label_field)]

        if examples is None:
            path = self.dirname if path is None else path
            examples = []
            with codecs.open(os.path.join(path, 'rt-polarity.neg'), encoding='utf-8', errors='ignore') as f:
                examples += [
                    data.Example.fromlist([line, 'negative'], fields) for line in f]
            with codecs.open(os.path.join(path, 'rt-polarity.pos'), encoding='utf-8', errors='ignore') as f:
                examples += [
                    data.Example.fromlist([line, 'positive'], fields) for line in f]
        super(MR, self).__init__(examples, fields, **kwargs)