Python torchtext.data.Pipeline() Examples
The following are 16
code examples of torchtext.data.Pipeline().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
torchtext.data
, or try the search function
.
Example #1
Source File: test_pipeline.py From decaNLP with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_composition(self): id_pipeline = data.Pipeline() pipeline = data.Pipeline(TestPipeline.repeat_n) pipeline.add_before(id_pipeline) pipeline.add_after(id_pipeline) pipeline.add_before(six.text_type.lower) pipeline.add_after(six.text_type.capitalize) other_pipeline = data.Pipeline(six.text_type.swapcase) other_pipeline.add_before(pipeline) # Assert pipeline gives proper results after composition # (test that we aren't modfifying pipes member) assert pipeline("teST") == "Testtesttest" assert pipeline(["ElE1", "eLe2"]) == ["Ele1ele1ele1", "Ele2ele2ele2"] # Assert pipeline that we added to gives proper results assert other_pipeline("teST") == "tESTTESTTEST" assert other_pipeline(["ElE1", "eLe2"]) == ["eLE1ELE1ELE1", "eLE2ELE2ELE2"]
Example #2
Source File: test_field.py From decaNLP with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_preprocess(self): # Default case. field = data.Field() assert field.preprocess("Test string.") == ["Test", "string."] # Test that lowercase is properly applied. field_lower = data.Field(lower=True) assert field_lower.preprocess("Test string.") == ["test", "string."] # Test that custom preprocessing pipelines are properly applied. preprocess_pipeline = data.Pipeline(lambda x: x + "!") field_preprocessing = data.Field(preprocessing=preprocess_pipeline, lower=True) assert field_preprocessing.preprocess("Test string.") == ["test!", "string.!"] # Test that non-sequential data is properly handled. field_not_sequential = data.Field(sequential=False, lower=True, preprocessing=preprocess_pipeline) assert field_not_sequential.preprocess("Test string.") == "test string.!" # Non-regression test that we do not try to decode unicode strings to unicode field_not_sequential = data.Field(sequential=False, lower=True, preprocessing=preprocess_pipeline) assert field_not_sequential.preprocess("ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T") == "ᑌᑎiᑕoᗪᕮ_tᕮ᙭t!"
Example #3
Source File: classification_datasets.py From DiPS with Apache License 2.0 | 6 votes |
def __init__(self, text_field, label_field, path=None, examples=None, **kwargs): """Create an MR dataset instance given a path and fields. Arguments: text_field: The field that will be used for text data. label_field: The field that will be used for label data. path: Path to the data file. examples: The examples contain all the data. Remaining keyword arguments: Passed to the constructor of data.Dataset. """ # text_field.preprocessing = data.Pipeline(clean_str) fields = [('text', text_field), ('label', label_field)] if examples is None: path = self.dirname if path is None else path examples = [] with codecs.open(os.path.join(path, 'rt-polarity.neg'),'r','utf8') as f: examples += [ data.Example.fromlist([line, 'negative'], fields) for line in f] with codecs.open(os.path.join(path, 'rt-polarity.pos'),'r','utf8') as f: examples += [ data.Example.fromlist([line, 'positive'], fields) for line in f] super(MR, self).__init__(examples, fields, **kwargs)
Example #4
Source File: test_pipeline.py From text with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_composition(self): id_pipeline = data.Pipeline() pipeline = data.Pipeline(TestPipeline.repeat_n) pipeline.add_before(id_pipeline) pipeline.add_after(id_pipeline) pipeline.add_before(str.lower) pipeline.add_after(str.capitalize) other_pipeline = data.Pipeline(str.swapcase) other_pipeline.add_before(pipeline) # Assert pipeline gives proper results after composition # (test that we aren't modfifying pipes member) assert pipeline("teST") == "Testtesttest" assert pipeline(["ElE1", "eLe2"]) == ["Ele1ele1ele1", "Ele2ele2ele2"] # Assert pipeline that we added to gives proper results assert other_pipeline("teST") == "tESTTESTTEST" assert other_pipeline(["ElE1", "eLe2"]) == ["eLE1ELE1ELE1", "eLE2ELE2ELE2"]
Example #5
Source File: test_field.py From text with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_preprocess(self): # Default case. field = data.Field() assert field.preprocess("Test string.") == ["Test", "string."] # Test that lowercase is properly applied. field_lower = data.Field(lower=True) assert field_lower.preprocess("Test string.") == ["test", "string."] # Test that custom preprocessing pipelines are properly applied. preprocess_pipeline = data.Pipeline(lambda x: x + "!") field_preprocessing = data.Field(preprocessing=preprocess_pipeline, lower=True) assert field_preprocessing.preprocess("Test string.") == ["test!", "string.!"] # Test that non-sequential data is properly handled. field_not_sequential = data.Field(sequential=False, lower=True, preprocessing=preprocess_pipeline) assert field_not_sequential.preprocess("Test string.") == "test string.!" # Non-regression test that we do not try to decode unicode strings to unicode field_not_sequential = data.Field(sequential=False, lower=True, preprocessing=preprocess_pipeline) assert field_not_sequential.preprocess("ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T") == "ᑌᑎiᑕoᗪᕮ_tᕮ᙭t!"
Example #6
Source File: test_pipeline.py From decaNLP with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_pipeline(self): id_pipeline = data.Pipeline() assert id_pipeline("Test STring") == "Test STring" assert id_pipeline("ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T") == "ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T" assert id_pipeline(["1241", "Some String"]) == ["1241", "Some String"] pipeline = data.Pipeline(six.text_type.lower) assert pipeline("Test STring") == "test string" assert pipeline("ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T") == "ᑌᑎiᑕoᗪᕮ_tᕮ᙭t" assert pipeline(["1241", "Some String"]) == ["1241", "some string"] args_pipeline = data.Pipeline(TestPipeline.repeat_n) assert args_pipeline("test", 5) == "testtesttesttesttest" assert args_pipeline(["ele1", "ele2"], 2) == ["ele1ele1", "ele2ele2"]
Example #7
Source File: test_pipeline.py From decaNLP with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_exceptions(self): with self.assertRaises(ValueError): data.Pipeline("Not Callable")
Example #8
Source File: test_pipeline.py From text with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_pipeline(self): id_pipeline = data.Pipeline() assert id_pipeline("Test STring") == "Test STring" assert id_pipeline("ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T") == "ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T" assert id_pipeline(["1241", "Some String"]) == ["1241", "Some String"] pipeline = data.Pipeline(str.lower) assert pipeline("Test STring") == "test string" assert pipeline("ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T") == "ᑌᑎiᑕoᗪᕮ_tᕮ᙭t" assert pipeline(["1241", "Some String"]) == ["1241", "some string"] args_pipeline = data.Pipeline(TestPipeline.repeat_n) assert args_pipeline("test", 5) == "testtesttesttesttest" assert args_pipeline(["ele1", "ele2"], 2) == ["ele1ele1", "ele2ele2"]
Example #9
Source File: test_pipeline.py From text with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_exceptions(self): with self.assertRaises(ValueError): data.Pipeline("Not Callable")
Example #10
Source File: bridge.py From castor with Apache License 2.0 | 5 votes |
def __init__(self, args): if not args.cuda: args.gpu = -1 if torch.cuda.is_available() and args.cuda: print("Note: You are using GPU for training") torch.cuda.set_device(args.gpu) torch.cuda.manual_seed(args.seed) if torch.cuda.is_available() and not args.cuda: print("Warning: You have Cuda but do not use it. You are using CPU for training") torch.manual_seed(args.seed) np.random.seed(args.seed) random.seed(args.seed) self.QID = data.Field(sequential=False) self.QUESTION = data.Field(batch_first=True) self.ANSWER = data.Field(batch_first=True) self.LABEL = data.Field(sequential=False) self.EXTERNAL = data.Field(sequential=True, tensor_type=torch.FloatTensor, batch_first=True, use_vocab=False, postprocessing=data.Pipeline(lambda arr, _, train: [float(y) for y in arr])) if 'TrecQA' in args.dataset: train, dev, test = TrecDataset.splits(self.QID, self.QUESTION, self.ANSWER, self.EXTERNAL, self.LABEL) elif 'WikiQA' in args.dataset: train, dev, test = WikiDataset.splits(self.QID, self.QUESTION, self.ANSWER, self.EXTERNAL, self.LABEL) else: print("Unsupported dataset") exit() self.QID.build_vocab(train, dev, test) self.QUESTION.build_vocab(train, dev, test) self.ANSWER.build_vocab(train, dev, test) self.LABEL.build_vocab(train, dev, test) if args.cuda: self.model = torch.load(args.model, map_location=lambda storage, location: storage.cuda(args.gpu)) else: self.model = torch.load(args.model, map_location=lambda storage, location: storage) self.gpu = args.gpu
Example #11
Source File: train_E2E.py From conv-emotion with MIT License | 4 votes |
def get_E2E_loaders(path, valid=0.1, batch_size=32): utterance = data.Field(tokenize=tokenizer, lower=True) label = data.Field(sequential=False, postprocessing=Pipeline(convert_token=convert_token)) id = data.Field(use_vocab=False,sequential=False) fields = [('id', id), ('turn1', utterance), ('turn2', utterance), ('turn3', utterance), ('label', label)] train = data.TabularDataset('{}/train.txt'.format(path), format='tsv', fields=fields, skip_header=True) valid = data.TabularDataset('{}/valid.txt'.format(path), format='tsv', fields=fields, skip_header=True) test = data.TabularDataset('{}/test.txt'.format(path), format='tsv', fields=fields, skip_header=True) vectors = vocab.Vectors(name='emojiplusglove.txt', cache='/media/backup/nlp-cic/DialogueRNN/') utterance.build_vocab(train, valid, test, vectors=vectors) #utterance.build_vocab(train, valid, test, vectors='glove.840B.300d') label.build_vocab(train) train_iter = BucketIterator(train, train=True, batch_size=batch_size, sort_key=lambda x: len(x.turn3), device=torch.device(0)) valid_iter = BucketIterator(valid, batch_size=batch_size, sort_key=lambda x: len(x.turn3), device=torch.device(0)) test_iter = BucketIterator(test, batch_size=batch_size, sort_key=lambda x: len(x.turn3), device=torch.device(0)) return train_iter, valid_iter, test_iter,\ utterance.vocab.vectors if not args.cuda else utterance.vocab.vectors.cuda(),\ label.vocab.itos
Example #12
Source File: mydatasets.py From pytorch-in-action with MIT License | 4 votes |
def __init__(self, text_field, label_field, path=None, text_cnt=1000, examples=None, **kwargs): """Create an MR dataset instance given a path and fields. Arguments: text_field: The field that will be used for text data. label_field: The field that will be used for label data. path: Path to the data file. examples: The examples contain all the data. Remaining keyword arguments: Passed to the constructor of data.Dataset. """ def clean_str(string): """ Tokenization/string cleaning for all datasets except for SST. Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py """ string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) string = re.sub(r"\'s", " \'s", string) string = re.sub(r"\'ve", " \'ve", string) string = re.sub(r"n\'t", " n\'t", string) string = re.sub(r"\'re", " \'re", string) string = re.sub(r"\'d", " \'d", string) string = re.sub(r"\'ll", " \'ll", string) string = re.sub(r",", " , ", string) string = re.sub(r"!", " ! ", string) string = re.sub(r"\(", " \( ", string) string = re.sub(r"\)", " \) ", string) string = re.sub(r"\?", " \? ", string) string = re.sub(r"\s{2,}", " ", string) return string.strip().lower() text_field.preprocessing = data.Pipeline(clean_str) fields = [('text', text_field), ('label', label_field)] categories = ['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian'] if examples is None: path = self.dirname if path is None else path examples = [] for sub_path in categories: sub_path_one = os.path.join(path, sub_path) sub_paths_two = os.listdir(sub_path_one) cnt = 0 for sub_path_two in sub_paths_two: lines = "" with open(os.path.join(sub_path_one, sub_path_two), encoding="utf8", errors='ignore') as f: lines = f.read() examples += [data.Example.fromlist([lines, sub_path], fields)] cnt += 1 super(NEWS_20, self).__init__(examples, fields, **kwargs)
Example #13
Source File: inputs.py From torchnlp with Apache License 2.0 | 4 votes |
def get_input_processor_words(vocab_word, vocab_char=None, convert_digits=True): """ Returns a function that converts text into a processed batch. Required duing inference. Parameters: vocab_word: Instance of torchtext.Vocab for input word vocabulary vocab_char[optional]: Instance of torchtext.Vocab for input per-word character vocabulary convert_digits: If True will convert numbers to single 0's """ inputs_word = data.Field(init_token="<bos>", eos_token="<eos>", batch_first=True, lower=True, preprocessing=data.Pipeline( lambda w: '0' if convert_digits and w.isdigit() else w )) # Set the vocab object manually without building from training dataset inputs_word.vocab = vocab_word if vocab_char is not None: inputs_char_nesting = data.Field(tokenize=list, init_token="<bos>", eos_token="<eos>", batch_first=True) inputs_char = data.NestedField(inputs_char_nesting, init_token="<bos>", eos_token="<eos>") # Set the vocab object manually without building from training dataset inputs_char.vocab = inputs_char_nesting.vocab = vocab_char fields = [(('inputs_word', 'inputs_char'), (inputs_word, inputs_char))] else: fields = [('inputs_word', inputs_word)] def input_processor_fn(inputs): if not isinstance(inputs, list): inputs = [inputs] examples = [] for line in inputs: examples.append(data.Example.fromlist([line], fields)) dataset = data.Dataset(examples, fields) # Entire input in one batch return data.Batch(data=dataset, dataset=dataset, device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")) return input_processor_fn
Example #14
Source File: mydatasets.py From cnn-text-classification-pytorch with Apache License 2.0 | 4 votes |
def __init__(self, text_field, label_field, path=None, examples=None, **kwargs): """Create an MR dataset instance given a path and fields. Arguments: text_field: The field that will be used for text data. label_field: The field that will be used for label data. path: Path to the data file. examples: The examples contain all the data. Remaining keyword arguments: Passed to the constructor of data.Dataset. """ def clean_str(string): """ Tokenization/string cleaning for all datasets except for SST. Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py """ string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) string = re.sub(r"\'s", " \'s", string) string = re.sub(r"\'ve", " \'ve", string) string = re.sub(r"n\'t", " n\'t", string) string = re.sub(r"\'re", " \'re", string) string = re.sub(r"\'d", " \'d", string) string = re.sub(r"\'ll", " \'ll", string) string = re.sub(r",", " , ", string) string = re.sub(r"!", " ! ", string) string = re.sub(r"\(", " \( ", string) string = re.sub(r"\)", " \) ", string) string = re.sub(r"\?", " \? ", string) string = re.sub(r"\s{2,}", " ", string) return string.strip() text_field.preprocessing = data.Pipeline(clean_str) fields = [('text', text_field), ('label', label_field)] if examples is None: path = self.dirname if path is None else path examples = [] with open(os.path.join(path, 'rt-polarity.neg'), errors='ignore') as f: examples += [ data.Example.fromlist([line, 'negative'], fields) for line in f] with open(os.path.join(path, 'rt-polarity.pos'), errors='ignore') as f: examples += [ data.Example.fromlist([line, 'positive'], fields) for line in f] super(MR, self).__init__(examples, fields, **kwargs)
Example #15
Source File: mydatasets.py From char-cnn-text-classification-pytorch with Apache License 2.0 | 4 votes |
def __init__(self, text_field, label_field, path=None, examples=None, **kwargs): """Create an MR dataset instance given a path and fields. Arguments: text_field: The field that will be used for text data. label_field: The field that will be used for label data. path: Path to the data file. examples: The examples contain all the data. Remaining keyword arguments: Passed to the constructor of data.Dataset. """ def clean_str(string): """ Tokenization/string cleaning for all datasets except for SST. Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py """ string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) string = re.sub(r"\'s", " \'s", string) string = re.sub(r"\'ve", " \'ve", string) string = re.sub(r"n\'t", " n\'t", string) string = re.sub(r"\'re", " \'re", string) string = re.sub(r"\'d", " \'d", string) string = re.sub(r"\'ll", " \'ll", string) string = re.sub(r",", " , ", string) string = re.sub(r"!", " ! ", string) string = re.sub(r"\(", " \( ", string) string = re.sub(r"\)", " \) ", string) string = re.sub(r"\?", " \? ", string) string = re.sub(r"\s{2,}", " ", string) return string.strip() text_field.preprocessing = data.Pipeline(clean_str) fields = [('text', text_field), ('label', label_field)] if examples is None: path = self.dirname if path is None else path examples = [] with codecs.open(os.path.join(path, 'rt-polarity.neg'), encoding='utf-8', errors='ignore') as f: examples += [ data.Example.fromlist([line, 'negative'], fields) for line in f] with codecs.open(os.path.join(path, 'rt-polarity.pos'), encoding='utf-8', errors='ignore') as f: examples += [ data.Example.fromlist([line, 'positive'], fields) for line in f] super(MR, self).__init__(examples, fields, **kwargs)
Example #16
Source File: data_loader_txt.py From char-cnn-text-classification-pytorch with Apache License 2.0 | 4 votes |
def __init__(self, text_field, label_field, path=None, examples=None, **kwargs): """Create an MR dataset instance given a path and fields. Arguments: text_field: The field that will be used for text data. label_field: The field that will be used for label data. path: Path to the data file. examples: The examples contain all the data. Remaining keyword arguments: Passed to the constructor of data.Dataset. """ def clean_str(string): """ Tokenization/string cleaning for all datasets except for SST. Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py """ string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) string = re.sub(r"\'s", " \'s", string) string = re.sub(r"\'ve", " \'ve", string) string = re.sub(r"n\'t", " n\'t", string) string = re.sub(r"\'re", " \'re", string) string = re.sub(r"\'d", " \'d", string) string = re.sub(r"\'ll", " \'ll", string) string = re.sub(r",", " , ", string) string = re.sub(r"!", " ! ", string) string = re.sub(r"\(", " \( ", string) string = re.sub(r"\)", " \) ", string) string = re.sub(r"\?", " \? ", string) string = re.sub(r"\s{2,}", " ", string) return string.strip() text_field.preprocessing = data.Pipeline(clean_str) fields = [('text', text_field), ('label', label_field)] if examples is None: path = self.dirname if path is None else path examples = [] with codecs.open(os.path.join(path, 'rt-polarity.neg'), encoding='utf-8', errors='ignore') as f: examples += [ data.Example.fromlist([line, 'negative'], fields) for line in f] with codecs.open(os.path.join(path, 'rt-polarity.pos'), encoding='utf-8', errors='ignore') as f: examples += [ data.Example.fromlist([line, 'positive'], fields) for line in f] super(MR, self).__init__(examples, fields, **kwargs)