Python torchtext.data.TabularDataset() Examples
The following are 30
code examples of torchtext.data.TabularDataset().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
torchtext.data
, or try the search function
.
Example #1
Source File: test_field.py From text with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_numericalize_basic(self): self.write_test_ppid_dataset(data_format="tsv") question_field = data.Field(sequential=True) tsv_fields = [("id", None), ("q1", question_field), ("q2", question_field), ("label", None)] tsv_dataset = data.TabularDataset( path=self.test_ppid_dataset_path, format="tsv", fields=tsv_fields) question_field.build_vocab(tsv_dataset) test_example_data = [["When", "do", "you", "use", "シ", "instead", "of", "し?"], ["What", "is", "2+2", "<pad>", "<pad>", "<pad>", "<pad>", "<pad>"], ["Here", "is", "a", "sentence", "with", "some", "oovs", "<pad>"]] # Test default default_numericalized = question_field.numericalize(test_example_data) verify_numericalized_example(question_field, test_example_data, default_numericalized)
Example #2
Source File: test_batch.py From text with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_batch_iter(self): self.write_test_numerical_features_dataset() FLOAT = data.Field(use_vocab=False, sequential=False, dtype=torch.float) INT = data.Field(use_vocab=False, sequential=False, is_target=True) TEXT = data.Field(sequential=False) dst = data.TabularDataset(path=self.test_numerical_features_dataset_path, format="tsv", skip_header=False, fields=[("float", FLOAT), ("int", INT), ("text", TEXT)]) TEXT.build_vocab(dst) itr = data.Iterator(dst, batch_size=2, device=-1, shuffle=False) fld_order = [k for k, v in dst.fields.items() if v is not None and not v.is_target] batch = next(iter(itr)) (x1, x2), y = batch x = (x1, x2)[fld_order.index("float")] self.assertEquals(y.data[0], 1) self.assertEquals(y.data[1], 12) self.assertAlmostEqual(x.data[0], 0.1, places=4) self.assertAlmostEqual(x.data[1], 0.5, places=4)
Example #3
Source File: test_field.py From decaNLP with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_numericalize_include_lengths(self): self.write_test_ppid_dataset(data_format="tsv") question_field = data.Field(sequential=True, include_lengths=True) tsv_fields = [("id", None), ("q1", question_field), ("q2", question_field), ("label", None)] tsv_dataset = data.TabularDataset( path=self.test_ppid_dataset_path, format="tsv", fields=tsv_fields) question_field.build_vocab(tsv_dataset) test_example_data = [["When", "do", "you", "use", "シ", "instead", "of", "し?"], ["What", "is", "2+2", "<pad>", "<pad>", "<pad>", "<pad>", "<pad>"], ["Here", "is", "a", "sentence", "with", "some", "oovs", "<pad>"]] test_example_lengths = [8, 3, 7] # Test with include_lengths include_lengths_numericalized = question_field.numericalize( (test_example_data, test_example_lengths), device=-1) verify_numericalized_example(question_field, test_example_data, include_lengths_numericalized, test_example_lengths)
Example #4
Source File: relation_task.py From DIAG-NRE with MIT License | 6 votes |
def init_test_set(self): test_file_path = self.config['test_file'] print('Loading test set {}'.format(test_file_path)) self.test_set = tt_data.TabularDataset(path=test_file_path, format='csv', fields=[('Id', self.ID), ('Text', self.TEXT), ('Pos1', self.POS), ('Pos2', self.POS), ('Label', self.LABEL)], skip_header=False) self.test_iter = tt_data.Iterator(self.test_set, sort_key=lambda x: len(x.Text), batch_size=self.config['test_batch_size'], train=False, repeat=False, sort_within_batch=True, device=self.device)
Example #5
Source File: relation_task.py From DIAG-NRE with MIT License | 6 votes |
def init_dev_set(self): dev_file_path = self.config['dev_file'] print('Loading dev set from {}'.format(dev_file_path)) self.dev_set = tt_data.TabularDataset(path=dev_file_path, format='csv', fields=[('Id', self.ID), ('Text', self.TEXT), ('Pos1', self.POS), ('Pos2', self.POS), ('Label', self.LABEL)], skip_header=False) self.dev_iter = tt_data.Iterator(self.dev_set, sort_key=lambda x: len(x.Text), batch_size=self.config['test_batch_size'], train=False, repeat=False, sort_within_batch=True, device=self.device)
Example #6
Source File: test_field.py From decaNLP with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_errors(self): # Test that passing a non-tuple (of data and length) to numericalize # with Field.include_lengths = True raises an error. with self.assertRaises(ValueError): self.write_test_ppid_dataset(data_format="tsv") question_field = data.Field(sequential=True, include_lengths=True) tsv_fields = [("id", None), ("q1", question_field), ("q2", question_field), ("label", None)] tsv_dataset = data.TabularDataset( path=self.test_ppid_dataset_path, format="tsv", fields=tsv_fields) question_field.build_vocab(tsv_dataset) test_example_data = [["When", "do", "you", "use", "シ", "instead", "of", "し?"], ["What", "is", "2+2", "<pad>", "<pad>", "<pad>", "<pad>", "<pad>"], ["Here", "is", "a", "sentence", "with", "some", "oovs", "<pad>"]] question_field.numericalize( test_example_data, device=-1)
Example #7
Source File: relation_task.py From DIAG-NRE with MIT License | 6 votes |
def init_train_set(self): set_all_random_seed(self.config['random_seed']) train_file_path = self.config['train_file'] print('Loading train set from {}'.format(train_file_path)) self.train_set = tt_data.TabularDataset(path=train_file_path, format='csv', fields=[('Id', self.ID), ('Text', self.TEXT), ('Pos1', self.POS), ('Pos2', self.POS), ('Label', self.TRAIN_LABEL)], skip_header=False) self.train_iter = tt_data.Iterator(self.train_set, sort_key=lambda x: len(x.Text), batch_size=self.config['train_batch_size'], train=True, repeat=False, sort_within_batch=True, device=self.device)
Example #8
Source File: test_field.py From text with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_numericalize_include_lengths(self): self.write_test_ppid_dataset(data_format="tsv") question_field = data.Field(sequential=True, include_lengths=True) tsv_fields = [("id", None), ("q1", question_field), ("q2", question_field), ("label", None)] tsv_dataset = data.TabularDataset( path=self.test_ppid_dataset_path, format="tsv", fields=tsv_fields) question_field.build_vocab(tsv_dataset) test_example_data = [["When", "do", "you", "use", "シ", "instead", "of", "し?"], ["What", "is", "2+2", "<pad>", "<pad>", "<pad>", "<pad>", "<pad>"], ["Here", "is", "a", "sentence", "with", "some", "oovs", "<pad>"]] test_example_lengths = [8, 3, 7] # Test with include_lengths include_lengths_numericalized = question_field.numericalize( (test_example_data, test_example_lengths)) verify_numericalized_example(question_field, test_example_data, include_lengths_numericalized, test_example_lengths)
Example #9
Source File: test_field.py From text with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_vocab_size(self): # Set up fields question_field = data.Field(sequential=True) label_field = data.LabelField() # Copied from test_build_vocab with minor changes # Write TSV dataset and construct a Dataset self.write_test_ppid_dataset(data_format="tsv") tsv_fields = [("id", None), ("q1", question_field), ("q2", question_field), ("label", label_field)] tsv_dataset = data.TabularDataset( path=self.test_ppid_dataset_path, format="tsv", fields=tsv_fields) # Skipping json dataset as we can rely on the original build vocab test label_field.build_vocab(tsv_dataset) assert label_field.vocab.freqs == Counter({'1': 2, '0': 1}) expected_stoi = {'1': 0, '0': 1} # No <unk> assert dict(label_field.vocab.stoi) == expected_stoi # Turn the stoi dictionary into an itos list expected_itos = [x[0] for x in sorted(expected_stoi.items(), key=lambda tup: tup[1])] assert label_field.vocab.itos == expected_itos
Example #10
Source File: test_field.py From text with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_numericalize_batch_first(self): self.write_test_ppid_dataset(data_format="tsv") question_field = data.Field(sequential=True, batch_first=True) tsv_fields = [("id", None), ("q1", question_field), ("q2", question_field), ("label", None)] tsv_dataset = data.TabularDataset( path=self.test_ppid_dataset_path, format="tsv", fields=tsv_fields) question_field.build_vocab(tsv_dataset) test_example_data = [["When", "do", "you", "use", "シ", "instead", "of", "し?"], ["What", "is", "2+2", "<pad>", "<pad>", "<pad>", "<pad>", "<pad>"], ["Here", "is", "a", "sentence", "with", "some", "oovs", "<pad>"]] # Test with batch_first include_lengths_numericalized = question_field.numericalize( test_example_data) verify_numericalized_example(question_field, test_example_data, include_lengths_numericalized, batch_first=True)
Example #11
Source File: test_field.py From text with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_errors(self): # Test that passing a non-tuple (of data and length) to numericalize # with Field.include_lengths = True raises an error. with self.assertRaises(ValueError): self.write_test_ppid_dataset(data_format="tsv") question_field = data.Field(sequential=True, include_lengths=True) tsv_fields = [("id", None), ("q1", question_field), ("q2", question_field), ("label", None)] tsv_dataset = data.TabularDataset( path=self.test_ppid_dataset_path, format="tsv", fields=tsv_fields) question_field.build_vocab(tsv_dataset) test_example_data = [["When", "do", "you", "use", "シ", "instead", "of", "し?"], ["What", "is", "2+2", "<pad>", "<pad>", "<pad>", "<pad>", "<pad>"], ["Here", "is", "a", "sentence", "with", "some", "oovs", "<pad>"]] question_field.numericalize( test_example_data)
Example #12
Source File: test_dataset.py From text with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_csv_dataset_quotechar(self): # Based on issue #349 example_data = [("text", "label"), ('" hello world', "0"), ('goodbye " world', "1"), ('this is a pen " ', "0")] with tempfile.NamedTemporaryFile(dir=self.test_dir) as f: for example in example_data: f.write("{}\n".format(",".join(example)).encode("latin-1")) TEXT = data.Field(lower=True, tokenize=lambda x: x.split()) fields = { "label": ("label", data.Field(use_vocab=False, sequential=False)), "text": ("text", TEXT) } f.seek(0) dataset = data.TabularDataset( path=f.name, format="csv", skip_header=False, fields=fields, csv_reader_params={"quotechar": None}) TEXT.build_vocab(dataset) self.assertEqual(len(dataset), len(example_data) - 1) for i, example in enumerate(dataset): self.assertEqual(example.text, example_data[i + 1][0].lower().split()) self.assertEqual(example.label, example_data[i + 1][1])
Example #13
Source File: test_dataset.py From text with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_json_valid_and_invalid_nested_key(self): self.write_test_nested_key_json_dataset() valid_fields = {'foods.vegetables.name': ('vegs', data.Field()), 'foods.fruits': ('fruits', data.Field())} invalid_fields = {'foods.vegetables.color': ('vegs', data.Field())} expected_examples = [ {"fruits": ["Apple", "Banana"], "vegs": ["Broccoli", "Cabbage"]}, {"fruits": ["Cherry", "Grape", "Lemon"], "vegs": ["Cucumber", "Lettuce"]}, {"fruits": ["Orange", "Pear", "Strawberry"], "vegs": ["Marrow", "Spinach"]} ] dataset = data.TabularDataset( path=self.test_nested_key_json_dataset_path, format="json", fields=valid_fields) # check results for example, expect in zip(dataset.examples, expected_examples): self.assertEqual(example.vegs, expect['vegs']) self.assertEqual(example.fruits, expect['fruits']) with self.assertRaises(ValueError): data.TabularDataset( path=self.test_nested_key_json_dataset_path, format="json", fields=invalid_fields)
Example #14
Source File: test_field.py From text with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_numericalize_stop_words(self): # Based on request from #354 self.write_test_ppid_dataset(data_format="tsv") question_field = data.Field(sequential=True, batch_first=True, stop_words=set(["do", "you"])) tsv_fields = [("id", None), ("q1", question_field), ("q2", question_field), ("label", None)] tsv_dataset = data.TabularDataset( path=self.test_ppid_dataset_path, format="tsv", fields=tsv_fields) question_field.build_vocab(tsv_dataset) test_example_data = question_field.pad( [question_field.preprocess(x) for x in [["When", "do", "you", "use", "シ", "instead", "of", "し?"], ["What", "is", "2+2", "<pad>", "<pad>", "<pad>", "<pad>", "<pad>"], ["Here", "is", "a", "sentence", "with", "some", "oovs", "<pad>"]]] ) # Test with batch_first stopwords_removed_numericalized = question_field.numericalize(test_example_data) verify_numericalized_example(question_field, test_example_data, stopwords_removed_numericalized, batch_first=True)
Example #15
Source File: test_field.py From text with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_numerical_features_no_vocab(self): self.write_test_numerical_features_dataset() # Test basic usage int_field = data.Field(sequential=False, use_vocab=False) float_field = data.Field(sequential=False, use_vocab=False, dtype=torch.float) tsv_fields = [("int", int_field), ("float", float_field), ("string", None)] tsv_dataset = data.TabularDataset( path=self.test_numerical_features_dataset_path, format="tsv", fields=tsv_fields) int_field.build_vocab(tsv_dataset) float_field.build_vocab(tsv_dataset) test_int_data = ["1", "0", "1", "3", "19"] test_float_data = ["1.1", "0.1", "3.91", "0.2", "10.2"] numericalized_int = int_field.numericalize(test_int_data) self.assertEqual(numericalized_int.data, [1, 0, 1, 3, 19]) numericalized_float = float_field.numericalize(test_float_data) self.assertEqual(numericalized_float.data, [1.1, 0.1, 3.91, 0.2, 10.2]) # Test with postprocessing applied int_field = data.Field(sequential=False, use_vocab=False, postprocessing=lambda arr, _: [x + 1 for x in arr]) float_field = data.Field(sequential=False, use_vocab=False, dtype=torch.float, postprocessing=lambda arr, _: [x * 0.5 for x in arr]) tsv_fields = [("int", int_field), ("float", float_field), ("string", None)] tsv_dataset = data.TabularDataset( path=self.test_numerical_features_dataset_path, format="tsv", fields=tsv_fields) int_field.build_vocab(tsv_dataset) float_field.build_vocab(tsv_dataset) test_int_data = ["1", "0", "1", "3", "19"] test_float_data = ["1.1", "0.1", "3.91", "0.2", "10.2"] numericalized_int = int_field.numericalize(test_int_data) self.assertEqual(numericalized_int.data, [2, 1, 2, 4, 20]) numericalized_float = float_field.numericalize(test_float_data) self.assertEqual(numericalized_float.data, [0.55, 0.05, 1.955, 0.1, 5.1])
Example #16
Source File: test_field.py From text with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_serialization_built_vocab(self): self.write_test_ppid_dataset(data_format="tsv") question_field = data.Field(sequential=True) tsv_fields = [("id", None), ("q1", question_field), ("q2", question_field), ("label", None)] tsv_dataset = data.TabularDataset( path=self.test_ppid_dataset_path, format="tsv", fields=tsv_fields) question_field.build_vocab(tsv_dataset) question_pickle_filename = "question.pl" question_pickle_path = os.path.join(self.test_dir, question_pickle_filename) torch.save(question_field, question_pickle_path) loaded_question_field = torch.load(question_pickle_path) assert loaded_question_field == question_field test_example_data = [["When", "do", "you", "use", "シ", "instead", "of", "し?"], ["What", "is", "2+2", "<pad>", "<pad>", "<pad>", "<pad>", "<pad>"], ["Here", "is", "a", "sentence", "with", "some", "oovs", "<pad>"]] # Test results of numericalization original_numericalization = question_field.numericalize(test_example_data) pickled_numericalization = loaded_question_field.numericalize(test_example_data) assert torch.all(torch.eq(original_numericalization, pickled_numericalization))
Example #17
Source File: tool.py From lightKG with Apache License 2.0 | 5 votes |
def get_dataset(self, path: str, fields=Fields, file_type='csv', skip_header=False): logger.info('loading dataset from {}'.format(path)) rl_dataset = TabularDataset(path, format=file_type, fields=fields, skip_header=skip_header) logger.info('successed loading dataset') return rl_dataset
Example #18
Source File: relation_task.py From DIAG-NRE with MIT License | 5 votes |
def init_heldout_test_set(self): # TODO: change this into input arguments data_dir_path = os.path.dirname(self.config['test_file']) heldout_test_file_path = os.path.join(data_dir_path, 'nyt_heldout_test.csv') heldout_test_entitypair_fp = os.path.join(data_dir_path, 'nyt_heldout_test_entitypair.csv') def read_entity_pair_info(entitypair_file_path): tmp_df = pd.read_csv(entitypair_file_path, header=None) tmp_df.columns = ['span1_guid', 'span2_guid', 'span1', 'span2'] entitypair_infos = tmp_df.to_dict(orient='records') entity_pairs = [] for ep_info in entitypair_infos: entity_pairs.append((ep_info['span1_guid'], ep_info['span2_guid'])) return entity_pairs print('Loading heldout test set {}'.format(heldout_test_file_path)) self.heldout_test_set = tt_data.TabularDataset(path=heldout_test_file_path, format='csv', fields=[('Id', self.ID), ('Text', self.TEXT), ('Pos1', self.POS), ('Pos2', self.POS), ('Label', self.LABEL)], skip_header=False) self.heldout_entity_pairs = read_entity_pair_info(heldout_test_entitypair_fp) self.heldout_test_iter = tt_data.Iterator(self.heldout_test_set, sort_key=lambda x: len(x.Text), batch_size=self.config['test_batch_size'], train=False, repeat=False, sort_within_batch=True, device=self.device)
Example #19
Source File: dataset_reader.py From nlp-experiments-in-pytorch with MIT License | 5 votes |
def read_dataset(self, batch_size=128, split_ratio=0.7, format="tsv"): sf, nlf, clf = self.create_fields() if self.task == "classification": dataset = data.TabularDataset(path=self.data_path, format=format, skip_header=True, fields=[("category_labels", clf), ("ner_labels", None), ("sentence", sf)]) elif self.task == "ner": dataset = data.TabularDataset(path=self.data_path, format=format, skip_header=True, fields=[("category_labels", None), ("ner_labels", nlf), ("sentence", sf)]) else: raise ValueError("Training task is not defined! It can be 'classification' or 'ner'") logger.info("Splitting dataset into train/dev/test") train, val, test = self.create_splits(dataset, split_ratio) logger.info("Splitting done!") logger.info("Creating vocabulary") self.create_vocabs(dataset, sf, clf, nlf) logger.info("Vocabulary created!") logger.info("Creating iterators") self.create_iterator(train, val, test, batch_size) return train, val, test
Example #20
Source File: dataset.py From pytorch-sentiment-analysis-classification with MIT License | 5 votes |
def __init__(self, root_dir='data', batch_size=64, use_vector=True): self.TEXT = Field(sequential=True, use_vocab=True, tokenize='spacy', lower=True, batch_first=True) self.LABEL = LabelField(tensor_type=torch.FloatTensor) vectors = Vectors(name='mr_vocab.txt', cache='./') dataset_path = os.path.join(root_dir, '{}.tsv') self.dataset = {} self.dataloader = {} for target in ['train', 'dev', 'test']: self.dataset[target] = TabularDataset( path=dataset_path.format(target), format='tsv', fields=[('text', self.TEXT), ('label', self.LABEL)] ) if use_vector: self.TEXT.build_vocab(self.dataset[target], max_size=25000, vectors=vectors) else: self.TEXT.build_vocab(self.dataset[target], max_size=25000) self.LABEL.build_vocab(self.dataset[target]) self.dataloader[target] = Iterator(self.dataset[target], batch_size=batch_size, device=None, repeat=False, sort_key=lambda x: len(x.text), shuffle=True)
Example #21
Source File: tool.py From lightNLP with Apache License 2.0 | 5 votes |
def get_dataset(self, path: str, fields=Fields, file_type='tsv', skip_header=True): logger.info('loading dataset from {}'.format(path)) st_dataset = TabularDataset(path, format=file_type, fields=fields, skip_header=skip_header) logger.info('successed loading dataset') return st_dataset
Example #22
Source File: test_dataset.py From decaNLP with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_json_dataset_one_key_multiple_fields(self): self.write_test_ppid_dataset(data_format="json") question_field = data.Field(sequential=True) spacy_tok_question_field = data.Field(sequential=True, tokenize="spacy") label_field = data.Field(sequential=False) fields = {"question1": [("q1", question_field), ("q1_spacy", spacy_tok_question_field)], "question2": [("q2", question_field), ("q2_spacy", spacy_tok_question_field)], "label": ("label", label_field)} dataset = data.TabularDataset( path=self.test_ppid_dataset_path, format="json", fields=fields) expected_examples = [ (["When", "do", "you", "use", "シ", "instead", "of", "し?"], ["When", "do", "you", "use", "シ", "instead", "of", "し", "?"], ["When", "do", "you", "use", "\"&\"", "instead", "of", "\"and\"?"], ["When", "do", "you", "use", "\"", "&", "\"", "instead", "of", "\"", "and", "\"", "?"], "0"), (["Where", "was", "Lincoln", "born?"], ["Where", "was", "Lincoln", "born", "?"], ["Which", "location", "was", "Abraham", "Lincoln", "born?"], ["Which", "location", "was", "Abraham", "Lincoln", "born", "?"], "1"), (["What", "is", "2+2"], ["What", "is", "2", "+", "2"], ["2+2=?"], ["2", "+", "2=", "?"], "1")] for i, example in enumerate(dataset): self.assertEqual(example.q1, expected_examples[i][0]) self.assertEqual(example.q1_spacy, expected_examples[i][1]) self.assertEqual(example.q2, expected_examples[i][2]) self.assertEqual(example.q2_spacy, expected_examples[i][3]) self.assertEqual(example.label, expected_examples[i][4])
Example #23
Source File: test_dataset.py From decaNLP with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_errors(self): # Ensure that trying to retrieve a key not in JSON data errors self.write_test_ppid_dataset(data_format="json") question_field = data.Field(sequential=True) label_field = data.Field(sequential=False) fields = {"qeustion1": ("q1", question_field), "question2": ("q2", question_field), "label": ("label", label_field)} with self.assertRaises(ValueError): data.TabularDataset( path=self.test_ppid_dataset_path, format="json", fields=fields)
Example #24
Source File: test_field.py From decaNLP with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_numericalize_basic(self): self.write_test_ppid_dataset(data_format="tsv") question_field = data.Field(sequential=True) tsv_fields = [("id", None), ("q1", question_field), ("q2", question_field), ("label", None)] tsv_dataset = data.TabularDataset( path=self.test_ppid_dataset_path, format="tsv", fields=tsv_fields) question_field.build_vocab(tsv_dataset) test_example_data = [["When", "do", "you", "use", "シ", "instead", "of", "し?"], ["What", "is", "2+2", "<pad>", "<pad>", "<pad>", "<pad>", "<pad>"], ["Here", "is", "a", "sentence", "with", "some", "oovs", "<pad>"]] # Test default default_numericalized = question_field.numericalize( test_example_data, device=-1) verify_numericalized_example(question_field, test_example_data, default_numericalized) # Test with train=False volatile_numericalized = question_field.numericalize( test_example_data, device=-1, train=False) verify_numericalized_example(question_field, test_example_data, volatile_numericalized, train=False)
Example #25
Source File: test_field.py From decaNLP with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_numericalize_postprocessing(self): self.write_test_ppid_dataset(data_format="tsv") def reverse_postprocess(arr, vocab, train): return [list(reversed(sentence)) for sentence in arr] question_field = data.Field(sequential=True, postprocessing=reverse_postprocess) tsv_fields = [("id", None), ("q1", question_field), ("q2", question_field), ("label", None)] tsv_dataset = data.TabularDataset( path=self.test_ppid_dataset_path, format="tsv", fields=tsv_fields) question_field.build_vocab(tsv_dataset) test_example_data = [["When", "do", "you", "use", "シ", "instead", "of", "し?"], ["What", "is", "2+2", "<pad>", "<pad>", "<pad>", "<pad>", "<pad>"], ["Here", "is", "a", "sentence", "with", "some", "oovs", "<pad>"]] reversed_test_example_data = [list(reversed(sentence)) for sentence in test_example_data] postprocessed_numericalized = question_field.numericalize( (test_example_data), device=-1) verify_numericalized_example(question_field, reversed_test_example_data, postprocessed_numericalized)
Example #26
Source File: test_field.py From decaNLP with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_numerical_features_no_vocab(self): self.write_test_numerical_features_dataset() # Test basic usage int_field = data.Field(sequential=False, use_vocab=False) float_field = data.Field(sequential=False, use_vocab=False, tensor_type=torch.FloatTensor) tsv_fields = [("int", int_field), ("float", float_field), ("string", None)] tsv_dataset = data.TabularDataset( path=self.test_numerical_features_dataset_path, format="tsv", fields=tsv_fields) int_field.build_vocab(tsv_dataset) float_field.build_vocab(tsv_dataset) test_int_data = ["1", "0", "1", "3", "19"] test_float_data = ["1.1", "0.1", "3.91", "0.2", "10.2"] numericalized_int = int_field.numericalize(test_int_data, device=-1) assert_allclose(numericalized_int.data.numpy(), [1, 0, 1, 3, 19]) numericalized_float = float_field.numericalize(test_float_data, device=-1) assert_allclose(numericalized_float.data.numpy(), [1.1, 0.1, 3.91, 0.2, 10.2]) # Test with postprocessing applied int_field = data.Field(sequential=False, use_vocab=False, postprocessing=lambda arr, _, __: [x + 1 for x in arr]) float_field = data.Field(sequential=False, use_vocab=False, tensor_type=torch.FloatTensor, postprocessing=lambda arr, _, __: [x * 0.5 for x in arr]) tsv_fields = [("int", int_field), ("float", float_field), ("string", None)] tsv_dataset = data.TabularDataset( path=self.test_numerical_features_dataset_path, format="tsv", fields=tsv_fields) int_field.build_vocab(tsv_dataset) float_field.build_vocab(tsv_dataset) test_int_data = ["1", "0", "1", "3", "19"] test_float_data = ["1.1", "0.1", "3.91", "0.2", "10.2"] numericalized_int = int_field.numericalize(test_int_data, device=-1) assert_allclose(numericalized_int.data.numpy(), [2, 1, 2, 4, 20]) numericalized_float = float_field.numericalize(test_float_data, device=-1) assert_allclose(numericalized_float.data.numpy(), [0.55, 0.05, 1.955, 0.1, 5.1])
Example #27
Source File: Process.py From Transformer with Apache License 2.0 | 5 votes |
def create_dataset(opt, SRC, TRG): print("creating dataset and iterator... ") raw_data = {'src' : [line for line in opt.src_data], 'trg': [line for line in opt.trg_data]} df = pd.DataFrame(raw_data, columns=["src", "trg"]) mask = (df['src'].str.count(' ') < opt.max_strlen) & (df['trg'].str.count(' ') < opt.max_strlen) df = df.loc[mask] df.to_csv("translate_transformer_temp.csv", index=False) data_fields = [('src', SRC), ('trg', TRG)] train = data.TabularDataset('./translate_transformer_temp.csv', format='csv', fields=data_fields) train_iter = MyIterator(train, batch_size=opt.batchsize, device=opt.device, repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)), batch_size_fn=batch_size_fn, train=True, shuffle=True) os.remove('translate_transformer_temp.csv') if opt.load_weights is None: SRC.build_vocab(train) TRG.build_vocab(train) if opt.checkpoint > 0: try: os.mkdir("weights") except: print("weights folder already exists, run program with -load_weights weights to load them") quit() pickle.dump(SRC, open('weights/SRC.pkl', 'wb')) pickle.dump(TRG, open('weights/TRG.pkl', 'wb')) opt.src_pad = SRC.vocab.stoi['<pad>'] opt.trg_pad = TRG.vocab.stoi['<pad>'] opt.train_len = get_len(train_iter) return train_iter
Example #28
Source File: tool.py From lightNLP with Apache License 2.0 | 5 votes |
def get_dataset(self, path: str, fields=Fields, file_type='tsv', skip_header=True): logger.info('loading dataset from {}'.format(path)) te_dataset = TabularDataset(path, format=file_type, fields=fields, skip_header=skip_header) logger.info('successed loading dataset') return te_dataset
Example #29
Source File: tool.py From lightNLP with Apache License 2.0 | 5 votes |
def get_dataset(self, path: str, fields=Fields, file_type='tsv', skip_header=True): logger.info('loading dataset from {}'.format(path)) st_dataset = TabularDataset(path, format=file_type, fields=fields, skip_header=skip_header) logger.info('successed loading dataset') return st_dataset
Example #30
Source File: test_dataset.py From text with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_errors(self): # Ensure that trying to retrieve a key not in JSON data errors self.write_test_ppid_dataset(data_format="json") question_field = data.Field(sequential=True) label_field = data.Field(sequential=False) fields = {"qeustion1": ("q1", question_field), "question2": ("q2", question_field), "label": ("label", label_field)} with self.assertRaises(ValueError): data.TabularDataset( path=self.test_ppid_dataset_path, format="json", fields=fields)