Python Examples of torchtext.data.TabularDataset

Source File: test_field.py From text with BSD 3-Clause "New" or "Revised" License

6 votes

def test_numericalize_basic(self):
        self.write_test_ppid_dataset(data_format="tsv")
        question_field = data.Field(sequential=True)
        tsv_fields = [("id", None), ("q1", question_field),
                      ("q2", question_field), ("label", None)]
        tsv_dataset = data.TabularDataset(
            path=self.test_ppid_dataset_path, format="tsv",
            fields=tsv_fields)
        question_field.build_vocab(tsv_dataset)

        test_example_data = [["When", "do", "you", "use", "シ",
                              "instead", "of", "し?"],
                             ["What", "is", "2+2", "<pad>", "<pad>",
                              "<pad>", "<pad>", "<pad>"],
                             ["Here", "is", "a", "sentence", "with",
                              "some", "oovs", "<pad>"]]

        # Test default
        default_numericalized = question_field.numericalize(test_example_data)
        verify_numericalized_example(question_field, test_example_data,
                                     default_numericalized)

Source File: test_batch.py From text with BSD 3-Clause "New" or "Revised" License

6 votes

def test_batch_iter(self):
        self.write_test_numerical_features_dataset()
        FLOAT = data.Field(use_vocab=False, sequential=False,
                           dtype=torch.float)
        INT = data.Field(use_vocab=False, sequential=False, is_target=True)
        TEXT = data.Field(sequential=False)

        dst = data.TabularDataset(path=self.test_numerical_features_dataset_path,
                                  format="tsv", skip_header=False,
                                  fields=[("float", FLOAT),
                                          ("int", INT),
                                          ("text", TEXT)])
        TEXT.build_vocab(dst)
        itr = data.Iterator(dst, batch_size=2, device=-1, shuffle=False)
        fld_order = [k for k, v in dst.fields.items() if
                     v is not None and not v.is_target]
        batch = next(iter(itr))
        (x1, x2), y = batch
        x = (x1, x2)[fld_order.index("float")]
        self.assertEquals(y.data[0], 1)
        self.assertEquals(y.data[1], 12)
        self.assertAlmostEqual(x.data[0], 0.1, places=4)
        self.assertAlmostEqual(x.data[1], 0.5, places=4)

Source File: test_field.py From decaNLP with BSD 3-Clause "New" or "Revised" License

6 votes

def test_numericalize_include_lengths(self):
        self.write_test_ppid_dataset(data_format="tsv")
        question_field = data.Field(sequential=True, include_lengths=True)
        tsv_fields = [("id", None), ("q1", question_field),
                      ("q2", question_field), ("label", None)]
        tsv_dataset = data.TabularDataset(
            path=self.test_ppid_dataset_path, format="tsv",
            fields=tsv_fields)
        question_field.build_vocab(tsv_dataset)

        test_example_data = [["When", "do", "you", "use", "シ",
                              "instead", "of", "し?"],
                             ["What", "is", "2+2", "<pad>", "<pad>",
                              "<pad>", "<pad>", "<pad>"],
                             ["Here", "is", "a", "sentence", "with",
                              "some", "oovs", "<pad>"]]
        test_example_lengths = [8, 3, 7]

        # Test with include_lengths
        include_lengths_numericalized = question_field.numericalize(
            (test_example_data, test_example_lengths), device=-1)
        verify_numericalized_example(question_field,
                                     test_example_data,
                                     include_lengths_numericalized,
                                     test_example_lengths)

Source File: relation_task.py From DIAG-NRE with MIT License

6 votes

def init_test_set(self):
        test_file_path = self.config['test_file']
        print('Loading test set {}'.format(test_file_path))
        self.test_set = tt_data.TabularDataset(path=test_file_path,
                                               format='csv',
                                               fields=[('Id', self.ID),
                                                       ('Text', self.TEXT),
                                                       ('Pos1', self.POS),
                                                       ('Pos2', self.POS),
                                                       ('Label', self.LABEL)],
                                               skip_header=False)
        self.test_iter = tt_data.Iterator(self.test_set,
                                          sort_key=lambda x: len(x.Text),
                                          batch_size=self.config['test_batch_size'],
                                          train=False,
                                          repeat=False,
                                          sort_within_batch=True,
                                          device=self.device)

Source File: relation_task.py From DIAG-NRE with MIT License

6 votes

def init_dev_set(self):
        dev_file_path = self.config['dev_file']
        print('Loading dev set from {}'.format(dev_file_path))
        self.dev_set = tt_data.TabularDataset(path=dev_file_path,
                                              format='csv',
                                              fields=[('Id', self.ID),
                                                      ('Text', self.TEXT),
                                                      ('Pos1', self.POS),
                                                      ('Pos2', self.POS),
                                                      ('Label', self.LABEL)],
                                              skip_header=False)
        self.dev_iter = tt_data.Iterator(self.dev_set,
                                         sort_key=lambda x: len(x.Text),
                                         batch_size=self.config['test_batch_size'],
                                         train=False,
                                         repeat=False,
                                         sort_within_batch=True,
                                         device=self.device)

Source File: test_field.py From decaNLP with BSD 3-Clause "New" or "Revised" License

6 votes

def test_errors(self):
        # Test that passing a non-tuple (of data and length) to numericalize
        # with Field.include_lengths = True raises an error.
        with self.assertRaises(ValueError):
            self.write_test_ppid_dataset(data_format="tsv")
            question_field = data.Field(sequential=True, include_lengths=True)
            tsv_fields = [("id", None), ("q1", question_field),
                          ("q2", question_field), ("label", None)]
            tsv_dataset = data.TabularDataset(
                path=self.test_ppid_dataset_path, format="tsv",
                fields=tsv_fields)
            question_field.build_vocab(tsv_dataset)
            test_example_data = [["When", "do", "you", "use", "シ",
                                  "instead", "of", "し?"],
                                 ["What", "is", "2+2", "<pad>", "<pad>",
                                  "<pad>", "<pad>", "<pad>"],
                                 ["Here", "is", "a", "sentence", "with",
                                  "some", "oovs", "<pad>"]]
            question_field.numericalize(
                test_example_data, device=-1)

Source File: relation_task.py From DIAG-NRE with MIT License

6 votes

def init_train_set(self):
        set_all_random_seed(self.config['random_seed'])
        train_file_path = self.config['train_file']
        print('Loading train set from {}'.format(train_file_path))
        self.train_set = tt_data.TabularDataset(path=train_file_path,
                                                format='csv',
                                                fields=[('Id', self.ID),
                                                        ('Text', self.TEXT),
                                                        ('Pos1', self.POS),
                                                        ('Pos2', self.POS),
                                                        ('Label', self.TRAIN_LABEL)],
                                                skip_header=False)
        self.train_iter = tt_data.Iterator(self.train_set,
                                           sort_key=lambda x: len(x.Text),
                                           batch_size=self.config['train_batch_size'],
                                           train=True,
                                           repeat=False,
                                           sort_within_batch=True,
                                           device=self.device)

Source File: test_field.py From text with BSD 3-Clause "New" or "Revised" License

6 votes

def test_numericalize_include_lengths(self):
        self.write_test_ppid_dataset(data_format="tsv")
        question_field = data.Field(sequential=True, include_lengths=True)
        tsv_fields = [("id", None), ("q1", question_field),
                      ("q2", question_field), ("label", None)]
        tsv_dataset = data.TabularDataset(
            path=self.test_ppid_dataset_path, format="tsv",
            fields=tsv_fields)
        question_field.build_vocab(tsv_dataset)

        test_example_data = [["When", "do", "you", "use", "シ",
                              "instead", "of", "し?"],
                             ["What", "is", "2+2", "<pad>", "<pad>",
                              "<pad>", "<pad>", "<pad>"],
                             ["Here", "is", "a", "sentence", "with",
                              "some", "oovs", "<pad>"]]
        test_example_lengths = [8, 3, 7]

        # Test with include_lengths
        include_lengths_numericalized = question_field.numericalize(
            (test_example_data, test_example_lengths))
        verify_numericalized_example(question_field,
                                     test_example_data,
                                     include_lengths_numericalized,
                                     test_example_lengths)

Source File: test_field.py From text with BSD 3-Clause "New" or "Revised" License

6 votes

def test_vocab_size(self):
        # Set up fields
        question_field = data.Field(sequential=True)
        label_field = data.LabelField()

        # Copied from test_build_vocab with minor changes
        # Write TSV dataset and construct a Dataset
        self.write_test_ppid_dataset(data_format="tsv")
        tsv_fields = [("id", None), ("q1", question_field),
                      ("q2", question_field), ("label", label_field)]
        tsv_dataset = data.TabularDataset(
            path=self.test_ppid_dataset_path, format="tsv",
            fields=tsv_fields)

        # Skipping json dataset as we can rely on the original build vocab test
        label_field.build_vocab(tsv_dataset)
        assert label_field.vocab.freqs == Counter({'1': 2, '0': 1})
        expected_stoi = {'1': 0, '0': 1}  # No <unk>
        assert dict(label_field.vocab.stoi) == expected_stoi
        # Turn the stoi dictionary into an itos list
        expected_itos = [x[0] for x in sorted(expected_stoi.items(),
                                              key=lambda tup: tup[1])]
        assert label_field.vocab.itos == expected_itos

Source File: test_field.py From text with BSD 3-Clause "New" or "Revised" License

6 votes

def test_numericalize_batch_first(self):
        self.write_test_ppid_dataset(data_format="tsv")
        question_field = data.Field(sequential=True, batch_first=True)
        tsv_fields = [("id", None), ("q1", question_field),
                      ("q2", question_field), ("label", None)]
        tsv_dataset = data.TabularDataset(
            path=self.test_ppid_dataset_path, format="tsv",
            fields=tsv_fields)
        question_field.build_vocab(tsv_dataset)

        test_example_data = [["When", "do", "you", "use", "シ",
                              "instead", "of", "し?"],
                             ["What", "is", "2+2", "<pad>", "<pad>",
                              "<pad>", "<pad>", "<pad>"],
                             ["Here", "is", "a", "sentence", "with",
                              "some", "oovs", "<pad>"]]

        # Test with batch_first
        include_lengths_numericalized = question_field.numericalize(
            test_example_data)
        verify_numericalized_example(question_field,
                                     test_example_data,
                                     include_lengths_numericalized,
                                     batch_first=True)

Source File: test_field.py From text with BSD 3-Clause "New" or "Revised" License

6 votes

def test_errors(self):
        # Test that passing a non-tuple (of data and length) to numericalize
        # with Field.include_lengths = True raises an error.
        with self.assertRaises(ValueError):
            self.write_test_ppid_dataset(data_format="tsv")
            question_field = data.Field(sequential=True, include_lengths=True)
            tsv_fields = [("id", None), ("q1", question_field),
                          ("q2", question_field), ("label", None)]
            tsv_dataset = data.TabularDataset(
                path=self.test_ppid_dataset_path, format="tsv",
                fields=tsv_fields)
            question_field.build_vocab(tsv_dataset)
            test_example_data = [["When", "do", "you", "use", "シ",
                                  "instead", "of", "し?"],
                                 ["What", "is", "2+2", "<pad>", "<pad>",
                                  "<pad>", "<pad>", "<pad>"],
                                 ["Here", "is", "a", "sentence", "with",
                                  "some", "oovs", "<pad>"]]
            question_field.numericalize(
                test_example_data)

Source File: test_dataset.py From text with BSD 3-Clause "New" or "Revised" License

5 votes

def test_csv_dataset_quotechar(self):
        # Based on issue #349
        example_data = [("text", "label"),
                        ('" hello world', "0"),
                        ('goodbye " world', "1"),
                        ('this is a pen " ', "0")]

        with tempfile.NamedTemporaryFile(dir=self.test_dir) as f:
            for example in example_data:
                f.write("{}\n".format(",".join(example)).encode("latin-1"))

            TEXT = data.Field(lower=True, tokenize=lambda x: x.split())
            fields = {
                "label": ("label", data.Field(use_vocab=False,
                                              sequential=False)),
                "text": ("text", TEXT)
            }

            f.seek(0)

            dataset = data.TabularDataset(
                path=f.name, format="csv",
                skip_header=False, fields=fields,
                csv_reader_params={"quotechar": None})

            TEXT.build_vocab(dataset)

            self.assertEqual(len(dataset), len(example_data) - 1)

            for i, example in enumerate(dataset):
                self.assertEqual(example.text,
                                 example_data[i + 1][0].lower().split())
                self.assertEqual(example.label, example_data[i + 1][1])

Source File: test_dataset.py From text with BSD 3-Clause "New" or "Revised" License

5 votes

def test_json_valid_and_invalid_nested_key(self):
        self.write_test_nested_key_json_dataset()
        valid_fields = {'foods.vegetables.name': ('vegs', data.Field()),
                        'foods.fruits': ('fruits', data.Field())}
        invalid_fields = {'foods.vegetables.color': ('vegs', data.Field())}

        expected_examples = [
            {"fruits": ["Apple", "Banana"],
             "vegs": ["Broccoli", "Cabbage"]},
            {"fruits": ["Cherry", "Grape", "Lemon"],
             "vegs": ["Cucumber", "Lettuce"]},
            {"fruits": ["Orange", "Pear", "Strawberry"],
             "vegs": ["Marrow", "Spinach"]}
        ]
        dataset = data.TabularDataset(
            path=self.test_nested_key_json_dataset_path,
            format="json",
            fields=valid_fields)
        # check results
        for example, expect in zip(dataset.examples, expected_examples):
            self.assertEqual(example.vegs, expect['vegs'])
            self.assertEqual(example.fruits, expect['fruits'])

        with self.assertRaises(ValueError):
            data.TabularDataset(
                path=self.test_nested_key_json_dataset_path,
                format="json",
                fields=invalid_fields)

Source File: test_field.py From text with BSD 3-Clause "New" or "Revised" License

5 votes

def test_numericalize_stop_words(self):
        # Based on request from #354
        self.write_test_ppid_dataset(data_format="tsv")
        question_field = data.Field(sequential=True, batch_first=True,
                                    stop_words=set(["do", "you"]))
        tsv_fields = [("id", None), ("q1", question_field),
                      ("q2", question_field), ("label", None)]
        tsv_dataset = data.TabularDataset(
            path=self.test_ppid_dataset_path, format="tsv",
            fields=tsv_fields)
        question_field.build_vocab(tsv_dataset)

        test_example_data = question_field.pad(
            [question_field.preprocess(x) for x in
             [["When", "do", "you", "use", "シ",
               "instead", "of", "し?"],
              ["What", "is", "2+2", "<pad>", "<pad>",
               "<pad>", "<pad>", "<pad>"],
              ["Here", "is", "a", "sentence", "with",
               "some", "oovs", "<pad>"]]]
        )

        # Test with batch_first
        stopwords_removed_numericalized = question_field.numericalize(test_example_data)
        verify_numericalized_example(question_field,
                                     test_example_data,
                                     stopwords_removed_numericalized,
                                     batch_first=True)

Source File: test_field.py From text with BSD 3-Clause "New" or "Revised" License

5 votes

def test_numerical_features_no_vocab(self):
        self.write_test_numerical_features_dataset()
        # Test basic usage
        int_field = data.Field(sequential=False, use_vocab=False)
        float_field = data.Field(sequential=False, use_vocab=False,
                                 dtype=torch.float)
        tsv_fields = [("int", int_field), ("float", float_field), ("string", None)]
        tsv_dataset = data.TabularDataset(
            path=self.test_numerical_features_dataset_path, format="tsv",
            fields=tsv_fields)
        int_field.build_vocab(tsv_dataset)
        float_field.build_vocab(tsv_dataset)
        test_int_data = ["1", "0", "1", "3", "19"]
        test_float_data = ["1.1", "0.1", "3.91", "0.2", "10.2"]

        numericalized_int = int_field.numericalize(test_int_data)
        self.assertEqual(numericalized_int.data, [1, 0, 1, 3, 19])
        numericalized_float = float_field.numericalize(test_float_data)
        self.assertEqual(numericalized_float.data, [1.1, 0.1, 3.91, 0.2, 10.2])

        # Test with postprocessing applied
        int_field = data.Field(sequential=False, use_vocab=False,
                               postprocessing=lambda arr, _: [x + 1 for x in arr])
        float_field = data.Field(sequential=False, use_vocab=False,
                                 dtype=torch.float,
                                 postprocessing=lambda arr, _: [x * 0.5 for x in arr])
        tsv_fields = [("int", int_field), ("float", float_field), ("string", None)]
        tsv_dataset = data.TabularDataset(
            path=self.test_numerical_features_dataset_path, format="tsv",
            fields=tsv_fields)
        int_field.build_vocab(tsv_dataset)
        float_field.build_vocab(tsv_dataset)
        test_int_data = ["1", "0", "1", "3", "19"]
        test_float_data = ["1.1", "0.1", "3.91", "0.2", "10.2"]

        numericalized_int = int_field.numericalize(test_int_data)
        self.assertEqual(numericalized_int.data, [2, 1, 2, 4, 20])
        numericalized_float = float_field.numericalize(test_float_data)
        self.assertEqual(numericalized_float.data, [0.55, 0.05, 1.955, 0.1, 5.1])

Source File: test_field.py From text with BSD 3-Clause "New" or "Revised" License

5 votes

def test_serialization_built_vocab(self):
        self.write_test_ppid_dataset(data_format="tsv")
        question_field = data.Field(sequential=True)
        tsv_fields = [("id", None), ("q1", question_field),
                      ("q2", question_field), ("label", None)]
        tsv_dataset = data.TabularDataset(
            path=self.test_ppid_dataset_path, format="tsv",
            fields=tsv_fields)

        question_field.build_vocab(tsv_dataset)

        question_pickle_filename = "question.pl"
        question_pickle_path = os.path.join(self.test_dir, question_pickle_filename)
        torch.save(question_field, question_pickle_path)

        loaded_question_field = torch.load(question_pickle_path)

        assert loaded_question_field == question_field

        test_example_data = [["When", "do", "you", "use", "シ",
                              "instead", "of", "し?"],
                             ["What", "is", "2+2", "<pad>", "<pad>",
                              "<pad>", "<pad>", "<pad>"],
                             ["Here", "is", "a", "sentence", "with",
                              "some", "oovs", "<pad>"]]

        # Test results of numericalization
        original_numericalization = question_field.numericalize(test_example_data)
        pickled_numericalization = loaded_question_field.numericalize(test_example_data)

        assert torch.all(torch.eq(original_numericalization, pickled_numericalization))

Source File: tool.py From lightKG with Apache License 2.0

5 votes

def get_dataset(self, path: str, fields=Fields, file_type='csv', skip_header=False):
        logger.info('loading dataset from {}'.format(path))
        rl_dataset = TabularDataset(path, format=file_type, fields=fields, skip_header=skip_header)
        logger.info('successed loading dataset')
        return rl_dataset

Source File: relation_task.py From DIAG-NRE with MIT License

5 votes

def init_heldout_test_set(self):
        # TODO: change this into input arguments
        data_dir_path = os.path.dirname(self.config['test_file'])
        heldout_test_file_path = os.path.join(data_dir_path, 'nyt_heldout_test.csv')
        heldout_test_entitypair_fp = os.path.join(data_dir_path, 'nyt_heldout_test_entitypair.csv')

        def read_entity_pair_info(entitypair_file_path):
            tmp_df = pd.read_csv(entitypair_file_path, header=None)
            tmp_df.columns = ['span1_guid', 'span2_guid', 'span1', 'span2']
            entitypair_infos = tmp_df.to_dict(orient='records')
            entity_pairs = []
            for ep_info in entitypair_infos:
                entity_pairs.append((ep_info['span1_guid'], ep_info['span2_guid']))

            return entity_pairs

        print('Loading heldout test set {}'.format(heldout_test_file_path))
        self.heldout_test_set = tt_data.TabularDataset(path=heldout_test_file_path,
                                                       format='csv',
                                                       fields=[('Id', self.ID),
                                                               ('Text', self.TEXT),
                                                               ('Pos1', self.POS),
                                                               ('Pos2', self.POS),
                                                               ('Label', self.LABEL)],
                                                       skip_header=False)
        self.heldout_entity_pairs = read_entity_pair_info(heldout_test_entitypair_fp)
        self.heldout_test_iter = tt_data.Iterator(self.heldout_test_set,
                                                  sort_key=lambda x: len(x.Text),
                                                  batch_size=self.config['test_batch_size'],
                                                  train=False,
                                                  repeat=False,
                                                  sort_within_batch=True,
                                                  device=self.device)

Source File: dataset_reader.py From nlp-experiments-in-pytorch with MIT License

5 votes

def read_dataset(self, batch_size=128, split_ratio=0.7, format="tsv"):
        sf, nlf, clf = self.create_fields()
        if self.task == "classification":
            dataset = data.TabularDataset(path=self.data_path,
                                          format=format,
                                          skip_header=True,
                                          fields=[("category_labels", clf),
                                                  ("ner_labels", None),
                                                  ("sentence", sf)])
        elif self.task == "ner":
            dataset = data.TabularDataset(path=self.data_path,
                                          format=format,
                                          skip_header=True,
                                          fields=[("category_labels", None),
                                                  ("ner_labels", nlf),
                                                  ("sentence", sf)])
        else:
            raise ValueError("Training task is not defined! It can be 'classification' or 'ner'")

        logger.info("Splitting dataset into train/dev/test")
        train, val, test = self.create_splits(dataset, split_ratio)
        logger.info("Splitting done!")
        logger.info("Creating vocabulary")
        self.create_vocabs(dataset, sf, clf, nlf)
        logger.info("Vocabulary created!")
        logger.info("Creating iterators")
        self.create_iterator(train, val, test, batch_size)
        return train, val, test

Source File: dataset.py From pytorch-sentiment-analysis-classification with MIT License

5 votes

def __init__(self, root_dir='data', batch_size=64, use_vector=True):
        self.TEXT = Field(sequential=True, use_vocab=True,
                          tokenize='spacy', lower=True, batch_first=True)
        self.LABEL = LabelField(tensor_type=torch.FloatTensor)
        vectors = Vectors(name='mr_vocab.txt', cache='./')

        dataset_path = os.path.join(root_dir, '{}.tsv')
        self.dataset = {}
        self.dataloader = {}
        for target in ['train', 'dev', 'test']:
            self.dataset[target] = TabularDataset(
                path=dataset_path.format(target),
                format='tsv',
                fields=[('text', self.TEXT), ('label', self.LABEL)]
            )
            if use_vector:
                self.TEXT.build_vocab(self.dataset[target], max_size=25000, vectors=vectors)
            else:
                self.TEXT.build_vocab(self.dataset[target], max_size=25000)

            self.LABEL.build_vocab(self.dataset[target])
            self.dataloader[target] = Iterator(self.dataset[target],
                                               batch_size=batch_size,
                                               device=None,
                                               repeat=False,
                                               sort_key=lambda x: len(x.text),
                                               shuffle=True)

Source File: tool.py From lightNLP with Apache License 2.0

5 votes

def get_dataset(self, path: str, fields=Fields, file_type='tsv', skip_header=True):
        logger.info('loading dataset from {}'.format(path))
        st_dataset = TabularDataset(path, format=file_type, fields=fields, skip_header=skip_header)
        logger.info('successed loading dataset')
        return st_dataset

Source File: test_dataset.py From decaNLP with BSD 3-Clause "New" or "Revised" License

5 votes

def test_json_dataset_one_key_multiple_fields(self):
        self.write_test_ppid_dataset(data_format="json")

        question_field = data.Field(sequential=True)
        spacy_tok_question_field = data.Field(sequential=True, tokenize="spacy")
        label_field = data.Field(sequential=False)
        fields = {"question1": [("q1", question_field),
                                ("q1_spacy", spacy_tok_question_field)],
                  "question2": [("q2", question_field),
                                ("q2_spacy", spacy_tok_question_field)],
                  "label": ("label", label_field)}
        dataset = data.TabularDataset(
            path=self.test_ppid_dataset_path, format="json", fields=fields)
        expected_examples = [
            (["When", "do", "you", "use", "シ", "instead", "of", "し?"],
             ["When", "do", "you", "use", "シ", "instead", "of", "し", "?"],
             ["When", "do", "you", "use", "\"&\"",
              "instead", "of", "\"and\"?"],
             ["When", "do", "you", "use", "\"", "&", "\"",
              "instead", "of", "\"", "and", "\"", "?"], "0"),
            (["Where", "was", "Lincoln", "born?"],
             ["Where", "was", "Lincoln", "born", "?"],
             ["Which", "location", "was", "Abraham", "Lincoln", "born?"],
             ["Which", "location", "was", "Abraham", "Lincoln", "born", "?"],
             "1"),
            (["What", "is", "2+2"], ["What", "is", "2", "+", "2"],
             ["2+2=?"], ["2", "+", "2=", "?"], "1")]
        for i, example in enumerate(dataset):
            self.assertEqual(example.q1, expected_examples[i][0])
            self.assertEqual(example.q1_spacy, expected_examples[i][1])
            self.assertEqual(example.q2, expected_examples[i][2])
            self.assertEqual(example.q2_spacy, expected_examples[i][3])
            self.assertEqual(example.label, expected_examples[i][4])

Source File: test_dataset.py From decaNLP with BSD 3-Clause "New" or "Revised" License

5 votes

def test_errors(self):
        # Ensure that trying to retrieve a key not in JSON data errors
        self.write_test_ppid_dataset(data_format="json")

        question_field = data.Field(sequential=True)
        label_field = data.Field(sequential=False)
        fields = {"qeustion1": ("q1", question_field),
                  "question2": ("q2", question_field),
                  "label": ("label", label_field)}

        with self.assertRaises(ValueError):
            data.TabularDataset(
                path=self.test_ppid_dataset_path, format="json", fields=fields)

Source File: test_field.py From decaNLP with BSD 3-Clause "New" or "Revised" License

5 votes

def test_numericalize_basic(self):
        self.write_test_ppid_dataset(data_format="tsv")
        question_field = data.Field(sequential=True)
        tsv_fields = [("id", None), ("q1", question_field),
                      ("q2", question_field), ("label", None)]
        tsv_dataset = data.TabularDataset(
            path=self.test_ppid_dataset_path, format="tsv",
            fields=tsv_fields)
        question_field.build_vocab(tsv_dataset)

        test_example_data = [["When", "do", "you", "use", "シ",
                              "instead", "of", "し?"],
                             ["What", "is", "2+2", "<pad>", "<pad>",
                              "<pad>", "<pad>", "<pad>"],
                             ["Here", "is", "a", "sentence", "with",
                              "some", "oovs", "<pad>"]]

        # Test default
        default_numericalized = question_field.numericalize(
            test_example_data, device=-1)
        verify_numericalized_example(question_field, test_example_data,
                                     default_numericalized)
        # Test with train=False
        volatile_numericalized = question_field.numericalize(
            test_example_data, device=-1, train=False)
        verify_numericalized_example(question_field, test_example_data,
                                     volatile_numericalized, train=False)

Source File: test_field.py From decaNLP with BSD 3-Clause "New" or "Revised" License

5 votes

def test_numericalize_postprocessing(self):
        self.write_test_ppid_dataset(data_format="tsv")

        def reverse_postprocess(arr, vocab, train):
            return [list(reversed(sentence)) for sentence in arr]

        question_field = data.Field(sequential=True,
                                    postprocessing=reverse_postprocess)
        tsv_fields = [("id", None), ("q1", question_field),
                      ("q2", question_field), ("label", None)]

        tsv_dataset = data.TabularDataset(
            path=self.test_ppid_dataset_path, format="tsv",
            fields=tsv_fields)
        question_field.build_vocab(tsv_dataset)

        test_example_data = [["When", "do", "you", "use", "シ",
                              "instead", "of", "し?"],
                             ["What", "is", "2+2", "<pad>", "<pad>",
                              "<pad>", "<pad>", "<pad>"],
                             ["Here", "is", "a", "sentence", "with",
                              "some", "oovs", "<pad>"]]
        reversed_test_example_data = [list(reversed(sentence)) for sentence in
                                      test_example_data]

        postprocessed_numericalized = question_field.numericalize(
            (test_example_data), device=-1)
        verify_numericalized_example(question_field,
                                     reversed_test_example_data,
                                     postprocessed_numericalized)

Source File: test_field.py From decaNLP with BSD 3-Clause "New" or "Revised" License

5 votes

def test_numerical_features_no_vocab(self):
        self.write_test_numerical_features_dataset()
        # Test basic usage
        int_field = data.Field(sequential=False, use_vocab=False)
        float_field = data.Field(sequential=False, use_vocab=False,
                                 tensor_type=torch.FloatTensor)
        tsv_fields = [("int", int_field), ("float", float_field), ("string", None)]
        tsv_dataset = data.TabularDataset(
            path=self.test_numerical_features_dataset_path, format="tsv",
            fields=tsv_fields)
        int_field.build_vocab(tsv_dataset)
        float_field.build_vocab(tsv_dataset)
        test_int_data = ["1", "0", "1", "3", "19"]
        test_float_data = ["1.1", "0.1", "3.91", "0.2", "10.2"]

        numericalized_int = int_field.numericalize(test_int_data, device=-1)
        assert_allclose(numericalized_int.data.numpy(), [1, 0, 1, 3, 19])
        numericalized_float = float_field.numericalize(test_float_data, device=-1)
        assert_allclose(numericalized_float.data.numpy(), [1.1, 0.1, 3.91, 0.2, 10.2])

        # Test with postprocessing applied
        int_field = data.Field(sequential=False, use_vocab=False,
                               postprocessing=lambda arr, _, __: [x + 1 for x in arr])
        float_field = data.Field(sequential=False, use_vocab=False,
                                 tensor_type=torch.FloatTensor,
                                 postprocessing=lambda arr, _, __: [x * 0.5 for x in arr])
        tsv_fields = [("int", int_field), ("float", float_field), ("string", None)]
        tsv_dataset = data.TabularDataset(
            path=self.test_numerical_features_dataset_path, format="tsv",
            fields=tsv_fields)
        int_field.build_vocab(tsv_dataset)
        float_field.build_vocab(tsv_dataset)
        test_int_data = ["1", "0", "1", "3", "19"]
        test_float_data = ["1.1", "0.1", "3.91", "0.2", "10.2"]

        numericalized_int = int_field.numericalize(test_int_data, device=-1)
        assert_allclose(numericalized_int.data.numpy(), [2, 1, 2, 4, 20])
        numericalized_float = float_field.numericalize(test_float_data, device=-1)
        assert_allclose(numericalized_float.data.numpy(), [0.55, 0.05, 1.955, 0.1, 5.1])

Source File: Process.py From Transformer with Apache License 2.0

5 votes

def create_dataset(opt, SRC, TRG):

    print("creating dataset and iterator... ")

    raw_data = {'src' : [line for line in opt.src_data], 'trg': [line for line in opt.trg_data]}
    df = pd.DataFrame(raw_data, columns=["src", "trg"])
    
    mask = (df['src'].str.count(' ') < opt.max_strlen) & (df['trg'].str.count(' ') < opt.max_strlen)
    df = df.loc[mask]

    df.to_csv("translate_transformer_temp.csv", index=False)
    
    data_fields = [('src', SRC), ('trg', TRG)]
    train = data.TabularDataset('./translate_transformer_temp.csv', format='csv', fields=data_fields)

    train_iter = MyIterator(train, batch_size=opt.batchsize, device=opt.device,
                        repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)),
                        batch_size_fn=batch_size_fn, train=True, shuffle=True)
    
    os.remove('translate_transformer_temp.csv')

    if opt.load_weights is None:
        SRC.build_vocab(train)
        TRG.build_vocab(train)
        if opt.checkpoint > 0:
            try:
                os.mkdir("weights")
            except:
                print("weights folder already exists, run program with -load_weights weights to load them")
                quit()
            pickle.dump(SRC, open('weights/SRC.pkl', 'wb'))
            pickle.dump(TRG, open('weights/TRG.pkl', 'wb'))

    opt.src_pad = SRC.vocab.stoi['<pad>']
    opt.trg_pad = TRG.vocab.stoi['<pad>']

    opt.train_len = get_len(train_iter)

    return train_iter

Source File: tool.py From lightNLP with Apache License 2.0

5 votes

def get_dataset(self, path: str, fields=Fields, file_type='tsv', skip_header=True):
        logger.info('loading dataset from {}'.format(path))
        te_dataset = TabularDataset(path, format=file_type, fields=fields, skip_header=skip_header)
        logger.info('successed loading dataset')
        return te_dataset

Source File: tool.py From lightNLP with Apache License 2.0

5 votes

def get_dataset(self, path: str, fields=Fields, file_type='tsv', skip_header=True):
        logger.info('loading dataset from {}'.format(path))
        st_dataset = TabularDataset(path, format=file_type, fields=fields, skip_header=skip_header)
        logger.info('successed loading dataset')
        return st_dataset

Source File: test_dataset.py From text with BSD 3-Clause "New" or "Revised" License

5 votes

def test_errors(self):
        # Ensure that trying to retrieve a key not in JSON data errors
        self.write_test_ppid_dataset(data_format="json")

        question_field = data.Field(sequential=True)
        label_field = data.Field(sequential=False)
        fields = {"qeustion1": ("q1", question_field),
                  "question2": ("q2", question_field),
                  "label": ("label", label_field)}

        with self.assertRaises(ValueError):
            data.TabularDataset(
                path=self.test_ppid_dataset_path, format="json", fields=fields)

Python torchtext.data.TabularDataset() Examples