Python torchtext.data.TabularDataset() Examples

The following are 30 code examples of torchtext.data.TabularDataset(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module torchtext.data , or try the search function .
Example #1
Source File: test_field.py    From text with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_numericalize_basic(self):
        self.write_test_ppid_dataset(data_format="tsv")
        question_field = data.Field(sequential=True)
        tsv_fields = [("id", None), ("q1", question_field),
                      ("q2", question_field), ("label", None)]
        tsv_dataset = data.TabularDataset(
            path=self.test_ppid_dataset_path, format="tsv",
            fields=tsv_fields)
        question_field.build_vocab(tsv_dataset)

        test_example_data = [["When", "do", "you", "use", "シ",
                              "instead", "of", "し?"],
                             ["What", "is", "2+2", "<pad>", "<pad>",
                              "<pad>", "<pad>", "<pad>"],
                             ["Here", "is", "a", "sentence", "with",
                              "some", "oovs", "<pad>"]]

        # Test default
        default_numericalized = question_field.numericalize(test_example_data)
        verify_numericalized_example(question_field, test_example_data,
                                     default_numericalized) 
Example #2
Source File: test_batch.py    From text with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_batch_iter(self):
        self.write_test_numerical_features_dataset()
        FLOAT = data.Field(use_vocab=False, sequential=False,
                           dtype=torch.float)
        INT = data.Field(use_vocab=False, sequential=False, is_target=True)
        TEXT = data.Field(sequential=False)

        dst = data.TabularDataset(path=self.test_numerical_features_dataset_path,
                                  format="tsv", skip_header=False,
                                  fields=[("float", FLOAT),
                                          ("int", INT),
                                          ("text", TEXT)])
        TEXT.build_vocab(dst)
        itr = data.Iterator(dst, batch_size=2, device=-1, shuffle=False)
        fld_order = [k for k, v in dst.fields.items() if
                     v is not None and not v.is_target]
        batch = next(iter(itr))
        (x1, x2), y = batch
        x = (x1, x2)[fld_order.index("float")]
        self.assertEquals(y.data[0], 1)
        self.assertEquals(y.data[1], 12)
        self.assertAlmostEqual(x.data[0], 0.1, places=4)
        self.assertAlmostEqual(x.data[1], 0.5, places=4) 
Example #3
Source File: test_field.py    From decaNLP with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_numericalize_include_lengths(self):
        self.write_test_ppid_dataset(data_format="tsv")
        question_field = data.Field(sequential=True, include_lengths=True)
        tsv_fields = [("id", None), ("q1", question_field),
                      ("q2", question_field), ("label", None)]
        tsv_dataset = data.TabularDataset(
            path=self.test_ppid_dataset_path, format="tsv",
            fields=tsv_fields)
        question_field.build_vocab(tsv_dataset)

        test_example_data = [["When", "do", "you", "use", "シ",
                              "instead", "of", "し?"],
                             ["What", "is", "2+2", "<pad>", "<pad>",
                              "<pad>", "<pad>", "<pad>"],
                             ["Here", "is", "a", "sentence", "with",
                              "some", "oovs", "<pad>"]]
        test_example_lengths = [8, 3, 7]

        # Test with include_lengths
        include_lengths_numericalized = question_field.numericalize(
            (test_example_data, test_example_lengths), device=-1)
        verify_numericalized_example(question_field,
                                     test_example_data,
                                     include_lengths_numericalized,
                                     test_example_lengths) 
Example #4
Source File: relation_task.py    From DIAG-NRE with MIT License 6 votes vote down vote up
def init_test_set(self):
        test_file_path = self.config['test_file']
        print('Loading test set {}'.format(test_file_path))
        self.test_set = tt_data.TabularDataset(path=test_file_path,
                                               format='csv',
                                               fields=[('Id', self.ID),
                                                       ('Text', self.TEXT),
                                                       ('Pos1', self.POS),
                                                       ('Pos2', self.POS),
                                                       ('Label', self.LABEL)],
                                               skip_header=False)
        self.test_iter = tt_data.Iterator(self.test_set,
                                          sort_key=lambda x: len(x.Text),
                                          batch_size=self.config['test_batch_size'],
                                          train=False,
                                          repeat=False,
                                          sort_within_batch=True,
                                          device=self.device) 
Example #5
Source File: relation_task.py    From DIAG-NRE with MIT License 6 votes vote down vote up
def init_dev_set(self):
        dev_file_path = self.config['dev_file']
        print('Loading dev set from {}'.format(dev_file_path))
        self.dev_set = tt_data.TabularDataset(path=dev_file_path,
                                              format='csv',
                                              fields=[('Id', self.ID),
                                                      ('Text', self.TEXT),
                                                      ('Pos1', self.POS),
                                                      ('Pos2', self.POS),
                                                      ('Label', self.LABEL)],
                                              skip_header=False)
        self.dev_iter = tt_data.Iterator(self.dev_set,
                                         sort_key=lambda x: len(x.Text),
                                         batch_size=self.config['test_batch_size'],
                                         train=False,
                                         repeat=False,
                                         sort_within_batch=True,
                                         device=self.device) 
Example #6
Source File: test_field.py    From decaNLP with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_errors(self):
        # Test that passing a non-tuple (of data and length) to numericalize
        # with Field.include_lengths = True raises an error.
        with self.assertRaises(ValueError):
            self.write_test_ppid_dataset(data_format="tsv")
            question_field = data.Field(sequential=True, include_lengths=True)
            tsv_fields = [("id", None), ("q1", question_field),
                          ("q2", question_field), ("label", None)]
            tsv_dataset = data.TabularDataset(
                path=self.test_ppid_dataset_path, format="tsv",
                fields=tsv_fields)
            question_field.build_vocab(tsv_dataset)
            test_example_data = [["When", "do", "you", "use", "シ",
                                  "instead", "of", "し?"],
                                 ["What", "is", "2+2", "<pad>", "<pad>",
                                  "<pad>", "<pad>", "<pad>"],
                                 ["Here", "is", "a", "sentence", "with",
                                  "some", "oovs", "<pad>"]]
            question_field.numericalize(
                test_example_data, device=-1) 
Example #7
Source File: relation_task.py    From DIAG-NRE with MIT License 6 votes vote down vote up
def init_train_set(self):
        set_all_random_seed(self.config['random_seed'])
        train_file_path = self.config['train_file']
        print('Loading train set from {}'.format(train_file_path))
        self.train_set = tt_data.TabularDataset(path=train_file_path,
                                                format='csv',
                                                fields=[('Id', self.ID),
                                                        ('Text', self.TEXT),
                                                        ('Pos1', self.POS),
                                                        ('Pos2', self.POS),
                                                        ('Label', self.TRAIN_LABEL)],
                                                skip_header=False)
        self.train_iter = tt_data.Iterator(self.train_set,
                                           sort_key=lambda x: len(x.Text),
                                           batch_size=self.config['train_batch_size'],
                                           train=True,
                                           repeat=False,
                                           sort_within_batch=True,
                                           device=self.device) 
Example #8
Source File: test_field.py    From text with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_numericalize_include_lengths(self):
        self.write_test_ppid_dataset(data_format="tsv")
        question_field = data.Field(sequential=True, include_lengths=True)
        tsv_fields = [("id", None), ("q1", question_field),
                      ("q2", question_field), ("label", None)]
        tsv_dataset = data.TabularDataset(
            path=self.test_ppid_dataset_path, format="tsv",
            fields=tsv_fields)
        question_field.build_vocab(tsv_dataset)

        test_example_data = [["When", "do", "you", "use", "シ",
                              "instead", "of", "し?"],
                             ["What", "is", "2+2", "<pad>", "<pad>",
                              "<pad>", "<pad>", "<pad>"],
                             ["Here", "is", "a", "sentence", "with",
                              "some", "oovs", "<pad>"]]
        test_example_lengths = [8, 3, 7]

        # Test with include_lengths
        include_lengths_numericalized = question_field.numericalize(
            (test_example_data, test_example_lengths))
        verify_numericalized_example(question_field,
                                     test_example_data,
                                     include_lengths_numericalized,
                                     test_example_lengths) 
Example #9
Source File: test_field.py    From text with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_vocab_size(self):
        # Set up fields
        question_field = data.Field(sequential=True)
        label_field = data.LabelField()

        # Copied from test_build_vocab with minor changes
        # Write TSV dataset and construct a Dataset
        self.write_test_ppid_dataset(data_format="tsv")
        tsv_fields = [("id", None), ("q1", question_field),
                      ("q2", question_field), ("label", label_field)]
        tsv_dataset = data.TabularDataset(
            path=self.test_ppid_dataset_path, format="tsv",
            fields=tsv_fields)

        # Skipping json dataset as we can rely on the original build vocab test
        label_field.build_vocab(tsv_dataset)
        assert label_field.vocab.freqs == Counter({'1': 2, '0': 1})
        expected_stoi = {'1': 0, '0': 1}  # No <unk>
        assert dict(label_field.vocab.stoi) == expected_stoi
        # Turn the stoi dictionary into an itos list
        expected_itos = [x[0] for x in sorted(expected_stoi.items(),
                                              key=lambda tup: tup[1])]
        assert label_field.vocab.itos == expected_itos 
Example #10
Source File: test_field.py    From text with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_numericalize_batch_first(self):
        self.write_test_ppid_dataset(data_format="tsv")
        question_field = data.Field(sequential=True, batch_first=True)
        tsv_fields = [("id", None), ("q1", question_field),
                      ("q2", question_field), ("label", None)]
        tsv_dataset = data.TabularDataset(
            path=self.test_ppid_dataset_path, format="tsv",
            fields=tsv_fields)
        question_field.build_vocab(tsv_dataset)

        test_example_data = [["When", "do", "you", "use", "シ",
                              "instead", "of", "し?"],
                             ["What", "is", "2+2", "<pad>", "<pad>",
                              "<pad>", "<pad>", "<pad>"],
                             ["Here", "is", "a", "sentence", "with",
                              "some", "oovs", "<pad>"]]

        # Test with batch_first
        include_lengths_numericalized = question_field.numericalize(
            test_example_data)
        verify_numericalized_example(question_field,
                                     test_example_data,
                                     include_lengths_numericalized,
                                     batch_first=True) 
Example #11
Source File: test_field.py    From text with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_errors(self):
        # Test that passing a non-tuple (of data and length) to numericalize
        # with Field.include_lengths = True raises an error.
        with self.assertRaises(ValueError):
            self.write_test_ppid_dataset(data_format="tsv")
            question_field = data.Field(sequential=True, include_lengths=True)
            tsv_fields = [("id", None), ("q1", question_field),
                          ("q2", question_field), ("label", None)]
            tsv_dataset = data.TabularDataset(
                path=self.test_ppid_dataset_path, format="tsv",
                fields=tsv_fields)
            question_field.build_vocab(tsv_dataset)
            test_example_data = [["When", "do", "you", "use", "シ",
                                  "instead", "of", "し?"],
                                 ["What", "is", "2+2", "<pad>", "<pad>",
                                  "<pad>", "<pad>", "<pad>"],
                                 ["Here", "is", "a", "sentence", "with",
                                  "some", "oovs", "<pad>"]]
            question_field.numericalize(
                test_example_data) 
Example #12
Source File: test_dataset.py    From text with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_csv_dataset_quotechar(self):
        # Based on issue #349
        example_data = [("text", "label"),
                        ('" hello world', "0"),
                        ('goodbye " world', "1"),
                        ('this is a pen " ', "0")]

        with tempfile.NamedTemporaryFile(dir=self.test_dir) as f:
            for example in example_data:
                f.write("{}\n".format(",".join(example)).encode("latin-1"))

            TEXT = data.Field(lower=True, tokenize=lambda x: x.split())
            fields = {
                "label": ("label", data.Field(use_vocab=False,
                                              sequential=False)),
                "text": ("text", TEXT)
            }

            f.seek(0)

            dataset = data.TabularDataset(
                path=f.name, format="csv",
                skip_header=False, fields=fields,
                csv_reader_params={"quotechar": None})

            TEXT.build_vocab(dataset)

            self.assertEqual(len(dataset), len(example_data) - 1)

            for i, example in enumerate(dataset):
                self.assertEqual(example.text,
                                 example_data[i + 1][0].lower().split())
                self.assertEqual(example.label, example_data[i + 1][1]) 
Example #13
Source File: test_dataset.py    From text with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_json_valid_and_invalid_nested_key(self):
        self.write_test_nested_key_json_dataset()
        valid_fields = {'foods.vegetables.name': ('vegs', data.Field()),
                        'foods.fruits': ('fruits', data.Field())}
        invalid_fields = {'foods.vegetables.color': ('vegs', data.Field())}

        expected_examples = [
            {"fruits": ["Apple", "Banana"],
             "vegs": ["Broccoli", "Cabbage"]},
            {"fruits": ["Cherry", "Grape", "Lemon"],
             "vegs": ["Cucumber", "Lettuce"]},
            {"fruits": ["Orange", "Pear", "Strawberry"],
             "vegs": ["Marrow", "Spinach"]}
        ]
        dataset = data.TabularDataset(
            path=self.test_nested_key_json_dataset_path,
            format="json",
            fields=valid_fields)
        # check results
        for example, expect in zip(dataset.examples, expected_examples):
            self.assertEqual(example.vegs, expect['vegs'])
            self.assertEqual(example.fruits, expect['fruits'])

        with self.assertRaises(ValueError):
            data.TabularDataset(
                path=self.test_nested_key_json_dataset_path,
                format="json",
                fields=invalid_fields) 
Example #14
Source File: test_field.py    From text with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_numericalize_stop_words(self):
        # Based on request from #354
        self.write_test_ppid_dataset(data_format="tsv")
        question_field = data.Field(sequential=True, batch_first=True,
                                    stop_words=set(["do", "you"]))
        tsv_fields = [("id", None), ("q1", question_field),
                      ("q2", question_field), ("label", None)]
        tsv_dataset = data.TabularDataset(
            path=self.test_ppid_dataset_path, format="tsv",
            fields=tsv_fields)
        question_field.build_vocab(tsv_dataset)

        test_example_data = question_field.pad(
            [question_field.preprocess(x) for x in
             [["When", "do", "you", "use", "シ",
               "instead", "of", "し?"],
              ["What", "is", "2+2", "<pad>", "<pad>",
               "<pad>", "<pad>", "<pad>"],
              ["Here", "is", "a", "sentence", "with",
               "some", "oovs", "<pad>"]]]
        )

        # Test with batch_first
        stopwords_removed_numericalized = question_field.numericalize(test_example_data)
        verify_numericalized_example(question_field,
                                     test_example_data,
                                     stopwords_removed_numericalized,
                                     batch_first=True) 
Example #15
Source File: test_field.py    From text with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_numerical_features_no_vocab(self):
        self.write_test_numerical_features_dataset()
        # Test basic usage
        int_field = data.Field(sequential=False, use_vocab=False)
        float_field = data.Field(sequential=False, use_vocab=False,
                                 dtype=torch.float)
        tsv_fields = [("int", int_field), ("float", float_field), ("string", None)]
        tsv_dataset = data.TabularDataset(
            path=self.test_numerical_features_dataset_path, format="tsv",
            fields=tsv_fields)
        int_field.build_vocab(tsv_dataset)
        float_field.build_vocab(tsv_dataset)
        test_int_data = ["1", "0", "1", "3", "19"]
        test_float_data = ["1.1", "0.1", "3.91", "0.2", "10.2"]

        numericalized_int = int_field.numericalize(test_int_data)
        self.assertEqual(numericalized_int.data, [1, 0, 1, 3, 19])
        numericalized_float = float_field.numericalize(test_float_data)
        self.assertEqual(numericalized_float.data, [1.1, 0.1, 3.91, 0.2, 10.2])

        # Test with postprocessing applied
        int_field = data.Field(sequential=False, use_vocab=False,
                               postprocessing=lambda arr, _: [x + 1 for x in arr])
        float_field = data.Field(sequential=False, use_vocab=False,
                                 dtype=torch.float,
                                 postprocessing=lambda arr, _: [x * 0.5 for x in arr])
        tsv_fields = [("int", int_field), ("float", float_field), ("string", None)]
        tsv_dataset = data.TabularDataset(
            path=self.test_numerical_features_dataset_path, format="tsv",
            fields=tsv_fields)
        int_field.build_vocab(tsv_dataset)
        float_field.build_vocab(tsv_dataset)
        test_int_data = ["1", "0", "1", "3", "19"]
        test_float_data = ["1.1", "0.1", "3.91", "0.2", "10.2"]

        numericalized_int = int_field.numericalize(test_int_data)
        self.assertEqual(numericalized_int.data, [2, 1, 2, 4, 20])
        numericalized_float = float_field.numericalize(test_float_data)
        self.assertEqual(numericalized_float.data, [0.55, 0.05, 1.955, 0.1, 5.1]) 
Example #16
Source File: test_field.py    From text with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_serialization_built_vocab(self):
        self.write_test_ppid_dataset(data_format="tsv")
        question_field = data.Field(sequential=True)
        tsv_fields = [("id", None), ("q1", question_field),
                      ("q2", question_field), ("label", None)]
        tsv_dataset = data.TabularDataset(
            path=self.test_ppid_dataset_path, format="tsv",
            fields=tsv_fields)

        question_field.build_vocab(tsv_dataset)

        question_pickle_filename = "question.pl"
        question_pickle_path = os.path.join(self.test_dir, question_pickle_filename)
        torch.save(question_field, question_pickle_path)

        loaded_question_field = torch.load(question_pickle_path)

        assert loaded_question_field == question_field

        test_example_data = [["When", "do", "you", "use", "シ",
                              "instead", "of", "し?"],
                             ["What", "is", "2+2", "<pad>", "<pad>",
                              "<pad>", "<pad>", "<pad>"],
                             ["Here", "is", "a", "sentence", "with",
                              "some", "oovs", "<pad>"]]

        # Test results of numericalization
        original_numericalization = question_field.numericalize(test_example_data)
        pickled_numericalization = loaded_question_field.numericalize(test_example_data)

        assert torch.all(torch.eq(original_numericalization, pickled_numericalization)) 
Example #17
Source File: tool.py    From lightKG with Apache License 2.0 5 votes vote down vote up
def get_dataset(self, path: str, fields=Fields, file_type='csv', skip_header=False):
        logger.info('loading dataset from {}'.format(path))
        rl_dataset = TabularDataset(path, format=file_type, fields=fields, skip_header=skip_header)
        logger.info('successed loading dataset')
        return rl_dataset 
Example #18
Source File: relation_task.py    From DIAG-NRE with MIT License 5 votes vote down vote up
def init_heldout_test_set(self):
        # TODO: change this into input arguments
        data_dir_path = os.path.dirname(self.config['test_file'])
        heldout_test_file_path = os.path.join(data_dir_path, 'nyt_heldout_test.csv')
        heldout_test_entitypair_fp = os.path.join(data_dir_path, 'nyt_heldout_test_entitypair.csv')

        def read_entity_pair_info(entitypair_file_path):
            tmp_df = pd.read_csv(entitypair_file_path, header=None)
            tmp_df.columns = ['span1_guid', 'span2_guid', 'span1', 'span2']
            entitypair_infos = tmp_df.to_dict(orient='records')
            entity_pairs = []
            for ep_info in entitypair_infos:
                entity_pairs.append((ep_info['span1_guid'], ep_info['span2_guid']))

            return entity_pairs

        print('Loading heldout test set {}'.format(heldout_test_file_path))
        self.heldout_test_set = tt_data.TabularDataset(path=heldout_test_file_path,
                                                       format='csv',
                                                       fields=[('Id', self.ID),
                                                               ('Text', self.TEXT),
                                                               ('Pos1', self.POS),
                                                               ('Pos2', self.POS),
                                                               ('Label', self.LABEL)],
                                                       skip_header=False)
        self.heldout_entity_pairs = read_entity_pair_info(heldout_test_entitypair_fp)
        self.heldout_test_iter = tt_data.Iterator(self.heldout_test_set,
                                                  sort_key=lambda x: len(x.Text),
                                                  batch_size=self.config['test_batch_size'],
                                                  train=False,
                                                  repeat=False,
                                                  sort_within_batch=True,
                                                  device=self.device) 
Example #19
Source File: dataset_reader.py    From nlp-experiments-in-pytorch with MIT License 5 votes vote down vote up
def read_dataset(self, batch_size=128, split_ratio=0.7, format="tsv"):
        sf, nlf, clf = self.create_fields()
        if self.task == "classification":
            dataset = data.TabularDataset(path=self.data_path,
                                          format=format,
                                          skip_header=True,
                                          fields=[("category_labels", clf),
                                                  ("ner_labels", None),
                                                  ("sentence", sf)])
        elif self.task == "ner":
            dataset = data.TabularDataset(path=self.data_path,
                                          format=format,
                                          skip_header=True,
                                          fields=[("category_labels", None),
                                                  ("ner_labels", nlf),
                                                  ("sentence", sf)])
        else:
            raise ValueError("Training task is not defined! It can be 'classification' or 'ner'")

        logger.info("Splitting dataset into train/dev/test")
        train, val, test = self.create_splits(dataset, split_ratio)
        logger.info("Splitting done!")
        logger.info("Creating vocabulary")
        self.create_vocabs(dataset, sf, clf, nlf)
        logger.info("Vocabulary created!")
        logger.info("Creating iterators")
        self.create_iterator(train, val, test, batch_size)
        return train, val, test 
Example #20
Source File: dataset.py    From pytorch-sentiment-analysis-classification with MIT License 5 votes vote down vote up
def __init__(self, root_dir='data', batch_size=64, use_vector=True):
        self.TEXT = Field(sequential=True, use_vocab=True,
                          tokenize='spacy', lower=True, batch_first=True)
        self.LABEL = LabelField(tensor_type=torch.FloatTensor)
        vectors = Vectors(name='mr_vocab.txt', cache='./')

        dataset_path = os.path.join(root_dir, '{}.tsv')
        self.dataset = {}
        self.dataloader = {}
        for target in ['train', 'dev', 'test']:
            self.dataset[target] = TabularDataset(
                path=dataset_path.format(target),
                format='tsv',
                fields=[('text', self.TEXT), ('label', self.LABEL)]
            )
            if use_vector:
                self.TEXT.build_vocab(self.dataset[target], max_size=25000, vectors=vectors)
            else:
                self.TEXT.build_vocab(self.dataset[target], max_size=25000)

            self.LABEL.build_vocab(self.dataset[target])
            self.dataloader[target] = Iterator(self.dataset[target],
                                               batch_size=batch_size,
                                               device=None,
                                               repeat=False,
                                               sort_key=lambda x: len(x.text),
                                               shuffle=True) 
Example #21
Source File: tool.py    From lightNLP with Apache License 2.0 5 votes vote down vote up
def get_dataset(self, path: str, fields=Fields, file_type='tsv', skip_header=True):
        logger.info('loading dataset from {}'.format(path))
        st_dataset = TabularDataset(path, format=file_type, fields=fields, skip_header=skip_header)
        logger.info('successed loading dataset')
        return st_dataset 
Example #22
Source File: test_dataset.py    From decaNLP with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_json_dataset_one_key_multiple_fields(self):
        self.write_test_ppid_dataset(data_format="json")

        question_field = data.Field(sequential=True)
        spacy_tok_question_field = data.Field(sequential=True, tokenize="spacy")
        label_field = data.Field(sequential=False)
        fields = {"question1": [("q1", question_field),
                                ("q1_spacy", spacy_tok_question_field)],
                  "question2": [("q2", question_field),
                                ("q2_spacy", spacy_tok_question_field)],
                  "label": ("label", label_field)}
        dataset = data.TabularDataset(
            path=self.test_ppid_dataset_path, format="json", fields=fields)
        expected_examples = [
            (["When", "do", "you", "use", "シ", "instead", "of", "し?"],
             ["When", "do", "you", "use", "シ", "instead", "of", "し", "?"],
             ["When", "do", "you", "use", "\"&\"",
              "instead", "of", "\"and\"?"],
             ["When", "do", "you", "use", "\"", "&", "\"",
              "instead", "of", "\"", "and", "\"", "?"], "0"),
            (["Where", "was", "Lincoln", "born?"],
             ["Where", "was", "Lincoln", "born", "?"],
             ["Which", "location", "was", "Abraham", "Lincoln", "born?"],
             ["Which", "location", "was", "Abraham", "Lincoln", "born", "?"],
             "1"),
            (["What", "is", "2+2"], ["What", "is", "2", "+", "2"],
             ["2+2=?"], ["2", "+", "2=", "?"], "1")]
        for i, example in enumerate(dataset):
            self.assertEqual(example.q1, expected_examples[i][0])
            self.assertEqual(example.q1_spacy, expected_examples[i][1])
            self.assertEqual(example.q2, expected_examples[i][2])
            self.assertEqual(example.q2_spacy, expected_examples[i][3])
            self.assertEqual(example.label, expected_examples[i][4]) 
Example #23
Source File: test_dataset.py    From decaNLP with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_errors(self):
        # Ensure that trying to retrieve a key not in JSON data errors
        self.write_test_ppid_dataset(data_format="json")

        question_field = data.Field(sequential=True)
        label_field = data.Field(sequential=False)
        fields = {"qeustion1": ("q1", question_field),
                  "question2": ("q2", question_field),
                  "label": ("label", label_field)}

        with self.assertRaises(ValueError):
            data.TabularDataset(
                path=self.test_ppid_dataset_path, format="json", fields=fields) 
Example #24
Source File: test_field.py    From decaNLP with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_numericalize_basic(self):
        self.write_test_ppid_dataset(data_format="tsv")
        question_field = data.Field(sequential=True)
        tsv_fields = [("id", None), ("q1", question_field),
                      ("q2", question_field), ("label", None)]
        tsv_dataset = data.TabularDataset(
            path=self.test_ppid_dataset_path, format="tsv",
            fields=tsv_fields)
        question_field.build_vocab(tsv_dataset)

        test_example_data = [["When", "do", "you", "use", "シ",
                              "instead", "of", "し?"],
                             ["What", "is", "2+2", "<pad>", "<pad>",
                              "<pad>", "<pad>", "<pad>"],
                             ["Here", "is", "a", "sentence", "with",
                              "some", "oovs", "<pad>"]]

        # Test default
        default_numericalized = question_field.numericalize(
            test_example_data, device=-1)
        verify_numericalized_example(question_field, test_example_data,
                                     default_numericalized)
        # Test with train=False
        volatile_numericalized = question_field.numericalize(
            test_example_data, device=-1, train=False)
        verify_numericalized_example(question_field, test_example_data,
                                     volatile_numericalized, train=False) 
Example #25
Source File: test_field.py    From decaNLP with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_numericalize_postprocessing(self):
        self.write_test_ppid_dataset(data_format="tsv")

        def reverse_postprocess(arr, vocab, train):
            return [list(reversed(sentence)) for sentence in arr]

        question_field = data.Field(sequential=True,
                                    postprocessing=reverse_postprocess)
        tsv_fields = [("id", None), ("q1", question_field),
                      ("q2", question_field), ("label", None)]

        tsv_dataset = data.TabularDataset(
            path=self.test_ppid_dataset_path, format="tsv",
            fields=tsv_fields)
        question_field.build_vocab(tsv_dataset)

        test_example_data = [["When", "do", "you", "use", "シ",
                              "instead", "of", "し?"],
                             ["What", "is", "2+2", "<pad>", "<pad>",
                              "<pad>", "<pad>", "<pad>"],
                             ["Here", "is", "a", "sentence", "with",
                              "some", "oovs", "<pad>"]]
        reversed_test_example_data = [list(reversed(sentence)) for sentence in
                                      test_example_data]

        postprocessed_numericalized = question_field.numericalize(
            (test_example_data), device=-1)
        verify_numericalized_example(question_field,
                                     reversed_test_example_data,
                                     postprocessed_numericalized) 
Example #26
Source File: test_field.py    From decaNLP with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_numerical_features_no_vocab(self):
        self.write_test_numerical_features_dataset()
        # Test basic usage
        int_field = data.Field(sequential=False, use_vocab=False)
        float_field = data.Field(sequential=False, use_vocab=False,
                                 tensor_type=torch.FloatTensor)
        tsv_fields = [("int", int_field), ("float", float_field), ("string", None)]
        tsv_dataset = data.TabularDataset(
            path=self.test_numerical_features_dataset_path, format="tsv",
            fields=tsv_fields)
        int_field.build_vocab(tsv_dataset)
        float_field.build_vocab(tsv_dataset)
        test_int_data = ["1", "0", "1", "3", "19"]
        test_float_data = ["1.1", "0.1", "3.91", "0.2", "10.2"]

        numericalized_int = int_field.numericalize(test_int_data, device=-1)
        assert_allclose(numericalized_int.data.numpy(), [1, 0, 1, 3, 19])
        numericalized_float = float_field.numericalize(test_float_data, device=-1)
        assert_allclose(numericalized_float.data.numpy(), [1.1, 0.1, 3.91, 0.2, 10.2])

        # Test with postprocessing applied
        int_field = data.Field(sequential=False, use_vocab=False,
                               postprocessing=lambda arr, _, __: [x + 1 for x in arr])
        float_field = data.Field(sequential=False, use_vocab=False,
                                 tensor_type=torch.FloatTensor,
                                 postprocessing=lambda arr, _, __: [x * 0.5 for x in arr])
        tsv_fields = [("int", int_field), ("float", float_field), ("string", None)]
        tsv_dataset = data.TabularDataset(
            path=self.test_numerical_features_dataset_path, format="tsv",
            fields=tsv_fields)
        int_field.build_vocab(tsv_dataset)
        float_field.build_vocab(tsv_dataset)
        test_int_data = ["1", "0", "1", "3", "19"]
        test_float_data = ["1.1", "0.1", "3.91", "0.2", "10.2"]

        numericalized_int = int_field.numericalize(test_int_data, device=-1)
        assert_allclose(numericalized_int.data.numpy(), [2, 1, 2, 4, 20])
        numericalized_float = float_field.numericalize(test_float_data, device=-1)
        assert_allclose(numericalized_float.data.numpy(), [0.55, 0.05, 1.955, 0.1, 5.1]) 
Example #27
Source File: Process.py    From Transformer with Apache License 2.0 5 votes vote down vote up
def create_dataset(opt, SRC, TRG):

    print("creating dataset and iterator... ")

    raw_data = {'src' : [line for line in opt.src_data], 'trg': [line for line in opt.trg_data]}
    df = pd.DataFrame(raw_data, columns=["src", "trg"])
    
    mask = (df['src'].str.count(' ') < opt.max_strlen) & (df['trg'].str.count(' ') < opt.max_strlen)
    df = df.loc[mask]

    df.to_csv("translate_transformer_temp.csv", index=False)
    
    data_fields = [('src', SRC), ('trg', TRG)]
    train = data.TabularDataset('./translate_transformer_temp.csv', format='csv', fields=data_fields)

    train_iter = MyIterator(train, batch_size=opt.batchsize, device=opt.device,
                        repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)),
                        batch_size_fn=batch_size_fn, train=True, shuffle=True)
    
    os.remove('translate_transformer_temp.csv')

    if opt.load_weights is None:
        SRC.build_vocab(train)
        TRG.build_vocab(train)
        if opt.checkpoint > 0:
            try:
                os.mkdir("weights")
            except:
                print("weights folder already exists, run program with -load_weights weights to load them")
                quit()
            pickle.dump(SRC, open('weights/SRC.pkl', 'wb'))
            pickle.dump(TRG, open('weights/TRG.pkl', 'wb'))

    opt.src_pad = SRC.vocab.stoi['<pad>']
    opt.trg_pad = TRG.vocab.stoi['<pad>']

    opt.train_len = get_len(train_iter)

    return train_iter 
Example #28
Source File: tool.py    From lightNLP with Apache License 2.0 5 votes vote down vote up
def get_dataset(self, path: str, fields=Fields, file_type='tsv', skip_header=True):
        logger.info('loading dataset from {}'.format(path))
        te_dataset = TabularDataset(path, format=file_type, fields=fields, skip_header=skip_header)
        logger.info('successed loading dataset')
        return te_dataset 
Example #29
Source File: tool.py    From lightNLP with Apache License 2.0 5 votes vote down vote up
def get_dataset(self, path: str, fields=Fields, file_type='tsv', skip_header=True):
        logger.info('loading dataset from {}'.format(path))
        st_dataset = TabularDataset(path, format=file_type, fields=fields, skip_header=skip_header)
        logger.info('successed loading dataset')
        return st_dataset 
Example #30
Source File: test_dataset.py    From text with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_errors(self):
        # Ensure that trying to retrieve a key not in JSON data errors
        self.write_test_ppid_dataset(data_format="json")

        question_field = data.Field(sequential=True)
        label_field = data.Field(sequential=False)
        fields = {"qeustion1": ("q1", question_field),
                  "question2": ("q2", question_field),
                  "label": ("label", label_field)}

        with self.assertRaises(ValueError):
            data.TabularDataset(
                path=self.test_ppid_dataset_path, format="json", fields=fields)