Python torchtext.data.NestedField() Examples

The following are 16 code examples of torchtext.data.NestedField(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module torchtext.data , or try the search function .
Example #1
Source File: test_field.py    From text with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_init_full(self):
        nesting_field = data.Field()
        field = data.NestedField(
            nesting_field,
            use_vocab=False,
            init_token="<s>",
            eos_token="</s>",
            fix_length=10,
            dtype=torch.float,
            preprocessing=lambda xs: list(reversed(xs)),
            postprocessing=lambda xs: [x.upper() for x in xs],
            tokenize=list,
            pad_first=True,
        )

        assert not field.use_vocab
        assert field.init_token == "<s>"
        assert field.eos_token == "</s>"
        assert field.fix_length == 10
        assert field.dtype is torch.float
        assert field.preprocessing("a b c".split()) == "c b a".split()
        assert field.postprocessing("a b c".split()) == "A B C".split()
        assert field.tokenize("abc") == ["a", "b", "c"]
        assert field.pad_first 
Example #2
Source File: test_field.py    From text with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_build_vocab_from_dataset(self):
        nesting_field = data.Field(tokenize=list, unk_token="<cunk>", pad_token="<cpad>",
                                   init_token="<w>", eos_token="</w>")
        CHARS = data.NestedField(nesting_field, init_token="<s>", eos_token="</s>")
        ex1 = data.Example.fromlist(["aaa bbb c"], [("chars", CHARS)])
        ex2 = data.Example.fromlist(["bbb aaa"], [("chars", CHARS)])
        dataset = data.Dataset([ex1, ex2], [("chars", CHARS)])

        CHARS.build_vocab(dataset, min_freq=2)

        expected = "a b <w> </w> <s> </s> <cunk> <cpad>".split()
        assert len(CHARS.vocab) == len(expected)
        for c in expected:
            assert c in CHARS.vocab.stoi

        expected_freqs = Counter({"a": 6, "b": 6, "c": 1})
        assert CHARS.vocab.freqs == CHARS.nesting_field.vocab.freqs == expected_freqs 
Example #3
Source File: test_field.py    From text with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_pad_when_no_init_and_eos_tokens(self):
        nesting_field = data.Field(tokenize=list, unk_token="<cunk>", pad_token="<cpad>",
                                   init_token="<w>", eos_token="</w>")
        CHARS = data.NestedField(nesting_field)
        minibatch = [
            ["john", "loves", "mary"],
            ["mary", "cries"]
        ]
        expected = [
            [
                ["<w>"] + list("john") + ["</w>", "<cpad>"],
                ["<w>"] + list("loves") + ["</w>"],
                ["<w>"] + list("mary") + ["</w>", "<cpad>"],
            ],
            [
                ["<w>"] + list("mary") + ["</w>", "<cpad>"],
                ["<w>"] + list("cries") + ["</w>"],
                ["<cpad>"] * 7,
            ]
        ]

        assert CHARS.pad(minibatch) == expected 
Example #4
Source File: test_field.py    From text with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_init_minimal(self):
        nesting_field = data.Field()
        field = data.NestedField(nesting_field)

        assert isinstance(field, data.Field)
        assert field.nesting_field is nesting_field
        assert field.sequential
        assert field.use_vocab
        assert field.init_token is None
        assert field.eos_token is None
        assert field.unk_token == nesting_field.unk_token
        assert field.fix_length is None
        assert field.dtype is torch.long
        assert field.preprocessing is None
        assert field.postprocessing is None
        assert field.lower == nesting_field.lower
        assert field.tokenize("a b c") == "a b c".split()
        assert not field.include_lengths
        assert field.batch_first
        assert field.pad_token == nesting_field.pad_token
        assert not field.pad_first 
Example #5
Source File: test_field.py    From text with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_init_when_nesting_field_is_not_sequential(self):
        nesting_field = data.Field(sequential=False)
        field = data.NestedField(nesting_field)

        assert field.pad_token == "<pad>" 
Example #6
Source File: test_field.py    From text with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_init_when_nesting_field_has_include_lengths_equal_true(self):
        nesting_field = data.Field(include_lengths=True)

        with pytest.raises(ValueError) as excinfo:
            data.NestedField(nesting_field)
        assert "nesting field cannot have include_lengths=True" in str(excinfo.value) 
Example #7
Source File: test_field.py    From text with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_init_with_nested_field_as_nesting_field(self):
        nesting_field = data.NestedField(data.Field())

        with pytest.raises(ValueError) as excinfo:
            data.NestedField(nesting_field)
        assert "nesting field must not be another NestedField" in str(excinfo.value) 
Example #8
Source File: test_field.py    From text with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_build_vocab_from_iterable(self):
        nesting_field = data.Field(unk_token="<cunk>", pad_token="<cpad>")
        CHARS = data.NestedField(nesting_field)
        CHARS.build_vocab(
            [[list("aaa"), list("bbb"), ["c"]], [list("bbb"), list("aaa")]],
            [[list("ccc"), list("bbb")], [list("bbb")]],
        )

        expected = "a b c <cunk> <cpad>".split()
        assert len(CHARS.vocab) == len(expected)
        for c in expected:
            assert c in CHARS.vocab.stoi

        expected_freqs = Counter({"a": 6, "b": 12, "c": 4})
        assert CHARS.vocab.freqs == CHARS.nesting_field.vocab.freqs == expected_freqs 
Example #9
Source File: test_field.py    From text with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_pad(self):
        nesting_field = data.Field(tokenize=list, unk_token="<cunk>", pad_token="<cpad>",
                                   init_token="<w>", eos_token="</w>")
        CHARS = data.NestedField(nesting_field, init_token="<s>", eos_token="</s>")
        minibatch = [
            [list("john"), list("loves"), list("mary")],
            [list("mary"), list("cries")],
        ]
        expected = [
            [
                ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4,
                ["<w>"] + list("john") + ["</w>", "<cpad>"],
                ["<w>"] + list("loves") + ["</w>"],
                ["<w>"] + list("mary") + ["</w>", "<cpad>"],
                ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4,
            ],
            [
                ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4,
                ["<w>"] + list("mary") + ["</w>", "<cpad>"],
                ["<w>"] + list("cries") + ["</w>"],
                ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4,
                ["<cpad>"] * 7,
            ]
        ]

        assert CHARS.pad(minibatch) == expected

        # test include_length
        nesting_field = data.Field(tokenize=list, unk_token="<cunk>", pad_token="<cpad>",
                                   init_token="<w>", eos_token="</w>")
        CHARS = data.NestedField(nesting_field, init_token="<s>",
                                 eos_token="</s>", include_lengths=True)
        arr, seq_len, words_len = CHARS.pad(minibatch)
        assert arr == expected
        assert seq_len == [5, 4]
        assert words_len == [[3, 6, 7, 6, 3], [3, 6, 7, 3, 0]] 
Example #10
Source File: test_field.py    From text with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_pad_when_nesting_field_is_not_sequential(self):
        nesting_field = data.Field(sequential=False, unk_token="<cunk>",
                                   pad_token="<cpad>", init_token="<w>", eos_token="</w>")
        CHARS = data.NestedField(nesting_field, init_token="<s>", eos_token="</s>")
        minibatch = [
            ["john", "loves", "mary"],
            ["mary", "cries"]
        ]
        expected = [
            ["<s>", "john", "loves", "mary", "</s>"],
            ["<s>", "mary", "cries", "</s>", "<pad>"],
        ]

        assert CHARS.pad(minibatch) == expected 
Example #11
Source File: test_field.py    From text with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_pad_when_nesting_field_has_fix_length(self):
        nesting_field = data.Field(tokenize=list, unk_token="<cunk>", pad_token="<cpad>",
                                   init_token="<w>", eos_token="</w>", fix_length=5)
        CHARS = data.NestedField(nesting_field, init_token="<s>", eos_token="</s>")
        minibatch = [
            ["john", "loves", "mary"],
            ["mary", "cries"]
        ]
        expected = [
            [
                ["<w>", "<s>", "</w>"] + ["<cpad>"] * 2,
                ["<w>"] + list("joh") + ["</w>"],
                ["<w>"] + list("lov") + ["</w>"],
                ["<w>"] + list("mar") + ["</w>"],
                ["<w>", "</s>", "</w>"] + ["<cpad>"] * 2,
            ],
            [
                ["<w>", "<s>", "</w>"] + ["<cpad>"] * 2,
                ["<w>"] + list("mar") + ["</w>"],
                ["<w>"] + list("cri") + ["</w>"],
                ["<w>", "</s>", "</w>"] + ["<cpad>"] * 2,
                ["<cpad>"] * 5,
            ]
        ]

        assert CHARS.pad(minibatch) == expected

        # test include length
        nesting_field = data.Field(tokenize=list, unk_token="<cunk>", pad_token="<cpad>",
                                   init_token="<w>", eos_token="</w>", fix_length=5)
        CHARS = data.NestedField(nesting_field, init_token="<s>",
                                 eos_token="</s>", include_lengths=True)
        arr, seq_len, words_len = CHARS.pad(minibatch)
        assert arr == expected
        assert seq_len == [5, 4]
        assert words_len == [[3, 5, 5, 5, 3], [3, 5, 5, 3, 0]] 
Example #12
Source File: test_field.py    From text with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_pad_when_pad_first_is_true(self):
        nesting_field = data.Field(tokenize=list, unk_token="<cunk>", pad_token="<cpad>",
                                   init_token="<w>", eos_token="</w>")
        CHARS = data.NestedField(nesting_field, init_token="<s>", eos_token="</s>",
                                 pad_first=True)
        minibatch = [
            [list("john"), list("loves"), list("mary")],
            [list("mary"), list("cries")],
        ]
        expected = [
            [
                ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4,
                ["<w>"] + list("john") + ["</w>", "<cpad>"],
                ["<w>"] + list("loves") + ["</w>"],
                ["<w>"] + list("mary") + ["</w>", "<cpad>"],
                ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4,
            ],
            [
                ["<cpad>"] * 7,
                ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4,
                ["<w>"] + list("mary") + ["</w>", "<cpad>"],
                ["<w>"] + list("cries") + ["</w>"],
                ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4,
            ]
        ]

        assert CHARS.pad(minibatch) == expected

        # test include_length
        nesting_field = data.Field(tokenize=list, unk_token="<cunk>", pad_token="<cpad>",
                                   init_token="<w>", eos_token="</w>")
        CHARS = data.NestedField(nesting_field, init_token="<s>",
                                 eos_token="</s>", include_lengths=True,
                                 pad_first=True)
        arr, seq_len, words_len = CHARS.pad(minibatch)
        assert arr == expected
        assert seq_len == [5, 4]
        assert words_len == [[3, 6, 7, 6, 3], [0, 3, 6, 7, 3]] 
Example #13
Source File: test_field.py    From text with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_serialization(self):
        nesting_field = data.Field(batch_first=True)
        field = data.NestedField(nesting_field)
        ex1 = data.Example.fromlist(["john loves mary"], [("words", field)])
        ex2 = data.Example.fromlist(["mary cries"], [("words", field)])
        dataset = data.Dataset([ex1, ex2], [("words", field)])
        field.build_vocab(dataset)
        examples_data = [
            [
                ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4,
                ["<w>"] + list("john") + ["</w>", "<cpad>"],
                ["<w>"] + list("loves") + ["</w>"],
                ["<w>"] + list("mary") + ["</w>", "<cpad>"],
                ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4,
            ],
            [
                ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4,
                ["<w>"] + list("mary") + ["</w>", "<cpad>"],
                ["<w>"] + list("cries") + ["</w>"],
                ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4,
                ["<cpad>"] * 7,
            ]
        ]

        field_pickle_filename = "char_field.pl"
        field_pickle_path = os.path.join(self.test_dir, field_pickle_filename)
        torch.save(field, field_pickle_path)

        loaded_field = torch.load(field_pickle_path)
        assert loaded_field == field

        original_numericalization = field.numericalize(examples_data)
        pickled_numericalization = loaded_field.numericalize(examples_data)

        assert torch.all(torch.eq(original_numericalization, pickled_numericalization)) 
Example #14
Source File: dataset_reader.py    From nlp-experiments-in-pytorch with MIT License 5 votes vote down vote up
def create_fields(self, seq_input=True, seq_ner=True, seq_cat=False):
        if self.level == "word":
            sentence_field = data.Field(sequential=seq_input, preprocessing=self.preprocessor, fix_length=self.fix_length,
                                        init_token="<start>", eos_token="<end>")
        elif self.level == "char":
            sentence_field = data.Field(sequential=seq_input, tokenize=self.evil_workaround_tokenizer, fix_length=1014)
            # sentence_field = data.NestedField(nested_field)
        else:
            raise KeyError("Sentence_field is undefined!")

        ner_label_field = data.Field(sequential=seq_ner, init_token="<start>", eos_token="<end>", unk_token=None)
        category_label_field = data.LabelField(sequential=seq_cat)
        return sentence_field, ner_label_field, category_label_field 
Example #15
Source File: iterator.py    From TD-DMN with MIT License 4 votes vote down vote up
def __init__(self):
        super(DMNIterator, self).__init__()
        # Define text nested field
        self.text_sent = data.Field(sequential=True,
                                    lower=True,
                                    tokenize=lambda x: x.split(" "))
        self.text_doc = data.NestedField(self.text_sent,
                                         tokenize=lambda x: x.split("<EOS>"),
                                         include_lengths=True)

        # Define entity nested field
        self.entity_sent = data.Field(sequential=True,
                                      tokenize=lambda x: x.split(" "),
                                      unk_token=None)
        self.entity_doc = data.NestedField(self.entity_sent,
                                           tokenize=lambda x: x.split("<EOS>"))

        # Define label nested field
        self.label_sent = data.Field(sequential=True,
                                     tokenize=lambda x: x.split(" "),
                                     unk_token=None)
        self.label_doc = data.NestedField(self.label_sent,
                                          tokenize=lambda x: x.split("<EOS>"))

        # Define offset nested field
        self.offset_sent = self.InfoField(sequential=True,
                                          tokenize=lambda x: x.split(" "),
                                          use_vocab=False)
        self.offset_doc = self.NestedInfoField(self.offset_sent,
                                               tokenize=lambda x: x.split("<EOS>"),
                                               use_vocab=False)

        # Define length nested field
        self.length_sent = self.InfoField(sequential=True,
                                          tokenize=lambda x: x.split(" "),
                                          use_vocab=False,
                                          pad_token=None)
        self.length_doc = self.NestedInfoField(self.length_sent,
                                               tokenize=lambda x: x.split("<EOS>"),
                                               use_vocab=False)

        # Define word attention field
        self.word_attn_sent = self.InfoField(sequential=True,
                                             tokenize=lambda x: x.split(" "),
                                             use_vocab=False)
        self.word_attn_doc = self.NestedInfoField(self.word_attn_sent,
                                                  tokenize=lambda x: x.split("<EOS>"),
                                                  use_vocab=False)
        # Define sentence attention field
        self.sent_attn_doc = self.InfoField(sequential=True,
                                            tokenize=lambda x: x.split("<EOS>"),
                                            use_vocab=False)

        # Define doc id field
        self.doc_id = self.InfoField(sequential=False, use_vocab=False)

        self.vectors = None 
Example #16
Source File: inputs.py    From torchnlp with Apache License 2.0 4 votes vote down vote up
def get_input_processor_words(vocab_word, vocab_char=None, convert_digits=True):
    """
    Returns a function that converts text into a processed batch. Required duing
    inference.
    Parameters:
        vocab_word: Instance of torchtext.Vocab for input word vocabulary
        vocab_char[optional]: Instance of torchtext.Vocab for input per-word 
                              character vocabulary
        convert_digits: If True will convert numbers to single 0's
    """
    inputs_word = data.Field(init_token="<bos>", eos_token="<eos>", batch_first=True, lower=True,
                                preprocessing=data.Pipeline(
                                    lambda w: '0' if convert_digits and w.isdigit() else w ))
    # Set the vocab object manually without building from training dataset
    inputs_word.vocab = vocab_word

    if vocab_char is not None:
        inputs_char_nesting = data.Field(tokenize=list, init_token="<bos>", eos_token="<eos>", 
                                        batch_first=True)

        inputs_char = data.NestedField(inputs_char_nesting, 
                                        init_token="<bos>", eos_token="<eos>")
        # Set the vocab object manually without building from training dataset
        inputs_char.vocab = inputs_char_nesting.vocab = vocab_char
        
        fields = [(('inputs_word', 'inputs_char'), (inputs_word, inputs_char))]
    else:
        fields = [('inputs_word', inputs_word)]


    def input_processor_fn(inputs):
        if not isinstance(inputs, list):
            inputs = [inputs]

        examples = []
        for line in inputs:
            examples.append(data.Example.fromlist([line], fields))
        
        dataset = data.Dataset(examples, fields)
        # Entire input in one batch
        return data.Batch(data=dataset, 
                          dataset=dataset,
                          device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu"))

    return input_processor_fn