Python Examples of allennlp.data.Token

Source File: vocabulary_test.py From allennlp with Apache License 2.0

6 votes

def test_from_params_extend_config(self):

        vocab_dir = self.TEST_DIR / "vocab_save"
        original_vocab = Vocabulary(non_padded_namespaces=["tokens"])
        original_vocab.add_token_to_namespace("a", namespace="tokens")
        original_vocab.save_to_files(vocab_dir)

        text_field = TextField(
            [Token(t) for t in ["a", "b"]], {"tokens": SingleIdTokenIndexer("tokens")}
        )
        instances = Batch([Instance({"text": text_field})])

        # If you ask to extend vocab from `directory`, instances must be passed
        # in Vocabulary constructor, or else there is nothing to extend to.
        params = Params({"type": "extend", "directory": vocab_dir})
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(params)

        # If you ask to extend vocab, `directory` key must be present in params,
        # or else there is nothing to extend from.
        params = Params({"type": "extend"})
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(params, instances=instances)

Source File: elmo_test.py From magnitude with MIT License

6 votes

def get_vocab_and_both_elmo_indexed_ids(batch                 ):
        instances = []
        indexer = ELMoTokenCharactersIndexer()
        indexer2 = SingleIdTokenIndexer()
        for sentence in batch:
            tokens = [Token(token) for token in sentence]
            field = TextField(tokens,
                              {u'character_ids': indexer,
                               u'tokens': indexer2})
            instance = Instance({u"elmo": field})
            instances.append(instance)

        dataset = Batch(instances)
        vocab = Vocabulary.from_instances(instances)
        dataset.index_instances(vocab)
        return vocab, dataset.as_tensor_dict()[u"elmo"]

Source File: dataset_test.py From allennlp with Apache License 2.0

6 votes

def get_instances(self):
        field1 = TextField(
            [Token(t) for t in ["this", "is", "a", "sentence", "."]], self.token_indexer
        )
        field2 = TextField(
            [Token(t) for t in ["this", "is", "a", "different", "sentence", "."]],
            self.token_indexer,
        )
        field3 = TextField(
            [Token(t) for t in ["here", "is", "a", "sentence", "."]], self.token_indexer
        )
        field4 = TextField([Token(t) for t in ["this", "is", "short"]], self.token_indexer)
        instances = [
            Instance({"text1": field1, "text2": field2}),
            Instance({"text1": field3, "text2": field4}),
        ]
        return instances

Source File: single_id_token_indexer_test.py From allennlp with Apache License 2.0

6 votes

def test_count_vocab_items_with_non_default_feature_name(self):
        tokenizer = SpacyTokenizer(parse=True)
        tokens = tokenizer.tokenize("This is a sentence.")
        tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")]
        indexer = SingleIdTokenIndexer(
            namespace="dep_labels", feature_name="dep_", default_value="NONE"
        )
        counter = defaultdict(lambda: defaultdict(int))
        for token in tokens:
            indexer.count_vocab_items(token, counter)

        assert counter["dep_labels"] == {
            "ROOT": 1,
            "nsubj": 1,
            "det": 1,
            "NONE": 2,
            "attr": 1,
            "punct": 1,
        }

Source File: text_field_test.py From allennlp with Apache License 2.0

6 votes

def test_token_padding_lengths_are_computed_correctly(self):
        field = TextField(
            [Token(t) for t in ["A", "sentence"]],
            token_indexers={
                "field_with_dict": DictReturningTokenIndexer(token_min_padding_length=3),
                "words": SingleIdTokenIndexer("words", token_min_padding_length=3),
                "characters": TokenCharactersIndexer(
                    "characters", min_padding_length=1, token_min_padding_length=3
                ),
            },
        )
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {
            "field_with_dict___token_ids": 5,
            "field_with_dict___additional_key": 3,
            "words___tokens": 3,
            "characters___token_characters": 3,
            "characters___num_token_characters": 8,
        }
        tensors = field.as_tensor(padding_lengths)
        assert tensors["field_with_dict"]["additional_key"].tolist()[-1] == 0
        assert tensors["words"]["tokens"].tolist()[-1] == 0
        assert tensors["characters"]["token_characters"].tolist()[-1] == [0] * 8

Source File: character_token_indexer_test.py From allennlp with Apache License 2.0

6 votes

def test_start_and_end_tokens(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("A", namespace="characters")  # 2
        vocab.add_token_to_namespace("s", namespace="characters")  # 3
        vocab.add_token_to_namespace("e", namespace="characters")  # 4
        vocab.add_token_to_namespace("n", namespace="characters")  # 5
        vocab.add_token_to_namespace("t", namespace="characters")  # 6
        vocab.add_token_to_namespace("c", namespace="characters")  # 7
        vocab.add_token_to_namespace("<", namespace="characters")  # 8
        vocab.add_token_to_namespace(">", namespace="characters")  # 9
        vocab.add_token_to_namespace("/", namespace="characters")  # 10

        indexer = TokenCharactersIndexer(
            "characters", start_tokens=["<s>"], end_tokens=["</s>"], min_padding_length=1
        )
        indices = indexer.tokens_to_indices([Token("sentential")], vocab)
        assert indices == {
            "token_characters": [[8, 3, 9], [3, 4, 5, 6, 4, 5, 6, 1, 1, 1], [8, 10, 3, 9]]
        }

Source File: text_field_test.py From allennlp with Apache License 2.0

6 votes

def test_as_tensor_handles_characters(self):
        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={
                "characters": TokenCharactersIndexer("characters", min_padding_length=1)
            },
        )
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        tensor_dict = field.as_tensor(padding_lengths)
        expected_character_array = numpy.array(
            [
                [1, 1, 1, 3, 0, 0, 0, 0],
                [1, 3, 0, 0, 0, 0, 0, 0],
                [1, 0, 0, 0, 0, 0, 0, 0],
                [3, 4, 5, 6, 4, 5, 7, 4],
                [1, 0, 0, 0, 0, 0, 0, 0],
            ]
        )
        numpy.testing.assert_array_almost_equal(
            tensor_dict["characters"]["token_characters"].detach().cpu().numpy(),
            expected_character_array,
        )

Source File: vocabulary_test.py From allennlp with Apache License 2.0

6 votes

def test_saving_and_loading_works_with_byte_encoding(self):
        # We're going to set a vocabulary from a TextField using byte encoding, index it, save the
        # vocab, load the vocab, then index the text field again, and make sure we get the same
        # result.
        tokenizer = CharacterTokenizer(byte_encoding="utf-8")
        token_indexer = TokenCharactersIndexer(character_tokenizer=tokenizer, min_padding_length=2)
        tokens = [Token(t) for t in ["Øyvind", "für", "汉字"]]
        text_field = TextField(tokens, {"characters": token_indexer})
        dataset = Batch([Instance({"sentence": text_field})])
        vocab = Vocabulary.from_instances(dataset)
        text_field.index(vocab)
        indexed_tokens = deepcopy(text_field._indexed_tokens)

        vocab_dir = self.TEST_DIR / "vocab_save"
        vocab.save_to_files(vocab_dir)
        vocab2 = Vocabulary.from_files(vocab_dir)
        text_field2 = TextField(tokens, {"characters": token_indexer})
        text_field2.index(vocab2)
        indexed_tokens2 = deepcopy(text_field2._indexed_tokens)
        assert indexed_tokens == indexed_tokens2

Source File: vocabulary_test.py From allennlp with Apache License 2.0

6 votes

def test_max_vocab_size_partial_dict(self):
        indexers = {
            "tokens": SingleIdTokenIndexer(),
            "token_characters": TokenCharactersIndexer(min_padding_length=3),
        }
        instance = Instance(
            {
                "text": TextField(
                    [Token(w) for w in "Abc def ghi jkl mno pqr stu vwx yz".split(" ")], indexers
                )
            }
        )
        dataset = Batch([instance])
        params = Params({"max_vocab_size": {"tokens": 1}})

        vocab = Vocabulary.from_params(params=params, instances=dataset)
        assert len(vocab.get_index_to_token_vocabulary("tokens").values()) == 3  # 1 + 2
        assert len(vocab.get_index_to_token_vocabulary("token_characters").values()) == 28  # 26 + 2

Source File: openai_transformer_byte_pair_indexer_test.py From magnitude with MIT License

6 votes

def test_bpe(self):

        # [e, w, o, e</w>] -> best pair (e, w)
        # [ew, o, e</w>] -> best pair (o, e</w>)
        # [ew, oe</w>] -> done
        token = Token(u"ewoe")
        assert self.indexer.byte_pair_encode(token) == [u'ew', u'oe</w>']

        # Prefer "ew" to "we"
        token = Token(u"ewe")
        assert self.indexer.byte_pair_encode(token) == [u'ew', u'e</w>']

        # Prefer ending a word
        token = Token(u"eee")
        assert self.indexer.byte_pair_encode(token) == [u'e', u'ee</w>']

        # Encodes up to a single symbol when appropriate
        token = Token(u"woe")
        assert self.indexer.byte_pair_encode(token) == [u'woe</w>']

Source File: openai_transformer_byte_pair_indexer_test.py From magnitude with MIT License

6 votes

def test_tokens_to_indices(self):
        tokens = [Token(u'ewoe'), Token(u'woe'), Token(u'ewe'), Token(u'ee')]

        indices = self.indexer.tokens_to_indices(tokens, None, u'test')

        assert set(indices.keys()) == set([u"test", u"test-offsets", u"mask"])

        text_tokens = indices[u'test']
        offsets = indices[u'test-offsets']

        assert text_tokens[:6] == [
                self.indexer.encoder.get(symbol, 0)
                for symbol in [u'ew', u'oe</w>'] + [u'woe</w>'] + [u'ew', u'e</w>'] + [u'ee</w>']
        ]

        assert offsets == [
                1,  # end of first word
                2,  # end of second word
                4,  # end of third word
                5,  # end of last word
        ]

Source File: relation_instances_reader.py From comb_dist_direct_relex with Apache License 2.0

6 votes

def _tokens_distances(self, tokens):
        e1_loc = []
        e2_loc = []

        while len(tokens) < 5:  # a hack to make sure all sentences are at least 5 tokens. CNN breaks otherwise.
            tokens.append(Token(text='.'))

        for i, token in enumerate(tokens):
            if token.text.startswith('<e1>'):
                e1_loc.append((i, 'start'))
                token.text = token.text[4:]
            if token.text.endswith('</e1>'):
                e1_loc.append((i, 'end'))
                token.text = token.text[:-5]
            if token.text.startswith('<e2>'):
                e2_loc.append((i, 'start'))
                token.text = token.text[4:]
            if token.text.endswith('</e2>'):
                e2_loc.append((i, 'end'))
                token.text = token.text[:-5]

        positions1 = self._positions(len(tokens), e1_loc)
        positions2 = self._positions(len(tokens), e2_loc)

        return tokens, positions1, positions2

Source File: index_field_test.py From allennlp with Apache License 2.0

6 votes

def test_equality(self):
        index_field1 = IndexField(4, self.text)
        index_field2 = IndexField(4, self.text)
        index_field3 = IndexField(
            4,
            TextField(
                [Token(t) for t in ["AllenNLP", "is", "the", "bomb", "!"]],
                {"words": SingleIdTokenIndexer("words")},
            ),
        )

        assert index_field1 == 4
        assert index_field1 == index_field1
        assert index_field1 == index_field2

        assert index_field1 != index_field3
        assert index_field2 != index_field3
        assert index_field3 == index_field3

Source File: character_token_indexer_test.py From magnitude with MIT License

5 votes

def test_count_vocab_items_respects_casing(self):
        indexer = TokenCharactersIndexer(u"characters")
        counter = defaultdict(lambda: defaultdict(int))
        indexer.count_vocab_items(Token(u"Hello"), counter)
        indexer.count_vocab_items(Token(u"hello"), counter)
        assert counter[u"characters"] == {u"h": 1, u"H": 1, u"e": 2, u"l": 4, u"o": 2}

        indexer = TokenCharactersIndexer(u"characters", CharacterTokenizer(lowercase_characters=True))
        counter = defaultdict(lambda: defaultdict(int))
        indexer.count_vocab_items(Token(u"Hello"), counter)
        indexer.count_vocab_items(Token(u"hello"), counter)
        assert counter[u"characters"] == {u"h": 2, u"e": 2, u"l": 4, u"o": 2}

Source File: character_token_indexer_test.py From magnitude with MIT License

5 votes

def test_tokens_to_indices_produces_correct_characters(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace(u"A", namespace=u'characters')
        vocab.add_token_to_namespace(u"s", namespace=u'characters')
        vocab.add_token_to_namespace(u"e", namespace=u'characters')
        vocab.add_token_to_namespace(u"n", namespace=u'characters')
        vocab.add_token_to_namespace(u"t", namespace=u'characters')
        vocab.add_token_to_namespace(u"c", namespace=u'characters')

        indexer = TokenCharactersIndexer(u"characters")
        indices = indexer.tokens_to_indices([Token(u"sentential")], vocab, u"char")
        assert indices == {u"char": [[3, 4, 5, 6, 4, 5, 6, 1, 1, 1]]}

Source File: elmo_indexer_test.py From magnitude with MIT License

5 votes

def test_bos_to_char_ids(self):
        indexer = ELMoTokenCharactersIndexer()
        indices = indexer.tokens_to_indices([Token(u'<S>')], Vocabulary(), u"test-elmo")
        expected_indices = [259, 257, 260, 261, 261, 261, 261, 261, 261,
                            261, 261, 261, 261, 261, 261, 261, 261, 261,
                            261, 261, 261, 261, 261, 261, 261, 261, 261,
                            261, 261, 261, 261, 261, 261, 261, 261, 261,
                            261, 261, 261, 261, 261, 261, 261, 261, 261,
                            261, 261, 261, 261, 261]
        assert indices == {u"test-elmo": [expected_indices]}

Source File: fever_sselection_reader.py From combine-FEVER-NSMN with MIT License

5 votes

def text_to_instance(self,  # type: ignore
                         premise: str,
                         hypothesis: str,
                         pid: str = None,
                         label: str = None) -> Instance:

        fields: Dict[str, Field] = {}

        premise_tokens = [Token(t) for t in premise.split(' ')]  # Removing code for parentheses in NLI
        hypothesis_tokens = [Token(t) for t in hypothesis.split(' ')]

        if self.max_l is not None:
            premise_tokens = premise_tokens[:self.max_l]
            hypothesis_tokens = hypothesis_tokens[:self.max_l]

        fields['premise'] = TextField(premise_tokens, self._token_indexers)
        fields['hypothesis'] = TextField(hypothesis_tokens, self._token_indexers)

        if label:
            fields['selection_label'] = LabelField(label, label_namespace='selection_labels')

        if pid:
            fields['pid'] = IdField(pid)

        return Instance(fields)

Source File: openai_transformer_byte_pair_indexer_test.py From magnitude with MIT License

5 votes

def test_raises_with_too_long_sentence(self):
        tokens = [Token(u'a') for _ in range(513)]

        with pytest.raises(RuntimeError):
            self.indexer.tokens_to_indices(tokens, None, u'should-fail')

Source File: util_test.py From allennlp with Apache License 2.0

5 votes

def test_get_token_ids_from_text_field_tensors(self):
        # Setting up a number of diffrent indexers, that we can test later.
        string_tokens = ["This", "is", "a", "test"]
        tokens = [Token(x) for x in string_tokens]
        vocab = Vocabulary()
        vocab.add_tokens_to_namespace(string_tokens, "tokens")
        vocab.add_tokens_to_namespace(
            set([char for token in string_tokens for char in token]), "token_characters"
        )
        elmo_indexer = ELMoTokenCharactersIndexer()
        token_chars_indexer = TokenCharactersIndexer()
        single_id_indexer = SingleIdTokenIndexer()
        indexers = {"elmo": elmo_indexer, "chars": token_chars_indexer, "tokens": single_id_indexer}

        # In all of the tests below, we'll want to recover the token ides that were produced by the
        # single_id indexer, so we grab that output first.
        text_field = TextField(tokens, {"tokens": single_id_indexer})
        text_field.index(vocab)
        tensors = text_field.as_tensor(text_field.get_padding_lengths())
        expected_token_ids = tensors["tokens"]["tokens"]

        # Now the actual tests.
        text_field = TextField(tokens, indexers)
        text_field.index(vocab)
        tensors = text_field.as_tensor(text_field.get_padding_lengths())
        token_ids = util.get_token_ids_from_text_field_tensors(tensors)
        assert (token_ids == expected_token_ids).all()

Source File: elmo_indexer_test.py From magnitude with MIT License

5 votes

def test_eos_to_char_ids(self):
        indexer = ELMoTokenCharactersIndexer()
        indices = indexer.tokens_to_indices([Token(u'</S>')], Vocabulary(), u"test-eos")
        expected_indices = [259, 258, 260, 261, 261, 261, 261, 261, 261,
                            261, 261, 261, 261, 261, 261, 261, 261, 261,
                            261, 261, 261, 261, 261, 261, 261, 261, 261,
                            261, 261, 261, 261, 261, 261, 261, 261, 261,
                            261, 261, 261, 261, 261, 261, 261, 261, 261,
                            261, 261, 261, 261, 261]
        assert indices == {u"test-eos": [expected_indices]}

Source File: sequence_label_field_test.py From allennlp with Apache License 2.0

5 votes

def setup_method(self):
        super().setup_method()
        self.text = TextField(
            [Token(t) for t in ["here", "are", "some", "words", "."]],
            {"words": SingleIdTokenIndexer("words")},
        )

Source File: text_field_test.py From allennlp with Apache License 2.0

5 votes

def tokens_to_indices(
        self, tokens: List[Token], vocabulary: Vocabulary
    ) -> Dict[str, List[int]]:
        return {
            "token_ids": (
                [10, 15]
                + [vocabulary.get_token_index(token.text, "words") for token in tokens]
                + [25]
            ),
            "additional_key": [22, 29],
        }

Source File: text_field_test.py From allennlp with Apache License 2.0

5 votes

def test_sequence_methods(self):
        field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]], {})

        assert len(field) == 5
        assert field[1].text == "is"
        assert [token.text for token in field] == ["This", "is", "a", "sentence", "."]

Source File: text_field_test.py From allennlp with Apache License 2.0

5 votes

def test_token_indexer_returns_dict(self):
        field = TextField(
            [Token(t) for t in ["A", "sentence"]],
            token_indexers={
                "field_with_dict": DictReturningTokenIndexer(),
                "words": SingleIdTokenIndexer("words"),
                "characters": TokenCharactersIndexer("characters", min_padding_length=1),
            },
        )
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {
            "field_with_dict___token_ids": 5,
            "field_with_dict___additional_key": 2,
            "words___tokens": 2,
            "characters___token_characters": 2,
            "characters___num_token_characters": 8,
        }
        padding_lengths["field_with_dict___token_ids"] = 7
        padding_lengths["field_with_dict___additional_key"] = 3
        padding_lengths["words___tokens"] = 4
        padding_lengths["characters___token_characters"] = 4
        tensors = field.as_tensor(padding_lengths)
        assert list(tensors["field_with_dict"]["token_ids"].shape) == [7]
        assert list(tensors["field_with_dict"]["additional_key"].shape) == [3]
        assert list(tensors["words"]["tokens"].shape) == [4]
        assert list(tensors["characters"]["token_characters"].shape) == [4, 8]

Source File: text_field_test.py From allennlp with Apache License 2.0

5 votes

def test_printing_doesnt_crash(self):
        field = TextField(
            [Token(t) for t in ["A", "sentence"]],
            {"words": SingleIdTokenIndexer(namespace="words")},
        )
        print(field)

Source File: text_field_test.py From allennlp with Apache License 2.0

5 votes

def test_as_tensor_handles_longer_lengths(self):
        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={"words": SingleIdTokenIndexer("words")},
        )
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        padding_lengths["words___tokens"] = 10
        tensor_dict = field.as_tensor(padding_lengths)
        numpy.testing.assert_array_almost_equal(
            tensor_dict["words"]["tokens"].detach().cpu().numpy(),
            numpy.array([1, 1, 1, 2, 1, 0, 0, 0, 0, 0]),
        )

Source File: text_field_test.py From allennlp with Apache License 2.0

5 votes

def test_as_tensor_handles_words(self):
        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={"words": SingleIdTokenIndexer("words")},
        )
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        tensor_dict = field.as_tensor(padding_lengths)
        numpy.testing.assert_array_almost_equal(
            tensor_dict["words"]["tokens"].detach().cpu().numpy(), numpy.array([1, 1, 1, 2, 1])
        )

Source File: text_field_test.py From allennlp with Apache License 2.0

5 votes

def test_padding_lengths_are_computed_correctly(self):
        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={"words": SingleIdTokenIndexer("words")},
        )
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {"words___tokens": 5}

        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={
                "characters": TokenCharactersIndexer("characters", min_padding_length=1)
            },
        )
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {
            "characters___token_characters": 5,
            "characters___num_token_characters": 8,
        }

        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={
                "characters": TokenCharactersIndexer("characters", min_padding_length=1),
                "words": SingleIdTokenIndexer("words"),
            },
        )
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {
            "characters___token_characters": 5,
            "characters___num_token_characters": 8,
            "words___tokens": 5,
        }

Source File: text_field_test.py From allennlp with Apache License 2.0

5 votes

def test_get_padding_lengths_raises_if_no_indexed_tokens(self):

        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={"words": SingleIdTokenIndexer("words")},
        )
        with pytest.raises(ConfigurationError):
            field.get_padding_lengths()

Source File: index_field_test.py From allennlp with Apache License 2.0

5 votes

def setup_method(self):
        super().setup_method()
        self.text = TextField(
            [Token(t) for t in ["here", "is", "a", "sentence", "."]],
            {"words": SingleIdTokenIndexer("words")},
        )

Python allennlp.data.Token() Examples