Python allennlp.data.Token() Examples

The following are 30 code examples of allennlp.data.Token(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module allennlp.data , or try the search function .
Example #1
Source File: vocabulary_test.py    From allennlp with Apache License 2.0 6 votes vote down vote up
def test_from_params_extend_config(self):

        vocab_dir = self.TEST_DIR / "vocab_save"
        original_vocab = Vocabulary(non_padded_namespaces=["tokens"])
        original_vocab.add_token_to_namespace("a", namespace="tokens")
        original_vocab.save_to_files(vocab_dir)

        text_field = TextField(
            [Token(t) for t in ["a", "b"]], {"tokens": SingleIdTokenIndexer("tokens")}
        )
        instances = Batch([Instance({"text": text_field})])

        # If you ask to extend vocab from `directory`, instances must be passed
        # in Vocabulary constructor, or else there is nothing to extend to.
        params = Params({"type": "extend", "directory": vocab_dir})
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(params)

        # If you ask to extend vocab, `directory` key must be present in params,
        # or else there is nothing to extend from.
        params = Params({"type": "extend"})
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(params, instances=instances) 
Example #2
Source File: elmo_test.py    From magnitude with MIT License 6 votes vote down vote up
def get_vocab_and_both_elmo_indexed_ids(batch                 ):
        instances = []
        indexer = ELMoTokenCharactersIndexer()
        indexer2 = SingleIdTokenIndexer()
        for sentence in batch:
            tokens = [Token(token) for token in sentence]
            field = TextField(tokens,
                              {u'character_ids': indexer,
                               u'tokens': indexer2})
            instance = Instance({u"elmo": field})
            instances.append(instance)

        dataset = Batch(instances)
        vocab = Vocabulary.from_instances(instances)
        dataset.index_instances(vocab)
        return vocab, dataset.as_tensor_dict()[u"elmo"] 
Example #3
Source File: dataset_test.py    From allennlp with Apache License 2.0 6 votes vote down vote up
def get_instances(self):
        field1 = TextField(
            [Token(t) for t in ["this", "is", "a", "sentence", "."]], self.token_indexer
        )
        field2 = TextField(
            [Token(t) for t in ["this", "is", "a", "different", "sentence", "."]],
            self.token_indexer,
        )
        field3 = TextField(
            [Token(t) for t in ["here", "is", "a", "sentence", "."]], self.token_indexer
        )
        field4 = TextField([Token(t) for t in ["this", "is", "short"]], self.token_indexer)
        instances = [
            Instance({"text1": field1, "text2": field2}),
            Instance({"text1": field3, "text2": field4}),
        ]
        return instances 
Example #4
Source File: single_id_token_indexer_test.py    From allennlp with Apache License 2.0 6 votes vote down vote up
def test_count_vocab_items_with_non_default_feature_name(self):
        tokenizer = SpacyTokenizer(parse=True)
        tokens = tokenizer.tokenize("This is a sentence.")
        tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")]
        indexer = SingleIdTokenIndexer(
            namespace="dep_labels", feature_name="dep_", default_value="NONE"
        )
        counter = defaultdict(lambda: defaultdict(int))
        for token in tokens:
            indexer.count_vocab_items(token, counter)

        assert counter["dep_labels"] == {
            "ROOT": 1,
            "nsubj": 1,
            "det": 1,
            "NONE": 2,
            "attr": 1,
            "punct": 1,
        } 
Example #5
Source File: text_field_test.py    From allennlp with Apache License 2.0 6 votes vote down vote up
def test_token_padding_lengths_are_computed_correctly(self):
        field = TextField(
            [Token(t) for t in ["A", "sentence"]],
            token_indexers={
                "field_with_dict": DictReturningTokenIndexer(token_min_padding_length=3),
                "words": SingleIdTokenIndexer("words", token_min_padding_length=3),
                "characters": TokenCharactersIndexer(
                    "characters", min_padding_length=1, token_min_padding_length=3
                ),
            },
        )
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {
            "field_with_dict___token_ids": 5,
            "field_with_dict___additional_key": 3,
            "words___tokens": 3,
            "characters___token_characters": 3,
            "characters___num_token_characters": 8,
        }
        tensors = field.as_tensor(padding_lengths)
        assert tensors["field_with_dict"]["additional_key"].tolist()[-1] == 0
        assert tensors["words"]["tokens"].tolist()[-1] == 0
        assert tensors["characters"]["token_characters"].tolist()[-1] == [0] * 8 
Example #6
Source File: character_token_indexer_test.py    From allennlp with Apache License 2.0 6 votes vote down vote up
def test_start_and_end_tokens(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace("A", namespace="characters")  # 2
        vocab.add_token_to_namespace("s", namespace="characters")  # 3
        vocab.add_token_to_namespace("e", namespace="characters")  # 4
        vocab.add_token_to_namespace("n", namespace="characters")  # 5
        vocab.add_token_to_namespace("t", namespace="characters")  # 6
        vocab.add_token_to_namespace("c", namespace="characters")  # 7
        vocab.add_token_to_namespace("<", namespace="characters")  # 8
        vocab.add_token_to_namespace(">", namespace="characters")  # 9
        vocab.add_token_to_namespace("/", namespace="characters")  # 10

        indexer = TokenCharactersIndexer(
            "characters", start_tokens=["<s>"], end_tokens=["</s>"], min_padding_length=1
        )
        indices = indexer.tokens_to_indices([Token("sentential")], vocab)
        assert indices == {
            "token_characters": [[8, 3, 9], [3, 4, 5, 6, 4, 5, 6, 1, 1, 1], [8, 10, 3, 9]]
        } 
Example #7
Source File: text_field_test.py    From allennlp with Apache License 2.0 6 votes vote down vote up
def test_as_tensor_handles_characters(self):
        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={
                "characters": TokenCharactersIndexer("characters", min_padding_length=1)
            },
        )
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        tensor_dict = field.as_tensor(padding_lengths)
        expected_character_array = numpy.array(
            [
                [1, 1, 1, 3, 0, 0, 0, 0],
                [1, 3, 0, 0, 0, 0, 0, 0],
                [1, 0, 0, 0, 0, 0, 0, 0],
                [3, 4, 5, 6, 4, 5, 7, 4],
                [1, 0, 0, 0, 0, 0, 0, 0],
            ]
        )
        numpy.testing.assert_array_almost_equal(
            tensor_dict["characters"]["token_characters"].detach().cpu().numpy(),
            expected_character_array,
        ) 
Example #8
Source File: vocabulary_test.py    From allennlp with Apache License 2.0 6 votes vote down vote up
def test_saving_and_loading_works_with_byte_encoding(self):
        # We're going to set a vocabulary from a TextField using byte encoding, index it, save the
        # vocab, load the vocab, then index the text field again, and make sure we get the same
        # result.
        tokenizer = CharacterTokenizer(byte_encoding="utf-8")
        token_indexer = TokenCharactersIndexer(character_tokenizer=tokenizer, min_padding_length=2)
        tokens = [Token(t) for t in ["Øyvind", "für", "汉字"]]
        text_field = TextField(tokens, {"characters": token_indexer})
        dataset = Batch([Instance({"sentence": text_field})])
        vocab = Vocabulary.from_instances(dataset)
        text_field.index(vocab)
        indexed_tokens = deepcopy(text_field._indexed_tokens)

        vocab_dir = self.TEST_DIR / "vocab_save"
        vocab.save_to_files(vocab_dir)
        vocab2 = Vocabulary.from_files(vocab_dir)
        text_field2 = TextField(tokens, {"characters": token_indexer})
        text_field2.index(vocab2)
        indexed_tokens2 = deepcopy(text_field2._indexed_tokens)
        assert indexed_tokens == indexed_tokens2 
Example #9
Source File: vocabulary_test.py    From allennlp with Apache License 2.0 6 votes vote down vote up
def test_max_vocab_size_partial_dict(self):
        indexers = {
            "tokens": SingleIdTokenIndexer(),
            "token_characters": TokenCharactersIndexer(min_padding_length=3),
        }
        instance = Instance(
            {
                "text": TextField(
                    [Token(w) for w in "Abc def ghi jkl mno pqr stu vwx yz".split(" ")], indexers
                )
            }
        )
        dataset = Batch([instance])
        params = Params({"max_vocab_size": {"tokens": 1}})

        vocab = Vocabulary.from_params(params=params, instances=dataset)
        assert len(vocab.get_index_to_token_vocabulary("tokens").values()) == 3  # 1 + 2
        assert len(vocab.get_index_to_token_vocabulary("token_characters").values()) == 28  # 26 + 2 
Example #10
Source File: openai_transformer_byte_pair_indexer_test.py    From magnitude with MIT License 6 votes vote down vote up
def test_bpe(self):

        # [e, w, o, e</w>] -> best pair (e, w)
        # [ew, o, e</w>] -> best pair (o, e</w>)
        # [ew, oe</w>] -> done
        token = Token(u"ewoe")
        assert self.indexer.byte_pair_encode(token) == [u'ew', u'oe</w>']

        # Prefer "ew" to "we"
        token = Token(u"ewe")
        assert self.indexer.byte_pair_encode(token) == [u'ew', u'e</w>']

        # Prefer ending a word
        token = Token(u"eee")
        assert self.indexer.byte_pair_encode(token) == [u'e', u'ee</w>']

        # Encodes up to a single symbol when appropriate
        token = Token(u"woe")
        assert self.indexer.byte_pair_encode(token) == [u'woe</w>'] 
Example #11
Source File: openai_transformer_byte_pair_indexer_test.py    From magnitude with MIT License 6 votes vote down vote up
def test_tokens_to_indices(self):
        tokens = [Token(u'ewoe'), Token(u'woe'), Token(u'ewe'), Token(u'ee')]

        indices = self.indexer.tokens_to_indices(tokens, None, u'test')

        assert set(indices.keys()) == set([u"test", u"test-offsets", u"mask"])

        text_tokens = indices[u'test']
        offsets = indices[u'test-offsets']

        assert text_tokens[:6] == [
                self.indexer.encoder.get(symbol, 0)
                for symbol in [u'ew', u'oe</w>'] + [u'woe</w>'] + [u'ew', u'e</w>'] + [u'ee</w>']
        ]

        assert offsets == [
                1,  # end of first word
                2,  # end of second word
                4,  # end of third word
                5,  # end of last word
        ] 
Example #12
Source File: relation_instances_reader.py    From comb_dist_direct_relex with Apache License 2.0 6 votes vote down vote up
def _tokens_distances(self, tokens):
        e1_loc = []
        e2_loc = []

        while len(tokens) < 5:  # a hack to make sure all sentences are at least 5 tokens. CNN breaks otherwise.
            tokens.append(Token(text='.'))

        for i, token in enumerate(tokens):
            if token.text.startswith('<e1>'):
                e1_loc.append((i, 'start'))
                token.text = token.text[4:]
            if token.text.endswith('</e1>'):
                e1_loc.append((i, 'end'))
                token.text = token.text[:-5]
            if token.text.startswith('<e2>'):
                e2_loc.append((i, 'start'))
                token.text = token.text[4:]
            if token.text.endswith('</e2>'):
                e2_loc.append((i, 'end'))
                token.text = token.text[:-5]

        positions1 = self._positions(len(tokens), e1_loc)
        positions2 = self._positions(len(tokens), e2_loc)

        return tokens, positions1, positions2 
Example #13
Source File: index_field_test.py    From allennlp with Apache License 2.0 6 votes vote down vote up
def test_equality(self):
        index_field1 = IndexField(4, self.text)
        index_field2 = IndexField(4, self.text)
        index_field3 = IndexField(
            4,
            TextField(
                [Token(t) for t in ["AllenNLP", "is", "the", "bomb", "!"]],
                {"words": SingleIdTokenIndexer("words")},
            ),
        )

        assert index_field1 == 4
        assert index_field1 == index_field1
        assert index_field1 == index_field2

        assert index_field1 != index_field3
        assert index_field2 != index_field3
        assert index_field3 == index_field3 
Example #14
Source File: character_token_indexer_test.py    From magnitude with MIT License 5 votes vote down vote up
def test_count_vocab_items_respects_casing(self):
        indexer = TokenCharactersIndexer(u"characters")
        counter = defaultdict(lambda: defaultdict(int))
        indexer.count_vocab_items(Token(u"Hello"), counter)
        indexer.count_vocab_items(Token(u"hello"), counter)
        assert counter[u"characters"] == {u"h": 1, u"H": 1, u"e": 2, u"l": 4, u"o": 2}

        indexer = TokenCharactersIndexer(u"characters", CharacterTokenizer(lowercase_characters=True))
        counter = defaultdict(lambda: defaultdict(int))
        indexer.count_vocab_items(Token(u"Hello"), counter)
        indexer.count_vocab_items(Token(u"hello"), counter)
        assert counter[u"characters"] == {u"h": 2, u"e": 2, u"l": 4, u"o": 2} 
Example #15
Source File: character_token_indexer_test.py    From magnitude with MIT License 5 votes vote down vote up
def test_tokens_to_indices_produces_correct_characters(self):
        vocab = Vocabulary()
        vocab.add_token_to_namespace(u"A", namespace=u'characters')
        vocab.add_token_to_namespace(u"s", namespace=u'characters')
        vocab.add_token_to_namespace(u"e", namespace=u'characters')
        vocab.add_token_to_namespace(u"n", namespace=u'characters')
        vocab.add_token_to_namespace(u"t", namespace=u'characters')
        vocab.add_token_to_namespace(u"c", namespace=u'characters')

        indexer = TokenCharactersIndexer(u"characters")
        indices = indexer.tokens_to_indices([Token(u"sentential")], vocab, u"char")
        assert indices == {u"char": [[3, 4, 5, 6, 4, 5, 6, 1, 1, 1]]} 
Example #16
Source File: elmo_indexer_test.py    From magnitude with MIT License 5 votes vote down vote up
def test_bos_to_char_ids(self):
        indexer = ELMoTokenCharactersIndexer()
        indices = indexer.tokens_to_indices([Token(u'<S>')], Vocabulary(), u"test-elmo")
        expected_indices = [259, 257, 260, 261, 261, 261, 261, 261, 261,
                            261, 261, 261, 261, 261, 261, 261, 261, 261,
                            261, 261, 261, 261, 261, 261, 261, 261, 261,
                            261, 261, 261, 261, 261, 261, 261, 261, 261,
                            261, 261, 261, 261, 261, 261, 261, 261, 261,
                            261, 261, 261, 261, 261]
        assert indices == {u"test-elmo": [expected_indices]} 
Example #17
Source File: fever_sselection_reader.py    From combine-FEVER-NSMN with MIT License 5 votes vote down vote up
def text_to_instance(self,  # type: ignore
                         premise: str,
                         hypothesis: str,
                         pid: str = None,
                         label: str = None) -> Instance:

        fields: Dict[str, Field] = {}

        premise_tokens = [Token(t) for t in premise.split(' ')]  # Removing code for parentheses in NLI
        hypothesis_tokens = [Token(t) for t in hypothesis.split(' ')]

        if self.max_l is not None:
            premise_tokens = premise_tokens[:self.max_l]
            hypothesis_tokens = hypothesis_tokens[:self.max_l]

        fields['premise'] = TextField(premise_tokens, self._token_indexers)
        fields['hypothesis'] = TextField(hypothesis_tokens, self._token_indexers)

        if label:
            fields['selection_label'] = LabelField(label, label_namespace='selection_labels')

        if pid:
            fields['pid'] = IdField(pid)

        return Instance(fields) 
Example #18
Source File: openai_transformer_byte_pair_indexer_test.py    From magnitude with MIT License 5 votes vote down vote up
def test_raises_with_too_long_sentence(self):
        tokens = [Token(u'a') for _ in range(513)]

        with pytest.raises(RuntimeError):
            self.indexer.tokens_to_indices(tokens, None, u'should-fail') 
Example #19
Source File: util_test.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def test_get_token_ids_from_text_field_tensors(self):
        # Setting up a number of diffrent indexers, that we can test later.
        string_tokens = ["This", "is", "a", "test"]
        tokens = [Token(x) for x in string_tokens]
        vocab = Vocabulary()
        vocab.add_tokens_to_namespace(string_tokens, "tokens")
        vocab.add_tokens_to_namespace(
            set([char for token in string_tokens for char in token]), "token_characters"
        )
        elmo_indexer = ELMoTokenCharactersIndexer()
        token_chars_indexer = TokenCharactersIndexer()
        single_id_indexer = SingleIdTokenIndexer()
        indexers = {"elmo": elmo_indexer, "chars": token_chars_indexer, "tokens": single_id_indexer}

        # In all of the tests below, we'll want to recover the token ides that were produced by the
        # single_id indexer, so we grab that output first.
        text_field = TextField(tokens, {"tokens": single_id_indexer})
        text_field.index(vocab)
        tensors = text_field.as_tensor(text_field.get_padding_lengths())
        expected_token_ids = tensors["tokens"]["tokens"]

        # Now the actual tests.
        text_field = TextField(tokens, indexers)
        text_field.index(vocab)
        tensors = text_field.as_tensor(text_field.get_padding_lengths())
        token_ids = util.get_token_ids_from_text_field_tensors(tensors)
        assert (token_ids == expected_token_ids).all() 
Example #20
Source File: elmo_indexer_test.py    From magnitude with MIT License 5 votes vote down vote up
def test_eos_to_char_ids(self):
        indexer = ELMoTokenCharactersIndexer()
        indices = indexer.tokens_to_indices([Token(u'</S>')], Vocabulary(), u"test-eos")
        expected_indices = [259, 258, 260, 261, 261, 261, 261, 261, 261,
                            261, 261, 261, 261, 261, 261, 261, 261, 261,
                            261, 261, 261, 261, 261, 261, 261, 261, 261,
                            261, 261, 261, 261, 261, 261, 261, 261, 261,
                            261, 261, 261, 261, 261, 261, 261, 261, 261,
                            261, 261, 261, 261, 261]
        assert indices == {u"test-eos": [expected_indices]} 
Example #21
Source File: sequence_label_field_test.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def setup_method(self):
        super().setup_method()
        self.text = TextField(
            [Token(t) for t in ["here", "are", "some", "words", "."]],
            {"words": SingleIdTokenIndexer("words")},
        ) 
Example #22
Source File: text_field_test.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def tokens_to_indices(
        self, tokens: List[Token], vocabulary: Vocabulary
    ) -> Dict[str, List[int]]:
        return {
            "token_ids": (
                [10, 15]
                + [vocabulary.get_token_index(token.text, "words") for token in tokens]
                + [25]
            ),
            "additional_key": [22, 29],
        } 
Example #23
Source File: text_field_test.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def test_sequence_methods(self):
        field = TextField([Token(t) for t in ["This", "is", "a", "sentence", "."]], {})

        assert len(field) == 5
        assert field[1].text == "is"
        assert [token.text for token in field] == ["This", "is", "a", "sentence", "."] 
Example #24
Source File: text_field_test.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def test_token_indexer_returns_dict(self):
        field = TextField(
            [Token(t) for t in ["A", "sentence"]],
            token_indexers={
                "field_with_dict": DictReturningTokenIndexer(),
                "words": SingleIdTokenIndexer("words"),
                "characters": TokenCharactersIndexer("characters", min_padding_length=1),
            },
        )
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {
            "field_with_dict___token_ids": 5,
            "field_with_dict___additional_key": 2,
            "words___tokens": 2,
            "characters___token_characters": 2,
            "characters___num_token_characters": 8,
        }
        padding_lengths["field_with_dict___token_ids"] = 7
        padding_lengths["field_with_dict___additional_key"] = 3
        padding_lengths["words___tokens"] = 4
        padding_lengths["characters___token_characters"] = 4
        tensors = field.as_tensor(padding_lengths)
        assert list(tensors["field_with_dict"]["token_ids"].shape) == [7]
        assert list(tensors["field_with_dict"]["additional_key"].shape) == [3]
        assert list(tensors["words"]["tokens"].shape) == [4]
        assert list(tensors["characters"]["token_characters"].shape) == [4, 8] 
Example #25
Source File: text_field_test.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def test_printing_doesnt_crash(self):
        field = TextField(
            [Token(t) for t in ["A", "sentence"]],
            {"words": SingleIdTokenIndexer(namespace="words")},
        )
        print(field) 
Example #26
Source File: text_field_test.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def test_as_tensor_handles_longer_lengths(self):
        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={"words": SingleIdTokenIndexer("words")},
        )
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        padding_lengths["words___tokens"] = 10
        tensor_dict = field.as_tensor(padding_lengths)
        numpy.testing.assert_array_almost_equal(
            tensor_dict["words"]["tokens"].detach().cpu().numpy(),
            numpy.array([1, 1, 1, 2, 1, 0, 0, 0, 0, 0]),
        ) 
Example #27
Source File: text_field_test.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def test_as_tensor_handles_words(self):
        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={"words": SingleIdTokenIndexer("words")},
        )
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        tensor_dict = field.as_tensor(padding_lengths)
        numpy.testing.assert_array_almost_equal(
            tensor_dict["words"]["tokens"].detach().cpu().numpy(), numpy.array([1, 1, 1, 2, 1])
        ) 
Example #28
Source File: text_field_test.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def test_padding_lengths_are_computed_correctly(self):
        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={"words": SingleIdTokenIndexer("words")},
        )
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {"words___tokens": 5}

        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={
                "characters": TokenCharactersIndexer("characters", min_padding_length=1)
            },
        )
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {
            "characters___token_characters": 5,
            "characters___num_token_characters": 8,
        }

        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={
                "characters": TokenCharactersIndexer("characters", min_padding_length=1),
                "words": SingleIdTokenIndexer("words"),
            },
        )
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {
            "characters___token_characters": 5,
            "characters___num_token_characters": 8,
            "words___tokens": 5,
        } 
Example #29
Source File: text_field_test.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def test_get_padding_lengths_raises_if_no_indexed_tokens(self):

        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={"words": SingleIdTokenIndexer("words")},
        )
        with pytest.raises(ConfigurationError):
            field.get_padding_lengths() 
Example #30
Source File: index_field_test.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def setup_method(self):
        super().setup_method()
        self.text = TextField(
            [Token(t) for t in ["here", "is", "a", "sentence", "."]],
            {"words": SingleIdTokenIndexer("words")},
        )