Python allennlp.data.fields.TextField() Examples

The following are 30 code examples of allennlp.data.fields.TextField(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module allennlp.data.fields , or try the search function .
Example #1
Source File: instance_test.py    From allennlp with Apache License 2.0 6 votes vote down vote up
def test_duplicate(self):
        # Verify the `duplicate()` method works with a `PretrainedTransformerIndexer` in
        # a `TextField`. See https://github.com/allenai/allennlp/issues/4270.
        instance = Instance(
            {
                "words": TextField(
                    [Token("hello")], {"tokens": PretrainedTransformerIndexer("bert-base-uncased")}
                )
            }
        )

        other = instance.duplicate()
        assert other == instance

        # Adding new fields to the original instance should not effect the duplicate.
        instance.add_field("labels", LabelField("some_label"))
        assert "labels" not in other.fields
        assert other != instance  # sanity check on the '__eq__' method. 
Example #2
Source File: dataset_reader.py    From ConvLab with MIT License 6 votes vote down vote up
def text_to_instance(self, context_tokens: List[Token], tokens: List[Token], tags: List[str] = None,
        intents: List[str] = None, dialog_act: Dict[str, Any] = None) -> Instance:  # type: ignore
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        # print([t.text for t in context_tokens])
        fields["context_tokens"] = TextField(context_tokens, self._token_indexers)
        fields["tokens"] = TextField(tokens, self._token_indexers)
        fields["metadata"] = MetadataField({"words": [x.text for x in tokens]})
        if tags is not None:
            fields["tags"] = SequenceLabelField(tags, fields["tokens"])
        if intents is not None:
            fields["intents"] = MultiLabelField(intents, label_namespace="intent_labels")
        if dialog_act is not None:
            fields["metadata"] = MetadataField({"words": [x.text for x in tokens],
            'dialog_act': dialog_act})
        else:
            fields["metadata"] = MetadataField({"words": [x.text for x in tokens], 'dialog_act': {}})
        return Instance(fields) 
Example #3
Source File: data_loading.py    From teaching with GNU General Public License v3.0 6 votes vote down vote up
def text_to_instance(self, query_sequence: str, doc_pos_sequence: str, doc_neg_sequence: str) -> Instance:  # type: ignore
        # pylint: disable=arguments-differ
        query_tokenized = self._tokenizer.tokenize(query_sequence)
        if self.max_query_length > -1:
            query_tokenized = query_tokenized[:self.max_query_length]

        query_field = TextField(query_tokenized, self._token_indexers)
        
        doc_pos_tokenized = self._tokenizer.tokenize(doc_pos_sequence)
        if self.max_doc_length > -1:
            doc_pos_tokenized = doc_pos_tokenized[:self.max_doc_length]

        doc_pos_field = TextField(doc_pos_tokenized, self._token_indexers)

        doc_neg_tokenized = self._tokenizer.tokenize(doc_neg_sequence)
        if self.max_doc_length > -1:
            doc_neg_tokenized = doc_neg_tokenized[:self.max_doc_length]

        doc_neg_field = TextField(doc_neg_tokenized, self._token_indexers)

        return Instance({
            "query_tokens":query_field,
            "doc_pos_tokens":doc_pos_field,
            "doc_neg_tokens": doc_neg_field}) 
Example #4
Source File: ebmnlp.py    From scibert with Apache License 2.0 6 votes vote down vote up
def _read(self, file_path: str) -> Iterable[Instance]:
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, "r") as data_file:
            logger.info("Reading instances from lines in file at: %s", file_path)

            # Group into alternative divider / sentence chunks.
            for is_divider, lines in itertools.groupby(data_file, _is_divider):
                # Ignore the divider chunks, so that `lines` corresponds to the words
                # of a single sentence.
                if not is_divider:
                    fields = [line.strip().split() for line in lines]
                    # unzipping trick returns tuples, but our Fields need lists
                    fields = [list(field) for field in zip(*fields)]
                    tokens_, _, _, pico_tags = fields
                    # TextField requires ``Token`` objects
                    tokens = [Token(token) for token in tokens_]

                    yield self.text_to_instance(tokens, pico_tags) 
Example #5
Source File: semeval_2010_task_8_reader.py    From DISTRE with Apache License 2.0 6 votes vote down vote up
def text_to_instance(self,  # type: ignore
                         tokens: List[str],
                         entity_1: Tuple[int],
                         entity_2: Tuple[int],
                         label: str = None) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        
        tokens = [OpenAISplitter._standardize(token) for token in tokens]
        tokens = ['__start__'] + tokens[entity_1[0]:entity_1[1]+1] + ['__del1__'] + tokens[entity_2[0]:entity_2[1]+1] + ['__del2__'] + tokens + ['__clf__']
            
        sentence = TextField([Token(text=t) for t in tokens], self._token_indexers)
        fields['sentence'] = sentence
        #fields['entity1'] = SpanField(*entity_1, sequence_field=sentence)
        #fields['entity2'] = SpanField(*entity_2, sequence_field=sentence)
        
        if label:
            fields['label'] = LabelField(label)

        return Instance(fields) 
Example #6
Source File: data_loading.py    From teaching with GNU General Public License v3.0 6 votes vote down vote up
def text_to_instance(self, query_id:str, doc_id:str, query_sequence: str, doc_sequence: str) -> Instance:  # type: ignore
        # pylint: disable=arguments-differ

        query_id_field = LabelField(int(query_id), skip_indexing=True)
        doc_id_field = LabelField(int(doc_id), skip_indexing=True)

        query_tokenized = self._tokenizer.tokenize(query_sequence)
        if self.max_query_length > -1:
            query_tokenized = query_tokenized[:self.max_query_length]

        query_field = TextField(query_tokenized, self._token_indexers)
        
        doc_tokenized = self._tokenizer.tokenize(doc_sequence)
        if self.max_doc_length > -1:
            doc_tokenized = doc_tokenized[:self.max_doc_length]

        doc_field = TextField(doc_tokenized, self._token_indexers)

        return Instance({
            "query_id":query_id_field,
            "doc_id":doc_id_field,
            "query_tokens":query_field,
            "doc_tokens":doc_field}) 
Example #7
Source File: dataset_reader.py    From ConvLab with MIT License 6 votes vote down vote up
def text_to_instance(self, tokens: List[Token], tags: List[str] = None, domain: str = None,
        intent: str = None, dialog_act: Dict[str, Any] = None) -> Instance:  # type: ignore
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        sequence = TextField(tokens, self._token_indexers)
        fields["tokens"] = sequence
        if tags:
            fields["tags"] = SequenceLabelField(tags, sequence)
        if domain:
            fields["domain"] = LabelField(domain, label_namespace="domain_labels")
        if intent:
            fields["intent"] = LabelField(intent, label_namespace="intent_labels")
        if dialog_act is not None:
            fields["metadata"] = MetadataField({"words": [x.text for x in tokens],
            'dialog_act': dialog_act})
        else:
            fields["metadata"] = MetadataField({"words": [x.text for x in tokens], 'dialog_act': {}})
        return Instance(fields) 
Example #8
Source File: dataset_test.py    From allennlp with Apache License 2.0 6 votes vote down vote up
def get_instances(self):
        field1 = TextField(
            [Token(t) for t in ["this", "is", "a", "sentence", "."]], self.token_indexer
        )
        field2 = TextField(
            [Token(t) for t in ["this", "is", "a", "different", "sentence", "."]],
            self.token_indexer,
        )
        field3 = TextField(
            [Token(t) for t in ["here", "is", "a", "sentence", "."]], self.token_indexer
        )
        field4 = TextField([Token(t) for t in ["this", "is", "short"]], self.token_indexer)
        instances = [
            Instance({"text1": field1, "text2": field2}),
            Instance({"text1": field3, "text2": field4}),
        ]
        return instances 
Example #9
Source File: vocabulary_test.py    From allennlp with Apache License 2.0 6 votes vote down vote up
def test_saving_and_loading_works_with_byte_encoding(self):
        # We're going to set a vocabulary from a TextField using byte encoding, index it, save the
        # vocab, load the vocab, then index the text field again, and make sure we get the same
        # result.
        tokenizer = CharacterTokenizer(byte_encoding="utf-8")
        token_indexer = TokenCharactersIndexer(character_tokenizer=tokenizer, min_padding_length=2)
        tokens = [Token(t) for t in ["Øyvind", "für", "汉字"]]
        text_field = TextField(tokens, {"characters": token_indexer})
        dataset = Batch([Instance({"sentence": text_field})])
        vocab = Vocabulary.from_instances(dataset)
        text_field.index(vocab)
        indexed_tokens = deepcopy(text_field._indexed_tokens)

        vocab_dir = self.TEST_DIR / "vocab_save"
        vocab.save_to_files(vocab_dir)
        vocab2 = Vocabulary.from_files(vocab_dir)
        text_field2 = TextField(tokens, {"characters": token_indexer})
        text_field2.index(vocab2)
        indexed_tokens2 = deepcopy(text_field2._indexed_tokens)
        assert indexed_tokens == indexed_tokens2 
Example #10
Source File: vocabulary_test.py    From allennlp with Apache License 2.0 6 votes vote down vote up
def test_from_params_extend_config(self):

        vocab_dir = self.TEST_DIR / "vocab_save"
        original_vocab = Vocabulary(non_padded_namespaces=["tokens"])
        original_vocab.add_token_to_namespace("a", namespace="tokens")
        original_vocab.save_to_files(vocab_dir)

        text_field = TextField(
            [Token(t) for t in ["a", "b"]], {"tokens": SingleIdTokenIndexer("tokens")}
        )
        instances = Batch([Instance({"text": text_field})])

        # If you ask to extend vocab from `directory`, instances must be passed
        # in Vocabulary constructor, or else there is nothing to extend to.
        params = Params({"type": "extend", "directory": vocab_dir})
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(params)

        # If you ask to extend vocab, `directory` key must be present in params,
        # or else there is nothing to extend from.
        params = Params({"type": "extend"})
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(params, instances=instances) 
Example #11
Source File: vocabulary_test.py    From allennlp with Apache License 2.0 6 votes vote down vote up
def test_max_vocab_size_partial_dict(self):
        indexers = {
            "tokens": SingleIdTokenIndexer(),
            "token_characters": TokenCharactersIndexer(min_padding_length=3),
        }
        instance = Instance(
            {
                "text": TextField(
                    [Token(w) for w in "Abc def ghi jkl mno pqr stu vwx yz".split(" ")], indexers
                )
            }
        )
        dataset = Batch([instance])
        params = Params({"max_vocab_size": {"tokens": 1}})

        vocab = Vocabulary.from_params(params=params, instances=dataset)
        assert len(vocab.get_index_to_token_vocabulary("tokens").values()) == 3  # 1 + 2
        assert len(vocab.get_index_to_token_vocabulary("token_characters").values()) == 28  # 26 + 2 
Example #12
Source File: index_field_test.py    From allennlp with Apache License 2.0 6 votes vote down vote up
def test_equality(self):
        index_field1 = IndexField(4, self.text)
        index_field2 = IndexField(4, self.text)
        index_field3 = IndexField(
            4,
            TextField(
                [Token(t) for t in ["AllenNLP", "is", "the", "bomb", "!"]],
                {"words": SingleIdTokenIndexer("words")},
            ),
        )

        assert index_field1 == 4
        assert index_field1 == index_field1
        assert index_field1 == index_field2

        assert index_field1 != index_field3
        assert index_field2 != index_field3
        assert index_field3 == index_field3 
Example #13
Source File: text_field_test.py    From allennlp with Apache License 2.0 6 votes vote down vote up
def test_as_tensor_handles_characters(self):
        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={
                "characters": TokenCharactersIndexer("characters", min_padding_length=1)
            },
        )
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        tensor_dict = field.as_tensor(padding_lengths)
        expected_character_array = numpy.array(
            [
                [1, 1, 1, 3, 0, 0, 0, 0],
                [1, 3, 0, 0, 0, 0, 0, 0],
                [1, 0, 0, 0, 0, 0, 0, 0],
                [3, 4, 5, 6, 4, 5, 7, 4],
                [1, 0, 0, 0, 0, 0, 0, 0],
            ]
        )
        numpy.testing.assert_array_almost_equal(
            tensor_dict["characters"]["token_characters"].detach().cpu().numpy(),
            expected_character_array,
        ) 
Example #14
Source File: text_field_test.py    From allennlp with Apache License 2.0 6 votes vote down vote up
def test_token_padding_lengths_are_computed_correctly(self):
        field = TextField(
            [Token(t) for t in ["A", "sentence"]],
            token_indexers={
                "field_with_dict": DictReturningTokenIndexer(token_min_padding_length=3),
                "words": SingleIdTokenIndexer("words", token_min_padding_length=3),
                "characters": TokenCharactersIndexer(
                    "characters", min_padding_length=1, token_min_padding_length=3
                ),
            },
        )
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {
            "field_with_dict___token_ids": 5,
            "field_with_dict___additional_key": 3,
            "words___tokens": 3,
            "characters___token_characters": 3,
            "characters___num_token_characters": 8,
        }
        tensors = field.as_tensor(padding_lengths)
        assert tensors["field_with_dict"]["additional_key"].tolist()[-1] == 0
        assert tensors["words"]["tokens"].tolist()[-1] == 0
        assert tensors["characters"]["token_characters"].tolist()[-1] == [0] * 8 
Example #15
Source File: conll2003.py    From magnitude with MIT License 6 votes vote down vote up
def _read(self, file_path     )                      :
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, u"r") as data_file:
            logger.info(u"Reading instances from lines in file at: %s", file_path)

            # Group into alternative divider / sentence chunks.
            for is_divider, lines in itertools.groupby(data_file, _is_divider):
                # Ignore the divider chunks, so that `lines` corresponds to the words
                # of a single sentence.
                if not is_divider:
                    fields = [line.strip().split() for line in lines]
                    # unzipping trick returns tuples, but our Fields need lists
                    tokens, pos_tags, chunk_tags, ner_tags = [list(field) for field in izip(*fields)]
                    # TextField requires ``Token`` objects
                    tokens = [Token(token) for token in tokens]

                    yield self.text_to_instance(tokens, pos_tags, chunk_tags, ner_tags) 
Example #16
Source File: snli.py    From magnitude with MIT License 6 votes vote down vote up
def text_to_instance(self,  # type: ignore
                         premise     ,
                         hypothesis     ,
                         label      = None)            :
        # pylint: disable=arguments-differ
        fields                   = {}
        premise_tokens = self._tokenizer.tokenize(premise)
        hypothesis_tokens = self._tokenizer.tokenize(hypothesis)
        fields[u'premise'] = TextField(premise_tokens, self._token_indexers)
        fields[u'hypothesis'] = TextField(hypothesis_tokens, self._token_indexers)
        if label:
            fields[u'label'] = LabelField(label)

        metadata = {u"premise_tokens": [x.text for x in premise_tokens],
                    u"hypothesis_tokens": [x.text for x in hypothesis_tokens]}
        fields[u"metadata"] = MetadataField(metadata)
        return Instance(fields) 
Example #17
Source File: semantic_role_labeling.py    From magnitude with MIT License 6 votes vote down vote up
def text_to_instance(self,  # type: ignore
                         tokens             ,
                         verb_label           ,
                         tags            = None)            :
        u"""
        We take `pre-tokenized` input here, along with a verb label.  The verb label should be a
        one-hot binary vector, the same length as the tokens, indicating the position of the verb
        to find arguments for.
        """
        # pylint: disable=arguments-differ
        fields                   = {}
        text_field = TextField(tokens, token_indexers=self._token_indexers)
        fields[u'tokens'] = text_field
        fields[u'verb_indicator'] = SequenceLabelField(verb_label, text_field)
        if tags:
            fields[u'tags'] = SequenceLabelField(tags, text_field)

        if all([x == 0 for x in verb_label]):
            verb = None
        else:
            verb = tokens[verb_label.index(1)].text
        fields[u"metadata"] = MetadataField({u"words": [x.text for x in tokens],
                                            u"verb": verb})
        return Instance(fields) 
Example #18
Source File: seq2seq.py    From magnitude with MIT License 5 votes vote down vote up
def text_to_instance(self, source_string     , target_string      = None)            :  # type: ignore
        # pylint: disable=arguments-differ
        tokenized_source = self._source_tokenizer.tokenize(source_string)
        if self._source_add_start_token:
            tokenized_source.insert(0, Token(START_SYMBOL))
        tokenized_source.append(Token(END_SYMBOL))
        source_field = TextField(tokenized_source, self._source_token_indexers)
        if target_string is not None:
            tokenized_target = self._target_tokenizer.tokenize(target_string)
            tokenized_target.insert(0, Token(START_SYMBOL))
            tokenized_target.append(Token(END_SYMBOL))
            target_field = TextField(tokenized_target, self._target_token_indexers)
            return Instance({u"source_tokens": source_field, u"target_tokens": target_field})
        else:
            return Instance({u'source_tokens': source_field}) 
Example #19
Source File: ebmnlp.py    From scibert with Apache License 2.0 5 votes vote down vote up
def text_to_instance(self,
                         tokens: List[Token],
                         pico_tags: List[str] = None):
        sequence = TextField(tokens, self._token_indexers)
        instance_fields: Dict[str, Field] = {'tokens': sequence}
        instance_fields["metadata"] = MetadataField({"words": [x.text for x in tokens]})
        
        # Set the field 'labels' according to the specified PIO element
        if pico_tags is not None:
            instance_fields['tags'] = SequenceLabelField(pico_tags, sequence, self.label_namespace)

        return Instance(instance_fields) 
Example #20
Source File: text_field_test.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def test_as_tensor_handles_longer_lengths(self):
        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={"words": SingleIdTokenIndexer("words")},
        )
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        padding_lengths["words___tokens"] = 10
        tensor_dict = field.as_tensor(padding_lengths)
        numpy.testing.assert_array_almost_equal(
            tensor_dict["words"]["tokens"].detach().cpu().numpy(),
            numpy.array([1, 1, 1, 2, 1, 0, 0, 0, 0, 0]),
        ) 
Example #21
Source File: text_field_test.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def test_as_tensor_handles_words(self):
        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={"words": SingleIdTokenIndexer("words")},
        )
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        tensor_dict = field.as_tensor(padding_lengths)
        numpy.testing.assert_array_almost_equal(
            tensor_dict["words"]["tokens"].detach().cpu().numpy(), numpy.array([1, 1, 1, 2, 1])
        ) 
Example #22
Source File: text_field_test.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def test_get_padding_lengths_raises_if_no_indexed_tokens(self):

        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={"words": SingleIdTokenIndexer("words")},
        )
        with pytest.raises(ConfigurationError):
            field.get_padding_lengths() 
Example #23
Source File: text_field_test.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def test_index_converts_field_correctly(self):
        vocab = Vocabulary()
        sentence_index = vocab.add_token_to_namespace("sentence", namespace="words")
        capital_a_index = vocab.add_token_to_namespace("A", namespace="words")
        capital_a_char_index = vocab.add_token_to_namespace("A", namespace="characters")
        s_index = vocab.add_token_to_namespace("s", namespace="characters")
        e_index = vocab.add_token_to_namespace("e", namespace="characters")
        n_index = vocab.add_token_to_namespace("n", namespace="characters")
        t_index = vocab.add_token_to_namespace("t", namespace="characters")
        c_index = vocab.add_token_to_namespace("c", namespace="characters")

        field = TextField(
            [Token(t) for t in ["A", "sentence"]],
            {"words": SingleIdTokenIndexer(namespace="words")},
        )
        field.index(vocab)

        assert field._indexed_tokens["words"]["tokens"] == [capital_a_index, sentence_index]

        field1 = TextField(
            [Token(t) for t in ["A", "sentence"]],
            {"characters": TokenCharactersIndexer(namespace="characters", min_padding_length=1)},
        )
        field1.index(vocab)
        assert field1._indexed_tokens["characters"]["token_characters"] == [
            [capital_a_char_index],
            [s_index, e_index, n_index, t_index, e_index, n_index, c_index, e_index],
        ]
        field2 = TextField(
            [Token(t) for t in ["A", "sentence"]],
            token_indexers={
                "words": SingleIdTokenIndexer(namespace="words"),
                "characters": TokenCharactersIndexer(namespace="characters", min_padding_length=1),
            },
        )
        field2.index(vocab)
        assert field2._indexed_tokens["words"]["tokens"] == [capital_a_index, sentence_index]
        assert field2._indexed_tokens["characters"]["token_characters"] == [
            [capital_a_char_index],
            [s_index, e_index, n_index, t_index, e_index, n_index, c_index, e_index],
        ] 
Example #24
Source File: ontonotes_ner.py    From magnitude with MIT License 5 votes vote down vote up
def text_to_instance(self, # type: ignore
                         tokens             ,
                         ner_tags            = None)            :
        u"""
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """
        # pylint: disable=arguments-differ
        sequence = TextField(tokens, self._token_indexers)
        instance_fields                   = {u'tokens': sequence}
        instance_fields[u"metadata"] = MetadataField({u"words": [x.text for x in tokens]})
        # Add "tag label" to instance
        if ner_tags is not None:
            if self._coding_scheme == u"BIOUL":
                ner_tags = to_bioul(ner_tags, encoding=u"BIO")
            instance_fields[u'tags'] = SequenceLabelField(ner_tags, sequence)
        return Instance(instance_fields) 
Example #25
Source File: classification_dataset_reader.py    From scibert with Apache License 2.0 5 votes vote down vote up
def text_to_instance(self,
                         text: str,
                         label: str = None,
                         metadata: Any = None) -> Instance:  # type: ignore
        text_tokens = self._tokenizer.tokenize(text)
        fields = {
            'text': TextField(text_tokens, self._token_indexers),
        }
        if label is not None:
            fields['label'] = LabelField(label)

        if metadata:
            fields['metadata'] = MetadataField(metadata)
        return Instance(fields) 
Example #26
Source File: index_field_test.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def setup_method(self):
        super().setup_method()
        self.text = TextField(
            [Token(t) for t in ["here", "is", "a", "sentence", "."]],
            {"words": SingleIdTokenIndexer("words")},
        ) 
Example #27
Source File: list_field_test.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def test_empty_list_can_be_tensorized(self):
        tokenizer = SpacyTokenizer()
        tokens = tokenizer.tokenize("Foo")
        text_field = TextField(tokens, self.word_indexer)
        list_field = ListField([text_field.empty_field()])
        fields = {
            "list": list_field,
            "bar": TextField(tokenizer.tokenize("BAR"), self.word_indexer),
        }
        instance = Instance(fields)
        instance.index_fields(self.vocab)
        instance.as_tensor_dict() 
Example #28
Source File: sequence_tagging.py    From magnitude with MIT License 5 votes vote down vote up
def text_to_instance(self, tokens             , tags            = None)            :  # type: ignore
        u"""
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """
        # pylint: disable=arguments-differ
        fields                   = {}
        sequence = TextField(tokens, self._token_indexers)
        fields[u"tokens"] = sequence
        fields[u"metadata"] = MetadataField({u"words": [x.text for x in tokens]})
        if tags is not None:
            fields[u"tags"] = SequenceLabelField(tags, sequence)
        return Instance(fields) 
Example #29
Source File: span_field_test.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def test_equality(self):
        span_field1 = SpanField(2, 3, self.text)
        span_field2 = SpanField(2, 3, self.text)
        span_field3 = SpanField(
            2, 3, TextField([Token(t) for t in ["not", "the", "same", "tokens"]], self.indexers)
        )

        assert span_field1 == (2, 3)
        assert span_field1 == span_field1
        assert span_field1 == span_field2
        assert span_field1 != span_field3
        assert span_field2 != span_field3 
Example #30
Source File: text_field_test.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def test_token_indexer_returns_dict(self):
        field = TextField(
            [Token(t) for t in ["A", "sentence"]],
            token_indexers={
                "field_with_dict": DictReturningTokenIndexer(),
                "words": SingleIdTokenIndexer("words"),
                "characters": TokenCharactersIndexer("characters", min_padding_length=1),
            },
        )
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {
            "field_with_dict___token_ids": 5,
            "field_with_dict___additional_key": 2,
            "words___tokens": 2,
            "characters___token_characters": 2,
            "characters___num_token_characters": 8,
        }
        padding_lengths["field_with_dict___token_ids"] = 7
        padding_lengths["field_with_dict___additional_key"] = 3
        padding_lengths["words___tokens"] = 4
        padding_lengths["characters___token_characters"] = 4
        tensors = field.as_tensor(padding_lengths)
        assert list(tensors["field_with_dict"]["token_ids"].shape) == [7]
        assert list(tensors["field_with_dict"]["additional_key"].shape) == [3]
        assert list(tensors["words"]["tokens"].shape) == [4]
        assert list(tensors["characters"]["token_characters"].shape) == [4, 8]