Python Examples of allennlp.data.instance.Instance

Source File: dataset_reader.py From ConvLab with MIT License

6 votes

def text_to_instance(self, tokens: List[Token], tags: List[str] = None, domain: str = None,
        intent: str = None, dialog_act: Dict[str, Any] = None) -> Instance:  # type: ignore
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        sequence = TextField(tokens, self._token_indexers)
        fields["tokens"] = sequence
        if tags:
            fields["tags"] = SequenceLabelField(tags, sequence)
        if domain:
            fields["domain"] = LabelField(domain, label_namespace="domain_labels")
        if intent:
            fields["intent"] = LabelField(intent, label_namespace="intent_labels")
        if dialog_act is not None:
            fields["metadata"] = MetadataField({"words": [x.text for x in tokens],
            'dialog_act': dialog_act})
        else:
            fields["metadata"] = MetadataField({"words": [x.text for x in tokens], 'dialog_act': {}})
        return Instance(fields)

Source File: dataset_reader.py From magnitude with MIT License

6 votes

def text_to_instance(self, *inputs)            :
        u"""
        Does whatever tokenization or processing is necessary to go from textual input to an
        ``Instance``.  The primary intended use for this is with a
        :class:`~allennlp.service.predictors.predictor.Predictor`, which gets text input as a JSON
        object and needs to process it to be input to a model.

        The intent here is to share code between :func:`_read` and what happens at
        model serving time, or any other time you want to make a prediction from new data.  We need
        to process the data in the same way it was done at training time.  Allowing the
        ``DatasetReader`` to process new text lets us accomplish this, as we can just call
        ``DatasetReader.text_to_instance`` when serving predictions.

        The input type here is rather vaguely specified, unfortunately.  The ``Predictor`` will
        have to make some assumptions about the kind of ``DatasetReader`` that it's using, in order
        to pass it the right information.
        """
        raise NotImplementedError

Source File: snli.py From magnitude with MIT License

6 votes

def text_to_instance(self,  # type: ignore
                         premise     ,
                         hypothesis     ,
                         label      = None)            :
        # pylint: disable=arguments-differ
        fields                   = {}
        premise_tokens = self._tokenizer.tokenize(premise)
        hypothesis_tokens = self._tokenizer.tokenize(hypothesis)
        fields[u'premise'] = TextField(premise_tokens, self._token_indexers)
        fields[u'hypothesis'] = TextField(hypothesis_tokens, self._token_indexers)
        if label:
            fields[u'label'] = LabelField(label)

        metadata = {u"premise_tokens": [x.text for x in premise_tokens],
                    u"hypothesis_tokens": [x.text for x in hypothesis_tokens]}
        fields[u"metadata"] = MetadataField(metadata)
        return Instance(fields)

Source File: semantic_role_labeling.py From magnitude with MIT License

6 votes

def text_to_instance(self,  # type: ignore
                         tokens             ,
                         verb_label           ,
                         tags            = None)            :
        u"""
        We take `pre-tokenized` input here, along with a verb label.  The verb label should be a
        one-hot binary vector, the same length as the tokens, indicating the position of the verb
        to find arguments for.
        """
        # pylint: disable=arguments-differ
        fields                   = {}
        text_field = TextField(tokens, token_indexers=self._token_indexers)
        fields[u'tokens'] = text_field
        fields[u'verb_indicator'] = SequenceLabelField(verb_label, text_field)
        if tags:
            fields[u'tags'] = SequenceLabelField(tags, text_field)

        if all([x == 0 for x in verb_label]):
            verb = None
        else:
            verb = tokens[verb_label.index(1)].text
        fields[u"metadata"] = MetadataField({u"words": [x.text for x in tokens],
                                            u"verb": verb})
        return Instance(fields)

Source File: data_loading.py From teaching with GNU General Public License v3.0

6 votes

def text_to_instance(self, query_id:str, doc_id:str, query_sequence: str, doc_sequence: str) -> Instance:  # type: ignore
        # pylint: disable=arguments-differ

        query_id_field = LabelField(int(query_id), skip_indexing=True)
        doc_id_field = LabelField(int(doc_id), skip_indexing=True)

        query_tokenized = self._tokenizer.tokenize(query_sequence)
        if self.max_query_length > -1:
            query_tokenized = query_tokenized[:self.max_query_length]

        query_field = TextField(query_tokenized, self._token_indexers)
        
        doc_tokenized = self._tokenizer.tokenize(doc_sequence)
        if self.max_doc_length > -1:
            doc_tokenized = doc_tokenized[:self.max_doc_length]

        doc_field = TextField(doc_tokenized, self._token_indexers)

        return Instance({
            "query_id":query_id_field,
            "doc_id":doc_id_field,
            "query_tokens":query_field,
            "doc_tokens":doc_field})

Source File: data_loading.py From teaching with GNU General Public License v3.0

6 votes

def text_to_instance(self, query_sequence: str, doc_pos_sequence: str, doc_neg_sequence: str) -> Instance:  # type: ignore
        # pylint: disable=arguments-differ
        query_tokenized = self._tokenizer.tokenize(query_sequence)
        if self.max_query_length > -1:
            query_tokenized = query_tokenized[:self.max_query_length]

        query_field = TextField(query_tokenized, self._token_indexers)
        
        doc_pos_tokenized = self._tokenizer.tokenize(doc_pos_sequence)
        if self.max_doc_length > -1:
            doc_pos_tokenized = doc_pos_tokenized[:self.max_doc_length]

        doc_pos_field = TextField(doc_pos_tokenized, self._token_indexers)

        doc_neg_tokenized = self._tokenizer.tokenize(doc_neg_sequence)
        if self.max_doc_length > -1:
            doc_neg_tokenized = doc_neg_tokenized[:self.max_doc_length]

        doc_neg_field = TextField(doc_neg_tokenized, self._token_indexers)

        return Instance({
            "query_tokens":query_field,
            "doc_pos_tokens":doc_pos_field,
            "doc_neg_tokens": doc_neg_field})

Source File: language_modeling.py From magnitude with MIT License

6 votes

def _read(self, file_path     ):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, u"r") as text_file:
            instance_strings = text_file.readlines()

        if self._tokens_per_instance is not None:
            all_text = u" ".join([x.replace(u"\n", u" ").strip() for x in instance_strings])
            tokenized_text = self._tokenizer.tokenize(all_text)
            num_tokens = self._tokens_per_instance + 1
            tokenized_strings = []
            logger.info(u"Creating dataset from all text in file: %s", file_path)
            for index in Tqdm.tqdm(range(0, len(tokenized_text) - num_tokens, num_tokens - 1)):
                tokenized_strings.append(tokenized_text[index:(index + num_tokens)])
        else:
            tokenized_strings = [self._tokenizer.tokenize(s) for s in instance_strings]

        for tokenized_string in tokenized_strings:
            input_field = TextField(tokenized_string[:-1], self._token_indexers)
            output_field = TextField(tokenized_string[1:], self._output_indexer)
            yield Instance({u'input_tokens': input_field,
                            u'output_tokens': output_field})

    #overrides

Source File: dataset.py From magnitude with MIT License

6 votes

def get_padding_lengths(self)                             :
        u"""
        Gets the maximum padding lengths from all ``Instances`` in this batch.  Each ``Instance``
        has multiple ``Fields``, and each ``Field`` could have multiple things that need padding.
        We look at all fields in all instances, and find the max values for each (field_name,
        padding_key) pair, returning them in a dictionary.

        This can then be used to convert this batch into arrays of consistent length, or to set
        model parameters, etc.
        """
        padding_lengths                            = defaultdict(dict)
        all_instance_lengths                                  = [instance.get_padding_lengths()
                                                                 for instance in self.instances]
        if not all_instance_lengths:
            return padding_lengths
        all_field_lengths                                  = defaultdict(list)
        for instance_lengths in all_instance_lengths:
            for field_name, instance_field_lengths in instance_lengths.items():
                all_field_lengths[field_name].append(instance_field_lengths)
        for field_name, field_lengths in all_field_lengths.items():
            for padding_key in field_lengths[0].keys():
                max_value = max(x[padding_key] if padding_key in x else 0 for x in field_lengths)
                padding_lengths[field_name][padding_key] = max_value
        return padding_lengths

Source File: ebmnlp.py From scibert with Apache License 2.0

6 votes

def _read(self, file_path: str) -> Iterable[Instance]:
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, "r") as data_file:
            logger.info("Reading instances from lines in file at: %s", file_path)

            # Group into alternative divider / sentence chunks.
            for is_divider, lines in itertools.groupby(data_file, _is_divider):
                # Ignore the divider chunks, so that `lines` corresponds to the words
                # of a single sentence.
                if not is_divider:
                    fields = [line.strip().split() for line in lines]
                    # unzipping trick returns tuples, but our Fields need lists
                    fields = [list(field) for field in zip(*fields)]
                    tokens_, _, _, pico_tags = fields
                    # TextField requires ``Token`` objects
                    tokens = [Token(token) for token in tokens_]

                    yield self.text_to_instance(tokens, pico_tags)

Source File: prolocal_dataset_reader.py From propara with Apache License 2.0

6 votes

def text_to_instance(self,  # type: ignore
                         sentence_tokens: List[str],
                         verb_vector: List[int],
                         entity_vector: List[int],
                         state_change_types: Optional[List[str]] = None,
                         state_change_tags: Optional[List[str]] = None) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}

        # encode inputs
        token_field = TextField([Token(word) for word in sentence_tokens], self._token_indexers)
        fields['tokens'] = token_field
        fields['verb_span'] = SequenceLabelField(verb_vector, token_field, 'indicator_tags')
        fields['entity_span'] = SequenceLabelField(entity_vector, token_field, 'indicator_tags')

        # encode outputs
        if state_change_types:
            fields['state_change_type_labels'] = LabelField(state_change_types, 'state_change_type_labels')
        if state_change_tags:
            fields['state_change_tags'] = SequenceLabelField(state_change_tags, token_field, 'state_change_tags')

        return Instance(fields)

Source File: semeval_2010_task_8_reader.py From DISTRE with Apache License 2.0

6 votes

def text_to_instance(self,  # type: ignore
                         tokens: List[str],
                         entity_1: Tuple[int],
                         entity_2: Tuple[int],
                         label: str = None) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        
        tokens = [OpenAISplitter._standardize(token) for token in tokens]
        tokens = ['__start__'] + tokens[entity_1[0]:entity_1[1]+1] + ['__del1__'] + tokens[entity_2[0]:entity_2[1]+1] + ['__del2__'] + tokens + ['__clf__']
            
        sentence = TextField([Token(text=t) for t in tokens], self._token_indexers)
        fields['sentence'] = sentence
        #fields['entity1'] = SpanField(*entity_1, sequence_field=sentence)
        #fields['entity2'] = SpanField(*entity_2, sequence_field=sentence)
        
        if label:
            fields['label'] = LabelField(label)

        return Instance(fields)

Source File: dataset_reader.py From ConvLab with MIT License

6 votes

def text_to_instance(self, context_tokens: List[Token], tokens: List[Token], tags: List[str] = None,
        intents: List[str] = None, dialog_act: Dict[str, Any] = None) -> Instance:  # type: ignore
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        # print([t.text for t in context_tokens])
        fields["context_tokens"] = TextField(context_tokens, self._token_indexers)
        fields["tokens"] = TextField(tokens, self._token_indexers)
        fields["metadata"] = MetadataField({"words": [x.text for x in tokens]})
        if tags is not None:
            fields["tags"] = SequenceLabelField(tags, fields["tokens"])
        if intents is not None:
            fields["intents"] = MultiLabelField(intents, label_namespace="intent_labels")
        if dialog_act is not None:
            fields["metadata"] = MetadataField({"words": [x.text for x in tokens],
            'dialog_act': dialog_act})
        else:
            fields["metadata"] = MetadataField({"words": [x.text for x in tokens], 'dialog_act': {}})
        return Instance(fields)

Source File: dataset.py From magnitude with MIT License

6 votes

def print_statistics(self)        :
        pass
        # # Make sure if has been indexed first
        # sequence_field_lengths                  = defaultdict(list)
        # for instance in self.instances:
        #     if not instance.indexed:
        #         raise ConfigurationError(u"Instances must be indexed with vocabulary "
        #                                  u"before asking to print dataset statistics.")
        #     for field, field_padding_lengths in instance.get_padding_lengths().items():
        #         for key, value in field_padding_lengths.items():
        #             sequence_field_lengths["{field}.{key}"].append(value)

        # print(u"\n\n----Dataset Statistics----\n")
        # for name, lengths in sequence_field_lengths.items():
        #     print("Statistics for {name}:")
        #     print("\tLengths: Mean: {numpy.mean(lengths)}, Standard Dev: {numpy.std(lengths)}, ")
        #           "Max: {numpy.max(lengths)}, Min: {numpy.min(lengths)}"

        # print(u"\n10 Random instances: ")
        # for i in list(numpy.random.randint(len(self.instances), size=10)):
        #     print("Instance {i}:")
        #     print("\t{self.instances[i]}")

Source File: lazy_dataset_reader_test.py From allennlp with Apache License 2.0

5 votes

def _read(self, _: str) -> Iterable[Instance]:
        self.num_reads += 1
        return (instance for instance in self._instances)

Source File: ontonotes_ner.py From magnitude with MIT License

5 votes

def text_to_instance(self, # type: ignore
                         tokens             ,
                         ner_tags            = None)            :
        u"""
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """
        # pylint: disable=arguments-differ
        sequence = TextField(tokens, self._token_indexers)
        instance_fields                   = {u'tokens': sequence}
        instance_fields[u"metadata"] = MetadataField({u"words": [x.text for x in tokens]})
        # Add "tag label" to instance
        if ner_tags is not None:
            if self._coding_scheme == u"BIOUL":
                ner_tags = to_bioul(ner_tags, encoding=u"BIO")
            instance_fields[u'tags'] = SequenceLabelField(ner_tags, sequence)
        return Instance(instance_fields)

Source File: universal_dependencies.py From magnitude with MIT License

5 votes

def text_to_instance(self,  # type: ignore
                         words           ,
                         upos_tags           ,
                         dependencies                        = None)            :
        # pylint: disable=arguments-differ
        u"""
        Parameters
        ----------
        words : ``List[str]``, required.
            The words in the sentence to be encoded.
        upos_tags : ``List[str]``, required.
            The universal dependencies POS tags for each word.
        dependencies ``List[Tuple[str, int]]``, optional (default = None)
            A list of  (head tag, head index) tuples. Indices are 1 indexed,
            meaning an index of 0 corresponds to that word being the root of
            the dependency tree.

        Returns
        -------
        An instance containing words, upos tags, dependency head tags and head
        indices as fields.
        """
        fields                   = {}

        tokens = TextField([Token(w) for w in words], self._token_indexers)
        fields[u"words"] = tokens
        fields[u"pos_tags"] = SequenceLabelField(upos_tags, tokens, label_namespace=u"pos")
        if dependencies is not None:
            # We don't want to expand the label namespace with an additional dummy token, so we'll
            # always give the 'ROOT_HEAD' token a label of 'root'.
            fields[u"head_tags"] = SequenceLabelField([x[0] for x in dependencies],
                                                     tokens,
                                                     label_namespace=u"head_tags")
            fields[u"head_indices"] = SequenceLabelField([int(x[1]) for x in dependencies],
                                                        tokens,
                                                        label_namespace=u"head_index_tags")

        fields[u"metadata"] = MetadataField({u"words": words, u"pos": upos_tags})
        return Instance(fields)

Source File: sequence_tagging.py From magnitude with MIT License

5 votes

def text_to_instance(self, tokens             , tags            = None)            :  # type: ignore
        u"""
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """
        # pylint: disable=arguments-differ
        fields                   = {}
        sequence = TextField(tokens, self._token_indexers)
        fields[u"tokens"] = sequence
        fields[u"metadata"] = MetadataField({u"words": [x.text for x in tokens]})
        if tags is not None:
            fields[u"tags"] = SequenceLabelField(tags, sequence)
        return Instance(fields)

Source File: language_modeling.py From magnitude with MIT License

5 votes

def text_to_instance(self, sentence     )            :  # type: ignore
        # pylint: disable=arguments-differ
        tokenized_string = self._tokenizer.tokenize(sentence)
        input_field = TextField(tokenized_string[:-1], self._token_indexers)
        output_field = TextField(tokenized_string[1:], self._output_indexer)
        return Instance({u'input_tokens': input_field, u'output_tokens': output_field})

Source File: decompatt_predictor.py From scitail with Apache License 2.0

5 votes

def _json_to_instance(self,  # type: ignore
                          json_dict: JsonDict) -> Instance:
        # pylint: disable=arguments-differ
        premise_text = json_dict["sentence1"]
        hypothesis_text = json_dict["sentence2"]
        return self._dataset_reader.text_to_instance(premise_text, hypothesis_text)

Source File: entailment_tuple_reader.py From scitail with Apache License 2.0

5 votes

def text_to_instance(self,
                         premise: str,
                         hypothesis: str,
                         hypothesis_structure: str,
                         label: str = None) -> Instance:
        fields: Dict[str, Field] = {}
        premise_tokens = self._tokenizer.tokenize(premise)[-self._max_tokens:]
        hypothesis_tokens = self._tokenizer.tokenize(hypothesis)[-self._max_tokens:]

        fields['premise'] = TextField(premise_tokens, self._token_indexers)
        fields['hypothesis'] = TextField(hypothesis_tokens, self._token_indexers)
        metadata = {
            'premise': premise,
            'hypothesis': hypothesis,
            'premise_tokens': [token.text for token in premise_tokens],
            'hypothesis_tokens': [token.text for token in hypothesis_tokens]
        }
        fields['metadata'] = MetadataField(metadata)
        self._add_structure_to_fields(hypothesis_structure, fields)
        if label:
            fields['label'] = LabelField(label)
        return Instance(fields)

Source File: util_test.py From allennlp with Apache License 2.0

5 votes

def train_util_test_reader():
    @DatasetReader.register("train-util-test-reader")
    class TrainUtilTestReader(DatasetReader):
        def _read(self, data_path):
            logger.info("...train-util-test-reader reading from %s", data_path)
            for i in range(10):
                yield self.text_to_instance(i)

        def text_to_instance(self, index: int) -> Instance:  # type: ignore
            return Instance({"index": LabelField(index, skip_indexing=True)})

    yield TrainUtilTestReader

    del DatasetReader._registry[DatasetReader]["train-util-test-reader"]

Source File: dataloader_test.py From allennlp with Apache License 2.0

5 votes

def test_loader_uses_all_instances_when_batches_per_epochs_set(lazy):
    NUM_INSTANCES = 20
    BATCH_SIZE = 2
    BATCHES_PER_EPOCH = 3
    EPOCHS = 4

    class FakeDatasetReader(DatasetReader):
        def _read(self, filename: str) -> Iterable[Instance]:
            for i in range(NUM_INSTANCES):
                yield Instance({"index": LabelField(i, skip_indexing=True)})

    reader = FakeDatasetReader(lazy=lazy)
    dataset = reader.read("blah")

    loader = PyTorchDataLoader(dataset, batch_size=BATCH_SIZE, batches_per_epoch=BATCHES_PER_EPOCH)
    epoch_batches = []
    for epoch in range(EPOCHS):
        batches = []
        for batch in loader:
            instances = []
            for index in batch["index"]:
                instances.append(index)
            batches.append(instances)
        epoch_batches.append(batches)

    assert epoch_batches == [
        # Epoch 0.
        [[0, 1], [2, 3], [4, 5]],
        # Epoch 1.
        [[6, 7], [8, 9], [10, 11]],
        # Epoch 2.
        [[12, 13], [14, 15], [16, 17]],
        # Epoch 3.
        [[18, 19], [0, 1], [2, 3]],
    ]

Source File: dataloader_test.py From allennlp with Apache License 2.0

5 votes

def test_multi_processing_with_lazy_dataset_warns():
    def fake_instance_generator(file_name: str) -> Iterable[Instance]:
        yield from []

    with pytest.warns(UserWarning, match=r".*deadlocks.*"):
        PyTorchDataLoader(
            AllennlpLazyDataset(fake_instance_generator, "nonexistent_file"), num_workers=1
        )

Source File: sharded_dataset_reader_test.py From allennlp with Apache License 2.0

5 votes

def fingerprint(instance: Instance) -> Tuple[str, ...]:
    """
    Get a hashable representation of a sequence tagging instance
    that can be put in a Counter.
    """
    text_tuple = tuple(t.text for t in instance.fields["tokens"].tokens)  # type: ignore
    labels_tuple = tuple(instance.fields["tags"].labels)  # type: ignore
    return text_tuple + labels_tuple

Source File: lazy_dataset_reader_test.py From allennlp with Apache License 2.0

5 votes

def __init__(self, instances: List[Instance], lazy: bool) -> None:
        super().__init__()
        self.lazy = lazy
        self._instances = instances
        self.num_reads = 0

Source File: dataloader.py From allennlp with Apache License 2.0

5 votes

def allennlp_collate(instances: List[Instance]) -> TensorDict:
    batch = Batch(instances)
    return batch.as_tensor_dict(batch.get_padding_lengths())

Source File: span_pred_reader.py From semanticRetrievalMRS with MIT License

5 votes

def text_to_instance(self,  # type: ignore
                         example) -> Instance:

        fields: Dict[str, Field] = {}

        joint_tokens_seq = ['[CLS]'] + example['query_c_tokens'] + ['[SEP]'] + example['context_c_tokens'] + ['[SEP]']
        assert len(joint_tokens_seq) < 512

        text1_len = len(example['query_c_tokens']) + 2
        text2_len = len(example['context_c_tokens']) + 1

        segments_ids = [0 for _ in range(text1_len)] + [1 for _ in range(text2_len)]

        joint_tokens_ids = self.bert_tokenizer.convert_tokens_to_ids(joint_tokens_seq)
        assert len(joint_tokens_ids) == len(segments_ids)

        fields['paired_sequence'] = BertIndexField(np.asarray(joint_tokens_ids, dtype=np.int64))
        fields['paired_segments_ids'] = BertIndexField(np.asarray(segments_ids, dtype=np.int64))

        # This text span is begin inclusive and end exclusive.
        text1_span = (1, 1 + len(example['query_c_tokens'])) # End is exclusive (important for later use)
        text2_span = (text1_span[1] + 1, text1_span[1] + 1 + len(example['context_c_tokens']))

        fields['bert_s1_span'] = SpanField(text1_span[0], text1_span[1], fields['paired_sequence'])
        fields['bert_s2_span'] = SpanField(text2_span[0], text2_span[1], fields['paired_sequence'])
        # fields['bert_s2_span'] = SpanField(text2_span)
        # fields['bert_s1_span'] = MetadataField(text1_span)
        # fields['bert_s2_span'] = MetadataField(text2_span)

        # However, the ground truth span is begin and end both inclusive
        fields['gt_span'] = SpanField(example['start_position'], example['end_position'], fields['paired_sequence'])

        fields['fid'] = IdField(example['fid'])
        fields['uid'] = IdField(example['uid'])

        return Instance(fields)

Source File: interleaving_dataset_reader_test.py From allennlp with Apache License 2.0

5 votes

def text_to_instance(self, line: str) -> Instance:  # type: ignore

        tokens = self._tokenizer.tokenize(line)
        return Instance({"line": TextField(tokens, self._token_indexers)})

Source File: interleaving_dataset_reader_test.py From allennlp with Apache License 2.0

5 votes

def _read(self, file_path: str) -> Iterable[Instance]:
        with open(file_path) as input_file:
            for line in input_file:
                yield self.text_to_instance(line)

Source File: pretrained_transformer_mismatched_embedder_test.py From allennlp with Apache License 2.0

5 votes

def test_token_without_wordpieces(self):
        token_indexer = PretrainedTransformerMismatchedIndexer("bert-base-uncased")

        sentence1 = ["A", "", "AllenNLP", "sentence", "."]
        sentence2 = ["AllenNLP", "", "great"]
        tokens1 = [Token(word) for word in sentence1]
        tokens2 = [Token(word) for word in sentence2]
        vocab = Vocabulary()
        params = Params(
            {
                "token_embedders": {
                    "bert": {
                        "type": "pretrained_transformer_mismatched",
                        "model_name": "bert-base-uncased",
                    }
                }
            }
        )
        token_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab, params=params)

        instance1 = Instance({"tokens": TextField(tokens1, {"bert": token_indexer})})
        instance2 = Instance({"tokens": TextField(tokens2, {"bert": token_indexer})})

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]

        assert tokens["bert"]["offsets"].tolist() == [
            [[1, 1], [-1, -1], [2, 4], [5, 5], [6, 6]],
            [[1, 3], [-1, -1], [4, 4], [0, 0], [0, 0]],
        ]

        bert_vectors = token_embedder(tokens)
        assert bert_vectors.size() == (2, max(len(sentence1), len(sentence2)), 768)
        assert not torch.isnan(bert_vectors).any()
        assert all(bert_vectors[0, 1] == 0)
        assert all(bert_vectors[1, 1] == 0)

Python allennlp.data.instance.Instance() Examples