Python Examples of allennlp.data.Instance

Source File: __init__.py From nanigonet with MIT License

6 votes

def predict_batch(self, texts):
        instances = []
        for text in texts:
            tokens = self._tokenizer.tokenize(text)
            instance = Instance({'tokens': TextField(tokens, self._token_indexers)})
            instances.append(instance)

        result = self.model.forward_on_instances(instances)

        results = []
        for instance_result, text in zip(result, texts):
            result = self._format_instance_result(instance_result)
            result['text'] = text
            results.append(result)

        return results

Source File: elmo_test.py From magnitude with MIT License

6 votes

def get_vocab_and_both_elmo_indexed_ids(batch                 ):
        instances = []
        indexer = ELMoTokenCharactersIndexer()
        indexer2 = SingleIdTokenIndexer()
        for sentence in batch:
            tokens = [Token(token) for token in sentence]
            field = TextField(tokens,
                              {u'character_ids': indexer,
                               u'tokens': indexer2})
            instance = Instance({u"elmo": field})
            instances.append(instance)

        dataset = Batch(instances)
        vocab = Vocabulary.from_instances(instances)
        dataset.index_instances(vocab)
        return vocab, dataset.as_tensor_dict()[u"elmo"]

Source File: predictor.py From allennlp with Apache License 2.0

6 votes

def json_to_labeled_instances(self, inputs: JsonDict) -> List[Instance]:
        """
        Converts incoming json to a [`Instance`](../data/instance.md),
        runs the model on the newly created instance, and adds labels to the
        `Instance`s given by the model's output.

        # Returns

        `List[instance]`
            A list of `Instance`'s.
        """

        instance = self._json_to_instance(inputs)
        outputs = self._model.forward_on_instance(instance)
        new_instances = self.predictions_to_labeled_instances(instance, outputs)
        return new_instances

Source File: predictor.py From udify with MIT License

6 votes

def _predict_unknown(self, instance: Instance):
        """
        Maps each unknown label in each namespace to a default token
        :param instance: the instance containing a list of labels for each namespace
        """
        def replace_tokens(instance: Instance, namespace: str, token: str):
            if namespace not in instance.fields:
                return

            instance.fields[namespace].labels = [label
                                                 if label in self._model.vocab._token_to_index[namespace]
                                                 else token
                                                 for label in instance.fields[namespace].labels]

        replace_tokens(instance, "lemmas", "↓0;d¦")
        replace_tokens(instance, "feats", "_")
        replace_tokens(instance, "xpos", "_")
        replace_tokens(instance, "upos", "NOUN")
        replace_tokens(instance, "head_tags", "case")

Source File: vocabulary_test.py From magnitude with MIT License

6 votes

def test_saving_and_loading_works_with_byte_encoding(self):
        # We're going to set a vocabulary from a TextField using byte encoding, index it, save the
        # vocab, load the vocab, then index the text field again, and make sure we get the same
        # result.
        tokenizer = CharacterTokenizer(byte_encoding=u'utf-8')
        token_indexer = TokenCharactersIndexer(character_tokenizer=tokenizer)
        tokens = [Token(t) for t in [u"Øyvind", u"für", u"汉字"]]
        text_field = TextField(tokens, {u"characters": token_indexer})
        dataset = Batch([Instance({u"sentence": text_field})])
        vocab = Vocabulary.from_instances(dataset)
        text_field.index(vocab)
        indexed_tokens = deepcopy(text_field._indexed_tokens)  # pylint: disable=protected-access

        vocab_dir = self.TEST_DIR / u'vocab_save'
        vocab.save_to_files(vocab_dir)
        vocab2 = Vocabulary.from_files(vocab_dir)
        text_field2 = TextField(tokens, {u"characters": token_indexer})
        text_field2.index(vocab2)
        indexed_tokens2 = deepcopy(text_field2._indexed_tokens)  # pylint: disable=protected-access
        assert indexed_tokens == indexed_tokens2

Source File: vocabulary_test.py From magnitude with MIT License

6 votes

def test_registrability(self):

        class MyVocabulary(object):
            @classmethod
            def from_params(cls, params, instances=None):
                # pylint: disable=unused-argument
                return MyVocabulary()


                MyVocabulary = Vocabulary.register(u'my-vocabulary')(MyVocabulary)

        params = Params({u'type': u'my-vocabulary'})

        instance = Instance(fields={})

        vocab = Vocabulary.from_params(params=params, instances=[instance])

        assert isinstance(vocab, MyVocabulary)

Source File: dataset_test.py From allennlp with Apache License 2.0

6 votes

def get_instances(self):
        field1 = TextField(
            [Token(t) for t in ["this", "is", "a", "sentence", "."]], self.token_indexer
        )
        field2 = TextField(
            [Token(t) for t in ["this", "is", "a", "different", "sentence", "."]],
            self.token_indexer,
        )
        field3 = TextField(
            [Token(t) for t in ["here", "is", "a", "sentence", "."]], self.token_indexer
        )
        field4 = TextField([Token(t) for t in ["this", "is", "short"]], self.token_indexer)
        instances = [
            Instance({"text1": field1, "text2": field2}),
            Instance({"text1": field3, "text2": field4}),
        ]
        return instances

Source File: instance_test.py From allennlp with Apache License 2.0

6 votes

def test_duplicate(self):
        # Verify the `duplicate()` method works with a `PretrainedTransformerIndexer` in
        # a `TextField`. See https://github.com/allenai/allennlp/issues/4270.
        instance = Instance(
            {
                "words": TextField(
                    [Token("hello")], {"tokens": PretrainedTransformerIndexer("bert-base-uncased")}
                )
            }
        )

        other = instance.duplicate()
        assert other == instance

        # Adding new fields to the original instance should not effect the duplicate.
        instance.add_field("labels", LabelField("some_label"))
        assert "labels" not in other.fields
        assert other != instance  # sanity check on the '__eq__' method.

Source File: dataset_reader.py From nanigonet with MIT License

6 votes

def text_to_instance(self, tokens: List[Token], tags: List[str]=None) -> Instance:

        if len(tokens) > self._max_token_len:
            tokens = tokens[:self._max_token_len]
            print(f'Length of tokens exceeded the limit {self._max_token_len}. Truncating...')
            if tags:
                tags = tags[:self._max_token_len]

        fields = {}

        text_field = TextField(tokens, self._token_indexers)
        fields['tokens'] = text_field
        if tags:
            fields['tags'] = SequenceLabelField(tags, text_field)

        return Instance(fields)

Source File: nlvr_parser.py From allennlp-semparse with Apache License 2.0

6 votes

def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        sentence = json_dict["sentence"]
        if "worlds" in json_dict:
            # This is grouped data
            worlds = json_dict["worlds"]
            if isinstance(worlds, str):
                worlds = json.loads(worlds)
        else:
            structured_rep = json_dict["structured_rep"]
            if isinstance(structured_rep, str):
                structured_rep = json.loads(structured_rep)
            worlds = [structured_rep]
        identifier = json_dict["identifier"] if "identifier" in json_dict else None
        instance = self._dataset_reader.text_to_instance(
            sentence=sentence,  # type: ignore
            structured_representations=worlds,
            identifier=identifier,
        )
        return instance

Source File: evaluate_custom.py From OpenBookQA with Apache License 2.0

6 votes

def evaluate(model: Model,
             instances: Iterable[Instance],
             data_iterator: DataIterator,
             output_file: str = None) -> Dict[str, Any]:
    model.eval()

    iterator = data_iterator(instances, num_epochs=1)
    logger.info("Iterating over dataset")
    generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances))
    with ExitStack() as stack:
        if output_file is None:
            file_handle = None
        else:
            file_handle = stack.enter_context(open(output_file, 'w'))
        for batch in generator_tqdm:
            model_output = model(**batch)
            metrics = model.get_metrics()
            if file_handle:
                id2label = model.vocab.get_index_to_token_vocabulary("labels")
                _persist_data(file_handle, batch.get("metadata"), model_output, id2label=id2label)
            description = ', '.join(["%s: %.2f" % (name, value) for name, value in metrics.items()]) + " ||"
            generator_tqdm.set_description(description)

    return model.get_metrics()

Source File: evaluate_predictions_qa_mc_know_visualize.py From OpenBookQA with Apache License 2.0

6 votes

def evaluate(model: Model,
             instances: Iterable[Instance],
             data_iterator: DataIterator,
             output_file: str = None,
             eval_type: str = None) -> Dict[str, Any]:
    model.eval()

    iterator = data_iterator(instances, num_epochs=1)
    logger.info("Iterating over dataset")
    generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances))
    with ExitStack() as stack:
        if output_file is None:
            file_handle = None
        else:
            file_handle = stack.enter_context(open(output_file, 'w'))
        for batch in generator_tqdm:
            model_output = model(**batch)
            metrics = model.get_metrics()
            if file_handle:
                _persist_data(file_handle, batch.get("metadata"), model_output, eval_type)
            description = ', '.join(["%s: %.2f" % (name, value) for name, value in metrics.items()]) + " ||"
            generator_tqdm.set_description(description)

    return model.get_metrics(reset=True)

Source File: vocabulary_test.py From allennlp with Apache License 2.0

6 votes

def test_saving_and_loading_works_with_byte_encoding(self):
        # We're going to set a vocabulary from a TextField using byte encoding, index it, save the
        # vocab, load the vocab, then index the text field again, and make sure we get the same
        # result.
        tokenizer = CharacterTokenizer(byte_encoding="utf-8")
        token_indexer = TokenCharactersIndexer(character_tokenizer=tokenizer, min_padding_length=2)
        tokens = [Token(t) for t in ["Øyvind", "für", "汉字"]]
        text_field = TextField(tokens, {"characters": token_indexer})
        dataset = Batch([Instance({"sentence": text_field})])
        vocab = Vocabulary.from_instances(dataset)
        text_field.index(vocab)
        indexed_tokens = deepcopy(text_field._indexed_tokens)

        vocab_dir = self.TEST_DIR / "vocab_save"
        vocab.save_to_files(vocab_dir)
        vocab2 = Vocabulary.from_files(vocab_dir)
        text_field2 = TextField(tokens, {"characters": token_indexer})
        text_field2.index(vocab2)
        indexed_tokens2 = deepcopy(text_field2._indexed_tokens)
        assert indexed_tokens == indexed_tokens2

Source File: vocabulary_test.py From allennlp with Apache License 2.0

6 votes

def test_from_params_extend_config(self):

        vocab_dir = self.TEST_DIR / "vocab_save"
        original_vocab = Vocabulary(non_padded_namespaces=["tokens"])
        original_vocab.add_token_to_namespace("a", namespace="tokens")
        original_vocab.save_to_files(vocab_dir)

        text_field = TextField(
            [Token(t) for t in ["a", "b"]], {"tokens": SingleIdTokenIndexer("tokens")}
        )
        instances = Batch([Instance({"text": text_field})])

        # If you ask to extend vocab from `directory`, instances must be passed
        # in Vocabulary constructor, or else there is nothing to extend to.
        params = Params({"type": "extend", "directory": vocab_dir})
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(params)

        # If you ask to extend vocab, `directory` key must be present in params,
        # or else there is nothing to extend from.
        params = Params({"type": "extend"})
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(params, instances=instances)

Source File: vocabulary_test.py From allennlp with Apache License 2.0

6 votes

def test_max_vocab_size_partial_dict(self):
        indexers = {
            "tokens": SingleIdTokenIndexer(),
            "token_characters": TokenCharactersIndexer(min_padding_length=3),
        }
        instance = Instance(
            {
                "text": TextField(
                    [Token(w) for w in "Abc def ghi jkl mno pqr stu vwx yz".split(" ")], indexers
                )
            }
        )
        dataset = Batch([instance])
        params = Params({"max_vocab_size": {"tokens": 1}})

        vocab = Vocabulary.from_params(params=params, instances=dataset)
        assert len(vocab.get_index_to_token_vocabulary("tokens").values()) == 3  # 1 + 2
        assert len(vocab.get_index_to_token_vocabulary("token_characters").values()) == 28  # 26 + 2

Source File: atis_parser.py From allennlp-semparse with Apache License 2.0

5 votes

def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        """
        Expects JSON that looks like ``{"utterance": "..."}``.
        """
        utterance = json_dict["utterance"]
        return self._dataset_reader.text_to_instance([utterance])

Source File: __init__.py From nanigonet with MIT License

5 votes

def predict(self, text):
        tokens = self._tokenizer.tokenize(text)
        instance = Instance({'tokens': TextField(tokens, self._token_indexers)})

        result = self.model.forward_on_instance(instance)
        result = self._format_instance_result(result)
        result['text'] = text
        return result

Source File: wikitables_parser.py From allennlp-semparse with Apache License 2.0

5 votes

def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        """
        Expects JSON that looks like ``{"question": "...", "table": "..."}``.
        """
        question_text = json_dict["question"]
        table_rows = json_dict["table"].split("\n")

        # We are directly passing the raw table rows here. The code in ``TableQuestionContext`` will do some
        # minimal processing to extract dates and numbers from the cells.
        instance = self._dataset_reader.text_to_instance(
            question_text,  # type: ignore
            table_rows,
        )
        return instance

Source File: dataset_reader.py From nanigonet with MIT License

5 votes

def _read(self, file_path: str) -> Iterable[Instance]:
        file_path = cached_path(file_path)

        with open(file_path) as f:
            for line in f:
                data = json.loads(line)
                tokens = self._tokenizer.tokenize(data['text'])
                tags = data.get('labels')

                yield self.text_to_instance(tokens, tags)

Source File: text_predictor.py From udify with MIT License

5 votes

def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        """
        Expects JSON that looks like ``{"sentence": "..."}``.
        Runs the underlying model, and adds the ``"words"`` to the output.
        """
        sentence = json_dict["sentence"]
        tokens = [word.text for word in self._dataset_reader.tokenizer.split_words(sentence)]
        return self._dataset_reader.text_to_instance(tokens)

Source File: dataset_test.py From magnitude with MIT License

5 votes

def test_instances_must_have_homogeneous_fields(self):
        instance1 = Instance({u"tag": (LabelField(1, skip_indexing=True))})
        instance2 = Instance({u"words": TextField([Token(u"hello")], {})})
        with pytest.raises(ConfigurationError):
            _ = Batch([instance1, instance2])

Source File: text_predictor.py From udify with MIT License

5 votes

def _predict_unknown(self, instance: Instance):
        """
        Maps each unknown label in each namespace to a default token
        :param instance: the instance containing a list of labels for each namespace
        """
        return self.predictor._predict_unknown(instance)

Source File: text_predictor.py From udify with MIT License

5 votes

def predict_batch_instance(self, instances: List[Instance]) -> List[JsonDict]:
        return self.predictor.predict_batch_instance(instances)

Source File: predictor.py From udify with MIT License

5 votes

def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        """
        Expects JSON that looks like ``{"sentence": "..."}``.
        Runs the underlying model, and adds the ``"words"`` to the output.
        """
        sentence = json_dict["sentence"]
        tokens = sentence.split()
        return self._dataset_reader.text_to_instance(tokens)

Source File: predictor.py From udify with MIT License

5 votes

def predict_instance(self, instance: Instance) -> JsonDict:
        if "@@UNKNOWN@@" not in self._model.vocab._token_to_index["lemmas"]:
            # Handle cases where the labels are present in the test set but not training set
            self._predict_unknown(instance)
        outputs = self._model.forward_on_instance(instance)
        return sanitize(outputs)

Source File: predictor.py From udify with MIT License

5 votes

def predict_batch_instance(self, instances: List[Instance]) -> List[JsonDict]:
        if "@@UNKNOWN@@" not in self._model.vocab._token_to_index["lemmas"]:
            # Handle cases where the labels are present in the test set but not training set
            for instance in instances:
                self._predict_unknown(instance)
        outputs = self._model.forward_on_instances(instances)
        return sanitize(outputs)

Source File: list_field_test.py From allennlp with Apache License 2.0

5 votes

def test_empty_list_can_be_tensorized(self):
        tokenizer = SpacyTokenizer()
        tokens = tokenizer.tokenize("Foo")
        text_field = TextField(tokens, self.word_indexer)
        list_field = ListField([text_field.empty_field()])
        fields = {
            "list": list_field,
            "bar": TextField(tokenizer.tokenize("BAR"), self.word_indexer),
        }
        instance = Instance(fields)
        instance.index_fields(self.vocab)
        instance.as_tensor_dict()

Source File: dataset_reader_test.py From allennlp with Apache License 2.0

5 votes

def text_to_instance(self, index: int):  # type: ignore
        return Instance({"index": LabelField(index, skip_indexing=True)})

Source File: dataset_reader_test.py From allennlp with Apache License 2.0

5 votes

def text_to_instance(self, index: int):  # type: ignore
        return Instance({"index": LabelField(index, skip_indexing=True)})

Source File: dataset_reader_test.py From allennlp with Apache License 2.0

5 votes

def text_to_instance(self, index: int):  # type: ignore
        return Instance({"index": LabelField(index, skip_indexing=True)})

Python allennlp.data.Instance() Examples