Python allennlp.data.Instance() Examples

The following are 30 code examples of allennlp.data.Instance(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module allennlp.data , or try the search function .
Example #1
Source File: __init__.py    From nanigonet with MIT License 6 votes vote down vote up
def predict_batch(self, texts):
        instances = []
        for text in texts:
            tokens = self._tokenizer.tokenize(text)
            instance = Instance({'tokens': TextField(tokens, self._token_indexers)})
            instances.append(instance)

        result = self.model.forward_on_instances(instances)

        results = []
        for instance_result, text in zip(result, texts):
            result = self._format_instance_result(instance_result)
            result['text'] = text
            results.append(result)

        return results 
Example #2
Source File: elmo_test.py    From magnitude with MIT License 6 votes vote down vote up
def get_vocab_and_both_elmo_indexed_ids(batch                 ):
        instances = []
        indexer = ELMoTokenCharactersIndexer()
        indexer2 = SingleIdTokenIndexer()
        for sentence in batch:
            tokens = [Token(token) for token in sentence]
            field = TextField(tokens,
                              {u'character_ids': indexer,
                               u'tokens': indexer2})
            instance = Instance({u"elmo": field})
            instances.append(instance)

        dataset = Batch(instances)
        vocab = Vocabulary.from_instances(instances)
        dataset.index_instances(vocab)
        return vocab, dataset.as_tensor_dict()[u"elmo"] 
Example #3
Source File: predictor.py    From allennlp with Apache License 2.0 6 votes vote down vote up
def json_to_labeled_instances(self, inputs: JsonDict) -> List[Instance]:
        """
        Converts incoming json to a [`Instance`](../data/instance.md),
        runs the model on the newly created instance, and adds labels to the
        `Instance`s given by the model's output.

        # Returns

        `List[instance]`
            A list of `Instance`'s.
        """

        instance = self._json_to_instance(inputs)
        outputs = self._model.forward_on_instance(instance)
        new_instances = self.predictions_to_labeled_instances(instance, outputs)
        return new_instances 
Example #4
Source File: predictor.py    From udify with MIT License 6 votes vote down vote up
def _predict_unknown(self, instance: Instance):
        """
        Maps each unknown label in each namespace to a default token
        :param instance: the instance containing a list of labels for each namespace
        """
        def replace_tokens(instance: Instance, namespace: str, token: str):
            if namespace not in instance.fields:
                return

            instance.fields[namespace].labels = [label
                                                 if label in self._model.vocab._token_to_index[namespace]
                                                 else token
                                                 for label in instance.fields[namespace].labels]

        replace_tokens(instance, "lemmas", "↓0;d¦")
        replace_tokens(instance, "feats", "_")
        replace_tokens(instance, "xpos", "_")
        replace_tokens(instance, "upos", "NOUN")
        replace_tokens(instance, "head_tags", "case") 
Example #5
Source File: vocabulary_test.py    From magnitude with MIT License 6 votes vote down vote up
def test_saving_and_loading_works_with_byte_encoding(self):
        # We're going to set a vocabulary from a TextField using byte encoding, index it, save the
        # vocab, load the vocab, then index the text field again, and make sure we get the same
        # result.
        tokenizer = CharacterTokenizer(byte_encoding=u'utf-8')
        token_indexer = TokenCharactersIndexer(character_tokenizer=tokenizer)
        tokens = [Token(t) for t in [u"Øyvind", u"für", u"汉字"]]
        text_field = TextField(tokens, {u"characters": token_indexer})
        dataset = Batch([Instance({u"sentence": text_field})])
        vocab = Vocabulary.from_instances(dataset)
        text_field.index(vocab)
        indexed_tokens = deepcopy(text_field._indexed_tokens)  # pylint: disable=protected-access

        vocab_dir = self.TEST_DIR / u'vocab_save'
        vocab.save_to_files(vocab_dir)
        vocab2 = Vocabulary.from_files(vocab_dir)
        text_field2 = TextField(tokens, {u"characters": token_indexer})
        text_field2.index(vocab2)
        indexed_tokens2 = deepcopy(text_field2._indexed_tokens)  # pylint: disable=protected-access
        assert indexed_tokens == indexed_tokens2 
Example #6
Source File: vocabulary_test.py    From magnitude with MIT License 6 votes vote down vote up
def test_registrability(self):

        class MyVocabulary(object):
            @classmethod
            def from_params(cls, params, instances=None):
                # pylint: disable=unused-argument
                return MyVocabulary()


                MyVocabulary = Vocabulary.register(u'my-vocabulary')(MyVocabulary)

        params = Params({u'type': u'my-vocabulary'})

        instance = Instance(fields={})

        vocab = Vocabulary.from_params(params=params, instances=[instance])

        assert isinstance(vocab, MyVocabulary) 
Example #7
Source File: dataset_test.py    From allennlp with Apache License 2.0 6 votes vote down vote up
def get_instances(self):
        field1 = TextField(
            [Token(t) for t in ["this", "is", "a", "sentence", "."]], self.token_indexer
        )
        field2 = TextField(
            [Token(t) for t in ["this", "is", "a", "different", "sentence", "."]],
            self.token_indexer,
        )
        field3 = TextField(
            [Token(t) for t in ["here", "is", "a", "sentence", "."]], self.token_indexer
        )
        field4 = TextField([Token(t) for t in ["this", "is", "short"]], self.token_indexer)
        instances = [
            Instance({"text1": field1, "text2": field2}),
            Instance({"text1": field3, "text2": field4}),
        ]
        return instances 
Example #8
Source File: instance_test.py    From allennlp with Apache License 2.0 6 votes vote down vote up
def test_duplicate(self):
        # Verify the `duplicate()` method works with a `PretrainedTransformerIndexer` in
        # a `TextField`. See https://github.com/allenai/allennlp/issues/4270.
        instance = Instance(
            {
                "words": TextField(
                    [Token("hello")], {"tokens": PretrainedTransformerIndexer("bert-base-uncased")}
                )
            }
        )

        other = instance.duplicate()
        assert other == instance

        # Adding new fields to the original instance should not effect the duplicate.
        instance.add_field("labels", LabelField("some_label"))
        assert "labels" not in other.fields
        assert other != instance  # sanity check on the '__eq__' method. 
Example #9
Source File: dataset_reader.py    From nanigonet with MIT License 6 votes vote down vote up
def text_to_instance(self, tokens: List[Token], tags: List[str]=None) -> Instance:

        if len(tokens) > self._max_token_len:
            tokens = tokens[:self._max_token_len]
            print(f'Length of tokens exceeded the limit {self._max_token_len}. Truncating...')
            if tags:
                tags = tags[:self._max_token_len]

        fields = {}

        text_field = TextField(tokens, self._token_indexers)
        fields['tokens'] = text_field
        if tags:
            fields['tags'] = SequenceLabelField(tags, text_field)

        return Instance(fields) 
Example #10
Source File: nlvr_parser.py    From allennlp-semparse with Apache License 2.0 6 votes vote down vote up
def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        sentence = json_dict["sentence"]
        if "worlds" in json_dict:
            # This is grouped data
            worlds = json_dict["worlds"]
            if isinstance(worlds, str):
                worlds = json.loads(worlds)
        else:
            structured_rep = json_dict["structured_rep"]
            if isinstance(structured_rep, str):
                structured_rep = json.loads(structured_rep)
            worlds = [structured_rep]
        identifier = json_dict["identifier"] if "identifier" in json_dict else None
        instance = self._dataset_reader.text_to_instance(
            sentence=sentence,  # type: ignore
            structured_representations=worlds,
            identifier=identifier,
        )
        return instance 
Example #11
Source File: evaluate_custom.py    From OpenBookQA with Apache License 2.0 6 votes vote down vote up
def evaluate(model: Model,
             instances: Iterable[Instance],
             data_iterator: DataIterator,
             output_file: str = None) -> Dict[str, Any]:
    model.eval()

    iterator = data_iterator(instances, num_epochs=1)
    logger.info("Iterating over dataset")
    generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances))
    with ExitStack() as stack:
        if output_file is None:
            file_handle = None
        else:
            file_handle = stack.enter_context(open(output_file, 'w'))
        for batch in generator_tqdm:
            model_output = model(**batch)
            metrics = model.get_metrics()
            if file_handle:
                id2label = model.vocab.get_index_to_token_vocabulary("labels")
                _persist_data(file_handle, batch.get("metadata"), model_output, id2label=id2label)
            description = ', '.join(["%s: %.2f" % (name, value) for name, value in metrics.items()]) + " ||"
            generator_tqdm.set_description(description)

    return model.get_metrics() 
Example #12
Source File: evaluate_predictions_qa_mc_know_visualize.py    From OpenBookQA with Apache License 2.0 6 votes vote down vote up
def evaluate(model: Model,
             instances: Iterable[Instance],
             data_iterator: DataIterator,
             output_file: str = None,
             eval_type: str = None) -> Dict[str, Any]:
    model.eval()

    iterator = data_iterator(instances, num_epochs=1)
    logger.info("Iterating over dataset")
    generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances))
    with ExitStack() as stack:
        if output_file is None:
            file_handle = None
        else:
            file_handle = stack.enter_context(open(output_file, 'w'))
        for batch in generator_tqdm:
            model_output = model(**batch)
            metrics = model.get_metrics()
            if file_handle:
                _persist_data(file_handle, batch.get("metadata"), model_output, eval_type)
            description = ', '.join(["%s: %.2f" % (name, value) for name, value in metrics.items()]) + " ||"
            generator_tqdm.set_description(description)

    return model.get_metrics(reset=True) 
Example #13
Source File: vocabulary_test.py    From allennlp with Apache License 2.0 6 votes vote down vote up
def test_saving_and_loading_works_with_byte_encoding(self):
        # We're going to set a vocabulary from a TextField using byte encoding, index it, save the
        # vocab, load the vocab, then index the text field again, and make sure we get the same
        # result.
        tokenizer = CharacterTokenizer(byte_encoding="utf-8")
        token_indexer = TokenCharactersIndexer(character_tokenizer=tokenizer, min_padding_length=2)
        tokens = [Token(t) for t in ["Øyvind", "für", "汉字"]]
        text_field = TextField(tokens, {"characters": token_indexer})
        dataset = Batch([Instance({"sentence": text_field})])
        vocab = Vocabulary.from_instances(dataset)
        text_field.index(vocab)
        indexed_tokens = deepcopy(text_field._indexed_tokens)

        vocab_dir = self.TEST_DIR / "vocab_save"
        vocab.save_to_files(vocab_dir)
        vocab2 = Vocabulary.from_files(vocab_dir)
        text_field2 = TextField(tokens, {"characters": token_indexer})
        text_field2.index(vocab2)
        indexed_tokens2 = deepcopy(text_field2._indexed_tokens)
        assert indexed_tokens == indexed_tokens2 
Example #14
Source File: vocabulary_test.py    From allennlp with Apache License 2.0 6 votes vote down vote up
def test_from_params_extend_config(self):

        vocab_dir = self.TEST_DIR / "vocab_save"
        original_vocab = Vocabulary(non_padded_namespaces=["tokens"])
        original_vocab.add_token_to_namespace("a", namespace="tokens")
        original_vocab.save_to_files(vocab_dir)

        text_field = TextField(
            [Token(t) for t in ["a", "b"]], {"tokens": SingleIdTokenIndexer("tokens")}
        )
        instances = Batch([Instance({"text": text_field})])

        # If you ask to extend vocab from `directory`, instances must be passed
        # in Vocabulary constructor, or else there is nothing to extend to.
        params = Params({"type": "extend", "directory": vocab_dir})
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(params)

        # If you ask to extend vocab, `directory` key must be present in params,
        # or else there is nothing to extend from.
        params = Params({"type": "extend"})
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(params, instances=instances) 
Example #15
Source File: vocabulary_test.py    From allennlp with Apache License 2.0 6 votes vote down vote up
def test_max_vocab_size_partial_dict(self):
        indexers = {
            "tokens": SingleIdTokenIndexer(),
            "token_characters": TokenCharactersIndexer(min_padding_length=3),
        }
        instance = Instance(
            {
                "text": TextField(
                    [Token(w) for w in "Abc def ghi jkl mno pqr stu vwx yz".split(" ")], indexers
                )
            }
        )
        dataset = Batch([instance])
        params = Params({"max_vocab_size": {"tokens": 1}})

        vocab = Vocabulary.from_params(params=params, instances=dataset)
        assert len(vocab.get_index_to_token_vocabulary("tokens").values()) == 3  # 1 + 2
        assert len(vocab.get_index_to_token_vocabulary("token_characters").values()) == 28  # 26 + 2 
Example #16
Source File: atis_parser.py    From allennlp-semparse with Apache License 2.0 5 votes vote down vote up
def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        """
        Expects JSON that looks like ``{"utterance": "..."}``.
        """
        utterance = json_dict["utterance"]
        return self._dataset_reader.text_to_instance([utterance]) 
Example #17
Source File: __init__.py    From nanigonet with MIT License 5 votes vote down vote up
def predict(self, text):
        tokens = self._tokenizer.tokenize(text)
        instance = Instance({'tokens': TextField(tokens, self._token_indexers)})

        result = self.model.forward_on_instance(instance)
        result = self._format_instance_result(result)
        result['text'] = text
        return result 
Example #18
Source File: wikitables_parser.py    From allennlp-semparse with Apache License 2.0 5 votes vote down vote up
def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        """
        Expects JSON that looks like ``{"question": "...", "table": "..."}``.
        """
        question_text = json_dict["question"]
        table_rows = json_dict["table"].split("\n")

        # We are directly passing the raw table rows here. The code in ``TableQuestionContext`` will do some
        # minimal processing to extract dates and numbers from the cells.
        instance = self._dataset_reader.text_to_instance(
            question_text,  # type: ignore
            table_rows,
        )
        return instance 
Example #19
Source File: dataset_reader.py    From nanigonet with MIT License 5 votes vote down vote up
def _read(self, file_path: str) -> Iterable[Instance]:
        file_path = cached_path(file_path)

        with open(file_path) as f:
            for line in f:
                data = json.loads(line)
                tokens = self._tokenizer.tokenize(data['text'])
                tags = data.get('labels')

                yield self.text_to_instance(tokens, tags) 
Example #20
Source File: text_predictor.py    From udify with MIT License 5 votes vote down vote up
def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        """
        Expects JSON that looks like ``{"sentence": "..."}``.
        Runs the underlying model, and adds the ``"words"`` to the output.
        """
        sentence = json_dict["sentence"]
        tokens = [word.text for word in self._dataset_reader.tokenizer.split_words(sentence)]
        return self._dataset_reader.text_to_instance(tokens) 
Example #21
Source File: dataset_test.py    From magnitude with MIT License 5 votes vote down vote up
def test_instances_must_have_homogeneous_fields(self):
        instance1 = Instance({u"tag": (LabelField(1, skip_indexing=True))})
        instance2 = Instance({u"words": TextField([Token(u"hello")], {})})
        with pytest.raises(ConfigurationError):
            _ = Batch([instance1, instance2]) 
Example #22
Source File: text_predictor.py    From udify with MIT License 5 votes vote down vote up
def _predict_unknown(self, instance: Instance):
        """
        Maps each unknown label in each namespace to a default token
        :param instance: the instance containing a list of labels for each namespace
        """
        return self.predictor._predict_unknown(instance) 
Example #23
Source File: text_predictor.py    From udify with MIT License 5 votes vote down vote up
def predict_batch_instance(self, instances: List[Instance]) -> List[JsonDict]:
        return self.predictor.predict_batch_instance(instances) 
Example #24
Source File: predictor.py    From udify with MIT License 5 votes vote down vote up
def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        """
        Expects JSON that looks like ``{"sentence": "..."}``.
        Runs the underlying model, and adds the ``"words"`` to the output.
        """
        sentence = json_dict["sentence"]
        tokens = sentence.split()
        return self._dataset_reader.text_to_instance(tokens) 
Example #25
Source File: predictor.py    From udify with MIT License 5 votes vote down vote up
def predict_instance(self, instance: Instance) -> JsonDict:
        if "@@UNKNOWN@@" not in self._model.vocab._token_to_index["lemmas"]:
            # Handle cases where the labels are present in the test set but not training set
            self._predict_unknown(instance)
        outputs = self._model.forward_on_instance(instance)
        return sanitize(outputs) 
Example #26
Source File: predictor.py    From udify with MIT License 5 votes vote down vote up
def predict_batch_instance(self, instances: List[Instance]) -> List[JsonDict]:
        if "@@UNKNOWN@@" not in self._model.vocab._token_to_index["lemmas"]:
            # Handle cases where the labels are present in the test set but not training set
            for instance in instances:
                self._predict_unknown(instance)
        outputs = self._model.forward_on_instances(instances)
        return sanitize(outputs) 
Example #27
Source File: list_field_test.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def test_empty_list_can_be_tensorized(self):
        tokenizer = SpacyTokenizer()
        tokens = tokenizer.tokenize("Foo")
        text_field = TextField(tokens, self.word_indexer)
        list_field = ListField([text_field.empty_field()])
        fields = {
            "list": list_field,
            "bar": TextField(tokenizer.tokenize("BAR"), self.word_indexer),
        }
        instance = Instance(fields)
        instance.index_fields(self.vocab)
        instance.as_tensor_dict() 
Example #28
Source File: dataset_reader_test.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def text_to_instance(self, index: int):  # type: ignore
        return Instance({"index": LabelField(index, skip_indexing=True)}) 
Example #29
Source File: dataset_reader_test.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def text_to_instance(self, index: int):  # type: ignore
        return Instance({"index": LabelField(index, skip_indexing=True)}) 
Example #30
Source File: dataset_reader_test.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def text_to_instance(self, index: int):  # type: ignore
        return Instance({"index": LabelField(index, skip_indexing=True)})