Python allennlp.data.Instance() Examples
The following are 30
code examples of allennlp.data.Instance().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
allennlp.data
, or try the search function
.
Example #1
Source File: __init__.py From nanigonet with MIT License | 6 votes |
def predict_batch(self, texts): instances = [] for text in texts: tokens = self._tokenizer.tokenize(text) instance = Instance({'tokens': TextField(tokens, self._token_indexers)}) instances.append(instance) result = self.model.forward_on_instances(instances) results = [] for instance_result, text in zip(result, texts): result = self._format_instance_result(instance_result) result['text'] = text results.append(result) return results
Example #2
Source File: elmo_test.py From magnitude with MIT License | 6 votes |
def get_vocab_and_both_elmo_indexed_ids(batch ): instances = [] indexer = ELMoTokenCharactersIndexer() indexer2 = SingleIdTokenIndexer() for sentence in batch: tokens = [Token(token) for token in sentence] field = TextField(tokens, {u'character_ids': indexer, u'tokens': indexer2}) instance = Instance({u"elmo": field}) instances.append(instance) dataset = Batch(instances) vocab = Vocabulary.from_instances(instances) dataset.index_instances(vocab) return vocab, dataset.as_tensor_dict()[u"elmo"]
Example #3
Source File: predictor.py From allennlp with Apache License 2.0 | 6 votes |
def json_to_labeled_instances(self, inputs: JsonDict) -> List[Instance]: """ Converts incoming json to a [`Instance`](../data/instance.md), runs the model on the newly created instance, and adds labels to the `Instance`s given by the model's output. # Returns `List[instance]` A list of `Instance`'s. """ instance = self._json_to_instance(inputs) outputs = self._model.forward_on_instance(instance) new_instances = self.predictions_to_labeled_instances(instance, outputs) return new_instances
Example #4
Source File: predictor.py From udify with MIT License | 6 votes |
def _predict_unknown(self, instance: Instance): """ Maps each unknown label in each namespace to a default token :param instance: the instance containing a list of labels for each namespace """ def replace_tokens(instance: Instance, namespace: str, token: str): if namespace not in instance.fields: return instance.fields[namespace].labels = [label if label in self._model.vocab._token_to_index[namespace] else token for label in instance.fields[namespace].labels] replace_tokens(instance, "lemmas", "↓0;d¦") replace_tokens(instance, "feats", "_") replace_tokens(instance, "xpos", "_") replace_tokens(instance, "upos", "NOUN") replace_tokens(instance, "head_tags", "case")
Example #5
Source File: vocabulary_test.py From magnitude with MIT License | 6 votes |
def test_saving_and_loading_works_with_byte_encoding(self): # We're going to set a vocabulary from a TextField using byte encoding, index it, save the # vocab, load the vocab, then index the text field again, and make sure we get the same # result. tokenizer = CharacterTokenizer(byte_encoding=u'utf-8') token_indexer = TokenCharactersIndexer(character_tokenizer=tokenizer) tokens = [Token(t) for t in [u"Øyvind", u"für", u"汉字"]] text_field = TextField(tokens, {u"characters": token_indexer}) dataset = Batch([Instance({u"sentence": text_field})]) vocab = Vocabulary.from_instances(dataset) text_field.index(vocab) indexed_tokens = deepcopy(text_field._indexed_tokens) # pylint: disable=protected-access vocab_dir = self.TEST_DIR / u'vocab_save' vocab.save_to_files(vocab_dir) vocab2 = Vocabulary.from_files(vocab_dir) text_field2 = TextField(tokens, {u"characters": token_indexer}) text_field2.index(vocab2) indexed_tokens2 = deepcopy(text_field2._indexed_tokens) # pylint: disable=protected-access assert indexed_tokens == indexed_tokens2
Example #6
Source File: vocabulary_test.py From magnitude with MIT License | 6 votes |
def test_registrability(self): class MyVocabulary(object): @classmethod def from_params(cls, params, instances=None): # pylint: disable=unused-argument return MyVocabulary() MyVocabulary = Vocabulary.register(u'my-vocabulary')(MyVocabulary) params = Params({u'type': u'my-vocabulary'}) instance = Instance(fields={}) vocab = Vocabulary.from_params(params=params, instances=[instance]) assert isinstance(vocab, MyVocabulary)
Example #7
Source File: dataset_test.py From allennlp with Apache License 2.0 | 6 votes |
def get_instances(self): field1 = TextField( [Token(t) for t in ["this", "is", "a", "sentence", "."]], self.token_indexer ) field2 = TextField( [Token(t) for t in ["this", "is", "a", "different", "sentence", "."]], self.token_indexer, ) field3 = TextField( [Token(t) for t in ["here", "is", "a", "sentence", "."]], self.token_indexer ) field4 = TextField([Token(t) for t in ["this", "is", "short"]], self.token_indexer) instances = [ Instance({"text1": field1, "text2": field2}), Instance({"text1": field3, "text2": field4}), ] return instances
Example #8
Source File: instance_test.py From allennlp with Apache License 2.0 | 6 votes |
def test_duplicate(self): # Verify the `duplicate()` method works with a `PretrainedTransformerIndexer` in # a `TextField`. See https://github.com/allenai/allennlp/issues/4270. instance = Instance( { "words": TextField( [Token("hello")], {"tokens": PretrainedTransformerIndexer("bert-base-uncased")} ) } ) other = instance.duplicate() assert other == instance # Adding new fields to the original instance should not effect the duplicate. instance.add_field("labels", LabelField("some_label")) assert "labels" not in other.fields assert other != instance # sanity check on the '__eq__' method.
Example #9
Source File: dataset_reader.py From nanigonet with MIT License | 6 votes |
def text_to_instance(self, tokens: List[Token], tags: List[str]=None) -> Instance: if len(tokens) > self._max_token_len: tokens = tokens[:self._max_token_len] print(f'Length of tokens exceeded the limit {self._max_token_len}. Truncating...') if tags: tags = tags[:self._max_token_len] fields = {} text_field = TextField(tokens, self._token_indexers) fields['tokens'] = text_field if tags: fields['tags'] = SequenceLabelField(tags, text_field) return Instance(fields)
Example #10
Source File: nlvr_parser.py From allennlp-semparse with Apache License 2.0 | 6 votes |
def _json_to_instance(self, json_dict: JsonDict) -> Instance: sentence = json_dict["sentence"] if "worlds" in json_dict: # This is grouped data worlds = json_dict["worlds"] if isinstance(worlds, str): worlds = json.loads(worlds) else: structured_rep = json_dict["structured_rep"] if isinstance(structured_rep, str): structured_rep = json.loads(structured_rep) worlds = [structured_rep] identifier = json_dict["identifier"] if "identifier" in json_dict else None instance = self._dataset_reader.text_to_instance( sentence=sentence, # type: ignore structured_representations=worlds, identifier=identifier, ) return instance
Example #11
Source File: evaluate_custom.py From OpenBookQA with Apache License 2.0 | 6 votes |
def evaluate(model: Model, instances: Iterable[Instance], data_iterator: DataIterator, output_file: str = None) -> Dict[str, Any]: model.eval() iterator = data_iterator(instances, num_epochs=1) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances)) with ExitStack() as stack: if output_file is None: file_handle = None else: file_handle = stack.enter_context(open(output_file, 'w')) for batch in generator_tqdm: model_output = model(**batch) metrics = model.get_metrics() if file_handle: id2label = model.vocab.get_index_to_token_vocabulary("labels") _persist_data(file_handle, batch.get("metadata"), model_output, id2label=id2label) description = ', '.join(["%s: %.2f" % (name, value) for name, value in metrics.items()]) + " ||" generator_tqdm.set_description(description) return model.get_metrics()
Example #12
Source File: evaluate_predictions_qa_mc_know_visualize.py From OpenBookQA with Apache License 2.0 | 6 votes |
def evaluate(model: Model, instances: Iterable[Instance], data_iterator: DataIterator, output_file: str = None, eval_type: str = None) -> Dict[str, Any]: model.eval() iterator = data_iterator(instances, num_epochs=1) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm(iterator, total=data_iterator.get_num_batches(instances)) with ExitStack() as stack: if output_file is None: file_handle = None else: file_handle = stack.enter_context(open(output_file, 'w')) for batch in generator_tqdm: model_output = model(**batch) metrics = model.get_metrics() if file_handle: _persist_data(file_handle, batch.get("metadata"), model_output, eval_type) description = ', '.join(["%s: %.2f" % (name, value) for name, value in metrics.items()]) + " ||" generator_tqdm.set_description(description) return model.get_metrics(reset=True)
Example #13
Source File: vocabulary_test.py From allennlp with Apache License 2.0 | 6 votes |
def test_saving_and_loading_works_with_byte_encoding(self): # We're going to set a vocabulary from a TextField using byte encoding, index it, save the # vocab, load the vocab, then index the text field again, and make sure we get the same # result. tokenizer = CharacterTokenizer(byte_encoding="utf-8") token_indexer = TokenCharactersIndexer(character_tokenizer=tokenizer, min_padding_length=2) tokens = [Token(t) for t in ["Øyvind", "für", "汉字"]] text_field = TextField(tokens, {"characters": token_indexer}) dataset = Batch([Instance({"sentence": text_field})]) vocab = Vocabulary.from_instances(dataset) text_field.index(vocab) indexed_tokens = deepcopy(text_field._indexed_tokens) vocab_dir = self.TEST_DIR / "vocab_save" vocab.save_to_files(vocab_dir) vocab2 = Vocabulary.from_files(vocab_dir) text_field2 = TextField(tokens, {"characters": token_indexer}) text_field2.index(vocab2) indexed_tokens2 = deepcopy(text_field2._indexed_tokens) assert indexed_tokens == indexed_tokens2
Example #14
Source File: vocabulary_test.py From allennlp with Apache License 2.0 | 6 votes |
def test_from_params_extend_config(self): vocab_dir = self.TEST_DIR / "vocab_save" original_vocab = Vocabulary(non_padded_namespaces=["tokens"]) original_vocab.add_token_to_namespace("a", namespace="tokens") original_vocab.save_to_files(vocab_dir) text_field = TextField( [Token(t) for t in ["a", "b"]], {"tokens": SingleIdTokenIndexer("tokens")} ) instances = Batch([Instance({"text": text_field})]) # If you ask to extend vocab from `directory`, instances must be passed # in Vocabulary constructor, or else there is nothing to extend to. params = Params({"type": "extend", "directory": vocab_dir}) with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(params) # If you ask to extend vocab, `directory` key must be present in params, # or else there is nothing to extend from. params = Params({"type": "extend"}) with pytest.raises(ConfigurationError): _ = Vocabulary.from_params(params, instances=instances)
Example #15
Source File: vocabulary_test.py From allennlp with Apache License 2.0 | 6 votes |
def test_max_vocab_size_partial_dict(self): indexers = { "tokens": SingleIdTokenIndexer(), "token_characters": TokenCharactersIndexer(min_padding_length=3), } instance = Instance( { "text": TextField( [Token(w) for w in "Abc def ghi jkl mno pqr stu vwx yz".split(" ")], indexers ) } ) dataset = Batch([instance]) params = Params({"max_vocab_size": {"tokens": 1}}) vocab = Vocabulary.from_params(params=params, instances=dataset) assert len(vocab.get_index_to_token_vocabulary("tokens").values()) == 3 # 1 + 2 assert len(vocab.get_index_to_token_vocabulary("token_characters").values()) == 28 # 26 + 2
Example #16
Source File: atis_parser.py From allennlp-semparse with Apache License 2.0 | 5 votes |
def _json_to_instance(self, json_dict: JsonDict) -> Instance: """ Expects JSON that looks like ``{"utterance": "..."}``. """ utterance = json_dict["utterance"] return self._dataset_reader.text_to_instance([utterance])
Example #17
Source File: __init__.py From nanigonet with MIT License | 5 votes |
def predict(self, text): tokens = self._tokenizer.tokenize(text) instance = Instance({'tokens': TextField(tokens, self._token_indexers)}) result = self.model.forward_on_instance(instance) result = self._format_instance_result(result) result['text'] = text return result
Example #18
Source File: wikitables_parser.py From allennlp-semparse with Apache License 2.0 | 5 votes |
def _json_to_instance(self, json_dict: JsonDict) -> Instance: """ Expects JSON that looks like ``{"question": "...", "table": "..."}``. """ question_text = json_dict["question"] table_rows = json_dict["table"].split("\n") # We are directly passing the raw table rows here. The code in ``TableQuestionContext`` will do some # minimal processing to extract dates and numbers from the cells. instance = self._dataset_reader.text_to_instance( question_text, # type: ignore table_rows, ) return instance
Example #19
Source File: dataset_reader.py From nanigonet with MIT License | 5 votes |
def _read(self, file_path: str) -> Iterable[Instance]: file_path = cached_path(file_path) with open(file_path) as f: for line in f: data = json.loads(line) tokens = self._tokenizer.tokenize(data['text']) tags = data.get('labels') yield self.text_to_instance(tokens, tags)
Example #20
Source File: text_predictor.py From udify with MIT License | 5 votes |
def _json_to_instance(self, json_dict: JsonDict) -> Instance: """ Expects JSON that looks like ``{"sentence": "..."}``. Runs the underlying model, and adds the ``"words"`` to the output. """ sentence = json_dict["sentence"] tokens = [word.text for word in self._dataset_reader.tokenizer.split_words(sentence)] return self._dataset_reader.text_to_instance(tokens)
Example #21
Source File: dataset_test.py From magnitude with MIT License | 5 votes |
def test_instances_must_have_homogeneous_fields(self): instance1 = Instance({u"tag": (LabelField(1, skip_indexing=True))}) instance2 = Instance({u"words": TextField([Token(u"hello")], {})}) with pytest.raises(ConfigurationError): _ = Batch([instance1, instance2])
Example #22
Source File: text_predictor.py From udify with MIT License | 5 votes |
def _predict_unknown(self, instance: Instance): """ Maps each unknown label in each namespace to a default token :param instance: the instance containing a list of labels for each namespace """ return self.predictor._predict_unknown(instance)
Example #23
Source File: text_predictor.py From udify with MIT License | 5 votes |
def predict_batch_instance(self, instances: List[Instance]) -> List[JsonDict]: return self.predictor.predict_batch_instance(instances)
Example #24
Source File: predictor.py From udify with MIT License | 5 votes |
def _json_to_instance(self, json_dict: JsonDict) -> Instance: """ Expects JSON that looks like ``{"sentence": "..."}``. Runs the underlying model, and adds the ``"words"`` to the output. """ sentence = json_dict["sentence"] tokens = sentence.split() return self._dataset_reader.text_to_instance(tokens)
Example #25
Source File: predictor.py From udify with MIT License | 5 votes |
def predict_instance(self, instance: Instance) -> JsonDict: if "@@UNKNOWN@@" not in self._model.vocab._token_to_index["lemmas"]: # Handle cases where the labels are present in the test set but not training set self._predict_unknown(instance) outputs = self._model.forward_on_instance(instance) return sanitize(outputs)
Example #26
Source File: predictor.py From udify with MIT License | 5 votes |
def predict_batch_instance(self, instances: List[Instance]) -> List[JsonDict]: if "@@UNKNOWN@@" not in self._model.vocab._token_to_index["lemmas"]: # Handle cases where the labels are present in the test set but not training set for instance in instances: self._predict_unknown(instance) outputs = self._model.forward_on_instances(instances) return sanitize(outputs)
Example #27
Source File: list_field_test.py From allennlp with Apache License 2.0 | 5 votes |
def test_empty_list_can_be_tensorized(self): tokenizer = SpacyTokenizer() tokens = tokenizer.tokenize("Foo") text_field = TextField(tokens, self.word_indexer) list_field = ListField([text_field.empty_field()]) fields = { "list": list_field, "bar": TextField(tokenizer.tokenize("BAR"), self.word_indexer), } instance = Instance(fields) instance.index_fields(self.vocab) instance.as_tensor_dict()
Example #28
Source File: dataset_reader_test.py From allennlp with Apache License 2.0 | 5 votes |
def text_to_instance(self, index: int): # type: ignore return Instance({"index": LabelField(index, skip_indexing=True)})
Example #29
Source File: dataset_reader_test.py From allennlp with Apache License 2.0 | 5 votes |
def text_to_instance(self, index: int): # type: ignore return Instance({"index": LabelField(index, skip_indexing=True)})
Example #30
Source File: dataset_reader_test.py From allennlp with Apache License 2.0 | 5 votes |
def text_to_instance(self, index: int): # type: ignore return Instance({"index": LabelField(index, skip_indexing=True)})