Python allennlp.data.instance.Instance() Examples
The following are 30
code examples of allennlp.data.instance.Instance().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
allennlp.data.instance
, or try the search function
.
Example #1
Source File: dataset_reader.py From ConvLab with MIT License | 6 votes |
def text_to_instance(self, tokens: List[Token], tags: List[str] = None, domain: str = None, intent: str = None, dialog_act: Dict[str, Any] = None) -> Instance: # type: ignore """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. """ # pylint: disable=arguments-differ fields: Dict[str, Field] = {} sequence = TextField(tokens, self._token_indexers) fields["tokens"] = sequence if tags: fields["tags"] = SequenceLabelField(tags, sequence) if domain: fields["domain"] = LabelField(domain, label_namespace="domain_labels") if intent: fields["intent"] = LabelField(intent, label_namespace="intent_labels") if dialog_act is not None: fields["metadata"] = MetadataField({"words": [x.text for x in tokens], 'dialog_act': dialog_act}) else: fields["metadata"] = MetadataField({"words": [x.text for x in tokens], 'dialog_act': {}}) return Instance(fields)
Example #2
Source File: dataset_reader.py From magnitude with MIT License | 6 votes |
def text_to_instance(self, *inputs) : u""" Does whatever tokenization or processing is necessary to go from textual input to an ``Instance``. The primary intended use for this is with a :class:`~allennlp.service.predictors.predictor.Predictor`, which gets text input as a JSON object and needs to process it to be input to a model. The intent here is to share code between :func:`_read` and what happens at model serving time, or any other time you want to make a prediction from new data. We need to process the data in the same way it was done at training time. Allowing the ``DatasetReader`` to process new text lets us accomplish this, as we can just call ``DatasetReader.text_to_instance`` when serving predictions. The input type here is rather vaguely specified, unfortunately. The ``Predictor`` will have to make some assumptions about the kind of ``DatasetReader`` that it's using, in order to pass it the right information. """ raise NotImplementedError
Example #3
Source File: snli.py From magnitude with MIT License | 6 votes |
def text_to_instance(self, # type: ignore premise , hypothesis , label = None) : # pylint: disable=arguments-differ fields = {} premise_tokens = self._tokenizer.tokenize(premise) hypothesis_tokens = self._tokenizer.tokenize(hypothesis) fields[u'premise'] = TextField(premise_tokens, self._token_indexers) fields[u'hypothesis'] = TextField(hypothesis_tokens, self._token_indexers) if label: fields[u'label'] = LabelField(label) metadata = {u"premise_tokens": [x.text for x in premise_tokens], u"hypothesis_tokens": [x.text for x in hypothesis_tokens]} fields[u"metadata"] = MetadataField(metadata) return Instance(fields)
Example #4
Source File: semantic_role_labeling.py From magnitude with MIT License | 6 votes |
def text_to_instance(self, # type: ignore tokens , verb_label , tags = None) : u""" We take `pre-tokenized` input here, along with a verb label. The verb label should be a one-hot binary vector, the same length as the tokens, indicating the position of the verb to find arguments for. """ # pylint: disable=arguments-differ fields = {} text_field = TextField(tokens, token_indexers=self._token_indexers) fields[u'tokens'] = text_field fields[u'verb_indicator'] = SequenceLabelField(verb_label, text_field) if tags: fields[u'tags'] = SequenceLabelField(tags, text_field) if all([x == 0 for x in verb_label]): verb = None else: verb = tokens[verb_label.index(1)].text fields[u"metadata"] = MetadataField({u"words": [x.text for x in tokens], u"verb": verb}) return Instance(fields)
Example #5
Source File: data_loading.py From teaching with GNU General Public License v3.0 | 6 votes |
def text_to_instance(self, query_id:str, doc_id:str, query_sequence: str, doc_sequence: str) -> Instance: # type: ignore # pylint: disable=arguments-differ query_id_field = LabelField(int(query_id), skip_indexing=True) doc_id_field = LabelField(int(doc_id), skip_indexing=True) query_tokenized = self._tokenizer.tokenize(query_sequence) if self.max_query_length > -1: query_tokenized = query_tokenized[:self.max_query_length] query_field = TextField(query_tokenized, self._token_indexers) doc_tokenized = self._tokenizer.tokenize(doc_sequence) if self.max_doc_length > -1: doc_tokenized = doc_tokenized[:self.max_doc_length] doc_field = TextField(doc_tokenized, self._token_indexers) return Instance({ "query_id":query_id_field, "doc_id":doc_id_field, "query_tokens":query_field, "doc_tokens":doc_field})
Example #6
Source File: data_loading.py From teaching with GNU General Public License v3.0 | 6 votes |
def text_to_instance(self, query_sequence: str, doc_pos_sequence: str, doc_neg_sequence: str) -> Instance: # type: ignore # pylint: disable=arguments-differ query_tokenized = self._tokenizer.tokenize(query_sequence) if self.max_query_length > -1: query_tokenized = query_tokenized[:self.max_query_length] query_field = TextField(query_tokenized, self._token_indexers) doc_pos_tokenized = self._tokenizer.tokenize(doc_pos_sequence) if self.max_doc_length > -1: doc_pos_tokenized = doc_pos_tokenized[:self.max_doc_length] doc_pos_field = TextField(doc_pos_tokenized, self._token_indexers) doc_neg_tokenized = self._tokenizer.tokenize(doc_neg_sequence) if self.max_doc_length > -1: doc_neg_tokenized = doc_neg_tokenized[:self.max_doc_length] doc_neg_field = TextField(doc_neg_tokenized, self._token_indexers) return Instance({ "query_tokens":query_field, "doc_pos_tokens":doc_pos_field, "doc_neg_tokens": doc_neg_field})
Example #7
Source File: language_modeling.py From magnitude with MIT License | 6 votes |
def _read(self, file_path ): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) with open(file_path, u"r") as text_file: instance_strings = text_file.readlines() if self._tokens_per_instance is not None: all_text = u" ".join([x.replace(u"\n", u" ").strip() for x in instance_strings]) tokenized_text = self._tokenizer.tokenize(all_text) num_tokens = self._tokens_per_instance + 1 tokenized_strings = [] logger.info(u"Creating dataset from all text in file: %s", file_path) for index in Tqdm.tqdm(range(0, len(tokenized_text) - num_tokens, num_tokens - 1)): tokenized_strings.append(tokenized_text[index:(index + num_tokens)]) else: tokenized_strings = [self._tokenizer.tokenize(s) for s in instance_strings] for tokenized_string in tokenized_strings: input_field = TextField(tokenized_string[:-1], self._token_indexers) output_field = TextField(tokenized_string[1:], self._output_indexer) yield Instance({u'input_tokens': input_field, u'output_tokens': output_field}) #overrides
Example #8
Source File: dataset.py From magnitude with MIT License | 6 votes |
def get_padding_lengths(self) : u""" Gets the maximum padding lengths from all ``Instances`` in this batch. Each ``Instance`` has multiple ``Fields``, and each ``Field`` could have multiple things that need padding. We look at all fields in all instances, and find the max values for each (field_name, padding_key) pair, returning them in a dictionary. This can then be used to convert this batch into arrays of consistent length, or to set model parameters, etc. """ padding_lengths = defaultdict(dict) all_instance_lengths = [instance.get_padding_lengths() for instance in self.instances] if not all_instance_lengths: return padding_lengths all_field_lengths = defaultdict(list) for instance_lengths in all_instance_lengths: for field_name, instance_field_lengths in instance_lengths.items(): all_field_lengths[field_name].append(instance_field_lengths) for field_name, field_lengths in all_field_lengths.items(): for padding_key in field_lengths[0].keys(): max_value = max(x[padding_key] if padding_key in x else 0 for x in field_lengths) padding_lengths[field_name][padding_key] = max_value return padding_lengths
Example #9
Source File: ebmnlp.py From scibert with Apache License 2.0 | 6 votes |
def _read(self, file_path: str) -> Iterable[Instance]: # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) with open(file_path, "r") as data_file: logger.info("Reading instances from lines in file at: %s", file_path) # Group into alternative divider / sentence chunks. for is_divider, lines in itertools.groupby(data_file, _is_divider): # Ignore the divider chunks, so that `lines` corresponds to the words # of a single sentence. if not is_divider: fields = [line.strip().split() for line in lines] # unzipping trick returns tuples, but our Fields need lists fields = [list(field) for field in zip(*fields)] tokens_, _, _, pico_tags = fields # TextField requires ``Token`` objects tokens = [Token(token) for token in tokens_] yield self.text_to_instance(tokens, pico_tags)
Example #10
Source File: prolocal_dataset_reader.py From propara with Apache License 2.0 | 6 votes |
def text_to_instance(self, # type: ignore sentence_tokens: List[str], verb_vector: List[int], entity_vector: List[int], state_change_types: Optional[List[str]] = None, state_change_tags: Optional[List[str]] = None) -> Instance: # pylint: disable=arguments-differ fields: Dict[str, Field] = {} # encode inputs token_field = TextField([Token(word) for word in sentence_tokens], self._token_indexers) fields['tokens'] = token_field fields['verb_span'] = SequenceLabelField(verb_vector, token_field, 'indicator_tags') fields['entity_span'] = SequenceLabelField(entity_vector, token_field, 'indicator_tags') # encode outputs if state_change_types: fields['state_change_type_labels'] = LabelField(state_change_types, 'state_change_type_labels') if state_change_tags: fields['state_change_tags'] = SequenceLabelField(state_change_tags, token_field, 'state_change_tags') return Instance(fields)
Example #11
Source File: semeval_2010_task_8_reader.py From DISTRE with Apache License 2.0 | 6 votes |
def text_to_instance(self, # type: ignore tokens: List[str], entity_1: Tuple[int], entity_2: Tuple[int], label: str = None) -> Instance: # pylint: disable=arguments-differ fields: Dict[str, Field] = {} tokens = [OpenAISplitter._standardize(token) for token in tokens] tokens = ['__start__'] + tokens[entity_1[0]:entity_1[1]+1] + ['__del1__'] + tokens[entity_2[0]:entity_2[1]+1] + ['__del2__'] + tokens + ['__clf__'] sentence = TextField([Token(text=t) for t in tokens], self._token_indexers) fields['sentence'] = sentence #fields['entity1'] = SpanField(*entity_1, sequence_field=sentence) #fields['entity2'] = SpanField(*entity_2, sequence_field=sentence) if label: fields['label'] = LabelField(label) return Instance(fields)
Example #12
Source File: dataset_reader.py From ConvLab with MIT License | 6 votes |
def text_to_instance(self, context_tokens: List[Token], tokens: List[Token], tags: List[str] = None, intents: List[str] = None, dialog_act: Dict[str, Any] = None) -> Instance: # type: ignore """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. """ # pylint: disable=arguments-differ fields: Dict[str, Field] = {} # print([t.text for t in context_tokens]) fields["context_tokens"] = TextField(context_tokens, self._token_indexers) fields["tokens"] = TextField(tokens, self._token_indexers) fields["metadata"] = MetadataField({"words": [x.text for x in tokens]}) if tags is not None: fields["tags"] = SequenceLabelField(tags, fields["tokens"]) if intents is not None: fields["intents"] = MultiLabelField(intents, label_namespace="intent_labels") if dialog_act is not None: fields["metadata"] = MetadataField({"words": [x.text for x in tokens], 'dialog_act': dialog_act}) else: fields["metadata"] = MetadataField({"words": [x.text for x in tokens], 'dialog_act': {}}) return Instance(fields)
Example #13
Source File: dataset.py From magnitude with MIT License | 6 votes |
def print_statistics(self) : pass # # Make sure if has been indexed first # sequence_field_lengths = defaultdict(list) # for instance in self.instances: # if not instance.indexed: # raise ConfigurationError(u"Instances must be indexed with vocabulary " # u"before asking to print dataset statistics.") # for field, field_padding_lengths in instance.get_padding_lengths().items(): # for key, value in field_padding_lengths.items(): # sequence_field_lengths["{field}.{key}"].append(value) # print(u"\n\n----Dataset Statistics----\n") # for name, lengths in sequence_field_lengths.items(): # print("Statistics for {name}:") # print("\tLengths: Mean: {numpy.mean(lengths)}, Standard Dev: {numpy.std(lengths)}, ") # "Max: {numpy.max(lengths)}, Min: {numpy.min(lengths)}" # print(u"\n10 Random instances: ") # for i in list(numpy.random.randint(len(self.instances), size=10)): # print("Instance {i}:") # print("\t{self.instances[i]}")
Example #14
Source File: lazy_dataset_reader_test.py From allennlp with Apache License 2.0 | 5 votes |
def _read(self, _: str) -> Iterable[Instance]: self.num_reads += 1 return (instance for instance in self._instances)
Example #15
Source File: ontonotes_ner.py From magnitude with MIT License | 5 votes |
def text_to_instance(self, # type: ignore tokens , ner_tags = None) : u""" We take `pre-tokenized` input here, because we don't have a tokenizer in this class. """ # pylint: disable=arguments-differ sequence = TextField(tokens, self._token_indexers) instance_fields = {u'tokens': sequence} instance_fields[u"metadata"] = MetadataField({u"words": [x.text for x in tokens]}) # Add "tag label" to instance if ner_tags is not None: if self._coding_scheme == u"BIOUL": ner_tags = to_bioul(ner_tags, encoding=u"BIO") instance_fields[u'tags'] = SequenceLabelField(ner_tags, sequence) return Instance(instance_fields)
Example #16
Source File: universal_dependencies.py From magnitude with MIT License | 5 votes |
def text_to_instance(self, # type: ignore words , upos_tags , dependencies = None) : # pylint: disable=arguments-differ u""" Parameters ---------- words : ``List[str]``, required. The words in the sentence to be encoded. upos_tags : ``List[str]``, required. The universal dependencies POS tags for each word. dependencies ``List[Tuple[str, int]]``, optional (default = None) A list of (head tag, head index) tuples. Indices are 1 indexed, meaning an index of 0 corresponds to that word being the root of the dependency tree. Returns ------- An instance containing words, upos tags, dependency head tags and head indices as fields. """ fields = {} tokens = TextField([Token(w) for w in words], self._token_indexers) fields[u"words"] = tokens fields[u"pos_tags"] = SequenceLabelField(upos_tags, tokens, label_namespace=u"pos") if dependencies is not None: # We don't want to expand the label namespace with an additional dummy token, so we'll # always give the 'ROOT_HEAD' token a label of 'root'. fields[u"head_tags"] = SequenceLabelField([x[0] for x in dependencies], tokens, label_namespace=u"head_tags") fields[u"head_indices"] = SequenceLabelField([int(x[1]) for x in dependencies], tokens, label_namespace=u"head_index_tags") fields[u"metadata"] = MetadataField({u"words": words, u"pos": upos_tags}) return Instance(fields)
Example #17
Source File: sequence_tagging.py From magnitude with MIT License | 5 votes |
def text_to_instance(self, tokens , tags = None) : # type: ignore u""" We take `pre-tokenized` input here, because we don't have a tokenizer in this class. """ # pylint: disable=arguments-differ fields = {} sequence = TextField(tokens, self._token_indexers) fields[u"tokens"] = sequence fields[u"metadata"] = MetadataField({u"words": [x.text for x in tokens]}) if tags is not None: fields[u"tags"] = SequenceLabelField(tags, sequence) return Instance(fields)
Example #18
Source File: language_modeling.py From magnitude with MIT License | 5 votes |
def text_to_instance(self, sentence ) : # type: ignore # pylint: disable=arguments-differ tokenized_string = self._tokenizer.tokenize(sentence) input_field = TextField(tokenized_string[:-1], self._token_indexers) output_field = TextField(tokenized_string[1:], self._output_indexer) return Instance({u'input_tokens': input_field, u'output_tokens': output_field})
Example #19
Source File: decompatt_predictor.py From scitail with Apache License 2.0 | 5 votes |
def _json_to_instance(self, # type: ignore json_dict: JsonDict) -> Instance: # pylint: disable=arguments-differ premise_text = json_dict["sentence1"] hypothesis_text = json_dict["sentence2"] return self._dataset_reader.text_to_instance(premise_text, hypothesis_text)
Example #20
Source File: entailment_tuple_reader.py From scitail with Apache License 2.0 | 5 votes |
def text_to_instance(self, premise: str, hypothesis: str, hypothesis_structure: str, label: str = None) -> Instance: fields: Dict[str, Field] = {} premise_tokens = self._tokenizer.tokenize(premise)[-self._max_tokens:] hypothesis_tokens = self._tokenizer.tokenize(hypothesis)[-self._max_tokens:] fields['premise'] = TextField(premise_tokens, self._token_indexers) fields['hypothesis'] = TextField(hypothesis_tokens, self._token_indexers) metadata = { 'premise': premise, 'hypothesis': hypothesis, 'premise_tokens': [token.text for token in premise_tokens], 'hypothesis_tokens': [token.text for token in hypothesis_tokens] } fields['metadata'] = MetadataField(metadata) self._add_structure_to_fields(hypothesis_structure, fields) if label: fields['label'] = LabelField(label) return Instance(fields)
Example #21
Source File: util_test.py From allennlp with Apache License 2.0 | 5 votes |
def train_util_test_reader(): @DatasetReader.register("train-util-test-reader") class TrainUtilTestReader(DatasetReader): def _read(self, data_path): logger.info("...train-util-test-reader reading from %s", data_path) for i in range(10): yield self.text_to_instance(i) def text_to_instance(self, index: int) -> Instance: # type: ignore return Instance({"index": LabelField(index, skip_indexing=True)}) yield TrainUtilTestReader del DatasetReader._registry[DatasetReader]["train-util-test-reader"]
Example #22
Source File: dataloader_test.py From allennlp with Apache License 2.0 | 5 votes |
def test_loader_uses_all_instances_when_batches_per_epochs_set(lazy): NUM_INSTANCES = 20 BATCH_SIZE = 2 BATCHES_PER_EPOCH = 3 EPOCHS = 4 class FakeDatasetReader(DatasetReader): def _read(self, filename: str) -> Iterable[Instance]: for i in range(NUM_INSTANCES): yield Instance({"index": LabelField(i, skip_indexing=True)}) reader = FakeDatasetReader(lazy=lazy) dataset = reader.read("blah") loader = PyTorchDataLoader(dataset, batch_size=BATCH_SIZE, batches_per_epoch=BATCHES_PER_EPOCH) epoch_batches = [] for epoch in range(EPOCHS): batches = [] for batch in loader: instances = [] for index in batch["index"]: instances.append(index) batches.append(instances) epoch_batches.append(batches) assert epoch_batches == [ # Epoch 0. [[0, 1], [2, 3], [4, 5]], # Epoch 1. [[6, 7], [8, 9], [10, 11]], # Epoch 2. [[12, 13], [14, 15], [16, 17]], # Epoch 3. [[18, 19], [0, 1], [2, 3]], ]
Example #23
Source File: dataloader_test.py From allennlp with Apache License 2.0 | 5 votes |
def test_multi_processing_with_lazy_dataset_warns(): def fake_instance_generator(file_name: str) -> Iterable[Instance]: yield from [] with pytest.warns(UserWarning, match=r".*deadlocks.*"): PyTorchDataLoader( AllennlpLazyDataset(fake_instance_generator, "nonexistent_file"), num_workers=1 )
Example #24
Source File: sharded_dataset_reader_test.py From allennlp with Apache License 2.0 | 5 votes |
def fingerprint(instance: Instance) -> Tuple[str, ...]: """ Get a hashable representation of a sequence tagging instance that can be put in a Counter. """ text_tuple = tuple(t.text for t in instance.fields["tokens"].tokens) # type: ignore labels_tuple = tuple(instance.fields["tags"].labels) # type: ignore return text_tuple + labels_tuple
Example #25
Source File: lazy_dataset_reader_test.py From allennlp with Apache License 2.0 | 5 votes |
def __init__(self, instances: List[Instance], lazy: bool) -> None: super().__init__() self.lazy = lazy self._instances = instances self.num_reads = 0
Example #26
Source File: dataloader.py From allennlp with Apache License 2.0 | 5 votes |
def allennlp_collate(instances: List[Instance]) -> TensorDict: batch = Batch(instances) return batch.as_tensor_dict(batch.get_padding_lengths())
Example #27
Source File: span_pred_reader.py From semanticRetrievalMRS with MIT License | 5 votes |
def text_to_instance(self, # type: ignore example) -> Instance: fields: Dict[str, Field] = {} joint_tokens_seq = ['[CLS]'] + example['query_c_tokens'] + ['[SEP]'] + example['context_c_tokens'] + ['[SEP]'] assert len(joint_tokens_seq) < 512 text1_len = len(example['query_c_tokens']) + 2 text2_len = len(example['context_c_tokens']) + 1 segments_ids = [0 for _ in range(text1_len)] + [1 for _ in range(text2_len)] joint_tokens_ids = self.bert_tokenizer.convert_tokens_to_ids(joint_tokens_seq) assert len(joint_tokens_ids) == len(segments_ids) fields['paired_sequence'] = BertIndexField(np.asarray(joint_tokens_ids, dtype=np.int64)) fields['paired_segments_ids'] = BertIndexField(np.asarray(segments_ids, dtype=np.int64)) # This text span is begin inclusive and end exclusive. text1_span = (1, 1 + len(example['query_c_tokens'])) # End is exclusive (important for later use) text2_span = (text1_span[1] + 1, text1_span[1] + 1 + len(example['context_c_tokens'])) fields['bert_s1_span'] = SpanField(text1_span[0], text1_span[1], fields['paired_sequence']) fields['bert_s2_span'] = SpanField(text2_span[0], text2_span[1], fields['paired_sequence']) # fields['bert_s2_span'] = SpanField(text2_span) # fields['bert_s1_span'] = MetadataField(text1_span) # fields['bert_s2_span'] = MetadataField(text2_span) # However, the ground truth span is begin and end both inclusive fields['gt_span'] = SpanField(example['start_position'], example['end_position'], fields['paired_sequence']) fields['fid'] = IdField(example['fid']) fields['uid'] = IdField(example['uid']) return Instance(fields)
Example #28
Source File: interleaving_dataset_reader_test.py From allennlp with Apache License 2.0 | 5 votes |
def text_to_instance(self, line: str) -> Instance: # type: ignore tokens = self._tokenizer.tokenize(line) return Instance({"line": TextField(tokens, self._token_indexers)})
Example #29
Source File: interleaving_dataset_reader_test.py From allennlp with Apache License 2.0 | 5 votes |
def _read(self, file_path: str) -> Iterable[Instance]: with open(file_path) as input_file: for line in input_file: yield self.text_to_instance(line)
Example #30
Source File: pretrained_transformer_mismatched_embedder_test.py From allennlp with Apache License 2.0 | 5 votes |
def test_token_without_wordpieces(self): token_indexer = PretrainedTransformerMismatchedIndexer("bert-base-uncased") sentence1 = ["A", "", "AllenNLP", "sentence", "."] sentence2 = ["AllenNLP", "", "great"] tokens1 = [Token(word) for word in sentence1] tokens2 = [Token(word) for word in sentence2] vocab = Vocabulary() params = Params( { "token_embedders": { "bert": { "type": "pretrained_transformer_mismatched", "model_name": "bert-base-uncased", } } } ) token_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab, params=params) instance1 = Instance({"tokens": TextField(tokens1, {"bert": token_indexer})}) instance2 = Instance({"tokens": TextField(tokens2, {"bert": token_indexer})}) batch = Batch([instance1, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] assert tokens["bert"]["offsets"].tolist() == [ [[1, 1], [-1, -1], [2, 4], [5, 5], [6, 6]], [[1, 3], [-1, -1], [4, 4], [0, 0], [0, 0]], ] bert_vectors = token_embedder(tokens) assert bert_vectors.size() == (2, max(len(sentence1), len(sentence2)), 768) assert not torch.isnan(bert_vectors).any() assert all(bert_vectors[0, 1] == 0) assert all(bert_vectors[1, 1] == 0)