Python allennlp.data.token_indexers.TokenIndexer() Examples
The following are 30
code examples of allennlp.data.token_indexers.TokenIndexer().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
allennlp.data.token_indexers
, or try the search function
.
Example #1
Source File: wordnet.py From kb with Apache License 2.0 | 6 votes |
def __init__(self, wordnet_entity_file: str, token_indexers: Dict[str, TokenIndexer], entity_indexer: TokenIndexer, is_training: bool, use_surface_form: bool = False, should_remap_span_indices: bool = True, extra_candidate_generators: Dict[str, MentionGenerator] = None): super().__init__(False) self.mention_generator = WordNetCandidateMentionGenerator( wordnet_entity_file, use_surface_form=use_surface_form ) self.token_indexers = token_indexers self.entity_indexer = {"ids": entity_indexer} self.is_training = is_training self.should_remap_span_indices = should_remap_span_indices self.extra_candidate_generators = extra_candidate_generators
Example #2
Source File: datareader.py From NLP_Toolkit with Apache License 2.0 | 6 votes |
def __init__(self, token_indexers: Dict[str, TokenIndexer] = None, delimeters: dict = SEQ_DELIMETERS, skip_correct: bool = False, skip_complex: int = 0, lazy: bool = False, max_len: int = None, test_mode: bool = False, tag_strategy: str = "keep_one", tn_prob: float = 0, tp_prob: float = 0, broken_dot_strategy: str = "keep") -> None: super().__init__(lazy) self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()} self._delimeters = delimeters self._max_len = max_len self._skip_correct = skip_correct self._skip_complex = skip_complex self._tag_strategy = tag_strategy self._broken_dot_strategy = broken_dot_strategy self._test_mode = test_mode self._tn_prob = tn_prob self._tp_prob = tp_prob
Example #3
Source File: bert_triple_loader.py From transformer-kernel-ranking with Apache License 2.0 | 6 votes |
def __init__(self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, source_add_start_token: bool = True, max_doc_length:int = -1, max_query_length:int = -1, min_doc_length:int = -1, min_query_length:int = -1, lazy: bool = False) -> None: super().__init__(lazy) self._tokenizer = tokenizer self._token_indexers = token_indexers self._source_add_start_token = source_add_start_token self.max_doc_length = max_doc_length self.max_query_length = max_query_length self.min_doc_length = min_doc_length self.min_query_length = min_query_length self.padding_value = Token(text = "[PAD]",text_id=0) self.sep_value = Token(text = "[SEP]")
Example #4
Source File: text_classification_json.py From allennlp with Apache License 2.0 | 6 votes |
def __init__( self, token_indexers: Dict[str, TokenIndexer] = None, tokenizer: Tokenizer = None, segment_sentences: bool = False, max_sequence_length: int = None, skip_label_indexing: bool = False, **kwargs, ) -> None: super().__init__(**kwargs) self._tokenizer = tokenizer or SpacyTokenizer() self._segment_sentences = segment_sentences self._max_sequence_length = max_sequence_length self._skip_label_indexing = skip_label_indexing self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()} if self._segment_sentences: self._sentence_segmenter = SpacySentenceSplitter()
Example #5
Source File: summarization_sentence_tagger_reader.py From summarus with Apache License 2.0 | 6 votes |
def __init__(self, tokenizer: Tokenizer, language: str, source_token_indexers: Dict[str, TokenIndexer] = None, max_sentences_count: int = 100, sentence_max_tokens: int = 100, lowercase: bool = True, lazy: bool = True) -> None: super().__init__(lazy=lazy) self._tokenizer = tokenizer self._lowercase = lowercase self._language = language self._max_sentences_count = max_sentences_count self._sentence_max_tokens = sentence_max_tokens self._source_token_indexers = source_token_indexers or {"tokens": SingleIdTokenIndexer()}
Example #6
Source File: semisupervised_text_classification_json.py From vampire with Apache License 2.0 | 6 votes |
def __init__(self, token_indexers: Dict[str, TokenIndexer] = None, tokenizer: Tokenizer = None, max_sequence_length: int = None, ignore_labels: bool = False, sample: int = None, skip_label_indexing: bool = False, lazy: bool = False) -> None: super().__init__(lazy=lazy, token_indexers=token_indexers, tokenizer=tokenizer, max_sequence_length=max_sequence_length, skip_label_indexing=skip_label_indexing) self._tokenizer = tokenizer or WordTokenizer() self._sample = sample self._max_sequence_length = max_sequence_length self._ignore_labels = ignore_labels self._skip_label_indexing = skip_label_indexing self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()} if self._segment_sentences: self._sentence_segmenter = SpacySentenceSplitter()
Example #7
Source File: transition_eds_reader.py From HIT-SCIR-CoNLL2019 with Apache License 2.0 | 6 votes |
def __init__(self, token_indexers: Dict[str, TokenIndexer] = None, lemma_indexers: Dict[str, TokenIndexer] = None, action_indexers: Dict[str, TokenIndexer] = None, arc_tag_indexers: Dict[str, TokenIndexer] = None, concept_label_indexers: Dict[str, TokenIndexer] = None, lazy: bool = False) -> None: super().__init__(lazy) self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()} self._lemma_indexers = None if lemma_indexers is not None and len(lemma_indexers) > 0: self._lemma_indexers = lemma_indexers self._action_indexers = None if action_indexers is not None and len(action_indexers) > 0: self._action_indexers = action_indexers self._arc_tag_indexers = None if arc_tag_indexers is not None and len(arc_tag_indexers) > 0: self._arc_tag_indexers = arc_tag_indexers self._concept_label_indexers = concept_label_indexers or { 'concept_label': SingleIdTokenIndexer(namespace='concept_label')}
Example #8
Source File: transition_sdp_reader.py From HIT-SCIR-CoNLL2019 with Apache License 2.0 | 6 votes |
def __init__(self, token_indexers: Dict[str, TokenIndexer] = None, lemma_indexers: Dict[str, TokenIndexer] = None, action_indexers: Dict[str, TokenIndexer] = None, arc_tag_indexers: Dict[str, TokenIndexer] = None, lazy: bool = False) -> None: super().__init__(lazy) self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()} self._lemma_indexers = None if lemma_indexers is not None and len(lemma_indexers) > 0: self._lemma_indexers = lemma_indexers self._action_indexers = None if action_indexers is not None and len(action_indexers) > 0: self._action_indexers = action_indexers self._arc_tag_indexers = None if arc_tag_indexers is not None and len(arc_tag_indexers) > 0: self._arc_tag_indexers = arc_tag_indexers
Example #9
Source File: transition_ucca_reader.py From HIT-SCIR-CoNLL2019 with Apache License 2.0 | 6 votes |
def __init__(self, token_indexers: Dict[str, TokenIndexer] = None, lemma_indexers: Dict[str, TokenIndexer] = None, action_indexers: Dict[str, TokenIndexer] = None, arc_tag_indexers: Dict[str, TokenIndexer] = None, lazy: bool = False) -> None: super().__init__(lazy) self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()} self._lemma_indexers = None if lemma_indexers is not None and len(lemma_indexers) > 0: self._lemma_indexers = lemma_indexers self._action_indexers = None if action_indexers is not None and len(action_indexers) > 0: self._action_indexers = action_indexers self._arc_tag_indexers = None if arc_tag_indexers is not None and len(arc_tag_indexers) > 0: self._arc_tag_indexers = arc_tag_indexers
Example #10
Source File: conll2003.py From allennlp with Apache License 2.0 | 6 votes |
def __init__( self, token_indexers: Dict[str, TokenIndexer] = None, tag_label: str = "ner", feature_labels: Sequence[str] = (), coding_scheme: str = "IOB1", label_namespace: str = "labels", **kwargs, ) -> None: super().__init__(**kwargs) self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()} if tag_label is not None and tag_label not in self._VALID_LABELS: raise ConfigurationError("unknown tag label type: {}".format(tag_label)) for label in feature_labels: if label not in self._VALID_LABELS: raise ConfigurationError("unknown feature label type: {}".format(label)) if coding_scheme not in ("IOB1", "BIOUL"): raise ConfigurationError("unknown coding_scheme: {}".format(coding_scheme)) self.tag_label = tag_label self.feature_labels = set(feature_labels) self.coding_scheme = coding_scheme self.label_namespace = label_namespace self._original_coding_scheme = "IOB1"
Example #11
Source File: bert_labeled_tuple_loader.py From transformer-kernel-ranking with Apache License 2.0 | 6 votes |
def __init__(self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, source_add_start_token: bool = True, max_doc_length:int = -1, max_query_length:int = -1, min_doc_length:int = -1, min_query_length:int = -1, lazy: bool = False) -> None: super().__init__(lazy) self._tokenizer = tokenizer self._token_indexers = token_indexers self._source_add_start_token = source_add_start_token self.max_doc_length = max_doc_length self.max_query_length = max_query_length self.min_doc_length = min_doc_length self.min_query_length = min_query_length self.padding_value = Token(text = "[PAD]",text_id=0) self.sep_value = Token(text = "[SEP]")
Example #12
Source File: ir_triple_loader.py From transformer-kernel-ranking with Apache License 2.0 | 6 votes |
def __init__(self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, source_add_start_token: bool = True, max_doc_length:int = -1, max_query_length:int = -1, min_doc_length:int = -1, min_query_length:int = -1, lazy: bool = False) -> None: super().__init__(lazy) self._tokenizer = tokenizer or WordTokenizer() # little bit faster, useful for multicore proc. word_splitter=SimpleWordSplitter() self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)} self._source_add_start_token = source_add_start_token self.max_doc_length = max_doc_length self.max_query_length = max_query_length self.min_doc_length = min_doc_length self.min_query_length = min_query_length self.padding_value = Token(text = "@@PADDING@@",text_id=0)
Example #13
Source File: ir_labeled_tuple_loader.py From transformer-kernel-ranking with Apache License 2.0 | 6 votes |
def __init__(self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, source_add_start_token: bool = True, max_doc_length:int = -1, max_query_length:int = -1, min_doc_length:int = -1, min_query_length:int = -1, lazy: bool = False) -> None: super().__init__(lazy) self._tokenizer = tokenizer or WordTokenizer() # little bit faster, useful for multicore proc. word_splitter=SimpleWordSplitter() self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)} self._source_add_start_token = source_add_start_token self.max_doc_length = max_doc_length self.max_query_length = max_query_length self.min_doc_length = min_doc_length self.min_query_length = min_query_length self.padding_value = Token(text = "@@PADDING@@",text_id=0)
Example #14
Source File: citation_data_reader_scicite_aux.py From scicite with Apache License 2.0 | 6 votes |
def __init__(self, lazy: bool = False, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, clean_citation: bool = True, with_elmo: bool = False ) -> None: super().__init__(lazy) self._clean_citation = clean_citation self._tokenizer = tokenizer or WordTokenizer() self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()} if with_elmo: self._token_indexers = {"elmo": ELMoTokenCharactersIndexer(), "tokens": SingleIdTokenIndexer()} else: self._token_indexers = {"tokens": SingleIdTokenIndexer()}
Example #15
Source File: copynet.py From nlp-models with MIT License | 6 votes |
def __init__( self, target_namespace: str, source_tokenizer: Tokenizer = None, target_tokenizer: Tokenizer = None, source_token_indexers: Dict[str, TokenIndexer] = None, target_token_indexers: Dict[str, TokenIndexer] = None, lazy: bool = False, ) -> None: super().__init__(lazy) self._target_namespace = target_namespace self._source_tokenizer = source_tokenizer or SpacyTokenizer() self._target_tokenizer = target_tokenizer or self._source_tokenizer self._source_token_indexers = source_token_indexers or { "tokens": SingleIdTokenIndexer() } self._target_token_indexers = ( target_token_indexers or self._source_token_indexers ) warnings.warn( "The 'copynet' dataset reader has been deprecated in favor of the " "'copynet_seq2seq' dataset reader (now part of the AllenNLP library).", DeprecationWarning, )
Example #16
Source File: nl2bash.py From nlp-models with MIT License | 6 votes |
def __init__( self, target_namespace: str, source_tokenizer: Tokenizer = None, target_tokenizer: Tokenizer = None, source_token_indexers: Dict[str, TokenIndexer] = None, target_token_indexers: Dict[str, TokenIndexer] = None, lazy: bool = False, ) -> None: source_tokenizer = source_tokenizer or NL2BashWordSplitter() target_tokenizer = target_tokenizer or source_tokenizer super().__init__( target_namespace, source_tokenizer=source_tokenizer, target_tokenizer=target_tokenizer, source_token_indexers=source_token_indexers, target_token_indexers=target_token_indexers, lazy=lazy, )
Example #17
Source File: citation_data_reader_aclarc.py From scicite with Apache License 2.0 | 6 votes |
def __init__(self, lazy: bool = False, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, use_lexicon_features: bool = False, use_sparse_lexicon_features: bool = False, with_elmo: bool = False ) -> None: super().__init__(lazy) self._tokenizer = tokenizer or WordTokenizer() if with_elmo: self._token_indexers = {"elmo": ELMoTokenCharactersIndexer(), "tokens": SingleIdTokenIndexer()} else: self._token_indexers = {"tokens": SingleIdTokenIndexer()} self.use_lexicon_features = use_lexicon_features self.use_sparse_lexicon_features = use_sparse_lexicon_features if self.use_lexicon_features or self.use_sparse_lexicon_features: self.lexicons = {**ALL_ACTION_LEXICONS, **ALL_CONCEPT_LEXICONS}
Example #18
Source File: nlvr.py From allennlp-semparse with Apache License 2.0 | 6 votes |
def __init__( self, lazy: bool = False, tokenizer: Tokenizer = None, sentence_token_indexers: Dict[str, TokenIndexer] = None, nonterminal_indexers: Dict[str, TokenIndexer] = None, terminal_indexers: Dict[str, TokenIndexer] = None, output_agendas: bool = True, ) -> None: super().__init__(lazy) self._tokenizer = tokenizer or SpacyTokenizer() self._sentence_token_indexers = sentence_token_indexers or { "tokens": SingleIdTokenIndexer() } self._nonterminal_indexers = nonterminal_indexers or { "tokens": SingleIdTokenIndexer("rule_labels") } self._terminal_indexers = terminal_indexers or { "tokens": SingleIdTokenIndexer("rule_labels") } self._output_agendas = output_agendas
Example #19
Source File: citation_data_reader_aclarc_aux.py From scicite with Apache License 2.0 | 6 votes |
def __init__(self, lazy: bool = False, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, clean_citation: bool = True, with_elmo: bool = False # use_lexicon_features: bool = False, # use_sparse_lexicon_features: bool = False ) -> None: super().__init__(lazy) self._clean_citation = clean_citation self._tokenizer = tokenizer or WordTokenizer() if with_elmo: self._token_indexers = {"elmo": ELMoTokenCharactersIndexer(), "tokens": SingleIdTokenIndexer()} else: self._token_indexers = {"tokens": SingleIdTokenIndexer()}
Example #20
Source File: fever_reader_with_wn.py From combine-FEVER-NSMN with MIT License | 6 votes |
def __init__(self, token_indexers: Dict[str, TokenIndexer] = None, lazy: bool = False, example_filter=None, wn_p_dict=None, wn_feature_list=wn_persistent_api.default_fn_list, max_l=None) -> None: super().__init__(lazy=lazy) self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer(namespace='tokens')} self._example_filter = example_filter self.wn_p_dict = wn_p_dict if wn_p_dict is None: raise ValueError("Need to specify WN feature dict for FEVER Reader.") self.wn_feature_list = wn_feature_list self.wn_feature_size = len(self.wn_feature_list) * 3 self.max_l = max_l
Example #21
Source File: imdb_review_reader.py From topic-rnn with Apache License 2.0 | 6 votes |
def __init__(self, lazy: bool = False, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, words_per_instance: int = 35 ) -> None: super().__init__(lazy) self._tokenizer = tokenizer or WordTokenizer( start_tokens=[START_SYMBOL], end_tokens=[END_SYMBOL] ) self._token_indexers = token_indexers or { "tokens": SingleIdTokenIndexer(namespace="tokens", lowercase_tokens=True) } self._words_per_instance = words_per_instance
Example #22
Source File: imdb_review_reader.py From topic-rnn with Apache License 2.0 | 6 votes |
def __init__(self, lazy: bool = False, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, words_per_instance: int = 35, classification_mode=False ) -> None: super().__init__(lazy) self._tokenizer = tokenizer or WordTokenizer( start_tokens=[START_SYMBOL], end_tokens=[END_SYMBOL] ) self._token_indexers = token_indexers or { "tokens": SingleIdTokenIndexer(namespace="tokens", lowercase_tokens=True) } self._words_per_instance = words_per_instance self._classification_mode = classification_mode
Example #23
Source File: fever_reader_with_wn_simi_doc.py From combine-FEVER-NSMN with MIT License | 6 votes |
def __init__(self, token_indexers: Dict[str, TokenIndexer] = None, lazy: bool = False, example_filter=None, wn_p_dict=None, wn_feature_list=wn_persistent_api.default_fn_list, max_l=None, num_encoding=True, shuffle_sentences=False, ablation=None) -> None: super().__init__(lazy=lazy) self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer(namespace='tokens')} self._example_filter = example_filter self.wn_p_dict = wn_p_dict if wn_p_dict is None: raise ValueError("Need to specify WN feature dict for FEVER Reader.") self.wn_feature_list = wn_feature_list num_encoding_dim = 5 if num_encoding else 0 self.wn_feature_size = len(self.wn_feature_list) * 3 + num_encoding_dim + 2 self.max_l = max_l self.shuffle_sentences = shuffle_sentences self.ablation = ablation if self.ablation is not None and self.ablation['rm_wn']: self.wn_feature_size -= (len(self.wn_feature_list) * 3 + num_encoding_dim) elif self.ablation is not None and self.ablation['rm_simi']: self.wn_feature_size -= 2
Example #24
Source File: clean_coqa_reader.py From SLQA with Apache License 2.0 | 5 votes |
def __init__(self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, lazy: bool = False) -> None: super().__init__(lazy) self._tokenizer = tokenizer or WordTokenizer() self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
Example #25
Source File: bert_tokenizer_and_candidate_generator.py From kb with Apache License 2.0 | 5 votes |
def __init__(self, entity_candidate_generators: Dict[str, MentionGenerator], entity_indexers: Dict[str, TokenIndexer], bert_model_type: str, do_lower_case: bool, whitespace_tokenize: bool = True, max_word_piece_sequence_length: int = 512) -> None: """ Note: the fields need to be used with a pre-generated allennlp vocabulary that contains the entity id namespaces and the bert name space. entity_indexers = {'wordnet': indexer for wordnet entities, 'wiki': indexer for wiki entities} """ # load BertTokenizer from huggingface self.candidate_generators = entity_candidate_generators self.bert_tokenizer = BertTokenizer.from_pretrained( bert_model_type, do_lower_case=do_lower_case ) self.bert_word_tokenizer = BasicTokenizer(do_lower_case=False) # Target length should include start and end token self.max_word_piece_sequence_length = max_word_piece_sequence_length self._entity_indexers = entity_indexers # for bert, we'll give an empty token indexer with empty name space # and do the indexing directly with the bert vocab to bypass # indexing in the indexer self._bert_single_id_indexer = {'tokens': SingleIdTokenIndexer('__bert__')} self.do_lowercase = do_lower_case self.whitespace_tokenize = whitespace_tokenize self.dtype = np.float32
Example #26
Source File: coca_reader.py From SLQA with Apache License 2.0 | 5 votes |
def __init__(self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, lazy: bool = False) -> None: super().__init__(lazy) self._tokenizer = tokenizer or WordTokenizer() self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
Example #27
Source File: cls_data.py From glyce with Apache License 2.0 | 5 votes |
def __init__(self, max_sentence_length: int, tokenizer: Tokenizer, max_instance: int, token_indexers: Dict[str, TokenIndexer] = None, ) -> None: if max_instance > 100000: super().__init__(False) else: super().__init__(False) self._tokenizer = tokenizer self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()} self.max_sentence_length = max_sentence_length self.trimmed_count = 0 self.max_instance = max_instance
Example #28
Source File: coca_reader.py From SLQA with Apache License 2.0 | 5 votes |
def make_reading_comprehension_instance_quac(self, question_list_tokens: List[List[Token]], passage_tokens: List[Token], token_indexers: Dict[str, TokenIndexer], passage_text: str, token_span_lists: List[List[Tuple[int, int]]] = None, yesno_list: List[int] = None, additional_metadata: Dict[str, Any] = None) -> Instance: additional_metadata = additional_metadata or {} fields: Dict[str, Field] = {} passage_offsets = [(token.idx, token.idx + len(token.text)) for token in passage_tokens] # This is separate so we can reference it later with a known type. passage_field = TextField(passage_tokens, token_indexers) fields['passage'] = passage_field fields['question'] = ListField([TextField(q_tokens, token_indexers) for q_tokens in question_list_tokens]) metadata = {'original_passage': passage_text, 'token_offsets': passage_offsets, 'question_tokens': [[token.text for token in question_tokens] \ for question_tokens in question_list_tokens], 'passage_tokens': [token.text for token in passage_tokens], } if token_span_lists: span_start_list: List[Field] = [] span_end_list: List[Field] = [] for question_index, answer_span_lists in enumerate(token_span_lists): span_start, span_end = min(answer_span_lists, key=lambda x: x[1] - x[0]) span_start_list.append(IndexField(span_start, passage_field)) span_end_list.append(IndexField(span_end, passage_field)) fields['span_start'] = ListField(span_start_list) fields['span_end'] = ListField(span_end_list) fields['yesno_list'] = ListField( [LabelField(yesno, label_namespace="yesno_labels") for yesno in yesno_list]) metadata.update(additional_metadata) fields['metadata'] = MetadataField(metadata) return Instance(fields)
Example #29
Source File: squad_reader.py From SLQA with Apache License 2.0 | 5 votes |
def __init__(self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, lazy: bool = False) -> None: super().__init__(lazy) self._tokenizer = tokenizer or WordTokenizer() self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
Example #30
Source File: clean_coqa_reader.py From SLQA with Apache License 2.0 | 5 votes |
def make_reading_comprehension_instance_quac(self, question_list_tokens: List[List[Token]], passage_tokens: List[Token], token_indexers: Dict[str, TokenIndexer], passage_text: str, token_span_lists: List[List[Tuple[int, int]]] = None, yesno_list: List[int] = None, additional_metadata: Dict[str, Any] = None) -> Instance: additional_metadata = additional_metadata or {} fields: Dict[str, Field] = {} passage_offsets = [(token.idx, token.idx + len(token.text)) for token in passage_tokens] # This is separate so we can reference it later with a known type. passage_field = TextField(passage_tokens, token_indexers) fields['passage'] = passage_field fields['question'] = ListField([TextField(q_tokens, token_indexers) for q_tokens in question_list_tokens]) metadata = {'original_passage': passage_text, 'token_offsets': passage_offsets, 'question_tokens': [[token.text for token in question_tokens] \ for question_tokens in question_list_tokens], 'passage_tokens': [token.text for token in passage_tokens], } if token_span_lists: span_start_list: List[Field] = [] span_end_list: List[Field] = [] for question_index, answer_span_lists in enumerate(token_span_lists): span_start, span_end = min(answer_span_lists, key=lambda x: x[1] - x[0]) span_start_list.append(IndexField(span_start, passage_field)) span_end_list.append(IndexField(span_end, passage_field)) fields['span_start'] = ListField(span_start_list) fields['span_end'] = ListField(span_end_list) fields['yesno_list'] = ListField( [LabelField(yesno, label_namespace="yesno_labels") for yesno in yesno_list]) metadata.update(additional_metadata) fields['metadata'] = MetadataField(metadata) return Instance(fields)