Python allennlp.data.token_indexers.TokenIndexer() Examples

The following are 30 code examples of allennlp.data.token_indexers.TokenIndexer(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module allennlp.data.token_indexers , or try the search function .
Example #1
Source File: wordnet.py    From kb with Apache License 2.0 6 votes vote down vote up
def __init__(self,
                 wordnet_entity_file: str,
                 token_indexers: Dict[str, TokenIndexer],
                 entity_indexer: TokenIndexer,
                 is_training: bool,
                 use_surface_form: bool = False,
                 should_remap_span_indices: bool = True,
                 extra_candidate_generators: Dict[str, MentionGenerator] = None):

        super().__init__(False)

        self.mention_generator = WordNetCandidateMentionGenerator(
                wordnet_entity_file, use_surface_form=use_surface_form
        )

        self.token_indexers = token_indexers
        self.entity_indexer = {"ids": entity_indexer}
        self.is_training = is_training
        self.should_remap_span_indices = should_remap_span_indices

        self.extra_candidate_generators = extra_candidate_generators 
Example #2
Source File: datareader.py    From NLP_Toolkit with Apache License 2.0 6 votes vote down vote up
def __init__(self,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 delimeters: dict = SEQ_DELIMETERS,
                 skip_correct: bool = False,
                 skip_complex: int = 0,
                 lazy: bool = False,
                 max_len: int = None,
                 test_mode: bool = False,
                 tag_strategy: str = "keep_one",
                 tn_prob: float = 0,
                 tp_prob: float = 0,
                 broken_dot_strategy: str = "keep") -> None:
        super().__init__(lazy)
        self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
        self._delimeters = delimeters
        self._max_len = max_len
        self._skip_correct = skip_correct
        self._skip_complex = skip_complex
        self._tag_strategy = tag_strategy
        self._broken_dot_strategy = broken_dot_strategy
        self._test_mode = test_mode
        self._tn_prob = tn_prob
        self._tp_prob = tp_prob 
Example #3
Source File: bert_triple_loader.py    From transformer-kernel-ranking with Apache License 2.0 6 votes vote down vote up
def __init__(self,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 source_add_start_token: bool = True,
                 max_doc_length:int = -1,
                 max_query_length:int = -1,
                 min_doc_length:int = -1,
                 min_query_length:int = -1,
                 lazy: bool = False) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer
        self._token_indexers = token_indexers
        self._source_add_start_token = source_add_start_token
        self.max_doc_length = max_doc_length
        self.max_query_length = max_query_length
        self.min_doc_length = min_doc_length
        self.min_query_length = min_query_length

        self.padding_value = Token(text = "[PAD]",text_id=0)
        self.sep_value = Token(text = "[SEP]") 
Example #4
Source File: text_classification_json.py    From allennlp with Apache License 2.0 6 votes vote down vote up
def __init__(
        self,
        token_indexers: Dict[str, TokenIndexer] = None,
        tokenizer: Tokenizer = None,
        segment_sentences: bool = False,
        max_sequence_length: int = None,
        skip_label_indexing: bool = False,
        **kwargs,
    ) -> None:
        super().__init__(**kwargs)
        self._tokenizer = tokenizer or SpacyTokenizer()
        self._segment_sentences = segment_sentences
        self._max_sequence_length = max_sequence_length
        self._skip_label_indexing = skip_label_indexing
        self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
        if self._segment_sentences:
            self._sentence_segmenter = SpacySentenceSplitter() 
Example #5
Source File: summarization_sentence_tagger_reader.py    From summarus with Apache License 2.0 6 votes vote down vote up
def __init__(self,
                 tokenizer: Tokenizer,
                 language: str,
                 source_token_indexers: Dict[str, TokenIndexer] = None,
                 max_sentences_count: int = 100,
                 sentence_max_tokens: int = 100,
                 lowercase: bool = True,
                 lazy: bool = True) -> None:
        super().__init__(lazy=lazy)

        self._tokenizer = tokenizer
        self._lowercase = lowercase
        self._language = language
        self._max_sentences_count = max_sentences_count
        self._sentence_max_tokens = sentence_max_tokens
        self._source_token_indexers = source_token_indexers or {"tokens": SingleIdTokenIndexer()} 
Example #6
Source File: semisupervised_text_classification_json.py    From vampire with Apache License 2.0 6 votes vote down vote up
def __init__(self,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 tokenizer: Tokenizer = None,
                 max_sequence_length: int = None,
                 ignore_labels: bool = False,
                 sample: int = None,
                 skip_label_indexing: bool = False,
                 lazy: bool = False) -> None:
        super().__init__(lazy=lazy,
                         token_indexers=token_indexers,
                         tokenizer=tokenizer,
                         max_sequence_length=max_sequence_length,
                         skip_label_indexing=skip_label_indexing)
        self._tokenizer = tokenizer or WordTokenizer()
        self._sample = sample
        self._max_sequence_length = max_sequence_length
        self._ignore_labels = ignore_labels
        self._skip_label_indexing = skip_label_indexing
        self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
        if self._segment_sentences:
            self._sentence_segmenter = SpacySentenceSplitter() 
Example #7
Source File: transition_eds_reader.py    From HIT-SCIR-CoNLL2019 with Apache License 2.0 6 votes vote down vote up
def __init__(self,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 lemma_indexers: Dict[str, TokenIndexer] = None,
                 action_indexers: Dict[str, TokenIndexer] = None,
                 arc_tag_indexers: Dict[str, TokenIndexer] = None,
                 concept_label_indexers: Dict[str, TokenIndexer] = None,
                 lazy: bool = False) -> None:
        super().__init__(lazy)
        self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}

        self._lemma_indexers = None
        if lemma_indexers is not None and len(lemma_indexers) > 0:
            self._lemma_indexers = lemma_indexers

        self._action_indexers = None
        if action_indexers is not None and len(action_indexers) > 0:
            self._action_indexers = action_indexers

        self._arc_tag_indexers = None
        if arc_tag_indexers is not None and len(arc_tag_indexers) > 0:
            self._arc_tag_indexers = arc_tag_indexers

        self._concept_label_indexers = concept_label_indexers or {
            'concept_label': SingleIdTokenIndexer(namespace='concept_label')} 
Example #8
Source File: transition_sdp_reader.py    From HIT-SCIR-CoNLL2019 with Apache License 2.0 6 votes vote down vote up
def __init__(self,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 lemma_indexers: Dict[str, TokenIndexer] = None,
                 action_indexers: Dict[str, TokenIndexer] = None,
                 arc_tag_indexers: Dict[str, TokenIndexer] = None,
                 lazy: bool = False) -> None:
        super().__init__(lazy)
        self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
        self._lemma_indexers = None
        if lemma_indexers is not None and len(lemma_indexers) > 0:
            self._lemma_indexers = lemma_indexers
        self._action_indexers = None
        if action_indexers is not None and len(action_indexers) > 0:
            self._action_indexers = action_indexers
        self._arc_tag_indexers = None
        if arc_tag_indexers is not None and len(arc_tag_indexers) > 0:
            self._arc_tag_indexers = arc_tag_indexers 
Example #9
Source File: transition_ucca_reader.py    From HIT-SCIR-CoNLL2019 with Apache License 2.0 6 votes vote down vote up
def __init__(self,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 lemma_indexers: Dict[str, TokenIndexer] = None,
                 action_indexers: Dict[str, TokenIndexer] = None,
                 arc_tag_indexers: Dict[str, TokenIndexer] = None,
                 lazy: bool = False) -> None:
        super().__init__(lazy)
        self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
        self._lemma_indexers = None
        if lemma_indexers is not None and len(lemma_indexers) > 0:
            self._lemma_indexers = lemma_indexers
        self._action_indexers = None
        if action_indexers is not None and len(action_indexers) > 0:
            self._action_indexers = action_indexers
        self._arc_tag_indexers = None
        if arc_tag_indexers is not None and len(arc_tag_indexers) > 0:
            self._arc_tag_indexers = arc_tag_indexers 
Example #10
Source File: conll2003.py    From allennlp with Apache License 2.0 6 votes vote down vote up
def __init__(
        self,
        token_indexers: Dict[str, TokenIndexer] = None,
        tag_label: str = "ner",
        feature_labels: Sequence[str] = (),
        coding_scheme: str = "IOB1",
        label_namespace: str = "labels",
        **kwargs,
    ) -> None:
        super().__init__(**kwargs)
        self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
        if tag_label is not None and tag_label not in self._VALID_LABELS:
            raise ConfigurationError("unknown tag label type: {}".format(tag_label))
        for label in feature_labels:
            if label not in self._VALID_LABELS:
                raise ConfigurationError("unknown feature label type: {}".format(label))
        if coding_scheme not in ("IOB1", "BIOUL"):
            raise ConfigurationError("unknown coding_scheme: {}".format(coding_scheme))

        self.tag_label = tag_label
        self.feature_labels = set(feature_labels)
        self.coding_scheme = coding_scheme
        self.label_namespace = label_namespace
        self._original_coding_scheme = "IOB1" 
Example #11
Source File: bert_labeled_tuple_loader.py    From transformer-kernel-ranking with Apache License 2.0 6 votes vote down vote up
def __init__(self,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 source_add_start_token: bool = True,
                 max_doc_length:int = -1,
                 max_query_length:int = -1,
                 min_doc_length:int = -1,
                 min_query_length:int = -1,
                 lazy: bool = False) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer
        self._token_indexers = token_indexers
        self._source_add_start_token = source_add_start_token
        self.max_doc_length = max_doc_length
        self.max_query_length = max_query_length
        self.min_doc_length = min_doc_length
        self.min_query_length = min_query_length

        self.padding_value = Token(text = "[PAD]",text_id=0)
        self.sep_value = Token(text = "[SEP]") 
Example #12
Source File: ir_triple_loader.py    From transformer-kernel-ranking with Apache License 2.0 6 votes vote down vote up
def __init__(self,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 source_add_start_token: bool = True,
                 max_doc_length:int = -1,
                 max_query_length:int = -1,
                 min_doc_length:int = -1,
                 min_query_length:int = -1,
                 lazy: bool = False) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer() # little bit faster, useful for multicore proc. word_splitter=SimpleWordSplitter()
        self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)}
        self._source_add_start_token = source_add_start_token
        self.max_doc_length = max_doc_length
        self.max_query_length = max_query_length
        self.min_doc_length = min_doc_length
        self.min_query_length = min_query_length

        self.padding_value = Token(text = "@@PADDING@@",text_id=0) 
Example #13
Source File: ir_labeled_tuple_loader.py    From transformer-kernel-ranking with Apache License 2.0 6 votes vote down vote up
def __init__(self,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 source_add_start_token: bool = True,
                 max_doc_length:int = -1,
                 max_query_length:int = -1,
                 min_doc_length:int = -1,
                 min_query_length:int = -1,
                 lazy: bool = False) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer() # little bit faster, useful for multicore proc. word_splitter=SimpleWordSplitter()
        self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)}
        self._source_add_start_token = source_add_start_token
        self.max_doc_length = max_doc_length
        self.max_query_length = max_query_length
        self.min_doc_length = min_doc_length
        self.min_query_length = min_query_length

        self.padding_value = Token(text = "@@PADDING@@",text_id=0) 
Example #14
Source File: citation_data_reader_scicite_aux.py    From scicite with Apache License 2.0 6 votes vote down vote up
def __init__(self,
                 lazy: bool = False,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 clean_citation: bool = True,
                 with_elmo: bool = False
                 ) -> None:
        super().__init__(lazy)
        self._clean_citation = clean_citation
        self._tokenizer = tokenizer or WordTokenizer()
        self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
        if with_elmo:
            self._token_indexers = {"elmo": ELMoTokenCharactersIndexer(),
                                    "tokens": SingleIdTokenIndexer()}
        else:
            self._token_indexers = {"tokens": SingleIdTokenIndexer()} 
Example #15
Source File: copynet.py    From nlp-models with MIT License 6 votes vote down vote up
def __init__(
        self,
        target_namespace: str,
        source_tokenizer: Tokenizer = None,
        target_tokenizer: Tokenizer = None,
        source_token_indexers: Dict[str, TokenIndexer] = None,
        target_token_indexers: Dict[str, TokenIndexer] = None,
        lazy: bool = False,
    ) -> None:
        super().__init__(lazy)
        self._target_namespace = target_namespace
        self._source_tokenizer = source_tokenizer or SpacyTokenizer()
        self._target_tokenizer = target_tokenizer or self._source_tokenizer
        self._source_token_indexers = source_token_indexers or {
            "tokens": SingleIdTokenIndexer()
        }
        self._target_token_indexers = (
            target_token_indexers or self._source_token_indexers
        )
        warnings.warn(
            "The 'copynet' dataset reader has been deprecated in favor of the "
            "'copynet_seq2seq' dataset reader (now part of the AllenNLP library).",
            DeprecationWarning,
        ) 
Example #16
Source File: nl2bash.py    From nlp-models with MIT License 6 votes vote down vote up
def __init__(
        self,
        target_namespace: str,
        source_tokenizer: Tokenizer = None,
        target_tokenizer: Tokenizer = None,
        source_token_indexers: Dict[str, TokenIndexer] = None,
        target_token_indexers: Dict[str, TokenIndexer] = None,
        lazy: bool = False,
    ) -> None:
        source_tokenizer = source_tokenizer or NL2BashWordSplitter()
        target_tokenizer = target_tokenizer or source_tokenizer
        super().__init__(
            target_namespace,
            source_tokenizer=source_tokenizer,
            target_tokenizer=target_tokenizer,
            source_token_indexers=source_token_indexers,
            target_token_indexers=target_token_indexers,
            lazy=lazy,
        ) 
Example #17
Source File: citation_data_reader_aclarc.py    From scicite with Apache License 2.0 6 votes vote down vote up
def __init__(self,
                 lazy: bool = False,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 use_lexicon_features: bool = False,
                 use_sparse_lexicon_features: bool = False,
                 with_elmo: bool = False
                 ) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer()
        if with_elmo:
            self._token_indexers = {"elmo": ELMoTokenCharactersIndexer(),
                                    "tokens": SingleIdTokenIndexer()}
        else:
            self._token_indexers = {"tokens": SingleIdTokenIndexer()}
        self.use_lexicon_features = use_lexicon_features
        self.use_sparse_lexicon_features = use_sparse_lexicon_features
        if self.use_lexicon_features or self.use_sparse_lexicon_features:
            self.lexicons = {**ALL_ACTION_LEXICONS, **ALL_CONCEPT_LEXICONS} 
Example #18
Source File: nlvr.py    From allennlp-semparse with Apache License 2.0 6 votes vote down vote up
def __init__(
        self,
        lazy: bool = False,
        tokenizer: Tokenizer = None,
        sentence_token_indexers: Dict[str, TokenIndexer] = None,
        nonterminal_indexers: Dict[str, TokenIndexer] = None,
        terminal_indexers: Dict[str, TokenIndexer] = None,
        output_agendas: bool = True,
    ) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or SpacyTokenizer()
        self._sentence_token_indexers = sentence_token_indexers or {
            "tokens": SingleIdTokenIndexer()
        }
        self._nonterminal_indexers = nonterminal_indexers or {
            "tokens": SingleIdTokenIndexer("rule_labels")
        }
        self._terminal_indexers = terminal_indexers or {
            "tokens": SingleIdTokenIndexer("rule_labels")
        }
        self._output_agendas = output_agendas 
Example #19
Source File: citation_data_reader_aclarc_aux.py    From scicite with Apache License 2.0 6 votes vote down vote up
def __init__(self,
                 lazy: bool = False,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 clean_citation: bool = True,
                 with_elmo: bool = False
                 # use_lexicon_features: bool = False,
                 # use_sparse_lexicon_features: bool = False
                 ) -> None:
        super().__init__(lazy)
        self._clean_citation = clean_citation
        self._tokenizer = tokenizer or WordTokenizer()
        if with_elmo:
            self._token_indexers = {"elmo": ELMoTokenCharactersIndexer(),
                                    "tokens": SingleIdTokenIndexer()}
        else:
            self._token_indexers = {"tokens": SingleIdTokenIndexer()} 
Example #20
Source File: fever_reader_with_wn.py    From combine-FEVER-NSMN with MIT License 6 votes vote down vote up
def __init__(self,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 lazy: bool = False,
                 example_filter=None,
                 wn_p_dict=None, wn_feature_list=wn_persistent_api.default_fn_list,
                 max_l=None) -> None:

        super().__init__(lazy=lazy)
        self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer(namespace='tokens')}
        self._example_filter = example_filter
        self.wn_p_dict = wn_p_dict
        if wn_p_dict is None:
            raise ValueError("Need to specify WN feature dict for FEVER Reader.")
        self.wn_feature_list = wn_feature_list
        self.wn_feature_size = len(self.wn_feature_list) * 3
        self.max_l = max_l 
Example #21
Source File: imdb_review_reader.py    From topic-rnn with Apache License 2.0 6 votes vote down vote up
def __init__(self,
                 lazy: bool = False,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 words_per_instance: int = 35
                ) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer(
            start_tokens=[START_SYMBOL],
            end_tokens=[END_SYMBOL]
        )
        self._token_indexers = token_indexers or {
            "tokens": SingleIdTokenIndexer(namespace="tokens", lowercase_tokens=True)
        }

        self._words_per_instance = words_per_instance 
Example #22
Source File: imdb_review_reader.py    From topic-rnn with Apache License 2.0 6 votes vote down vote up
def __init__(self,
                 lazy: bool = False,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 words_per_instance: int = 35,
                 classification_mode=False
                ) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer(
            start_tokens=[START_SYMBOL],
            end_tokens=[END_SYMBOL]
        )
        self._token_indexers = token_indexers or {
            "tokens": SingleIdTokenIndexer(namespace="tokens", lowercase_tokens=True)
        }

        self._words_per_instance = words_per_instance
        self._classification_mode = classification_mode 
Example #23
Source File: fever_reader_with_wn_simi_doc.py    From combine-FEVER-NSMN with MIT License 6 votes vote down vote up
def __init__(self,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 lazy: bool = False,
                 example_filter=None,
                 wn_p_dict=None, wn_feature_list=wn_persistent_api.default_fn_list,
                 max_l=None, num_encoding=True, shuffle_sentences=False, ablation=None) -> None:

        super().__init__(lazy=lazy)
        self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer(namespace='tokens')}
        self._example_filter = example_filter
        self.wn_p_dict = wn_p_dict
        if wn_p_dict is None:
            raise ValueError("Need to specify WN feature dict for FEVER Reader.")
        self.wn_feature_list = wn_feature_list
        num_encoding_dim = 5 if num_encoding else 0
        self.wn_feature_size = len(self.wn_feature_list) * 3 + num_encoding_dim + 2
        self.max_l = max_l
        self.shuffle_sentences = shuffle_sentences
        self.ablation = ablation

        if self.ablation is not None and self.ablation['rm_wn']:
            self.wn_feature_size -= (len(self.wn_feature_list) * 3 + num_encoding_dim)
        elif self.ablation is not None and self.ablation['rm_simi']:
            self.wn_feature_size -= 2 
Example #24
Source File: clean_coqa_reader.py    From SLQA with Apache License 2.0 5 votes vote down vote up
def __init__(self,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 lazy: bool = False) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer()
        self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()} 
Example #25
Source File: bert_tokenizer_and_candidate_generator.py    From kb with Apache License 2.0 5 votes vote down vote up
def __init__(self,
                 entity_candidate_generators: Dict[str, MentionGenerator],
                 entity_indexers: Dict[str, TokenIndexer],
                 bert_model_type: str,
                 do_lower_case: bool,
                 whitespace_tokenize: bool = True,
                 max_word_piece_sequence_length: int = 512) -> None:
        """
        Note: the fields need to be used with a pre-generated allennlp vocabulary
        that contains the entity id namespaces and the bert name space.
        entity_indexers = {'wordnet': indexer for wordnet entities,
                          'wiki': indexer for wiki entities}
        """
        # load BertTokenizer from huggingface
        self.candidate_generators = entity_candidate_generators
        self.bert_tokenizer = BertTokenizer.from_pretrained(
            bert_model_type, do_lower_case=do_lower_case
        )
        self.bert_word_tokenizer = BasicTokenizer(do_lower_case=False)
        # Target length should include start and end token
        self.max_word_piece_sequence_length = max_word_piece_sequence_length

        self._entity_indexers = entity_indexers
        # for bert, we'll give an empty token indexer with empty name space
        # and do the indexing directly with the bert vocab to bypass
        # indexing in the indexer
        self._bert_single_id_indexer = {'tokens': SingleIdTokenIndexer('__bert__')}
        self.do_lowercase = do_lower_case
        self.whitespace_tokenize = whitespace_tokenize
        self.dtype = np.float32 
Example #26
Source File: coca_reader.py    From SLQA with Apache License 2.0 5 votes vote down vote up
def __init__(self,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 lazy: bool = False) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer()
        self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()} 
Example #27
Source File: cls_data.py    From glyce with Apache License 2.0 5 votes vote down vote up
def __init__(self,
                 max_sentence_length: int,
                 tokenizer: Tokenizer,
                 max_instance: int,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 ) -> None:
        if max_instance > 100000:
            super().__init__(False)
        else:
            super().__init__(False)
        self._tokenizer = tokenizer
        self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
        self.max_sentence_length = max_sentence_length
        self.trimmed_count = 0
        self.max_instance = max_instance 
Example #28
Source File: coca_reader.py    From SLQA with Apache License 2.0 5 votes vote down vote up
def make_reading_comprehension_instance_quac(self,
                                                 question_list_tokens: List[List[Token]],
                                                 passage_tokens: List[Token],
                                                 token_indexers: Dict[str, TokenIndexer],
                                                 passage_text: str,
                                                 token_span_lists: List[List[Tuple[int, int]]] = None,
                                                 yesno_list: List[int] = None,
                                                 additional_metadata: Dict[str, Any] = None) -> Instance:
        additional_metadata = additional_metadata or {}
        fields: Dict[str, Field] = {}
        passage_offsets = [(token.idx, token.idx + len(token.text)) for token in passage_tokens]
        # This is separate so we can reference it later with a known type.
        passage_field = TextField(passage_tokens, token_indexers)
        fields['passage'] = passage_field
        fields['question'] = ListField([TextField(q_tokens, token_indexers) for q_tokens in question_list_tokens])
        metadata = {'original_passage': passage_text,
                    'token_offsets': passage_offsets,
                    'question_tokens': [[token.text for token in question_tokens] \
                                        for question_tokens in question_list_tokens],
                    'passage_tokens': [token.text for token in passage_tokens], }
        if token_span_lists:
            span_start_list: List[Field] = []
            span_end_list: List[Field] = []
            for question_index, answer_span_lists in enumerate(token_span_lists):
                span_start, span_end = min(answer_span_lists, key=lambda x: x[1] - x[0])
                span_start_list.append(IndexField(span_start, passage_field))
                span_end_list.append(IndexField(span_end, passage_field))

            fields['span_start'] = ListField(span_start_list)
            fields['span_end'] = ListField(span_end_list)
            fields['yesno_list'] = ListField(
                [LabelField(yesno, label_namespace="yesno_labels") for yesno in yesno_list])
        metadata.update(additional_metadata)
        fields['metadata'] = MetadataField(metadata)
        return Instance(fields) 
Example #29
Source File: squad_reader.py    From SLQA with Apache License 2.0 5 votes vote down vote up
def __init__(self,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 lazy: bool = False) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer()
        self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()} 
Example #30
Source File: clean_coqa_reader.py    From SLQA with Apache License 2.0 5 votes vote down vote up
def make_reading_comprehension_instance_quac(self,
                                                 question_list_tokens: List[List[Token]],
                                                 passage_tokens: List[Token],
                                                 token_indexers: Dict[str, TokenIndexer],
                                                 passage_text: str,
                                                 token_span_lists: List[List[Tuple[int, int]]] = None,
                                                 yesno_list: List[int] = None,
                                                 additional_metadata: Dict[str, Any] = None) -> Instance:
        additional_metadata = additional_metadata or {}
        fields: Dict[str, Field] = {}
        passage_offsets = [(token.idx, token.idx + len(token.text)) for token in passage_tokens]
        # This is separate so we can reference it later with a known type.
        passage_field = TextField(passage_tokens, token_indexers)
        fields['passage'] = passage_field
        fields['question'] = ListField([TextField(q_tokens, token_indexers) for q_tokens in question_list_tokens])
        metadata = {'original_passage': passage_text,
                    'token_offsets': passage_offsets,
                    'question_tokens': [[token.text for token in question_tokens] \
                                        for question_tokens in question_list_tokens],
                    'passage_tokens': [token.text for token in passage_tokens], }
        if token_span_lists:
            span_start_list: List[Field] = []
            span_end_list: List[Field] = []
            for question_index, answer_span_lists in enumerate(token_span_lists):
                span_start, span_end = min(answer_span_lists, key=lambda x: x[1] - x[0])
                span_start_list.append(IndexField(span_start, passage_field))
                span_end_list.append(IndexField(span_end, passage_field))

            fields['span_start'] = ListField(span_start_list)
            fields['span_end'] = ListField(span_end_list)
            fields['yesno_list'] = ListField(
                [LabelField(yesno, label_namespace="yesno_labels") for yesno in yesno_list])
        metadata.update(additional_metadata)
        fields['metadata'] = MetadataField(metadata)
        return Instance(fields)