Python allennlp.data.tokenizers.Token() Examples
The following are 30
code examples of allennlp.data.tokenizers.Token().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
allennlp.data.tokenizers
, or try the search function
.
Example #1
Source File: knowledge_graph_field.py From allennlp-semparse with Apache License 2.0 | 6 votes |
def _span_lemma_overlap_fraction( self, entity: str, entity_text: List[Token], token: Token, token_index: int, tokens: List[Token], ) -> float: entity_lemmas = set(entity_token.lemma_ for entity_token in entity_text) if not entity_lemmas: # Some tables have empty cells. return 0 seen_entity_lemmas = set() token_index_left = token_index while token_index < len(tokens) and tokens[token_index].lemma_ in entity_lemmas: seen_entity_lemmas.add(tokens[token_index].lemma_) token_index += 1 while token_index_left >= 0 and tokens[token_index_left].lemma_ in entity_lemmas: seen_entity_lemmas.add(tokens[token_index_left].lemma_) token_index_left -= 1 return len(seen_entity_lemmas) / len(entity_lemmas)
Example #2
Source File: sequence_tagging.py From allennlp with Apache License 2.0 | 6 votes |
def _read(self, file_path): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) with open(file_path, "r") as data_file: logger.info("Reading instances from lines in file at: %s", file_path) for line in data_file: line = line.strip("\n") # skip blank lines if not line: continue tokens_and_tags = [ pair.rsplit(self._word_tag_delimiter, 1) for pair in line.split(self._token_delimiter) ] tokens = [Token(token) for token, tag in tokens_and_tags] tags = [tag for token, tag in tokens_and_tags] yield self.text_to_instance(tokens, tags)
Example #3
Source File: ir_labeled_tuple_loader.py From transformer-kernel-ranking with Apache License 2.0 | 6 votes |
def __init__(self, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, source_add_start_token: bool = True, max_doc_length:int = -1, max_query_length:int = -1, min_doc_length:int = -1, min_query_length:int = -1, lazy: bool = False) -> None: super().__init__(lazy) self._tokenizer = tokenizer or WordTokenizer() # little bit faster, useful for multicore proc. word_splitter=SimpleWordSplitter() self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)} self._source_add_start_token = source_add_start_token self.max_doc_length = max_doc_length self.max_query_length = max_query_length self.min_doc_length = min_doc_length self.min_query_length = min_query_length self.padding_value = Token(text = "@@PADDING@@",text_id=0)
Example #4
Source File: entailment_pair.py From multee with Apache License 2.0 | 6 votes |
def text_to_instance(self, # pylint: disable=arguments-differ premise: str, hypothesis: str, label: str = None) -> Instance: fields: Dict[str, Field] = {} premise_tokens = [Token(token.text) for token in self._tokenizer.tokenize(premise)[-self._max_tokens:]] hypothesis_tokens = [Token(token.text) for token in self._tokenizer.tokenize(hypothesis)[-self._max_tokens:]] fields['premise'] = TextField(premise_tokens, self._token_indexers) fields['hypothesis'] = TextField(hypothesis_tokens, self._token_indexers) if label: fields['label'] = LabelField(label) # metadata = {"premise_tokens": [x.text for x in premise_tokens], # "hypothesis_tokens": [x.text for x in hypothesis_tokens]} # fields["metadata"] = MetadataField(metadata) return Instance(fields)
Example #5
Source File: ebmnlp.py From scibert with Apache License 2.0 | 6 votes |
def _read(self, file_path: str) -> Iterable[Instance]: # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) with open(file_path, "r") as data_file: logger.info("Reading instances from lines in file at: %s", file_path) # Group into alternative divider / sentence chunks. for is_divider, lines in itertools.groupby(data_file, _is_divider): # Ignore the divider chunks, so that `lines` corresponds to the words # of a single sentence. if not is_divider: fields = [line.strip().split() for line in lines] # unzipping trick returns tuples, but our Fields need lists fields = [list(field) for field in zip(*fields)] tokens_, _, _, pico_tags = fields # TextField requires ``Token`` objects tokens = [Token(token) for token in tokens_] yield self.text_to_instance(tokens, pico_tags)
Example #6
Source File: dataset_reader.py From ConvLab with MIT License | 6 votes |
def text_to_instance(self, context_tokens: List[Token], tokens: List[Token], tags: List[str] = None, intents: List[str] = None, dialog_act: Dict[str, Any] = None) -> Instance: # type: ignore """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. """ # pylint: disable=arguments-differ fields: Dict[str, Field] = {} # print([t.text for t in context_tokens]) fields["context_tokens"] = TextField(context_tokens, self._token_indexers) fields["tokens"] = TextField(tokens, self._token_indexers) fields["metadata"] = MetadataField({"words": [x.text for x in tokens]}) if tags is not None: fields["tags"] = SequenceLabelField(tags, fields["tokens"]) if intents is not None: fields["intents"] = MultiLabelField(intents, label_namespace="intent_labels") if dialog_act is not None: fields["metadata"] = MetadataField({"words": [x.text for x in tokens], 'dialog_act': dialog_act}) else: fields["metadata"] = MetadataField({"words": [x.text for x in tokens], 'dialog_act': {}}) return Instance(fields)
Example #7
Source File: dataset_reader.py From ConvLab with MIT License | 6 votes |
def text_to_instance(self, tokens: List[Token], tags: List[str] = None, domain: str = None, intent: str = None, dialog_act: Dict[str, Any] = None) -> Instance: # type: ignore """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. """ # pylint: disable=arguments-differ fields: Dict[str, Field] = {} sequence = TextField(tokens, self._token_indexers) fields["tokens"] = sequence if tags: fields["tags"] = SequenceLabelField(tags, sequence) if domain: fields["domain"] = LabelField(domain, label_namespace="domain_labels") if intent: fields["intent"] = LabelField(intent, label_namespace="intent_labels") if dialog_act is not None: fields["metadata"] = MetadataField({"words": [x.text for x in tokens], 'dialog_act': dialog_act}) else: fields["metadata"] = MetadataField({"words": [x.text for x in tokens], 'dialog_act': {}}) return Instance(fields)
Example #8
Source File: conll2003.py From magnitude with MIT License | 6 votes |
def _read(self, file_path ) : # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) with open(file_path, u"r") as data_file: logger.info(u"Reading instances from lines in file at: %s", file_path) # Group into alternative divider / sentence chunks. for is_divider, lines in itertools.groupby(data_file, _is_divider): # Ignore the divider chunks, so that `lines` corresponds to the words # of a single sentence. if not is_divider: fields = [line.strip().split() for line in lines] # unzipping trick returns tuples, but our Fields need lists tokens, pos_tags, chunk_tags, ner_tags = [list(field) for field in izip(*fields)] # TextField requires ``Token`` objects tokens = [Token(token) for token in tokens] yield self.text_to_instance(tokens, pos_tags, chunk_tags, ner_tags)
Example #9
Source File: sequence_tagging.py From magnitude with MIT License | 6 votes |
def _read(self, file_path): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) with open(file_path, u"r") as data_file: logger.info(u"Reading instances from lines in file at: %s", file_path) for line in data_file: line = line.strip(u"\n") # skip blank lines if not line: continue tokens_and_tags = [pair.rsplit(self._word_tag_delimiter, 1) for pair in line.split(self._token_delimiter)] tokens = [Token(token) for token, tag in tokens_and_tags] tags = [tag for token, tag in tokens_and_tags] yield self.text_to_instance(tokens, tags)
Example #10
Source File: knowledge_graph_field.py From allennlp-semparse with Apache License 2.0 | 6 votes |
def _related_column_lemma( self, entity: str, entity_text: List[Token], token: Token, token_index: int, tokens: List[Token], ) -> float: # Check if the entity is a column name in one of the two WikiTables languages. if not entity.startswith("fb:row.row") and "_column:" not in entity: return 0.0 for neighbor in self.knowledge_graph.neighbors[entity]: if token.text in self._entity_text_exact_text[neighbor]: return 1.0 if token.lemma_ in self._entity_text_lemmas[neighbor]: return 1.0 return 0.0
Example #11
Source File: semantic_role_labeling.py From magnitude with MIT License | 6 votes |
def _read(self, file_path ): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) ontonotes_reader = Ontonotes() logger.info(u"Reading SRL instances from dataset files at: %s", file_path) if self._domain_identifier is not None: logger.info(u"Filtering to only include file paths containing the %s domain", self._domain_identifier) for sentence in self._ontonotes_subset(ontonotes_reader, file_path, self._domain_identifier): tokens = [Token(t) for t in sentence.words] if not sentence.srl_frames: # Sentence contains no predicates. tags = [u"O" for _ in tokens] verb_label = [0 for _ in tokens] yield self.text_to_instance(tokens, verb_label, tags) else: for (_, tags) in sentence.srl_frames: verb_indicator = [1 if label[-2:] == u"-V" else 0 for label in tags] yield self.text_to_instance(tokens, verb_indicator, tags)
Example #12
Source File: knowledge_graph_field.py From allennlp-semparse with Apache License 2.0 | 6 votes |
def _number_token_match( self, entity: str, entity_text: List[Token], token: Token, token_index: int, tokens: List[Token], ) -> float: # PNP had a "spanFeatures" function that said whether an entity was a-priori known to link # to a token or set of tokens in the question. This was only used for numbers, and it's # not totally clear to me how this number feature overlapped with the token match features # in the original implementation (I think in most cases it was the same, except for things # like "four million", because the token match is derived from the entity name, which would # be 4000000, and wouldn't match "four million"). # # Our implementation basically just adds a duplicate token match feature that's specific to # numbers. It'll break in some rare cases (e.g., "Which four had four million ..."), but # those shouldn't be a big deal. if ":" in entity: # This check works because numbers are the only entities that don't contain ":". All # others in both WikiTables languages do (e.g.: fb:row.row.column_name, # date_column:year, string:usl_a_league etc.). return 0.0 return self._contains_exact_token_match(entity, entity_text, token, token_index, tokens)
Example #13
Source File: template_text2sql.py From allennlp-semparse with Apache License 2.0 | 6 votes |
def text_to_instance( self, # type: ignore query: List[str], slot_tags: List[str] = None, sql_template: str = None, ) -> Instance: fields: Dict[str, Field] = {} tokens = TextField([Token(t) for t in query], self._token_indexers) fields["tokens"] = tokens if slot_tags is not None and sql_template is not None: slot_field = SequenceLabelField(slot_tags, tokens, label_namespace="slot_tags") template = LabelField(sql_template, label_namespace="template_labels") fields["slot_tags"] = slot_field fields["template"] = template return Instance(fields)
Example #14
Source File: wikitables_world_test.py From magnitude with MIT License | 6 votes |
def test_world_adds_numbers_from_question(self): question_tokens = [Token(x) for x in [u'what', u'2007', u'2,107', u'0.2', u'1800s', u'1950s', u'?']] table_kg = TableQuestionKnowledgeGraph.read_from_file( self.FIXTURES_ROOT / u"data" / u"wikitables" / u"sample_table.tsv", question_tokens) world = WikiTablesWorld(table_kg) valid_actions = world.get_valid_actions() assert u'n -> 2007' in valid_actions[u'n'] assert u'n -> 2107' in valid_actions[u'n'] # It appears that sempre normalizes floating point numbers. assert u'n -> 0.200' in valid_actions[u'n'] # We want to add the end-points to things like "1800s": 1800 and 1900. assert u'n -> 1800' in valid_actions[u'n'] assert u'n -> 1900' in valid_actions[u'n'] assert u'n -> 1950' in valid_actions[u'n'] assert u'n -> 1960' in valid_actions[u'n']
Example #15
Source File: atis_tables.py From allennlp-semparse with Apache License 2.0 | 6 votes |
def get_flight_numbers_from_utterance( utterance: str, tokenized_utterance: List[Token] ) -> Dict[str, List[int]]: indices_words_preceding_flight_number = { index for index, token in enumerate(tokenized_utterance) if token.text in {"flight", "number"} or token.text.upper() in AIRLINE_CODE_LIST or token.text.lower() in AIRLINE_CODES.keys() } indices_words_succeeding_flight_number = { index for index, token in enumerate(tokenized_utterance) if token.text == "flight" } flight_numbers_linking_dict: Dict[str, List[int]] = defaultdict(list) for token_index, token in enumerate(tokenized_utterance): if token.text.isdigit(): if token_index - 1 in indices_words_preceding_flight_number: flight_numbers_linking_dict[token.text].append(token_index) if token_index + 1 in indices_words_succeeding_flight_number: flight_numbers_linking_dict[token.text].append(token_index) return flight_numbers_linking_dict
Example #16
Source File: atis_tables.py From allennlp-semparse with Apache License 2.0 | 6 votes |
def get_time_range_end_from_utterance( utterance: str, tokenized_utterance: List[Token] ) -> Dict[str, List[int]]: early_indices = { index for index, token in enumerate(tokenized_utterance) if token.text == "early" } time_range_end_linking_dict: Dict[str, List[int]] = defaultdict(list) for token_index, token in enumerate(tokenized_utterance): for time in TIME_RANGE_END_DICT.get(token.text, []): if token_index - 1 not in early_indices: time_range_end_linking_dict[str(time)].append(token_index) bigrams = ngrams([token.text for token in tokenized_utterance], 2) for bigram_index, bigram in enumerate(bigrams): for time in TIME_RANGE_END_DICT.get(" ".join(bigram), []): time_range_end_linking_dict[str(time)].extend([bigram_index, bigram_index + 1]) return time_range_end_linking_dict
Example #17
Source File: atis_world.py From allennlp-semparse with Apache License 2.0 | 6 votes |
def get_strings_from_utterance(tokenized_utterance: List[Token]) -> Dict[str, List[int]]: """ Based on the current utterance, return a dictionary where the keys are the strings in the database that map to lists of the token indices that they are linked to. """ string_linking_scores: Dict[str, List[int]] = defaultdict(list) for index, token in enumerate(tokenized_utterance): for string in atis_tables.ATIS_TRIGGER_DICT.get(token.text.lower(), []): string_linking_scores[string].append(index) token_bigrams = bigrams([token.text for token in tokenized_utterance]) for index, token_bigram in enumerate(token_bigrams): for string in atis_tables.ATIS_TRIGGER_DICT.get(" ".join(token_bigram).lower(), []): string_linking_scores[string].extend([index, index + 1]) trigrams = ngrams([token.text for token in tokenized_utterance], 3) for index, trigram in enumerate(trigrams): if trigram[0] == "st": natural_language_key = f"st. {trigram[2]}".lower() else: natural_language_key = " ".join(trigram).lower() for string in atis_tables.ATIS_TRIGGER_DICT.get(natural_language_key, []): string_linking_scores[string].extend([index, index + 1, index + 2]) return string_linking_scores
Example #18
Source File: dataset_reader.py From nanigonet with MIT License | 6 votes |
def text_to_instance(self, tokens: List[Token], tags: List[str]=None) -> Instance: if len(tokens) > self._max_token_len: tokens = tokens[:self._max_token_len] print(f'Length of tokens exceeded the limit {self._max_token_len}. Truncating...') if tags: tags = tags[:self._max_token_len] fields = {} text_field = TextField(tokens, self._token_indexers) fields['tokens'] = text_field if tags: fields['tags'] = SequenceLabelField(tags, text_field) return Instance(fields)
Example #19
Source File: drop_utils.py From MTMSN with Apache License 2.0 | 6 votes |
def split_tokens_by_hyphen(tokens: List[Token]) -> List[Token]: hyphens = ["-", "–", "~"] new_tokens: List[Token] = [] for token in tokens: if any(hyphen in token.text for hyphen in hyphens): unsplit_tokens = [token] split_tokens: List[Token] = [] for hyphen in hyphens: for unsplit_token in unsplit_tokens: if hyphen in token.text: split_tokens += split_token_by_delimiter(unsplit_token, hyphen) else: split_tokens.append(unsplit_token) unsplit_tokens, split_tokens = split_tokens, [] new_tokens += unsplit_tokens else: new_tokens.append(token) return new_tokens
Example #20
Source File: single_correct_mcq_entailment.py From multee with Apache License 2.0 | 5 votes |
def text_to_instance(self, # pylint: disable=arguments-differ premises: List[str], hypotheses: List[str], answer_index: int = None, relevant_sentence_idxs: List[int] = None) -> Instance: fields = {} premises_tokens = [self._tokenizer.tokenize(premise)[-self._premise_max_tokens:] for premise in premises] hypotheses_tokens = [self._tokenizer.tokenize(hypothesis)[-self._hypothesis_max_tokens:] for hypothesis in hypotheses] if premises: premises_text_fields = [TextField(premise_tokens, self._token_indexers) for premise_tokens in premises_tokens] premises_field = ListField(premises_text_fields) else: empty_stub = ListField([TextField([Token('dummy')], self._token_indexers)]) premises_field = empty_stub.empty_field() fields['premises'] = premises_field hypotheses_text_fields = [TextField(hypothesis_tokens, self._token_indexers) for hypothesis_tokens in hypotheses_tokens] hypotheses_field = ListField(hypotheses_text_fields) fields['hypotheses'] = hypotheses_field # If sentence relevance is available if relevant_sentence_idxs is not None: relevance_presence_mask = np.zeros(len(premises)) for idx in relevant_sentence_idxs: relevance_presence_mask[idx] = 1 fields['relevance_presence_mask'] = ArrayField(np.array(relevance_presence_mask)) # If entailment labels are available if answer_index is not None: if answer_index not in range(0, len(hypotheses)): raise ConfigurationError("Provided label must be in 0 to {}".format(len(hypotheses))) fields['answer_index'] = ArrayField(np.array(answer_index), padding_value=-1, dtype=np.long) paragraph_tokens = [token for premise_tokens in premises_tokens for token in premise_tokens] paragraph_text_field = TextField(paragraph_tokens, self._token_indexers) fields['paragraph'] = paragraph_text_field return Instance(fields)
Example #21
Source File: wikitables_language_test.py From allennlp-semparse with Apache License 2.0 | 5 votes |
def _get_world_with_question_tokens(self, tokens: List[Token]) -> WikiTablesLanguage: table_context = TableQuestionContext.read_from_file(self.table_file, tokens) world = WikiTablesLanguage(table_context) return world
Example #22
Source File: clean_coqa_reader.py From SLQA with Apache License 2.0 | 5 votes |
def make_reading_comprehension_instance_quac(self, question_list_tokens: List[List[Token]], passage_tokens: List[Token], token_indexers: Dict[str, TokenIndexer], passage_text: str, token_span_lists: List[List[Tuple[int, int]]] = None, yesno_list: List[int] = None, additional_metadata: Dict[str, Any] = None) -> Instance: additional_metadata = additional_metadata or {} fields: Dict[str, Field] = {} passage_offsets = [(token.idx, token.idx + len(token.text)) for token in passage_tokens] # This is separate so we can reference it later with a known type. passage_field = TextField(passage_tokens, token_indexers) fields['passage'] = passage_field fields['question'] = ListField([TextField(q_tokens, token_indexers) for q_tokens in question_list_tokens]) metadata = {'original_passage': passage_text, 'token_offsets': passage_offsets, 'question_tokens': [[token.text for token in question_tokens] \ for question_tokens in question_list_tokens], 'passage_tokens': [token.text for token in passage_tokens], } if token_span_lists: span_start_list: List[Field] = [] span_end_list: List[Field] = [] for question_index, answer_span_lists in enumerate(token_span_lists): span_start, span_end = min(answer_span_lists, key=lambda x: x[1] - x[0]) span_start_list.append(IndexField(span_start, passage_field)) span_end_list.append(IndexField(span_end, passage_field)) fields['span_start'] = ListField(span_start_list) fields['span_end'] = ListField(span_end_list) fields['yesno_list'] = ListField( [LabelField(yesno, label_namespace="yesno_labels") for yesno in yesno_list]) metadata.update(additional_metadata) fields['metadata'] = MetadataField(metadata) return Instance(fields)
Example #23
Source File: table_question_context.py From allennlp-semparse with Apache License 2.0 | 5 votes |
def read_from_file(cls, filename: str, question_tokens: List[Token]) -> "TableQuestionContext": with open(filename, "r") as file_pointer: reader = csv.reader(file_pointer, delimiter="\t", quoting=csv.QUOTE_NONE) lines = [line for line in reader] return cls.read_from_lines(lines, question_tokens)
Example #24
Source File: coca_reader.py From SLQA with Apache License 2.0 | 5 votes |
def text_to_instance(self, # type: ignore question_text_list: List[str], passage_text: str, start_span_list: List[List[int]] = None, end_span_list: List[List[int]] = None, passage_tokens: List[Token] = None, yesno_list: List[str] = None, additional_metadata: Dict[str, Any] = None) -> Instance: # pylint: disable=arguments-differ # We need to convert character indices in `passage_text` to token indices in # `passage_tokens`, as the latter is what we'll actually use for supervision. answer_token_span_list = [] passage_offsets = [(token.idx, token.idx + len(token.text)) for token in passage_tokens] for start_list, end_list in zip(start_span_list, end_span_list): token_spans: List[Tuple[int, int]] = [] for char_span_start, char_span_end in zip(start_list, end_list): (span_start, span_end), error = util.char_span_to_token_span(passage_offsets, (char_span_start, char_span_end)) if error: logger.debug("Passage: %s", passage_text) logger.debug("Passage tokens: %s", passage_tokens) logger.debug("Answer span: (%d, %d)", char_span_start, char_span_end) logger.debug("Token span: (%d, %d)", span_start, span_end) logger.debug("Tokens in answer: %s", passage_tokens[span_start:span_end + 1]) logger.debug("Answer: %s", passage_text[char_span_start:char_span_end]) token_spans.append((span_start, span_end)) answer_token_span_list.append(token_spans) question_list_tokens = [self._tokenizer.tokenize(q) for q in question_text_list] # Map answer texts to "CANNOTANSWER" if more than half of them marked as so. additional_metadata['answer_texts_list'] = [util.handle_cannot(ans_list) for ans_list \ in additional_metadata['answer_texts_list']] return self.make_reading_comprehension_instance_quac(question_list_tokens, passage_tokens, self._token_indexers, passage_text, answer_token_span_list, yesno_list, additional_metadata)
Example #25
Source File: coca_reader.py From SLQA with Apache License 2.0 | 5 votes |
def make_reading_comprehension_instance_quac(self, question_list_tokens: List[List[Token]], passage_tokens: List[Token], token_indexers: Dict[str, TokenIndexer], passage_text: str, token_span_lists: List[List[Tuple[int, int]]] = None, yesno_list: List[int] = None, additional_metadata: Dict[str, Any] = None) -> Instance: additional_metadata = additional_metadata or {} fields: Dict[str, Field] = {} passage_offsets = [(token.idx, token.idx + len(token.text)) for token in passage_tokens] # This is separate so we can reference it later with a known type. passage_field = TextField(passage_tokens, token_indexers) fields['passage'] = passage_field fields['question'] = ListField([TextField(q_tokens, token_indexers) for q_tokens in question_list_tokens]) metadata = {'original_passage': passage_text, 'token_offsets': passage_offsets, 'question_tokens': [[token.text for token in question_tokens] \ for question_tokens in question_list_tokens], 'passage_tokens': [token.text for token in passage_tokens], } if token_span_lists: span_start_list: List[Field] = [] span_end_list: List[Field] = [] for question_index, answer_span_lists in enumerate(token_span_lists): span_start, span_end = min(answer_span_lists, key=lambda x: x[1] - x[0]) span_start_list.append(IndexField(span_start, passage_field)) span_end_list.append(IndexField(span_end, passage_field)) fields['span_start'] = ListField(span_start_list) fields['span_end'] = ListField(span_end_list) fields['yesno_list'] = ListField( [LabelField(yesno, label_namespace="yesno_labels") for yesno in yesno_list]) metadata.update(additional_metadata) fields['metadata'] = MetadataField(metadata) return Instance(fields)
Example #26
Source File: squad_reader.py From SLQA with Apache License 2.0 | 5 votes |
def text_to_instance(self, # type: ignore question_text_list: List[str], passage_text: str, start_span_list: List[List[int]] = None, end_span_list: List[List[int]] = None, passage_tokens: List[Token] = None, yesno_list: List[str] = None, additional_metadata: Dict[str, Any] = None) -> Instance: # pylint: disable=arguments-differ # We need to convert character indices in `passage_text` to token indices in # `passage_tokens`, as the latter is what we'll actually use for supervision. answer_token_span_list = [] passage_offsets = [(token.idx, token.idx + len(token.text)) for token in passage_tokens] for start_list, end_list in zip(start_span_list, end_span_list): token_spans: List[Tuple[int, int]] = [] for char_span_start, char_span_end in zip(start_list, end_list): (span_start, span_end), error = util.char_span_to_token_span(passage_offsets, (char_span_start, char_span_end)) if error: logger.debug("Passage: %s", passage_text) logger.debug("Passage tokens: %s", passage_tokens) logger.debug("Answer span: (%d, %d)", char_span_start, char_span_end) logger.debug("Token span: (%d, %d)", span_start, span_end) logger.debug("Tokens in answer: %s", passage_tokens[span_start:span_end + 1]) logger.debug("Answer: %s", passage_text[char_span_start:char_span_end]) token_spans.append((span_start, span_end)) answer_token_span_list.append(token_spans) question_list_tokens = [self._tokenizer.tokenize(q) for q in question_text_list] # Map answer texts to "CANNOTANSWER" if more than half of them marked as so. additional_metadata['answer_texts_list'] = [util.handle_cannot(ans_list) for ans_list \ in additional_metadata['answer_texts_list']] return self.make_reading_comprehension_instance_quac(question_list_tokens, passage_tokens, self._token_indexers, passage_text, answer_token_span_list, yesno_list, additional_metadata)
Example #27
Source File: atis_tables.py From allennlp-semparse with Apache License 2.0 | 5 votes |
def get_costs_from_utterance( utterance: str, tokenized_utterance: List[Token] ) -> Dict[str, List[int]]: dollars_indices = { index for index, token in enumerate(tokenized_utterance) if token.text == "dollars" or token.text == "dollar" } costs_linking_dict: Dict[str, List[int]] = defaultdict(list) for token_index, token in enumerate(tokenized_utterance): if token_index + 1 in dollars_indices and token.text.isdigit(): costs_linking_dict[token.text].append(token_index) return costs_linking_dict
Example #28
Source File: atis_world.py From allennlp-semparse with Apache License 2.0 | 5 votes |
def add_to_number_linking_scores( self, all_numbers: Set[str], number_linking_scores: Dict[str, Tuple[str, str, List[int]]], get_number_linking_dict: Callable[[str, List[Token]], Dict[str, List[int]]], current_tokenized_utterance: List[Token], nonterminal: str, ) -> None: """ This is a helper method for adding different types of numbers (eg. starting time ranges) as entities. We first go through all utterances in the interaction and find the numbers of a certain type and add them to the set ``all_numbers``, which is initialized with default values. We want to add all numbers that occur in the interaction, and not just the current turn because the query could contain numbers that were triggered before the current turn. For each entity, we then check if it is triggered by tokens in the current utterance and construct the linking score. """ number_linking_dict: Dict[str, List[int]] = {} for utterance, tokenized_utterance in zip(self.utterances, self.tokenized_utterances): number_linking_dict = get_number_linking_dict(utterance, tokenized_utterance) all_numbers.update(number_linking_dict.keys()) all_numbers_list: List[str] = sorted(all_numbers, reverse=True) for number in all_numbers_list: entity_linking = [0 for token in current_tokenized_utterance] # ``number_linking_dict`` is for the last utterance here. If the number was triggered # before the last utterance, then it will have linking scores of 0's. for token_index in number_linking_dict.get(number, []): if token_index < len(entity_linking): entity_linking[token_index] = 1 action = format_action( nonterminal, number, is_number=True, keywords_to_uppercase=KEYWORDS ) number_linking_scores[action] = (nonterminal, number, entity_linking)
Example #29
Source File: drop_utils.py From MTMSN with Apache License 2.0 | 5 votes |
def find_valid_spans(passage_tokens: List[Token], answer_texts: List[List[Token]]) -> List[Tuple[int, int]]: normalized_tokens = [token.text.lower().strip(STRIPPED_CHARACTERS) for token in passage_tokens] word_positions: Dict[str, List[int]] = defaultdict(list) for i, token in enumerate(normalized_tokens): word_positions[token].append(i) spans = [] for answer_text in answer_texts: answer_tokens = [token.text.lower().strip(STRIPPED_CHARACTERS) for token in answer_text] num_answer_tokens = len(answer_tokens) if answer_tokens[0] not in word_positions: continue for span_start in word_positions[answer_tokens[0]]: span_end = span_start # span_end is _inclusive_ answer_index = 1 while answer_index < num_answer_tokens and span_end + 1 < len(normalized_tokens): token = normalized_tokens[span_end + 1] if answer_tokens[answer_index].strip(STRIPPED_CHARACTERS) == token: answer_index += 1 span_end += 1 elif token in IGNORED_TOKENS: span_end += 1 else: break if num_answer_tokens == answer_index: spans.append((span_start, span_end)) return spans
Example #30
Source File: knowledge_graph_field.py From allennlp-semparse with Apache License 2.0 | 5 votes |
def _lemma_match( self, entity: str, entity_text: List[Token], token: Token, token_index: int, tokens: List[Token], ) -> float: if len(entity_text) != 1: return 0.0 return self._contains_lemma_match(entity, entity_text, token, token_index, tokens)