Python allennlp.data.tokenizers.Token() Examples

The following are 30 code examples of allennlp.data.tokenizers.Token(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module allennlp.data.tokenizers , or try the search function .
Example #1
Source File: knowledge_graph_field.py    From allennlp-semparse with Apache License 2.0 6 votes vote down vote up
def _span_lemma_overlap_fraction(
        self,
        entity: str,
        entity_text: List[Token],
        token: Token,
        token_index: int,
        tokens: List[Token],
    ) -> float:
        entity_lemmas = set(entity_token.lemma_ for entity_token in entity_text)
        if not entity_lemmas:
            # Some tables have empty cells.
            return 0
        seen_entity_lemmas = set()
        token_index_left = token_index
        while token_index < len(tokens) and tokens[token_index].lemma_ in entity_lemmas:
            seen_entity_lemmas.add(tokens[token_index].lemma_)
            token_index += 1
        while token_index_left >= 0 and tokens[token_index_left].lemma_ in entity_lemmas:
            seen_entity_lemmas.add(tokens[token_index_left].lemma_)
            token_index_left -= 1
        return len(seen_entity_lemmas) / len(entity_lemmas) 
Example #2
Source File: sequence_tagging.py    From allennlp with Apache License 2.0 6 votes vote down vote up
def _read(self, file_path):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, "r") as data_file:

            logger.info("Reading instances from lines in file at: %s", file_path)
            for line in data_file:
                line = line.strip("\n")

                # skip blank lines
                if not line:
                    continue

                tokens_and_tags = [
                    pair.rsplit(self._word_tag_delimiter, 1)
                    for pair in line.split(self._token_delimiter)
                ]
                tokens = [Token(token) for token, tag in tokens_and_tags]
                tags = [tag for token, tag in tokens_and_tags]
                yield self.text_to_instance(tokens, tags) 
Example #3
Source File: ir_labeled_tuple_loader.py    From transformer-kernel-ranking with Apache License 2.0 6 votes vote down vote up
def __init__(self,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 source_add_start_token: bool = True,
                 max_doc_length:int = -1,
                 max_query_length:int = -1,
                 min_doc_length:int = -1,
                 min_query_length:int = -1,
                 lazy: bool = False) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or WordTokenizer() # little bit faster, useful for multicore proc. word_splitter=SimpleWordSplitter()
        self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)}
        self._source_add_start_token = source_add_start_token
        self.max_doc_length = max_doc_length
        self.max_query_length = max_query_length
        self.min_doc_length = min_doc_length
        self.min_query_length = min_query_length

        self.padding_value = Token(text = "@@PADDING@@",text_id=0) 
Example #4
Source File: entailment_pair.py    From multee with Apache License 2.0 6 votes vote down vote up
def text_to_instance(self, # pylint: disable=arguments-differ
                         premise: str,
                         hypothesis: str,
                         label: str = None) -> Instance:
        fields: Dict[str, Field] = {}
        premise_tokens = [Token(token.text)
                          for token in self._tokenizer.tokenize(premise)[-self._max_tokens:]]
        hypothesis_tokens = [Token(token.text)
                             for token in self._tokenizer.tokenize(hypothesis)[-self._max_tokens:]]

        fields['premise'] = TextField(premise_tokens, self._token_indexers)
        fields['hypothesis'] = TextField(hypothesis_tokens, self._token_indexers)

        if label:
            fields['label'] = LabelField(label)

        # metadata = {"premise_tokens": [x.text for x in premise_tokens],
        #             "hypothesis_tokens": [x.text for x in hypothesis_tokens]}
        # fields["metadata"] = MetadataField(metadata)
        return Instance(fields) 
Example #5
Source File: ebmnlp.py    From scibert with Apache License 2.0 6 votes vote down vote up
def _read(self, file_path: str) -> Iterable[Instance]:
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, "r") as data_file:
            logger.info("Reading instances from lines in file at: %s", file_path)

            # Group into alternative divider / sentence chunks.
            for is_divider, lines in itertools.groupby(data_file, _is_divider):
                # Ignore the divider chunks, so that `lines` corresponds to the words
                # of a single sentence.
                if not is_divider:
                    fields = [line.strip().split() for line in lines]
                    # unzipping trick returns tuples, but our Fields need lists
                    fields = [list(field) for field in zip(*fields)]
                    tokens_, _, _, pico_tags = fields
                    # TextField requires ``Token`` objects
                    tokens = [Token(token) for token in tokens_]

                    yield self.text_to_instance(tokens, pico_tags) 
Example #6
Source File: dataset_reader.py    From ConvLab with MIT License 6 votes vote down vote up
def text_to_instance(self, context_tokens: List[Token], tokens: List[Token], tags: List[str] = None,
        intents: List[str] = None, dialog_act: Dict[str, Any] = None) -> Instance:  # type: ignore
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        # print([t.text for t in context_tokens])
        fields["context_tokens"] = TextField(context_tokens, self._token_indexers)
        fields["tokens"] = TextField(tokens, self._token_indexers)
        fields["metadata"] = MetadataField({"words": [x.text for x in tokens]})
        if tags is not None:
            fields["tags"] = SequenceLabelField(tags, fields["tokens"])
        if intents is not None:
            fields["intents"] = MultiLabelField(intents, label_namespace="intent_labels")
        if dialog_act is not None:
            fields["metadata"] = MetadataField({"words": [x.text for x in tokens],
            'dialog_act': dialog_act})
        else:
            fields["metadata"] = MetadataField({"words": [x.text for x in tokens], 'dialog_act': {}})
        return Instance(fields) 
Example #7
Source File: dataset_reader.py    From ConvLab with MIT License 6 votes vote down vote up
def text_to_instance(self, tokens: List[Token], tags: List[str] = None, domain: str = None,
        intent: str = None, dialog_act: Dict[str, Any] = None) -> Instance:  # type: ignore
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        sequence = TextField(tokens, self._token_indexers)
        fields["tokens"] = sequence
        if tags:
            fields["tags"] = SequenceLabelField(tags, sequence)
        if domain:
            fields["domain"] = LabelField(domain, label_namespace="domain_labels")
        if intent:
            fields["intent"] = LabelField(intent, label_namespace="intent_labels")
        if dialog_act is not None:
            fields["metadata"] = MetadataField({"words": [x.text for x in tokens],
            'dialog_act': dialog_act})
        else:
            fields["metadata"] = MetadataField({"words": [x.text for x in tokens], 'dialog_act': {}})
        return Instance(fields) 
Example #8
Source File: conll2003.py    From magnitude with MIT License 6 votes vote down vote up
def _read(self, file_path     )                      :
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, u"r") as data_file:
            logger.info(u"Reading instances from lines in file at: %s", file_path)

            # Group into alternative divider / sentence chunks.
            for is_divider, lines in itertools.groupby(data_file, _is_divider):
                # Ignore the divider chunks, so that `lines` corresponds to the words
                # of a single sentence.
                if not is_divider:
                    fields = [line.strip().split() for line in lines]
                    # unzipping trick returns tuples, but our Fields need lists
                    tokens, pos_tags, chunk_tags, ner_tags = [list(field) for field in izip(*fields)]
                    # TextField requires ``Token`` objects
                    tokens = [Token(token) for token in tokens]

                    yield self.text_to_instance(tokens, pos_tags, chunk_tags, ner_tags) 
Example #9
Source File: sequence_tagging.py    From magnitude with MIT License 6 votes vote down vote up
def _read(self, file_path):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, u"r") as data_file:

            logger.info(u"Reading instances from lines in file at: %s", file_path)
            for line in data_file:
                line = line.strip(u"\n")

                # skip blank lines
                if not line:
                    continue

                tokens_and_tags = [pair.rsplit(self._word_tag_delimiter, 1)
                                   for pair in line.split(self._token_delimiter)]
                tokens = [Token(token) for token, tag in tokens_and_tags]
                tags = [tag for token, tag in tokens_and_tags]
                yield self.text_to_instance(tokens, tags) 
Example #10
Source File: knowledge_graph_field.py    From allennlp-semparse with Apache License 2.0 6 votes vote down vote up
def _related_column_lemma(
        self,
        entity: str,
        entity_text: List[Token],
        token: Token,
        token_index: int,
        tokens: List[Token],
    ) -> float:
        # Check if the entity is a column name in one of the two WikiTables languages.
        if not entity.startswith("fb:row.row") and "_column:" not in entity:
            return 0.0
        for neighbor in self.knowledge_graph.neighbors[entity]:
            if token.text in self._entity_text_exact_text[neighbor]:
                return 1.0
            if token.lemma_ in self._entity_text_lemmas[neighbor]:
                return 1.0
        return 0.0 
Example #11
Source File: semantic_role_labeling.py    From magnitude with MIT License 6 votes vote down vote up
def _read(self, file_path     ):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)
        ontonotes_reader = Ontonotes()
        logger.info(u"Reading SRL instances from dataset files at: %s", file_path)
        if self._domain_identifier is not None:
            logger.info(u"Filtering to only include file paths containing the %s domain", self._domain_identifier)

        for sentence in self._ontonotes_subset(ontonotes_reader, file_path, self._domain_identifier):
            tokens = [Token(t) for t in sentence.words]
            if not sentence.srl_frames:
                # Sentence contains no predicates.
                tags = [u"O" for _ in tokens]
                verb_label = [0 for _ in tokens]
                yield self.text_to_instance(tokens, verb_label, tags)
            else:
                for (_, tags) in sentence.srl_frames:
                    verb_indicator = [1 if label[-2:] == u"-V" else 0 for label in tags]
                    yield self.text_to_instance(tokens, verb_indicator, tags) 
Example #12
Source File: knowledge_graph_field.py    From allennlp-semparse with Apache License 2.0 6 votes vote down vote up
def _number_token_match(
        self,
        entity: str,
        entity_text: List[Token],
        token: Token,
        token_index: int,
        tokens: List[Token],
    ) -> float:
        # PNP had a "spanFeatures" function that said whether an entity was a-priori known to link
        # to a token or set of tokens in the question.  This was only used for numbers, and it's
        # not totally clear to me how this number feature overlapped with the token match features
        # in the original implementation (I think in most cases it was the same, except for things
        # like "four million", because the token match is derived from the entity name, which would
        # be 4000000, and wouldn't match "four million").
        #
        # Our implementation basically just adds a duplicate token match feature that's specific to
        # numbers.  It'll break in some rare cases (e.g., "Which four had four million ..."), but
        # those shouldn't be a big deal.
        if ":" in entity:
            # This check works because numbers are the only entities that don't contain ":". All
            # others in both WikiTables languages do (e.g.: fb:row.row.column_name,
            # date_column:year, string:usl_a_league etc.).
            return 0.0
        return self._contains_exact_token_match(entity, entity_text, token, token_index, tokens) 
Example #13
Source File: template_text2sql.py    From allennlp-semparse with Apache License 2.0 6 votes vote down vote up
def text_to_instance(
        self,  # type: ignore
        query: List[str],
        slot_tags: List[str] = None,
        sql_template: str = None,
    ) -> Instance:
        fields: Dict[str, Field] = {}
        tokens = TextField([Token(t) for t in query], self._token_indexers)
        fields["tokens"] = tokens

        if slot_tags is not None and sql_template is not None:
            slot_field = SequenceLabelField(slot_tags, tokens, label_namespace="slot_tags")
            template = LabelField(sql_template, label_namespace="template_labels")
            fields["slot_tags"] = slot_field
            fields["template"] = template

        return Instance(fields) 
Example #14
Source File: wikitables_world_test.py    From magnitude with MIT License 6 votes vote down vote up
def test_world_adds_numbers_from_question(self):
        question_tokens = [Token(x) for x in [u'what', u'2007', u'2,107', u'0.2', u'1800s', u'1950s', u'?']]
        table_kg = TableQuestionKnowledgeGraph.read_from_file(
                self.FIXTURES_ROOT / u"data" / u"wikitables" / u"sample_table.tsv", question_tokens)
        world = WikiTablesWorld(table_kg)
        valid_actions = world.get_valid_actions()
        assert u'n -> 2007' in valid_actions[u'n']
        assert u'n -> 2107' in valid_actions[u'n']

        # It appears that sempre normalizes floating point numbers.
        assert u'n -> 0.200' in valid_actions[u'n']

        # We want to add the end-points to things like "1800s": 1800 and 1900.
        assert u'n -> 1800' in valid_actions[u'n']
        assert u'n -> 1900' in valid_actions[u'n']
        assert u'n -> 1950' in valid_actions[u'n']
        assert u'n -> 1960' in valid_actions[u'n'] 
Example #15
Source File: atis_tables.py    From allennlp-semparse with Apache License 2.0 6 votes vote down vote up
def get_flight_numbers_from_utterance(
    utterance: str, tokenized_utterance: List[Token]
) -> Dict[str, List[int]]:
    indices_words_preceding_flight_number = {
        index
        for index, token in enumerate(tokenized_utterance)
        if token.text in {"flight", "number"}
        or token.text.upper() in AIRLINE_CODE_LIST
        or token.text.lower() in AIRLINE_CODES.keys()
    }

    indices_words_succeeding_flight_number = {
        index for index, token in enumerate(tokenized_utterance) if token.text == "flight"
    }

    flight_numbers_linking_dict: Dict[str, List[int]] = defaultdict(list)
    for token_index, token in enumerate(tokenized_utterance):
        if token.text.isdigit():
            if token_index - 1 in indices_words_preceding_flight_number:
                flight_numbers_linking_dict[token.text].append(token_index)
            if token_index + 1 in indices_words_succeeding_flight_number:
                flight_numbers_linking_dict[token.text].append(token_index)
    return flight_numbers_linking_dict 
Example #16
Source File: atis_tables.py    From allennlp-semparse with Apache License 2.0 6 votes vote down vote up
def get_time_range_end_from_utterance(
    utterance: str, tokenized_utterance: List[Token]
) -> Dict[str, List[int]]:
    early_indices = {
        index for index, token in enumerate(tokenized_utterance) if token.text == "early"
    }

    time_range_end_linking_dict: Dict[str, List[int]] = defaultdict(list)
    for token_index, token in enumerate(tokenized_utterance):
        for time in TIME_RANGE_END_DICT.get(token.text, []):
            if token_index - 1 not in early_indices:
                time_range_end_linking_dict[str(time)].append(token_index)

    bigrams = ngrams([token.text for token in tokenized_utterance], 2)
    for bigram_index, bigram in enumerate(bigrams):
        for time in TIME_RANGE_END_DICT.get(" ".join(bigram), []):
            time_range_end_linking_dict[str(time)].extend([bigram_index, bigram_index + 1])

    return time_range_end_linking_dict 
Example #17
Source File: atis_world.py    From allennlp-semparse with Apache License 2.0 6 votes vote down vote up
def get_strings_from_utterance(tokenized_utterance: List[Token]) -> Dict[str, List[int]]:
    """
    Based on the current utterance, return a dictionary where the keys are the strings in
    the database that map to lists of the token indices that they are linked to.
    """
    string_linking_scores: Dict[str, List[int]] = defaultdict(list)

    for index, token in enumerate(tokenized_utterance):
        for string in atis_tables.ATIS_TRIGGER_DICT.get(token.text.lower(), []):
            string_linking_scores[string].append(index)

    token_bigrams = bigrams([token.text for token in tokenized_utterance])
    for index, token_bigram in enumerate(token_bigrams):
        for string in atis_tables.ATIS_TRIGGER_DICT.get(" ".join(token_bigram).lower(), []):
            string_linking_scores[string].extend([index, index + 1])

    trigrams = ngrams([token.text for token in tokenized_utterance], 3)
    for index, trigram in enumerate(trigrams):
        if trigram[0] == "st":
            natural_language_key = f"st. {trigram[2]}".lower()
        else:
            natural_language_key = " ".join(trigram).lower()
        for string in atis_tables.ATIS_TRIGGER_DICT.get(natural_language_key, []):
            string_linking_scores[string].extend([index, index + 1, index + 2])
    return string_linking_scores 
Example #18
Source File: dataset_reader.py    From nanigonet with MIT License 6 votes vote down vote up
def text_to_instance(self, tokens: List[Token], tags: List[str]=None) -> Instance:

        if len(tokens) > self._max_token_len:
            tokens = tokens[:self._max_token_len]
            print(f'Length of tokens exceeded the limit {self._max_token_len}. Truncating...')
            if tags:
                tags = tags[:self._max_token_len]

        fields = {}

        text_field = TextField(tokens, self._token_indexers)
        fields['tokens'] = text_field
        if tags:
            fields['tags'] = SequenceLabelField(tags, text_field)

        return Instance(fields) 
Example #19
Source File: drop_utils.py    From MTMSN with Apache License 2.0 6 votes vote down vote up
def split_tokens_by_hyphen(tokens: List[Token]) -> List[Token]:
    hyphens = ["-", "–", "~"]
    new_tokens: List[Token] = []

    for token in tokens:
        if any(hyphen in token.text for hyphen in hyphens):
            unsplit_tokens = [token]
            split_tokens: List[Token] = []
            for hyphen in hyphens:
                for unsplit_token in unsplit_tokens:
                    if hyphen in token.text:
                        split_tokens += split_token_by_delimiter(unsplit_token, hyphen)
                    else:
                        split_tokens.append(unsplit_token)
                unsplit_tokens, split_tokens = split_tokens, []
            new_tokens += unsplit_tokens
        else:
            new_tokens.append(token)

    return new_tokens 
Example #20
Source File: single_correct_mcq_entailment.py    From multee with Apache License 2.0 5 votes vote down vote up
def text_to_instance(self, # pylint: disable=arguments-differ
                         premises: List[str],
                         hypotheses: List[str],
                         answer_index: int = None,
                         relevant_sentence_idxs: List[int] = None) -> Instance:
        fields = {}
        premises_tokens = [self._tokenizer.tokenize(premise)[-self._premise_max_tokens:]
                           for premise in premises]
        hypotheses_tokens = [self._tokenizer.tokenize(hypothesis)[-self._hypothesis_max_tokens:]
                             for hypothesis in hypotheses]
        if premises:
            premises_text_fields = [TextField(premise_tokens, self._token_indexers)
                                    for premise_tokens in premises_tokens]
            premises_field = ListField(premises_text_fields)
        else:
            empty_stub = ListField([TextField([Token('dummy')], self._token_indexers)])
            premises_field = empty_stub.empty_field()
        fields['premises'] = premises_field

        hypotheses_text_fields = [TextField(hypothesis_tokens, self._token_indexers)
                                for hypothesis_tokens in hypotheses_tokens]
        hypotheses_field = ListField(hypotheses_text_fields)
        fields['hypotheses'] = hypotheses_field

        # If sentence relevance is available
        if relevant_sentence_idxs is not None:
            relevance_presence_mask = np.zeros(len(premises))
            for idx in relevant_sentence_idxs:
                relevance_presence_mask[idx] = 1
            fields['relevance_presence_mask'] = ArrayField(np.array(relevance_presence_mask))

        # If entailment labels are available
        if answer_index is not None:
            if answer_index not in range(0, len(hypotheses)):
                raise ConfigurationError("Provided label must be in 0 to {}".format(len(hypotheses)))
            fields['answer_index'] = ArrayField(np.array(answer_index), padding_value=-1, dtype=np.long)

        paragraph_tokens = [token for premise_tokens in premises_tokens for token in premise_tokens]
        paragraph_text_field = TextField(paragraph_tokens, self._token_indexers)
        fields['paragraph'] = paragraph_text_field
        return Instance(fields) 
Example #21
Source File: wikitables_language_test.py    From allennlp-semparse with Apache License 2.0 5 votes vote down vote up
def _get_world_with_question_tokens(self, tokens: List[Token]) -> WikiTablesLanguage:
        table_context = TableQuestionContext.read_from_file(self.table_file, tokens)
        world = WikiTablesLanguage(table_context)
        return world 
Example #22
Source File: clean_coqa_reader.py    From SLQA with Apache License 2.0 5 votes vote down vote up
def make_reading_comprehension_instance_quac(self,
                                                 question_list_tokens: List[List[Token]],
                                                 passage_tokens: List[Token],
                                                 token_indexers: Dict[str, TokenIndexer],
                                                 passage_text: str,
                                                 token_span_lists: List[List[Tuple[int, int]]] = None,
                                                 yesno_list: List[int] = None,
                                                 additional_metadata: Dict[str, Any] = None) -> Instance:
        additional_metadata = additional_metadata or {}
        fields: Dict[str, Field] = {}
        passage_offsets = [(token.idx, token.idx + len(token.text)) for token in passage_tokens]
        # This is separate so we can reference it later with a known type.
        passage_field = TextField(passage_tokens, token_indexers)
        fields['passage'] = passage_field
        fields['question'] = ListField([TextField(q_tokens, token_indexers) for q_tokens in question_list_tokens])
        metadata = {'original_passage': passage_text,
                    'token_offsets': passage_offsets,
                    'question_tokens': [[token.text for token in question_tokens] \
                                        for question_tokens in question_list_tokens],
                    'passage_tokens': [token.text for token in passage_tokens], }
        if token_span_lists:
            span_start_list: List[Field] = []
            span_end_list: List[Field] = []
            for question_index, answer_span_lists in enumerate(token_span_lists):
                span_start, span_end = min(answer_span_lists, key=lambda x: x[1] - x[0])
                span_start_list.append(IndexField(span_start, passage_field))
                span_end_list.append(IndexField(span_end, passage_field))

            fields['span_start'] = ListField(span_start_list)
            fields['span_end'] = ListField(span_end_list)
            fields['yesno_list'] = ListField(
                [LabelField(yesno, label_namespace="yesno_labels") for yesno in yesno_list])
        metadata.update(additional_metadata)
        fields['metadata'] = MetadataField(metadata)
        return Instance(fields) 
Example #23
Source File: table_question_context.py    From allennlp-semparse with Apache License 2.0 5 votes vote down vote up
def read_from_file(cls, filename: str, question_tokens: List[Token]) -> "TableQuestionContext":
        with open(filename, "r") as file_pointer:
            reader = csv.reader(file_pointer, delimiter="\t", quoting=csv.QUOTE_NONE)
            lines = [line for line in reader]
            return cls.read_from_lines(lines, question_tokens) 
Example #24
Source File: coca_reader.py    From SLQA with Apache License 2.0 5 votes vote down vote up
def text_to_instance(self,  # type: ignore
                         question_text_list: List[str],
                         passage_text: str,
                         start_span_list: List[List[int]] = None,
                         end_span_list: List[List[int]] = None,
                         passage_tokens: List[Token] = None,
                         yesno_list: List[str] = None,
                         additional_metadata: Dict[str, Any] = None) -> Instance:
        # pylint: disable=arguments-differ
        # We need to convert character indices in `passage_text` to token indices in
        # `passage_tokens`, as the latter is what we'll actually use for supervision.
        answer_token_span_list = []
        passage_offsets = [(token.idx, token.idx + len(token.text)) for token in passage_tokens]
        for start_list, end_list in zip(start_span_list, end_span_list):
            token_spans: List[Tuple[int, int]] = []
            for char_span_start, char_span_end in zip(start_list, end_list):
                (span_start, span_end), error = util.char_span_to_token_span(passage_offsets,
                                                                             (char_span_start, char_span_end))
                if error:
                    logger.debug("Passage: %s", passage_text)
                    logger.debug("Passage tokens: %s", passage_tokens)
                    logger.debug("Answer span: (%d, %d)", char_span_start, char_span_end)
                    logger.debug("Token span: (%d, %d)", span_start, span_end)
                    logger.debug("Tokens in answer: %s", passage_tokens[span_start:span_end + 1])
                    logger.debug("Answer: %s", passage_text[char_span_start:char_span_end])
                token_spans.append((span_start, span_end))
            answer_token_span_list.append(token_spans)
        question_list_tokens = [self._tokenizer.tokenize(q) for q in question_text_list]
        # Map answer texts to "CANNOTANSWER" if more than half of them marked as so.
        additional_metadata['answer_texts_list'] = [util.handle_cannot(ans_list) for ans_list \
                                                    in additional_metadata['answer_texts_list']]
        return self.make_reading_comprehension_instance_quac(question_list_tokens,
                                                             passage_tokens,
                                                             self._token_indexers,
                                                             passage_text,
                                                             answer_token_span_list,
                                                             yesno_list,
                                                             additional_metadata) 
Example #25
Source File: coca_reader.py    From SLQA with Apache License 2.0 5 votes vote down vote up
def make_reading_comprehension_instance_quac(self,
                                                 question_list_tokens: List[List[Token]],
                                                 passage_tokens: List[Token],
                                                 token_indexers: Dict[str, TokenIndexer],
                                                 passage_text: str,
                                                 token_span_lists: List[List[Tuple[int, int]]] = None,
                                                 yesno_list: List[int] = None,
                                                 additional_metadata: Dict[str, Any] = None) -> Instance:
        additional_metadata = additional_metadata or {}
        fields: Dict[str, Field] = {}
        passage_offsets = [(token.idx, token.idx + len(token.text)) for token in passage_tokens]
        # This is separate so we can reference it later with a known type.
        passage_field = TextField(passage_tokens, token_indexers)
        fields['passage'] = passage_field
        fields['question'] = ListField([TextField(q_tokens, token_indexers) for q_tokens in question_list_tokens])
        metadata = {'original_passage': passage_text,
                    'token_offsets': passage_offsets,
                    'question_tokens': [[token.text for token in question_tokens] \
                                        for question_tokens in question_list_tokens],
                    'passage_tokens': [token.text for token in passage_tokens], }
        if token_span_lists:
            span_start_list: List[Field] = []
            span_end_list: List[Field] = []
            for question_index, answer_span_lists in enumerate(token_span_lists):
                span_start, span_end = min(answer_span_lists, key=lambda x: x[1] - x[0])
                span_start_list.append(IndexField(span_start, passage_field))
                span_end_list.append(IndexField(span_end, passage_field))

            fields['span_start'] = ListField(span_start_list)
            fields['span_end'] = ListField(span_end_list)
            fields['yesno_list'] = ListField(
                [LabelField(yesno, label_namespace="yesno_labels") for yesno in yesno_list])
        metadata.update(additional_metadata)
        fields['metadata'] = MetadataField(metadata)
        return Instance(fields) 
Example #26
Source File: squad_reader.py    From SLQA with Apache License 2.0 5 votes vote down vote up
def text_to_instance(self,  # type: ignore
                         question_text_list: List[str],
                         passage_text: str,
                         start_span_list: List[List[int]] = None,
                         end_span_list: List[List[int]] = None,
                         passage_tokens: List[Token] = None,
                         yesno_list: List[str] = None,
                         additional_metadata: Dict[str, Any] = None) -> Instance:
        # pylint: disable=arguments-differ
        # We need to convert character indices in `passage_text` to token indices in
        # `passage_tokens`, as the latter is what we'll actually use for supervision.
        answer_token_span_list = []
        passage_offsets = [(token.idx, token.idx + len(token.text)) for token in passage_tokens]
        for start_list, end_list in zip(start_span_list, end_span_list):
            token_spans: List[Tuple[int, int]] = []
            for char_span_start, char_span_end in zip(start_list, end_list):
                (span_start, span_end), error = util.char_span_to_token_span(passage_offsets,
                                                                             (char_span_start, char_span_end))
                if error:
                    logger.debug("Passage: %s", passage_text)
                    logger.debug("Passage tokens: %s", passage_tokens)
                    logger.debug("Answer span: (%d, %d)", char_span_start, char_span_end)
                    logger.debug("Token span: (%d, %d)", span_start, span_end)
                    logger.debug("Tokens in answer: %s", passage_tokens[span_start:span_end + 1])
                    logger.debug("Answer: %s", passage_text[char_span_start:char_span_end])
                token_spans.append((span_start, span_end))
            answer_token_span_list.append(token_spans)
        question_list_tokens = [self._tokenizer.tokenize(q) for q in question_text_list]
        # Map answer texts to "CANNOTANSWER" if more than half of them marked as so.
        additional_metadata['answer_texts_list'] = [util.handle_cannot(ans_list) for ans_list \
                                                    in additional_metadata['answer_texts_list']]
        return self.make_reading_comprehension_instance_quac(question_list_tokens,
                                                             passage_tokens,
                                                             self._token_indexers,
                                                             passage_text,
                                                             answer_token_span_list,
                                                             yesno_list,
                                                             additional_metadata) 
Example #27
Source File: atis_tables.py    From allennlp-semparse with Apache License 2.0 5 votes vote down vote up
def get_costs_from_utterance(
    utterance: str, tokenized_utterance: List[Token]
) -> Dict[str, List[int]]:
    dollars_indices = {
        index
        for index, token in enumerate(tokenized_utterance)
        if token.text == "dollars" or token.text == "dollar"
    }

    costs_linking_dict: Dict[str, List[int]] = defaultdict(list)
    for token_index, token in enumerate(tokenized_utterance):
        if token_index + 1 in dollars_indices and token.text.isdigit():
            costs_linking_dict[token.text].append(token_index)
    return costs_linking_dict 
Example #28
Source File: atis_world.py    From allennlp-semparse with Apache License 2.0 5 votes vote down vote up
def add_to_number_linking_scores(
        self,
        all_numbers: Set[str],
        number_linking_scores: Dict[str, Tuple[str, str, List[int]]],
        get_number_linking_dict: Callable[[str, List[Token]], Dict[str, List[int]]],
        current_tokenized_utterance: List[Token],
        nonterminal: str,
    ) -> None:
        """
        This is a helper method for adding different types of numbers (eg. starting time ranges) as entities.
        We first go through all utterances in the interaction and find the numbers of a certain type and add
        them to the set ``all_numbers``, which is initialized with default values. We want to add all numbers
        that occur in the interaction, and not just the current turn because the query could contain numbers
        that were triggered before the current turn. For each entity, we then check if it is triggered by tokens
        in the current utterance and construct the linking score.
        """
        number_linking_dict: Dict[str, List[int]] = {}
        for utterance, tokenized_utterance in zip(self.utterances, self.tokenized_utterances):
            number_linking_dict = get_number_linking_dict(utterance, tokenized_utterance)
            all_numbers.update(number_linking_dict.keys())
        all_numbers_list: List[str] = sorted(all_numbers, reverse=True)
        for number in all_numbers_list:
            entity_linking = [0 for token in current_tokenized_utterance]
            # ``number_linking_dict`` is for the last utterance here. If the number was triggered
            # before the last utterance, then it will have linking scores of 0's.
            for token_index in number_linking_dict.get(number, []):
                if token_index < len(entity_linking):
                    entity_linking[token_index] = 1
            action = format_action(
                nonterminal, number, is_number=True, keywords_to_uppercase=KEYWORDS
            )
            number_linking_scores[action] = (nonterminal, number, entity_linking) 
Example #29
Source File: drop_utils.py    From MTMSN with Apache License 2.0 5 votes vote down vote up
def find_valid_spans(passage_tokens: List[Token],
                         answer_texts: List[List[Token]]) -> List[Tuple[int, int]]:
        normalized_tokens = [token.text.lower().strip(STRIPPED_CHARACTERS) for token in passage_tokens]
        word_positions: Dict[str, List[int]] = defaultdict(list)
        for i, token in enumerate(normalized_tokens):
            word_positions[token].append(i)
        spans = []
        for answer_text in answer_texts:
            answer_tokens = [token.text.lower().strip(STRIPPED_CHARACTERS) for token in answer_text]
            num_answer_tokens = len(answer_tokens)
            if answer_tokens[0] not in word_positions:
                continue
            for span_start in word_positions[answer_tokens[0]]:
                span_end = span_start  # span_end is _inclusive_
                answer_index = 1
                while answer_index < num_answer_tokens and span_end + 1 < len(normalized_tokens):
                    token = normalized_tokens[span_end + 1]
                    if answer_tokens[answer_index].strip(STRIPPED_CHARACTERS) == token:
                        answer_index += 1
                        span_end += 1
                    elif token in IGNORED_TOKENS:
                        span_end += 1
                    else:
                        break
                if num_answer_tokens == answer_index:
                    spans.append((span_start, span_end))
        return spans 
Example #30
Source File: knowledge_graph_field.py    From allennlp-semparse with Apache License 2.0 5 votes vote down vote up
def _lemma_match(
        self,
        entity: str,
        entity_text: List[Token],
        token: Token,
        token_index: int,
        tokens: List[Token],
    ) -> float:
        if len(entity_text) != 1:
            return 0.0
        return self._contains_lemma_match(entity, entity_text, token, token_index, tokens)