Python Examples of allennlp.common.file_utils.cached

Source File: entailment_pair.py From multee with Apache License 2.0

6 votes

def _read(self, file_path: str):
        file_path = cached_path(file_path)
        with open(file_path, 'r') as entailment_file:
            logger.info("Reading entailment instances from jsonl dataset at: %s", file_path)
            for line in entailment_file:
                if line.strip():
                    instance_json = json.loads(line.strip())
                    premise = instance_json.get("sentence1", None) or instance_json.get("premise", None)
                    hypothesis = instance_json.get("sentence2", None) or instance_json.get("hypothesis", None)
                    label = instance_json.get("gold_label", None) or instance_json.get("label", None) # entails or neutral
                    if label == '-':
                        # These were cases where the annotators disagreed; we'll just skip them.
                        # It's like 800 out of 500k examples in the training data.
                        continue
                    if label in ["entails", "entailment"]:
                        label = "entailment"
                    yield self.text_to_instance(premise, hypothesis, label)

Source File: ebmnlp.py From scibert with Apache License 2.0

6 votes

def _read(self, file_path: str) -> Iterable[Instance]:
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, "r") as data_file:
            logger.info("Reading instances from lines in file at: %s", file_path)

            # Group into alternative divider / sentence chunks.
            for is_divider, lines in itertools.groupby(data_file, _is_divider):
                # Ignore the divider chunks, so that `lines` corresponds to the words
                # of a single sentence.
                if not is_divider:
                    fields = [line.strip().split() for line in lines]
                    # unzipping trick returns tuples, but our Fields need lists
                    fields = [list(field) for field in zip(*fields)]
                    tokens_, _, _, pico_tags = fields
                    # TextField requires ``Token`` objects
                    tokens = [Token(token) for token in tokens_]

                    yield self.text_to_instance(tokens, pico_tags)

Source File: elmo.py From allennlp with Apache License 2.0

6 votes

def __init__(self, options_file: str, weight_file: str, requires_grad: bool = False) -> None:
        super().__init__()

        with open(cached_path(options_file), "r") as fin:
            self._options = json.load(fin)
        self._weight_file = weight_file

        self.output_dim = self._options["lstm"]["projection_dim"]
        self.requires_grad = requires_grad

        self._load_weights()

        # Cache the arrays for use in forward -- +1 due to masking.
        self._beginning_of_sentence_characters = torch.from_numpy(
            numpy.array(ELMoCharacterMapper.beginning_of_sentence_characters) + 1
        )
        self._end_of_sentence_characters = torch.from_numpy(
            numpy.array(ELMoCharacterMapper.end_of_sentence_characters) + 1
        )

Source File: text_classification_json.py From allennlp with Apache License 2.0

6 votes

def _read(self, file_path):
        with open(cached_path(file_path), "r") as data_file:
            for line in data_file.readlines():
                if not line:
                    continue
                items = json.loads(line)
                text = items["text"]
                label = items.get("label")
                if label is not None:
                    if self._skip_label_indexing:
                        try:
                            label = int(label)
                        except ValueError:
                            raise ValueError(
                                "Labels must be integers if skip_label_indexing is True."
                            )
                    else:
                        label = str(label)
                instance = self.text_to_instance(text=text, label=label)
                if instance is not None:
                    yield instance

Source File: entailment_tuple_reader.py From scitail with Apache License 2.0

6 votes

def read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)
        instances = []
        with open(file_path, 'r') as entailment_file:
            logger.info("Reading entailment instances from TSV dataset at: %s", file_path)
            for line in tqdm.tqdm(entailment_file):
                fields = line.split("\t")
                if len(fields) != 4:
                    raise ValueError("Expected four fields: "
                                     "premise   hypothesis  label   hypothesis_structure. "
                                     "Found {} fields in {}".format(len(fields), line))
                premise, hypothesis, label, hypothesis_structure = fields
                instances.append(self.text_to_instance(premise, hypothesis, hypothesis_structure,
                                                       label))
        if not instances:
            raise ConfigurationError("No instances were read from the given filepath {}. "
                                     "Is the path correct?".format(file_path))
        return Dataset(instances)

Source File: elmo.py From magnitude with MIT License

6 votes

def __init__(self,
                 options_file     ,
                 weight_file     ,
                 requires_grad       = False)        :
        super(_ElmoCharacterEncoder, self).__init__()

        with open(cached_path(options_file), u'r') as fin:
            self._options = json.load(fin)
        self._weight_file = weight_file

        self.output_dim = self._options[u'lstm'][u'projection_dim']
        self.requires_grad = requires_grad

        self._load_weights()

        # Cache the arrays for use in forward -- +1 due to masking.
        self._beginning_of_sentence_characters = torch.from_numpy(
                numpy.array(ELMoCharacterMapper.beginning_of_sentence_characters) + 1
        )
        self._end_of_sentence_characters = torch.from_numpy(
                numpy.array(ELMoCharacterMapper.end_of_sentence_characters) + 1
        )

Source File: sequence_tagging.py From magnitude with MIT License

6 votes

def _read(self, file_path):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, u"r") as data_file:

            logger.info(u"Reading instances from lines in file at: %s", file_path)
            for line in data_file:
                line = line.strip(u"\n")

                # skip blank lines
                if not line:
                    continue

                tokens_and_tags = [pair.rsplit(self._word_tag_delimiter, 1)
                                   for pair in line.split(self._token_delimiter)]
                tokens = [Token(token) for token, tag in tokens_and_tags]
                tags = [tag for token, tag in tokens_and_tags]
                yield self.text_to_instance(tokens, tags)

Source File: conll2003.py From magnitude with MIT License

6 votes

def _read(self, file_path     )                      :
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, u"r") as data_file:
            logger.info(u"Reading instances from lines in file at: %s", file_path)

            # Group into alternative divider / sentence chunks.
            for is_divider, lines in itertools.groupby(data_file, _is_divider):
                # Ignore the divider chunks, so that `lines` corresponds to the words
                # of a single sentence.
                if not is_divider:
                    fields = [line.strip().split() for line in lines]
                    # unzipping trick returns tuples, but our Fields need lists
                    tokens, pos_tags, chunk_tags, ner_tags = [list(field) for field in izip(*fields)]
                    # TextField requires ``Token`` objects
                    tokens = [Token(token) for token in tokens]

                    yield self.text_to_instance(tokens, pos_tags, chunk_tags, ner_tags)

Source File: penn_tree_bank.py From magnitude with MIT License

6 votes

def _read(self, file_path):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)
        directory, filename = os.path.split(file_path)
        logger.info(u"Reading instances from lines in file at: %s", file_path)
        for parse in BracketParseCorpusReader(root=directory, fileids=[filename]).parsed_sents():

            self._strip_functional_tags(parse)
            # This is un-needed and clutters the label space.
            # All the trees also contain a root S node.
            if parse.label() == u"VROOT":
                parse = parse[0]
            pos_tags = [x[1] for x in parse.pos()] if self._use_pos_tags else None
            yield self.text_to_instance(parse.leaves(), pos_tags, parse)

    #overrides

Source File: atis.py From magnitude with MIT License

6 votes

def _read(self, file_path     ):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path) as atis_file:
            logger.info(u"Reading ATIS instances from dataset at : %s", file_path)
            for line in _lazy_parse(atis_file.read()):
                utterances = []
                for current_interaction in line[u'interaction']:
                    if not current_interaction[u'utterance']:
                        continue
                    utterances.append(current_interaction[u'utterance'])
                    instance = self.text_to_instance(utterances, current_interaction[u'sql'])
                    if not instance:
                        continue
                    yield instance

    #overrides

Source File: semantic_role_labeling.py From magnitude with MIT License

6 votes

def _read(self, file_path     ):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)
        ontonotes_reader = Ontonotes()
        logger.info(u"Reading SRL instances from dataset files at: %s", file_path)
        if self._domain_identifier is not None:
            logger.info(u"Filtering to only include file paths containing the %s domain", self._domain_identifier)

        for sentence in self._ontonotes_subset(ontonotes_reader, file_path, self._domain_identifier):
            tokens = [Token(t) for t in sentence.words]
            if not sentence.srl_frames:
                # Sentence contains no predicates.
                tags = [u"O" for _ in tokens]
                verb_label = [0 for _ in tokens]
                yield self.text_to_instance(tokens, verb_label, tags)
            else:
                for (_, tags) in sentence.srl_frames:
                    verb_indicator = [1 if label[-2:] == u"-V" else 0 for label in tags]
                    yield self.text_to_instance(tokens, verb_indicator, tags)

Source File: conll.py From magnitude with MIT License

6 votes

def _read(self, file_path     ):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        ontonotes_reader = Ontonotes()
        for sentences in ontonotes_reader.dataset_document_iterator(file_path):
            clusters                                          = collections.defaultdict(list)

            total_tokens = 0
            for sentence in sentences:
                for typed_span in sentence.coref_spans:
                    # Coref annotations are on a _per sentence_
                    # basis, so we need to adjust them to be relative
                    # to the length of the document.
                    span_id, (start, end) = typed_span
                    clusters[span_id].append((start + total_tokens,
                                              end + total_tokens))
                total_tokens += len(sentence.words)

            canonical_clusters = canonicalize_clusters(clusters)
            yield self.text_to_instance([s.words for s in sentences], canonical_clusters)

    #overrides

Source File: language_modeling.py From magnitude with MIT License

6 votes

def _read(self, file_path     ):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, u"r") as text_file:
            instance_strings = text_file.readlines()

        if self._tokens_per_instance is not None:
            all_text = u" ".join([x.replace(u"\n", u" ").strip() for x in instance_strings])
            tokenized_text = self._tokenizer.tokenize(all_text)
            num_tokens = self._tokens_per_instance + 1
            tokenized_strings = []
            logger.info(u"Creating dataset from all text in file: %s", file_path)
            for index in Tqdm.tqdm(range(0, len(tokenized_text) - num_tokens, num_tokens - 1)):
                tokenized_strings.append(tokenized_text[index:(index + num_tokens)])
        else:
            tokenized_strings = [self._tokenizer.tokenize(s) for s in instance_strings]

        for tokenized_string in tokenized_strings:
            input_field = TextField(tokenized_string[:-1], self._token_indexers)
            output_field = TextField(tokenized_string[1:], self._output_indexer)
            yield Instance({u'input_tokens': input_field,
                            u'output_tokens': output_field})

    #overrides

Source File: atis_sql_table_context.py From allennlp-semparse with Apache License 2.0

6 votes

def __init__(
        self,
        all_tables: Dict[str, List[str]] = None,
        tables_with_strings: Dict[str, List[str]] = None,
        database_file: str = None,
    ) -> None:
        self.all_tables = all_tables
        self.tables_with_strings = tables_with_strings
        if database_file:
            self.database_file = cached_path(database_file)
            self.connection = sqlite3.connect(self.database_file)
            self.cursor = self.connection.cursor()

        grammar_dictionary, strings_list = self.create_grammar_dict_and_strings()
        self.grammar_dictionary: Dict[str, List[str]] = grammar_dictionary
        self.strings_list: List[Tuple[str, str]] = strings_list

        self.grammar_string: str = self.get_grammar_string()
        self.grammar: Grammar = Grammar(self.grammar_string)
        self.valid_actions: Dict[str, List[str]] = initialize_valid_actions(self.grammar, KEYWORDS)
        if database_file:
            self.connection.close()

Source File: atis.py From allennlp-semparse with Apache License 2.0

6 votes

def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path) as atis_file:
            logger.info("Reading ATIS instances from dataset at : %s", file_path)
            for line in _lazy_parse(atis_file.read()):
                utterances = []
                for current_interaction in line["interaction"]:
                    if not current_interaction["utterance"] or not current_interaction["sql"]:
                        continue
                    utterances.append(current_interaction["utterance"])
                    sql_query_labels = [
                        query for query in current_interaction["sql"].split("\n") if query
                    ]
                    instance = self.text_to_instance(deepcopy(utterances), sql_query_labels)
                    if not instance:
                        continue
                    yield instance

Source File: stanford_sentiment_tree_bank.py From magnitude with MIT License

6 votes

def _read(self, file_path):
        with open(cached_path(file_path), u"r") as data_file:
            logger.info(u"Reading instances from lines in file at: %s", file_path)
            for line in data_file.readlines():
                line = line.strip(u"\n")
                if not line:
                    continue
                parsed_line = Tree.fromstring(line)
                if self._use_subtrees:
                    for subtree in parsed_line.subtrees():
                        instance = self.text_to_instance(subtree.leaves(), subtree.label())
                        if instance is not None:
                            yield instance
                else:
                    instance = self.text_to_instance(parsed_line.leaves(), parsed_line.label())
                    if instance is not None:
                        yield instance

    #overrides

Source File: multiple_correct_mcq_entailment.py From multee with Apache License 2.0

6 votes

def _read(self, file_path: str):
        file_path = cached_path(file_path)
        with open(file_path, 'r') as entailment_file:
            logger.info("Reading entailment instances from jsonl dataset at: %s", file_path)
            for line in entailment_file:
                if line.strip():
                    instances_json = json.loads(line.strip())
                    premises = instances_json["premises"]
                    hypotheses = instances_json["hypotheses"]
                    entailments = instances_json.get("entailments", None)
                    if entailments is None:
                        answer_indices = None
                    else:
                        answer_indices = [index for index, entailment in enumerate(entailments) if entailment]
                    relevant_sentence_idxs = instances_json.get("relevant_sentence_idxs", None)
                    yield self.text_to_instance(premises,
                                                hypotheses,
                                                answer_indices,
                                                relevant_sentence_idxs)

Source File: semeval_2010_task_8_reader.py From DISTRE with Apache License 2.0

6 votes

def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, 'r') as semeval_file:
            logger.info("Reading SemEval 2010 Task 8 instances from jsonl dataset at: %s", file_path)
            for line in semeval_file:
                example = json.loads(line)

                tokens = example["tokens"]
                label = example["label"]
                entity_indices = example["entities"]
                
                start_e1, end_e1 = entity_indices[0]
                start_e2, end_e2 = entity_indices[1]
                entity_1 = (start_e1, end_e1 - 1)
                entity_2 = (start_e2, end_e2 - 1)

                yield self.text_to_instance(tokens, entity_1, entity_2, label)

Source File: prolocal_dataset_reader.py From propara with Apache License 2.0

5 votes

def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        instances = []
        with open(file_path, 'r') as state_change_file:
            logger.info("Reading state change instances from TSV dataset at: %s", file_path)
            for line in tqdm.tqdm(state_change_file):
                parts: List[str] = line.split()
                # parse input
                sentence_tokens = parts[0].split("####")
                verb_span = parts[1].split(",")
                verb_vector = [int(i) for i in verb_span]
                entity_span = parts[2].split(",")
                entity_vector = [int(i) for i in entity_span]

                # parse labels
                state_change_types = parts[3]
                state_change_tags = parts[4].split(",")

                # create instance
                yield self.text_to_instance(sentence_tokens=sentence_tokens,
                                                       verb_vector=verb_vector,
                                                       entity_vector=entity_vector,
                                                       state_change_types=state_change_types,
                                                       state_change_tags=state_change_tags)

Source File: datareader.py From NLP_Toolkit with Apache License 2.0

5 votes

def _read(self, file_path):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)
        with open(file_path, "r") as data_file:
            logger.info("Reading instances from lines in file at: %s", file_path)
            for line in data_file:
                line = line.strip("\n")
                # skip blank and broken lines
                if not line or (not self._test_mode and self._broken_dot_strategy == 'skip'
                                and self.BROKEN_SENTENCES_REGEXP.search(line) is not None):
                    continue

                tokens_and_tags = [pair.rsplit(self._delimeters['labels'], 1)
                                   for pair in line.split(self._delimeters['tokens'])]
                try:
                    tokens = [Token(token) for token, tag in tokens_and_tags]
                    tags = [tag for token, tag in tokens_and_tags]
                except ValueError:
                    tokens = [Token(token[0]) for token in tokens_and_tags]
                    tags = None

                if tokens and tokens[0] != Token(START_TOKEN):
                    tokens = [Token(START_TOKEN)] + tokens

                words = [x.text for x in tokens]
                if self._max_len is not None:
                    tokens = tokens[:self._max_len]
                    tags = None if tags is None else tags[:self._max_len]
                instance = self.text_to_instance(tokens, tags, words)
                if instance:
                    yield instance

Source File: embedding.py From allennlp with Apache License 2.0

5 votes

def _open_inside_tar(self, archive_path: str, member_path: Optional[str] = None) -> None:
        cached_archive_path = cached_path(archive_path, cache_dir=self._cache_dir)
        archive = tarfile.open(cached_archive_path, "r")
        if member_path is None:
            members_list = archive.getnames()
            member_path = self._get_the_only_file_in_the_archive(members_list, archive_path)
        member_path = cast(str, member_path)
        member = archive.getmember(member_path)  # raises exception if not present
        member_file = cast(BinaryIO, archive.extractfile(member))
        self._handle = io.TextIOWrapper(member_file, encoding=self._encoding)
        self._archive_handle = archive

Source File: predict.py From allennlp with Apache License 2.0

5 votes

def _get_json_data(self) -> Iterator[JsonDict]:
        if self._input_file == "-":
            for line in sys.stdin:
                if not line.isspace():
                    yield self._predictor.load_line(line)
        else:
            input_file = cached_path(self._input_file)
            with open(input_file, "r") as file_input:
                for line in file_input:
                    if not line.isspace():
                        yield self._predictor.load_line(line)

Source File: arc_multichoice_json_reader.py From OpenBookQA with Apache License 2.0

5 votes

def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, 'r') as data_file:
            logger.info("Reading Multi-choice QA instances in ARC format from jsonl dataset at: %s", file_path)
            for curr_line_id, line in enumerate(data_file):
                item_json = json.loads(line.strip())

                item_id = item_json["id"]
                question_text = self.get_question_text_from_item(item_json, self._question_value_type)

                choice_label_to_id = {}
                choice_text_list = []

                for choice_id, choice_item in enumerate(item_json["question"]["choices"]):
                    choice_label = choice_item["label"]
                    choice_label_to_id[choice_label] = choice_id

                    choice_text = self.get_choice_text_from_item(item_json, choice_id, self._choice_value_type)

                    choice_text_list.append(choice_text)

                answer_id = choice_label_to_id[item_json["answerKey"]]

                yield self.text_to_instance(item_id, question_text, choice_text_list, answer_id)

Source File: data_loading.py From teaching with GNU General Public License v3.0

5 votes

def _read(self, file_path):
        with open(cached_path(file_path), "r", encoding="utf8") as data_file:
            #logger.info("Reading instances from lines in file at: %s", file_path)
            for line_num, line in enumerate(data_file):
                line = line.strip("\n")

                if not line:
                    continue

                line_parts = line.split('\t')
                if len(line_parts) != 3:
                    raise ConfigurationError("Invalid line format: %s (line number %d)" % (line, line_num + 1))
                query_sequence, doc_pos_sequence, doc_neg_sequence = line_parts
                yield self.text_to_instance(query_sequence, doc_pos_sequence, doc_neg_sequence)

Source File: wikitables_accuracy.py From magnitude with MIT License

5 votes

def _create_sempre_executor(self)        :
        u"""
        Creates a server running SEMPRE that we can send logical forms to for evaluation.  This
        uses inter-process communication, because SEMPRE is java code.  We also need to be careful
        to clean up the process when our program exits.
        """
        if self._executor_process:
            return

        # It'd be much nicer to just use `cached_path` for these files.  However, the SEMPRE jar
        # that we're using expects to find these files in a particular location, so we need to make
        # sure we put the files in that location.
        os.makedirs(SEMPRE_DIR, exist_ok=True)
        abbreviations_path = os.path.join(SEMPRE_DIR, u'abbreviations.tsv')
        if not os.path.exists(abbreviations_path):
            subprocess.run('wget {ABBREVIATIONS_FILE}', shell=True)
            subprocess.run('mv wikitables-abbreviations.tsv {abbreviations_path}', shell=True)

        grammar_path = os.path.join(SEMPRE_DIR, u'grow.grammar')
        if not os.path.exists(grammar_path):
            subprocess.run('wget {GROW_FILE}', shell=True)
            subprocess.run('mv wikitables-grow.grammar {grammar_path}', shell=True)

        args = [u'java', u'-jar', cached_path(SEMPRE_EXECUTOR_JAR), u'serve', self._table_directory]
        self._executor_process = subprocess.Popen(args,
                                                  stdin=subprocess.PIPE,
                                                  stdout=subprocess.PIPE,
                                                  bufsize=1)

        lines = []
        for _ in range(6):
            # SEMPRE outputs six lines of stuff when it loads that I can't disable.  So, we clear
            # that here.
            lines.append(unicode(self._executor_process.stdout.readline()))
        assert u'Parser' in lines[-1], u"SEMPRE server output unexpected; the server may have changed"
        logger.info(u"Started SEMPRE server for evaluating logical forms")

        # This is supposed to ensure that the subprocess gets killed when python exits.
        atexit.register(self._stop_sempre_executor)

Source File: params.py From magnitude with MIT License

5 votes

def from_file(params_file     , params_overrides      = u"")            :
        u"""
        Load a `Params` object from a configuration file.
        """
        # redirect to cache, if necessary
        params_file = cached_path(params_file)
        ext_vars = dict(os.environ)

        file_dict = json.loads(evaluate_file(params_file, ext_vars=ext_vars))

        overrides_dict = parse_overrides(params_overrides)
        param_dict = with_fallback(preferred=overrides_dict, fallback=file_dict)

        return Params(param_dict)

Source File: copynet.py From nlp-models with MIT License

5 votes

def _read(self, file_path):
        with open(cached_path(file_path), "r") as data_file:
            logger.info("Reading instances from lines in file at: %s", file_path)
            for line_num, line in enumerate(data_file):
                source_sequence, target_sequence = self._read_line(line_num, line)
                if not source_sequence:
                    continue
                yield self.text_to_instance(source_sequence, target_sequence)

Source File: nl2bash.py From nlp-models with MIT License

5 votes

def _read(self, file_path):
        with open(cached_path(file_path), "r") as data_file:
            logger.info("Reading instances from lines in file at: %s", file_path)
            for line_num, line in enumerate(data_file):
                source_sequence, target_sequence = self._read_line(line_num, line)
                if not source_sequence:
                    continue
                target_sequence = self._preprocess_target(target_sequence)
                yield self.text_to_instance(source_sequence, target_sequence)

Source File: dataset_reader.py From nanigonet with MIT License

5 votes

def _read(self, file_path: str) -> Iterable[Instance]:
        file_path = cached_path(file_path)

        with open(file_path) as f:
            for line in f:
                data = json.loads(line)
                tokens = self._tokenizer.tokenize(data['text'])
                tags = data.get('labels')

                yield self.text_to_instance(tokens, tags)

Source File: winobias.py From magnitude with MIT License

5 votes

def _read(self, file_path     ):

        for sentence in open(cached_path(file_path), u"r"):
            tokens = sentence.strip().split(u" ")
            clusters                                          = collections.defaultdict(list)
            words = []
            for index, token in enumerate(tokens):
                # Coreference is annotated using [square brackets]
                # or (round brackets) around coreferent phrases.
                if u"[" in token and u"]" in token:
                    clusters[0].append((index, index))
                elif u"[" in token:
                    clusters[0].append((index, index))
                elif u"]" in token:
                    old_span = clusters[0][-1]
                    clusters[0][-1] = (old_span[0], index)

                if u"(" in token and u")" in token:
                    clusters[1].append((index, index))
                elif u"(" in token:
                    clusters[1].append((index, index))
                elif u")" in token:
                    old_span = clusters[1][-1]
                    clusters[1][-1] = (old_span[0], index)

                if token.endswith(u"."):
                    # Winobias is tokenised, but not for full stops.
                    # We'll just special case them here.
                    token = token[:-1]
                    words.append(token.strip(u"[]()"))
                    words.append(u".")
                else:
                    words.append(token.strip(u"[]()"))

            yield self.text_to_instance([Token(x) for x in words], [x for x in list(clusters.values())])

    #overrides

Python allennlp.common.file_utils.cached_path() Examples