Python allennlp.common.file_utils.cached_path() Examples

The following are 30 code examples of allennlp.common.file_utils.cached_path(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module allennlp.common.file_utils , or try the search function .
Example #1
Source File: entailment_pair.py    From multee with Apache License 2.0 6 votes vote down vote up
def _read(self, file_path: str):
        file_path = cached_path(file_path)
        with open(file_path, 'r') as entailment_file:
            logger.info("Reading entailment instances from jsonl dataset at: %s", file_path)
            for line in entailment_file:
                if line.strip():
                    instance_json = json.loads(line.strip())
                    premise = instance_json.get("sentence1", None) or instance_json.get("premise", None)
                    hypothesis = instance_json.get("sentence2", None) or instance_json.get("hypothesis", None)
                    label = instance_json.get("gold_label", None) or instance_json.get("label", None) # entails or neutral
                    if label == '-':
                        # These were cases where the annotators disagreed; we'll just skip them.
                        # It's like 800 out of 500k examples in the training data.
                        continue
                    if label in ["entails", "entailment"]:
                        label = "entailment"
                    yield self.text_to_instance(premise, hypothesis, label) 
Example #2
Source File: ebmnlp.py    From scibert with Apache License 2.0 6 votes vote down vote up
def _read(self, file_path: str) -> Iterable[Instance]:
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, "r") as data_file:
            logger.info("Reading instances from lines in file at: %s", file_path)

            # Group into alternative divider / sentence chunks.
            for is_divider, lines in itertools.groupby(data_file, _is_divider):
                # Ignore the divider chunks, so that `lines` corresponds to the words
                # of a single sentence.
                if not is_divider:
                    fields = [line.strip().split() for line in lines]
                    # unzipping trick returns tuples, but our Fields need lists
                    fields = [list(field) for field in zip(*fields)]
                    tokens_, _, _, pico_tags = fields
                    # TextField requires ``Token`` objects
                    tokens = [Token(token) for token in tokens_]

                    yield self.text_to_instance(tokens, pico_tags) 
Example #3
Source File: elmo.py    From allennlp with Apache License 2.0 6 votes vote down vote up
def __init__(self, options_file: str, weight_file: str, requires_grad: bool = False) -> None:
        super().__init__()

        with open(cached_path(options_file), "r") as fin:
            self._options = json.load(fin)
        self._weight_file = weight_file

        self.output_dim = self._options["lstm"]["projection_dim"]
        self.requires_grad = requires_grad

        self._load_weights()

        # Cache the arrays for use in forward -- +1 due to masking.
        self._beginning_of_sentence_characters = torch.from_numpy(
            numpy.array(ELMoCharacterMapper.beginning_of_sentence_characters) + 1
        )
        self._end_of_sentence_characters = torch.from_numpy(
            numpy.array(ELMoCharacterMapper.end_of_sentence_characters) + 1
        ) 
Example #4
Source File: text_classification_json.py    From allennlp with Apache License 2.0 6 votes vote down vote up
def _read(self, file_path):
        with open(cached_path(file_path), "r") as data_file:
            for line in data_file.readlines():
                if not line:
                    continue
                items = json.loads(line)
                text = items["text"]
                label = items.get("label")
                if label is not None:
                    if self._skip_label_indexing:
                        try:
                            label = int(label)
                        except ValueError:
                            raise ValueError(
                                "Labels must be integers if skip_label_indexing is True."
                            )
                    else:
                        label = str(label)
                instance = self.text_to_instance(text=text, label=label)
                if instance is not None:
                    yield instance 
Example #5
Source File: entailment_tuple_reader.py    From scitail with Apache License 2.0 6 votes vote down vote up
def read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)
        instances = []
        with open(file_path, 'r') as entailment_file:
            logger.info("Reading entailment instances from TSV dataset at: %s", file_path)
            for line in tqdm.tqdm(entailment_file):
                fields = line.split("\t")
                if len(fields) != 4:
                    raise ValueError("Expected four fields: "
                                     "premise   hypothesis  label   hypothesis_structure. "
                                     "Found {} fields in {}".format(len(fields), line))
                premise, hypothesis, label, hypothesis_structure = fields
                instances.append(self.text_to_instance(premise, hypothesis, hypothesis_structure,
                                                       label))
        if not instances:
            raise ConfigurationError("No instances were read from the given filepath {}. "
                                     "Is the path correct?".format(file_path))
        return Dataset(instances) 
Example #6
Source File: elmo.py    From magnitude with MIT License 6 votes vote down vote up
def __init__(self,
                 options_file     ,
                 weight_file     ,
                 requires_grad       = False)        :
        super(_ElmoCharacterEncoder, self).__init__()

        with open(cached_path(options_file), u'r') as fin:
            self._options = json.load(fin)
        self._weight_file = weight_file

        self.output_dim = self._options[u'lstm'][u'projection_dim']
        self.requires_grad = requires_grad

        self._load_weights()

        # Cache the arrays for use in forward -- +1 due to masking.
        self._beginning_of_sentence_characters = torch.from_numpy(
                numpy.array(ELMoCharacterMapper.beginning_of_sentence_characters) + 1
        )
        self._end_of_sentence_characters = torch.from_numpy(
                numpy.array(ELMoCharacterMapper.end_of_sentence_characters) + 1
        ) 
Example #7
Source File: sequence_tagging.py    From magnitude with MIT License 6 votes vote down vote up
def _read(self, file_path):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, u"r") as data_file:

            logger.info(u"Reading instances from lines in file at: %s", file_path)
            for line in data_file:
                line = line.strip(u"\n")

                # skip blank lines
                if not line:
                    continue

                tokens_and_tags = [pair.rsplit(self._word_tag_delimiter, 1)
                                   for pair in line.split(self._token_delimiter)]
                tokens = [Token(token) for token, tag in tokens_and_tags]
                tags = [tag for token, tag in tokens_and_tags]
                yield self.text_to_instance(tokens, tags) 
Example #8
Source File: conll2003.py    From magnitude with MIT License 6 votes vote down vote up
def _read(self, file_path     )                      :
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, u"r") as data_file:
            logger.info(u"Reading instances from lines in file at: %s", file_path)

            # Group into alternative divider / sentence chunks.
            for is_divider, lines in itertools.groupby(data_file, _is_divider):
                # Ignore the divider chunks, so that `lines` corresponds to the words
                # of a single sentence.
                if not is_divider:
                    fields = [line.strip().split() for line in lines]
                    # unzipping trick returns tuples, but our Fields need lists
                    tokens, pos_tags, chunk_tags, ner_tags = [list(field) for field in izip(*fields)]
                    # TextField requires ``Token`` objects
                    tokens = [Token(token) for token in tokens]

                    yield self.text_to_instance(tokens, pos_tags, chunk_tags, ner_tags) 
Example #9
Source File: penn_tree_bank.py    From magnitude with MIT License 6 votes vote down vote up
def _read(self, file_path):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)
        directory, filename = os.path.split(file_path)
        logger.info(u"Reading instances from lines in file at: %s", file_path)
        for parse in BracketParseCorpusReader(root=directory, fileids=[filename]).parsed_sents():

            self._strip_functional_tags(parse)
            # This is un-needed and clutters the label space.
            # All the trees also contain a root S node.
            if parse.label() == u"VROOT":
                parse = parse[0]
            pos_tags = [x[1] for x in parse.pos()] if self._use_pos_tags else None
            yield self.text_to_instance(parse.leaves(), pos_tags, parse)

    #overrides 
Example #10
Source File: atis.py    From magnitude with MIT License 6 votes vote down vote up
def _read(self, file_path     ):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path) as atis_file:
            logger.info(u"Reading ATIS instances from dataset at : %s", file_path)
            for line in _lazy_parse(atis_file.read()):
                utterances = []
                for current_interaction in line[u'interaction']:
                    if not current_interaction[u'utterance']:
                        continue
                    utterances.append(current_interaction[u'utterance'])
                    instance = self.text_to_instance(utterances, current_interaction[u'sql'])
                    if not instance:
                        continue
                    yield instance

    #overrides 
Example #11
Source File: semantic_role_labeling.py    From magnitude with MIT License 6 votes vote down vote up
def _read(self, file_path     ):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)
        ontonotes_reader = Ontonotes()
        logger.info(u"Reading SRL instances from dataset files at: %s", file_path)
        if self._domain_identifier is not None:
            logger.info(u"Filtering to only include file paths containing the %s domain", self._domain_identifier)

        for sentence in self._ontonotes_subset(ontonotes_reader, file_path, self._domain_identifier):
            tokens = [Token(t) for t in sentence.words]
            if not sentence.srl_frames:
                # Sentence contains no predicates.
                tags = [u"O" for _ in tokens]
                verb_label = [0 for _ in tokens]
                yield self.text_to_instance(tokens, verb_label, tags)
            else:
                for (_, tags) in sentence.srl_frames:
                    verb_indicator = [1 if label[-2:] == u"-V" else 0 for label in tags]
                    yield self.text_to_instance(tokens, verb_indicator, tags) 
Example #12
Source File: conll.py    From magnitude with MIT License 6 votes vote down vote up
def _read(self, file_path     ):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        ontonotes_reader = Ontonotes()
        for sentences in ontonotes_reader.dataset_document_iterator(file_path):
            clusters                                          = collections.defaultdict(list)

            total_tokens = 0
            for sentence in sentences:
                for typed_span in sentence.coref_spans:
                    # Coref annotations are on a _per sentence_
                    # basis, so we need to adjust them to be relative
                    # to the length of the document.
                    span_id, (start, end) = typed_span
                    clusters[span_id].append((start + total_tokens,
                                              end + total_tokens))
                total_tokens += len(sentence.words)

            canonical_clusters = canonicalize_clusters(clusters)
            yield self.text_to_instance([s.words for s in sentences], canonical_clusters)

    #overrides 
Example #13
Source File: language_modeling.py    From magnitude with MIT License 6 votes vote down vote up
def _read(self, file_path     ):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, u"r") as text_file:
            instance_strings = text_file.readlines()

        if self._tokens_per_instance is not None:
            all_text = u" ".join([x.replace(u"\n", u" ").strip() for x in instance_strings])
            tokenized_text = self._tokenizer.tokenize(all_text)
            num_tokens = self._tokens_per_instance + 1
            tokenized_strings = []
            logger.info(u"Creating dataset from all text in file: %s", file_path)
            for index in Tqdm.tqdm(range(0, len(tokenized_text) - num_tokens, num_tokens - 1)):
                tokenized_strings.append(tokenized_text[index:(index + num_tokens)])
        else:
            tokenized_strings = [self._tokenizer.tokenize(s) for s in instance_strings]

        for tokenized_string in tokenized_strings:
            input_field = TextField(tokenized_string[:-1], self._token_indexers)
            output_field = TextField(tokenized_string[1:], self._output_indexer)
            yield Instance({u'input_tokens': input_field,
                            u'output_tokens': output_field})

    #overrides 
Example #14
Source File: atis_sql_table_context.py    From allennlp-semparse with Apache License 2.0 6 votes vote down vote up
def __init__(
        self,
        all_tables: Dict[str, List[str]] = None,
        tables_with_strings: Dict[str, List[str]] = None,
        database_file: str = None,
    ) -> None:
        self.all_tables = all_tables
        self.tables_with_strings = tables_with_strings
        if database_file:
            self.database_file = cached_path(database_file)
            self.connection = sqlite3.connect(self.database_file)
            self.cursor = self.connection.cursor()

        grammar_dictionary, strings_list = self.create_grammar_dict_and_strings()
        self.grammar_dictionary: Dict[str, List[str]] = grammar_dictionary
        self.strings_list: List[Tuple[str, str]] = strings_list

        self.grammar_string: str = self.get_grammar_string()
        self.grammar: Grammar = Grammar(self.grammar_string)
        self.valid_actions: Dict[str, List[str]] = initialize_valid_actions(self.grammar, KEYWORDS)
        if database_file:
            self.connection.close() 
Example #15
Source File: atis.py    From allennlp-semparse with Apache License 2.0 6 votes vote down vote up
def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path) as atis_file:
            logger.info("Reading ATIS instances from dataset at : %s", file_path)
            for line in _lazy_parse(atis_file.read()):
                utterances = []
                for current_interaction in line["interaction"]:
                    if not current_interaction["utterance"] or not current_interaction["sql"]:
                        continue
                    utterances.append(current_interaction["utterance"])
                    sql_query_labels = [
                        query for query in current_interaction["sql"].split("\n") if query
                    ]
                    instance = self.text_to_instance(deepcopy(utterances), sql_query_labels)
                    if not instance:
                        continue
                    yield instance 
Example #16
Source File: stanford_sentiment_tree_bank.py    From magnitude with MIT License 6 votes vote down vote up
def _read(self, file_path):
        with open(cached_path(file_path), u"r") as data_file:
            logger.info(u"Reading instances from lines in file at: %s", file_path)
            for line in data_file.readlines():
                line = line.strip(u"\n")
                if not line:
                    continue
                parsed_line = Tree.fromstring(line)
                if self._use_subtrees:
                    for subtree in parsed_line.subtrees():
                        instance = self.text_to_instance(subtree.leaves(), subtree.label())
                        if instance is not None:
                            yield instance
                else:
                    instance = self.text_to_instance(parsed_line.leaves(), parsed_line.label())
                    if instance is not None:
                        yield instance

    #overrides 
Example #17
Source File: multiple_correct_mcq_entailment.py    From multee with Apache License 2.0 6 votes vote down vote up
def _read(self, file_path: str):
        file_path = cached_path(file_path)
        with open(file_path, 'r') as entailment_file:
            logger.info("Reading entailment instances from jsonl dataset at: %s", file_path)
            for line in entailment_file:
                if line.strip():
                    instances_json = json.loads(line.strip())
                    premises = instances_json["premises"]
                    hypotheses = instances_json["hypotheses"]
                    entailments = instances_json.get("entailments", None)
                    if entailments is None:
                        answer_indices = None
                    else:
                        answer_indices = [index for index, entailment in enumerate(entailments) if entailment]
                    relevant_sentence_idxs = instances_json.get("relevant_sentence_idxs", None)
                    yield self.text_to_instance(premises,
                                                hypotheses,
                                                answer_indices,
                                                relevant_sentence_idxs) 
Example #18
Source File: semeval_2010_task_8_reader.py    From DISTRE with Apache License 2.0 6 votes vote down vote up
def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, 'r') as semeval_file:
            logger.info("Reading SemEval 2010 Task 8 instances from jsonl dataset at: %s", file_path)
            for line in semeval_file:
                example = json.loads(line)

                tokens = example["tokens"]
                label = example["label"]
                entity_indices = example["entities"]
                
                start_e1, end_e1 = entity_indices[0]
                start_e2, end_e2 = entity_indices[1]
                entity_1 = (start_e1, end_e1 - 1)
                entity_2 = (start_e2, end_e2 - 1)

                yield self.text_to_instance(tokens, entity_1, entity_2, label) 
Example #19
Source File: prolocal_dataset_reader.py    From propara with Apache License 2.0 5 votes vote down vote up
def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        instances = []
        with open(file_path, 'r') as state_change_file:
            logger.info("Reading state change instances from TSV dataset at: %s", file_path)
            for line in tqdm.tqdm(state_change_file):
                parts: List[str] = line.split()
                # parse input
                sentence_tokens = parts[0].split("####")
                verb_span = parts[1].split(",")
                verb_vector = [int(i) for i in verb_span]
                entity_span = parts[2].split(",")
                entity_vector = [int(i) for i in entity_span]

                # parse labels
                state_change_types = parts[3]
                state_change_tags = parts[4].split(",")

                # create instance
                yield self.text_to_instance(sentence_tokens=sentence_tokens,
                                                       verb_vector=verb_vector,
                                                       entity_vector=entity_vector,
                                                       state_change_types=state_change_types,
                                                       state_change_tags=state_change_tags) 
Example #20
Source File: datareader.py    From NLP_Toolkit with Apache License 2.0 5 votes vote down vote up
def _read(self, file_path):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)
        with open(file_path, "r") as data_file:
            logger.info("Reading instances from lines in file at: %s", file_path)
            for line in data_file:
                line = line.strip("\n")
                # skip blank and broken lines
                if not line or (not self._test_mode and self._broken_dot_strategy == 'skip'
                                and self.BROKEN_SENTENCES_REGEXP.search(line) is not None):
                    continue

                tokens_and_tags = [pair.rsplit(self._delimeters['labels'], 1)
                                   for pair in line.split(self._delimeters['tokens'])]
                try:
                    tokens = [Token(token) for token, tag in tokens_and_tags]
                    tags = [tag for token, tag in tokens_and_tags]
                except ValueError:
                    tokens = [Token(token[0]) for token in tokens_and_tags]
                    tags = None

                if tokens and tokens[0] != Token(START_TOKEN):
                    tokens = [Token(START_TOKEN)] + tokens

                words = [x.text for x in tokens]
                if self._max_len is not None:
                    tokens = tokens[:self._max_len]
                    tags = None if tags is None else tags[:self._max_len]
                instance = self.text_to_instance(tokens, tags, words)
                if instance:
                    yield instance 
Example #21
Source File: embedding.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def _open_inside_tar(self, archive_path: str, member_path: Optional[str] = None) -> None:
        cached_archive_path = cached_path(archive_path, cache_dir=self._cache_dir)
        archive = tarfile.open(cached_archive_path, "r")
        if member_path is None:
            members_list = archive.getnames()
            member_path = self._get_the_only_file_in_the_archive(members_list, archive_path)
        member_path = cast(str, member_path)
        member = archive.getmember(member_path)  # raises exception if not present
        member_file = cast(BinaryIO, archive.extractfile(member))
        self._handle = io.TextIOWrapper(member_file, encoding=self._encoding)
        self._archive_handle = archive 
Example #22
Source File: predict.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def _get_json_data(self) -> Iterator[JsonDict]:
        if self._input_file == "-":
            for line in sys.stdin:
                if not line.isspace():
                    yield self._predictor.load_line(line)
        else:
            input_file = cached_path(self._input_file)
            with open(input_file, "r") as file_input:
                for line in file_input:
                    if not line.isspace():
                        yield self._predictor.load_line(line) 
Example #23
Source File: arc_multichoice_json_reader.py    From OpenBookQA with Apache License 2.0 5 votes vote down vote up
def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, 'r') as data_file:
            logger.info("Reading Multi-choice QA instances in ARC format from jsonl dataset at: %s", file_path)
            for curr_line_id, line in enumerate(data_file):
                item_json = json.loads(line.strip())

                item_id = item_json["id"]
                question_text = self.get_question_text_from_item(item_json, self._question_value_type)

                choice_label_to_id = {}
                choice_text_list = []

                for choice_id, choice_item in enumerate(item_json["question"]["choices"]):
                    choice_label = choice_item["label"]
                    choice_label_to_id[choice_label] = choice_id

                    choice_text = self.get_choice_text_from_item(item_json, choice_id, self._choice_value_type)

                    choice_text_list.append(choice_text)

                answer_id = choice_label_to_id[item_json["answerKey"]]

                yield self.text_to_instance(item_id, question_text, choice_text_list, answer_id) 
Example #24
Source File: data_loading.py    From teaching with GNU General Public License v3.0 5 votes vote down vote up
def _read(self, file_path):
        with open(cached_path(file_path), "r", encoding="utf8") as data_file:
            #logger.info("Reading instances from lines in file at: %s", file_path)
            for line_num, line in enumerate(data_file):
                line = line.strip("\n")

                if not line:
                    continue

                line_parts = line.split('\t')
                if len(line_parts) != 3:
                    raise ConfigurationError("Invalid line format: %s (line number %d)" % (line, line_num + 1))
                query_sequence, doc_pos_sequence, doc_neg_sequence = line_parts
                yield self.text_to_instance(query_sequence, doc_pos_sequence, doc_neg_sequence) 
Example #25
Source File: wikitables_accuracy.py    From magnitude with MIT License 5 votes vote down vote up
def _create_sempre_executor(self)        :
        u"""
        Creates a server running SEMPRE that we can send logical forms to for evaluation.  This
        uses inter-process communication, because SEMPRE is java code.  We also need to be careful
        to clean up the process when our program exits.
        """
        if self._executor_process:
            return

        # It'd be much nicer to just use `cached_path` for these files.  However, the SEMPRE jar
        # that we're using expects to find these files in a particular location, so we need to make
        # sure we put the files in that location.
        os.makedirs(SEMPRE_DIR, exist_ok=True)
        abbreviations_path = os.path.join(SEMPRE_DIR, u'abbreviations.tsv')
        if not os.path.exists(abbreviations_path):
            subprocess.run('wget {ABBREVIATIONS_FILE}', shell=True)
            subprocess.run('mv wikitables-abbreviations.tsv {abbreviations_path}', shell=True)

        grammar_path = os.path.join(SEMPRE_DIR, u'grow.grammar')
        if not os.path.exists(grammar_path):
            subprocess.run('wget {GROW_FILE}', shell=True)
            subprocess.run('mv wikitables-grow.grammar {grammar_path}', shell=True)

        args = [u'java', u'-jar', cached_path(SEMPRE_EXECUTOR_JAR), u'serve', self._table_directory]
        self._executor_process = subprocess.Popen(args,
                                                  stdin=subprocess.PIPE,
                                                  stdout=subprocess.PIPE,
                                                  bufsize=1)

        lines = []
        for _ in range(6):
            # SEMPRE outputs six lines of stuff when it loads that I can't disable.  So, we clear
            # that here.
            lines.append(unicode(self._executor_process.stdout.readline()))
        assert u'Parser' in lines[-1], u"SEMPRE server output unexpected; the server may have changed"
        logger.info(u"Started SEMPRE server for evaluating logical forms")

        # This is supposed to ensure that the subprocess gets killed when python exits.
        atexit.register(self._stop_sempre_executor) 
Example #26
Source File: params.py    From magnitude with MIT License 5 votes vote down vote up
def from_file(params_file     , params_overrides      = u"")            :
        u"""
        Load a `Params` object from a configuration file.
        """
        # redirect to cache, if necessary
        params_file = cached_path(params_file)
        ext_vars = dict(os.environ)

        file_dict = json.loads(evaluate_file(params_file, ext_vars=ext_vars))

        overrides_dict = parse_overrides(params_overrides)
        param_dict = with_fallback(preferred=overrides_dict, fallback=file_dict)

        return Params(param_dict) 
Example #27
Source File: copynet.py    From nlp-models with MIT License 5 votes vote down vote up
def _read(self, file_path):
        with open(cached_path(file_path), "r") as data_file:
            logger.info("Reading instances from lines in file at: %s", file_path)
            for line_num, line in enumerate(data_file):
                source_sequence, target_sequence = self._read_line(line_num, line)
                if not source_sequence:
                    continue
                yield self.text_to_instance(source_sequence, target_sequence) 
Example #28
Source File: nl2bash.py    From nlp-models with MIT License 5 votes vote down vote up
def _read(self, file_path):
        with open(cached_path(file_path), "r") as data_file:
            logger.info("Reading instances from lines in file at: %s", file_path)
            for line_num, line in enumerate(data_file):
                source_sequence, target_sequence = self._read_line(line_num, line)
                if not source_sequence:
                    continue
                target_sequence = self._preprocess_target(target_sequence)
                yield self.text_to_instance(source_sequence, target_sequence) 
Example #29
Source File: dataset_reader.py    From nanigonet with MIT License 5 votes vote down vote up
def _read(self, file_path: str) -> Iterable[Instance]:
        file_path = cached_path(file_path)

        with open(file_path) as f:
            for line in f:
                data = json.loads(line)
                tokens = self._tokenizer.tokenize(data['text'])
                tags = data.get('labels')

                yield self.text_to_instance(tokens, tags) 
Example #30
Source File: winobias.py    From magnitude with MIT License 5 votes vote down vote up
def _read(self, file_path     ):

        for sentence in open(cached_path(file_path), u"r"):
            tokens = sentence.strip().split(u" ")
            clusters                                          = collections.defaultdict(list)
            words = []
            for index, token in enumerate(tokens):
                # Coreference is annotated using [square brackets]
                # or (round brackets) around coreferent phrases.
                if u"[" in token and u"]" in token:
                    clusters[0].append((index, index))
                elif u"[" in token:
                    clusters[0].append((index, index))
                elif u"]" in token:
                    old_span = clusters[0][-1]
                    clusters[0][-1] = (old_span[0], index)

                if u"(" in token and u")" in token:
                    clusters[1].append((index, index))
                elif u"(" in token:
                    clusters[1].append((index, index))
                elif u")" in token:
                    old_span = clusters[1][-1]
                    clusters[1][-1] = (old_span[0], index)

                if token.endswith(u"."):
                    # Winobias is tokenised, but not for full stops.
                    # We'll just special case them here.
                    token = token[:-1]
                    words.append(token.strip(u"[]()"))
                    words.append(u".")
                else:
                    words.append(token.strip(u"[]()"))

            yield self.text_to_instance([Token(x) for x in words], [x for x in list(clusters.values())])

    #overrides