Python allennlp.common.file_utils.cached_path() Examples
The following are 30
code examples of allennlp.common.file_utils.cached_path().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
allennlp.common.file_utils
, or try the search function
.
Example #1
Source File: entailment_pair.py From multee with Apache License 2.0 | 6 votes |
def _read(self, file_path: str): file_path = cached_path(file_path) with open(file_path, 'r') as entailment_file: logger.info("Reading entailment instances from jsonl dataset at: %s", file_path) for line in entailment_file: if line.strip(): instance_json = json.loads(line.strip()) premise = instance_json.get("sentence1", None) or instance_json.get("premise", None) hypothesis = instance_json.get("sentence2", None) or instance_json.get("hypothesis", None) label = instance_json.get("gold_label", None) or instance_json.get("label", None) # entails or neutral if label == '-': # These were cases where the annotators disagreed; we'll just skip them. # It's like 800 out of 500k examples in the training data. continue if label in ["entails", "entailment"]: label = "entailment" yield self.text_to_instance(premise, hypothesis, label)
Example #2
Source File: ebmnlp.py From scibert with Apache License 2.0 | 6 votes |
def _read(self, file_path: str) -> Iterable[Instance]: # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) with open(file_path, "r") as data_file: logger.info("Reading instances from lines in file at: %s", file_path) # Group into alternative divider / sentence chunks. for is_divider, lines in itertools.groupby(data_file, _is_divider): # Ignore the divider chunks, so that `lines` corresponds to the words # of a single sentence. if not is_divider: fields = [line.strip().split() for line in lines] # unzipping trick returns tuples, but our Fields need lists fields = [list(field) for field in zip(*fields)] tokens_, _, _, pico_tags = fields # TextField requires ``Token`` objects tokens = [Token(token) for token in tokens_] yield self.text_to_instance(tokens, pico_tags)
Example #3
Source File: elmo.py From allennlp with Apache License 2.0 | 6 votes |
def __init__(self, options_file: str, weight_file: str, requires_grad: bool = False) -> None: super().__init__() with open(cached_path(options_file), "r") as fin: self._options = json.load(fin) self._weight_file = weight_file self.output_dim = self._options["lstm"]["projection_dim"] self.requires_grad = requires_grad self._load_weights() # Cache the arrays for use in forward -- +1 due to masking. self._beginning_of_sentence_characters = torch.from_numpy( numpy.array(ELMoCharacterMapper.beginning_of_sentence_characters) + 1 ) self._end_of_sentence_characters = torch.from_numpy( numpy.array(ELMoCharacterMapper.end_of_sentence_characters) + 1 )
Example #4
Source File: text_classification_json.py From allennlp with Apache License 2.0 | 6 votes |
def _read(self, file_path): with open(cached_path(file_path), "r") as data_file: for line in data_file.readlines(): if not line: continue items = json.loads(line) text = items["text"] label = items.get("label") if label is not None: if self._skip_label_indexing: try: label = int(label) except ValueError: raise ValueError( "Labels must be integers if skip_label_indexing is True." ) else: label = str(label) instance = self.text_to_instance(text=text, label=label) if instance is not None: yield instance
Example #5
Source File: entailment_tuple_reader.py From scitail with Apache License 2.0 | 6 votes |
def read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) instances = [] with open(file_path, 'r') as entailment_file: logger.info("Reading entailment instances from TSV dataset at: %s", file_path) for line in tqdm.tqdm(entailment_file): fields = line.split("\t") if len(fields) != 4: raise ValueError("Expected four fields: " "premise hypothesis label hypothesis_structure. " "Found {} fields in {}".format(len(fields), line)) premise, hypothesis, label, hypothesis_structure = fields instances.append(self.text_to_instance(premise, hypothesis, hypothesis_structure, label)) if not instances: raise ConfigurationError("No instances were read from the given filepath {}. " "Is the path correct?".format(file_path)) return Dataset(instances)
Example #6
Source File: elmo.py From magnitude with MIT License | 6 votes |
def __init__(self, options_file , weight_file , requires_grad = False) : super(_ElmoCharacterEncoder, self).__init__() with open(cached_path(options_file), u'r') as fin: self._options = json.load(fin) self._weight_file = weight_file self.output_dim = self._options[u'lstm'][u'projection_dim'] self.requires_grad = requires_grad self._load_weights() # Cache the arrays for use in forward -- +1 due to masking. self._beginning_of_sentence_characters = torch.from_numpy( numpy.array(ELMoCharacterMapper.beginning_of_sentence_characters) + 1 ) self._end_of_sentence_characters = torch.from_numpy( numpy.array(ELMoCharacterMapper.end_of_sentence_characters) + 1 )
Example #7
Source File: sequence_tagging.py From magnitude with MIT License | 6 votes |
def _read(self, file_path): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) with open(file_path, u"r") as data_file: logger.info(u"Reading instances from lines in file at: %s", file_path) for line in data_file: line = line.strip(u"\n") # skip blank lines if not line: continue tokens_and_tags = [pair.rsplit(self._word_tag_delimiter, 1) for pair in line.split(self._token_delimiter)] tokens = [Token(token) for token, tag in tokens_and_tags] tags = [tag for token, tag in tokens_and_tags] yield self.text_to_instance(tokens, tags)
Example #8
Source File: conll2003.py From magnitude with MIT License | 6 votes |
def _read(self, file_path ) : # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) with open(file_path, u"r") as data_file: logger.info(u"Reading instances from lines in file at: %s", file_path) # Group into alternative divider / sentence chunks. for is_divider, lines in itertools.groupby(data_file, _is_divider): # Ignore the divider chunks, so that `lines` corresponds to the words # of a single sentence. if not is_divider: fields = [line.strip().split() for line in lines] # unzipping trick returns tuples, but our Fields need lists tokens, pos_tags, chunk_tags, ner_tags = [list(field) for field in izip(*fields)] # TextField requires ``Token`` objects tokens = [Token(token) for token in tokens] yield self.text_to_instance(tokens, pos_tags, chunk_tags, ner_tags)
Example #9
Source File: penn_tree_bank.py From magnitude with MIT License | 6 votes |
def _read(self, file_path): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) directory, filename = os.path.split(file_path) logger.info(u"Reading instances from lines in file at: %s", file_path) for parse in BracketParseCorpusReader(root=directory, fileids=[filename]).parsed_sents(): self._strip_functional_tags(parse) # This is un-needed and clutters the label space. # All the trees also contain a root S node. if parse.label() == u"VROOT": parse = parse[0] pos_tags = [x[1] for x in parse.pos()] if self._use_pos_tags else None yield self.text_to_instance(parse.leaves(), pos_tags, parse) #overrides
Example #10
Source File: atis.py From magnitude with MIT License | 6 votes |
def _read(self, file_path ): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) with open(file_path) as atis_file: logger.info(u"Reading ATIS instances from dataset at : %s", file_path) for line in _lazy_parse(atis_file.read()): utterances = [] for current_interaction in line[u'interaction']: if not current_interaction[u'utterance']: continue utterances.append(current_interaction[u'utterance']) instance = self.text_to_instance(utterances, current_interaction[u'sql']) if not instance: continue yield instance #overrides
Example #11
Source File: semantic_role_labeling.py From magnitude with MIT License | 6 votes |
def _read(self, file_path ): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) ontonotes_reader = Ontonotes() logger.info(u"Reading SRL instances from dataset files at: %s", file_path) if self._domain_identifier is not None: logger.info(u"Filtering to only include file paths containing the %s domain", self._domain_identifier) for sentence in self._ontonotes_subset(ontonotes_reader, file_path, self._domain_identifier): tokens = [Token(t) for t in sentence.words] if not sentence.srl_frames: # Sentence contains no predicates. tags = [u"O" for _ in tokens] verb_label = [0 for _ in tokens] yield self.text_to_instance(tokens, verb_label, tags) else: for (_, tags) in sentence.srl_frames: verb_indicator = [1 if label[-2:] == u"-V" else 0 for label in tags] yield self.text_to_instance(tokens, verb_indicator, tags)
Example #12
Source File: conll.py From magnitude with MIT License | 6 votes |
def _read(self, file_path ): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) ontonotes_reader = Ontonotes() for sentences in ontonotes_reader.dataset_document_iterator(file_path): clusters = collections.defaultdict(list) total_tokens = 0 for sentence in sentences: for typed_span in sentence.coref_spans: # Coref annotations are on a _per sentence_ # basis, so we need to adjust them to be relative # to the length of the document. span_id, (start, end) = typed_span clusters[span_id].append((start + total_tokens, end + total_tokens)) total_tokens += len(sentence.words) canonical_clusters = canonicalize_clusters(clusters) yield self.text_to_instance([s.words for s in sentences], canonical_clusters) #overrides
Example #13
Source File: language_modeling.py From magnitude with MIT License | 6 votes |
def _read(self, file_path ): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) with open(file_path, u"r") as text_file: instance_strings = text_file.readlines() if self._tokens_per_instance is not None: all_text = u" ".join([x.replace(u"\n", u" ").strip() for x in instance_strings]) tokenized_text = self._tokenizer.tokenize(all_text) num_tokens = self._tokens_per_instance + 1 tokenized_strings = [] logger.info(u"Creating dataset from all text in file: %s", file_path) for index in Tqdm.tqdm(range(0, len(tokenized_text) - num_tokens, num_tokens - 1)): tokenized_strings.append(tokenized_text[index:(index + num_tokens)]) else: tokenized_strings = [self._tokenizer.tokenize(s) for s in instance_strings] for tokenized_string in tokenized_strings: input_field = TextField(tokenized_string[:-1], self._token_indexers) output_field = TextField(tokenized_string[1:], self._output_indexer) yield Instance({u'input_tokens': input_field, u'output_tokens': output_field}) #overrides
Example #14
Source File: atis_sql_table_context.py From allennlp-semparse with Apache License 2.0 | 6 votes |
def __init__( self, all_tables: Dict[str, List[str]] = None, tables_with_strings: Dict[str, List[str]] = None, database_file: str = None, ) -> None: self.all_tables = all_tables self.tables_with_strings = tables_with_strings if database_file: self.database_file = cached_path(database_file) self.connection = sqlite3.connect(self.database_file) self.cursor = self.connection.cursor() grammar_dictionary, strings_list = self.create_grammar_dict_and_strings() self.grammar_dictionary: Dict[str, List[str]] = grammar_dictionary self.strings_list: List[Tuple[str, str]] = strings_list self.grammar_string: str = self.get_grammar_string() self.grammar: Grammar = Grammar(self.grammar_string) self.valid_actions: Dict[str, List[str]] = initialize_valid_actions(self.grammar, KEYWORDS) if database_file: self.connection.close()
Example #15
Source File: atis.py From allennlp-semparse with Apache License 2.0 | 6 votes |
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) with open(file_path) as atis_file: logger.info("Reading ATIS instances from dataset at : %s", file_path) for line in _lazy_parse(atis_file.read()): utterances = [] for current_interaction in line["interaction"]: if not current_interaction["utterance"] or not current_interaction["sql"]: continue utterances.append(current_interaction["utterance"]) sql_query_labels = [ query for query in current_interaction["sql"].split("\n") if query ] instance = self.text_to_instance(deepcopy(utterances), sql_query_labels) if not instance: continue yield instance
Example #16
Source File: stanford_sentiment_tree_bank.py From magnitude with MIT License | 6 votes |
def _read(self, file_path): with open(cached_path(file_path), u"r") as data_file: logger.info(u"Reading instances from lines in file at: %s", file_path) for line in data_file.readlines(): line = line.strip(u"\n") if not line: continue parsed_line = Tree.fromstring(line) if self._use_subtrees: for subtree in parsed_line.subtrees(): instance = self.text_to_instance(subtree.leaves(), subtree.label()) if instance is not None: yield instance else: instance = self.text_to_instance(parsed_line.leaves(), parsed_line.label()) if instance is not None: yield instance #overrides
Example #17
Source File: multiple_correct_mcq_entailment.py From multee with Apache License 2.0 | 6 votes |
def _read(self, file_path: str): file_path = cached_path(file_path) with open(file_path, 'r') as entailment_file: logger.info("Reading entailment instances from jsonl dataset at: %s", file_path) for line in entailment_file: if line.strip(): instances_json = json.loads(line.strip()) premises = instances_json["premises"] hypotheses = instances_json["hypotheses"] entailments = instances_json.get("entailments", None) if entailments is None: answer_indices = None else: answer_indices = [index for index, entailment in enumerate(entailments) if entailment] relevant_sentence_idxs = instances_json.get("relevant_sentence_idxs", None) yield self.text_to_instance(premises, hypotheses, answer_indices, relevant_sentence_idxs)
Example #18
Source File: semeval_2010_task_8_reader.py From DISTRE with Apache License 2.0 | 6 votes |
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) with open(file_path, 'r') as semeval_file: logger.info("Reading SemEval 2010 Task 8 instances from jsonl dataset at: %s", file_path) for line in semeval_file: example = json.loads(line) tokens = example["tokens"] label = example["label"] entity_indices = example["entities"] start_e1, end_e1 = entity_indices[0] start_e2, end_e2 = entity_indices[1] entity_1 = (start_e1, end_e1 - 1) entity_2 = (start_e2, end_e2 - 1) yield self.text_to_instance(tokens, entity_1, entity_2, label)
Example #19
Source File: prolocal_dataset_reader.py From propara with Apache License 2.0 | 5 votes |
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) instances = [] with open(file_path, 'r') as state_change_file: logger.info("Reading state change instances from TSV dataset at: %s", file_path) for line in tqdm.tqdm(state_change_file): parts: List[str] = line.split() # parse input sentence_tokens = parts[0].split("####") verb_span = parts[1].split(",") verb_vector = [int(i) for i in verb_span] entity_span = parts[2].split(",") entity_vector = [int(i) for i in entity_span] # parse labels state_change_types = parts[3] state_change_tags = parts[4].split(",") # create instance yield self.text_to_instance(sentence_tokens=sentence_tokens, verb_vector=verb_vector, entity_vector=entity_vector, state_change_types=state_change_types, state_change_tags=state_change_tags)
Example #20
Source File: datareader.py From NLP_Toolkit with Apache License 2.0 | 5 votes |
def _read(self, file_path): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) with open(file_path, "r") as data_file: logger.info("Reading instances from lines in file at: %s", file_path) for line in data_file: line = line.strip("\n") # skip blank and broken lines if not line or (not self._test_mode and self._broken_dot_strategy == 'skip' and self.BROKEN_SENTENCES_REGEXP.search(line) is not None): continue tokens_and_tags = [pair.rsplit(self._delimeters['labels'], 1) for pair in line.split(self._delimeters['tokens'])] try: tokens = [Token(token) for token, tag in tokens_and_tags] tags = [tag for token, tag in tokens_and_tags] except ValueError: tokens = [Token(token[0]) for token in tokens_and_tags] tags = None if tokens and tokens[0] != Token(START_TOKEN): tokens = [Token(START_TOKEN)] + tokens words = [x.text for x in tokens] if self._max_len is not None: tokens = tokens[:self._max_len] tags = None if tags is None else tags[:self._max_len] instance = self.text_to_instance(tokens, tags, words) if instance: yield instance
Example #21
Source File: embedding.py From allennlp with Apache License 2.0 | 5 votes |
def _open_inside_tar(self, archive_path: str, member_path: Optional[str] = None) -> None: cached_archive_path = cached_path(archive_path, cache_dir=self._cache_dir) archive = tarfile.open(cached_archive_path, "r") if member_path is None: members_list = archive.getnames() member_path = self._get_the_only_file_in_the_archive(members_list, archive_path) member_path = cast(str, member_path) member = archive.getmember(member_path) # raises exception if not present member_file = cast(BinaryIO, archive.extractfile(member)) self._handle = io.TextIOWrapper(member_file, encoding=self._encoding) self._archive_handle = archive
Example #22
Source File: predict.py From allennlp with Apache License 2.0 | 5 votes |
def _get_json_data(self) -> Iterator[JsonDict]: if self._input_file == "-": for line in sys.stdin: if not line.isspace(): yield self._predictor.load_line(line) else: input_file = cached_path(self._input_file) with open(input_file, "r") as file_input: for line in file_input: if not line.isspace(): yield self._predictor.load_line(line)
Example #23
Source File: arc_multichoice_json_reader.py From OpenBookQA with Apache License 2.0 | 5 votes |
def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) with open(file_path, 'r') as data_file: logger.info("Reading Multi-choice QA instances in ARC format from jsonl dataset at: %s", file_path) for curr_line_id, line in enumerate(data_file): item_json = json.loads(line.strip()) item_id = item_json["id"] question_text = self.get_question_text_from_item(item_json, self._question_value_type) choice_label_to_id = {} choice_text_list = [] for choice_id, choice_item in enumerate(item_json["question"]["choices"]): choice_label = choice_item["label"] choice_label_to_id[choice_label] = choice_id choice_text = self.get_choice_text_from_item(item_json, choice_id, self._choice_value_type) choice_text_list.append(choice_text) answer_id = choice_label_to_id[item_json["answerKey"]] yield self.text_to_instance(item_id, question_text, choice_text_list, answer_id)
Example #24
Source File: data_loading.py From teaching with GNU General Public License v3.0 | 5 votes |
def _read(self, file_path): with open(cached_path(file_path), "r", encoding="utf8") as data_file: #logger.info("Reading instances from lines in file at: %s", file_path) for line_num, line in enumerate(data_file): line = line.strip("\n") if not line: continue line_parts = line.split('\t') if len(line_parts) != 3: raise ConfigurationError("Invalid line format: %s (line number %d)" % (line, line_num + 1)) query_sequence, doc_pos_sequence, doc_neg_sequence = line_parts yield self.text_to_instance(query_sequence, doc_pos_sequence, doc_neg_sequence)
Example #25
Source File: wikitables_accuracy.py From magnitude with MIT License | 5 votes |
def _create_sempre_executor(self) : u""" Creates a server running SEMPRE that we can send logical forms to for evaluation. This uses inter-process communication, because SEMPRE is java code. We also need to be careful to clean up the process when our program exits. """ if self._executor_process: return # It'd be much nicer to just use `cached_path` for these files. However, the SEMPRE jar # that we're using expects to find these files in a particular location, so we need to make # sure we put the files in that location. os.makedirs(SEMPRE_DIR, exist_ok=True) abbreviations_path = os.path.join(SEMPRE_DIR, u'abbreviations.tsv') if not os.path.exists(abbreviations_path): subprocess.run('wget {ABBREVIATIONS_FILE}', shell=True) subprocess.run('mv wikitables-abbreviations.tsv {abbreviations_path}', shell=True) grammar_path = os.path.join(SEMPRE_DIR, u'grow.grammar') if not os.path.exists(grammar_path): subprocess.run('wget {GROW_FILE}', shell=True) subprocess.run('mv wikitables-grow.grammar {grammar_path}', shell=True) args = [u'java', u'-jar', cached_path(SEMPRE_EXECUTOR_JAR), u'serve', self._table_directory] self._executor_process = subprocess.Popen(args, stdin=subprocess.PIPE, stdout=subprocess.PIPE, bufsize=1) lines = [] for _ in range(6): # SEMPRE outputs six lines of stuff when it loads that I can't disable. So, we clear # that here. lines.append(unicode(self._executor_process.stdout.readline())) assert u'Parser' in lines[-1], u"SEMPRE server output unexpected; the server may have changed" logger.info(u"Started SEMPRE server for evaluating logical forms") # This is supposed to ensure that the subprocess gets killed when python exits. atexit.register(self._stop_sempre_executor)
Example #26
Source File: params.py From magnitude with MIT License | 5 votes |
def from_file(params_file , params_overrides = u"") : u""" Load a `Params` object from a configuration file. """ # redirect to cache, if necessary params_file = cached_path(params_file) ext_vars = dict(os.environ) file_dict = json.loads(evaluate_file(params_file, ext_vars=ext_vars)) overrides_dict = parse_overrides(params_overrides) param_dict = with_fallback(preferred=overrides_dict, fallback=file_dict) return Params(param_dict)
Example #27
Source File: copynet.py From nlp-models with MIT License | 5 votes |
def _read(self, file_path): with open(cached_path(file_path), "r") as data_file: logger.info("Reading instances from lines in file at: %s", file_path) for line_num, line in enumerate(data_file): source_sequence, target_sequence = self._read_line(line_num, line) if not source_sequence: continue yield self.text_to_instance(source_sequence, target_sequence)
Example #28
Source File: nl2bash.py From nlp-models with MIT License | 5 votes |
def _read(self, file_path): with open(cached_path(file_path), "r") as data_file: logger.info("Reading instances from lines in file at: %s", file_path) for line_num, line in enumerate(data_file): source_sequence, target_sequence = self._read_line(line_num, line) if not source_sequence: continue target_sequence = self._preprocess_target(target_sequence) yield self.text_to_instance(source_sequence, target_sequence)
Example #29
Source File: dataset_reader.py From nanigonet with MIT License | 5 votes |
def _read(self, file_path: str) -> Iterable[Instance]: file_path = cached_path(file_path) with open(file_path) as f: for line in f: data = json.loads(line) tokens = self._tokenizer.tokenize(data['text']) tags = data.get('labels') yield self.text_to_instance(tokens, tags)
Example #30
Source File: winobias.py From magnitude with MIT License | 5 votes |
def _read(self, file_path ): for sentence in open(cached_path(file_path), u"r"): tokens = sentence.strip().split(u" ") clusters = collections.defaultdict(list) words = [] for index, token in enumerate(tokens): # Coreference is annotated using [square brackets] # or (round brackets) around coreferent phrases. if u"[" in token and u"]" in token: clusters[0].append((index, index)) elif u"[" in token: clusters[0].append((index, index)) elif u"]" in token: old_span = clusters[0][-1] clusters[0][-1] = (old_span[0], index) if u"(" in token and u")" in token: clusters[1].append((index, index)) elif u"(" in token: clusters[1].append((index, index)) elif u")" in token: old_span = clusters[1][-1] clusters[1][-1] = (old_span[0], index) if token.endswith(u"."): # Winobias is tokenised, but not for full stops. # We'll just special case them here. token = token[:-1] words.append(token.strip(u"[]()")) words.append(u".") else: words.append(token.strip(u"[]()")) yield self.text_to_instance([Token(x) for x in words], [x for x in list(clusters.values())]) #overrides