Python tensorflow.compat.v1.gfile() Examples
The following are 15
code examples of tensorflow.compat.v1.gfile().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
tensorflow.compat.v1
, or try the search function
.
Example #1
Source File: configurable_ops.py From morph-net with Apache License 2.0 | 6 votes |
def decorator_from_parameterization_file( filename, fallback_rule=FallbackRule.pass_through, **kwargs): """Create a ConfigurableOps from a parameterization file. Loads a json parameterization file from disk (as saved by tools.structure_exporter) and creates an ConfigurableOps from it. Args: filename: Path to a parameterization file in json format. fallback_rule: A `FallbackRule` enum which controls fallback behavior (see __init__ for more detail.) **kwargs: Miscellaneous args for ConfigurableOps. Returns: An ConfigurableOps instance with the parameterization from `filename`. """ with tf.gfile.Open(filename, 'r') as f: parameterization = json.loads(f.read()) return ConfigurableOps( parameterization=parameterization, fallback_rule=fallback_rule, **kwargs)
Example #2
Source File: evaluate_squad_watermark.py From language with Apache License 2.0 | 6 votes |
def main(_): def load_dataset_file(dataset_file): with gfile.Open(dataset_file) as df: dataset_json = json.load(df) data = dataset_json['data'] return data def load_preds_file(prediction_file): with gfile.Open(prediction_file) as pf: preds = json.load(pf) return preds dataset = load_dataset_file(FLAGS.watermark_file) preds = load_preds_file(FLAGS.watermark_output_file) logging.info('Watermark Label Accuracy =') logging.info( json.dumps(evaluate_dataset_preds(dataset, preds, ans_key='answers'))) logging.info('Victim Label Accuracy =') logging.info( json.dumps( evaluate_dataset_preds(dataset, preds, ans_key='original_answers')))
Example #3
Source File: corpus.py From lamb with Apache License 2.0 | 5 votes |
def read_character_based_corpus(filename, encoding='utf-8'): with codecs.getreader(encoding)(tf.gfile.GFile(filename, mode='rb')) as f: return Corpus([line.rstrip('\n') for line in f])
Example #4
Source File: corpus.py From lamb with Apache License 2.0 | 5 votes |
def read_word_based_corpus(filename, encoding='utf-8'): with codecs.getreader(encoding)(tf.gfile.GFile(filename, mode='rb')) as f: return Corpus([line.split() for line in f])
Example #5
Source File: fixed_replay_buffer.py From batch_rl with Apache License 2.0 | 5 votes |
def _load_replay_buffers(self, num_buffers=None): """Loads multiple checkpoints into a list of replay buffers.""" if not self._loaded_buffers: # pytype: disable=attribute-error ckpts = gfile.ListDirectory(self._data_dir) # pytype: disable=attribute-error # Assumes that the checkpoints are saved in a format CKPT_NAME.{SUFFIX}.gz ckpt_counters = collections.Counter( [name.split('.')[-2] for name in ckpts]) # Should contain the files for add_count, action, observation, reward, # terminal and invalid_range ckpt_suffixes = [x for x in ckpt_counters if ckpt_counters[x] in [6, 7]] if num_buffers is not None: ckpt_suffixes = np.random.choice( ckpt_suffixes, num_buffers, replace=False) self._replay_buffers = [] # Load the replay buffers in parallel with futures.ThreadPoolExecutor( max_workers=num_buffers) as thread_pool_executor: replay_futures = [thread_pool_executor.submit( self._load_buffer, suffix) for suffix in ckpt_suffixes] for f in replay_futures: replay_buffer = f.result() if replay_buffer is not None: self._replay_buffers.append(replay_buffer) self.add_count = max(replay_buffer.add_count, self.add_count) self._num_replay_buffers = len(self._replay_buffers) if self._num_replay_buffers: self._loaded_buffers = True
Example #6
Source File: merge_datasets_simple.py From language with Apache License 2.0 | 5 votes |
def main(_): output_data = [] dataset_paths = FLAGS.dataset_paths.split(",") for dp in dataset_paths: with gfile.Open(dp, "r") as f: base_dataset = f.read().strip().split("\n") base_dataset_header = base_dataset[0] base_dataset = base_dataset[1:] indices_base_dataset = [ base_dataset_header.split("\t").index(x) for x in relevant_headers[FLAGS.task_name] ] for point in base_dataset: input_shards = [ point.split("\t")[index] for index in indices_base_dataset ] output_data.append(("%d\t" % len(output_data)) + "\t".join(input_shards)) logging.info("Final dataset size = %d", len(output_data)) final_header = "index\t" + "\t".join(relevant_headers[FLAGS.task_name]) output_data = [final_header] + output_data with gfile.Open(FLAGS.output_path, "w") as f: f.write("\n".join(output_data) + "\n")
Example #7
Source File: dataset_analysis.py From language with Apache License 2.0 | 5 votes |
def main(_): with gfile.Open(FLAGS.input_path, "r") as f: sents_data = f.read().strip().split("\n")[1:] classes = [0 for _ in range(num_labels[FLAGS.task_name])] entropies = [] # Assume that the last three columns are probability information for x in tqdm(sents_data): probs = (x.split("\t"))[-num_labels[FLAGS.task_name]:] probs = [float(x1) for x1 in probs] entropies.append(stats.entropy(probs)) classes[np.argmax(probs)] += 1 class_distro = [] for i, cls1 in enumerate(classes): class_distro.append(float(cls1) / len(sents_data)) logging.info("Class %d = %.6f (%d / %d)", i, float(cls1) / len(sents_data), cls1, len(sents_data)) class_entropy = stats.entropy(class_distro) logging.info("Class distribution self-entropy = %.8f", class_entropy) logging.info("Average per-instance self-entropy = %.8f", np.mean(entropies)) logging.info("Max per-instance self-entropy = %.8f", np.max(entropies)) logging.info("Min per-instance self-entropy = %.8f", np.min(entropies)) logging.info("Std per-instance self-entropy = %.8f", np.std(entropies)) return
Example #8
Source File: preprocess_edit_distance_one.py From language with Apache License 2.0 | 5 votes |
def main(_): random.seed(FLAGS.random_seed) with gfile.Open(FLAGS.input_path, "r") as f: sents_data = f.read().strip().split("\n") header = sents_data[0] sents_data = sents_data[1:] vocab = build_vocab(sents_data) subset_sents_data = build_subset(sents_data) output_data = [] for sent in subset_sents_data: output_data.append(sent) data_point_parts = sent.split("\t") original_sent = data_point_parts[0].split() if FLAGS.keep_only_original: continue # For each pertubation, construct a new sentence and randomly replace a word for _ in range(FLAGS.num_pertubations): pertubed = [x for x in original_sent] pertubed[random.randint(0, len(original_sent) - 1)] = random.choice(vocab) output_data.append(" ".join(pertubed) + " \t" + "\t".join(data_point_parts[1:])) output_data = [header] + output_data with gfile.Open(FLAGS.output_path, "w") as f: f.write("\n".join(output_data) + "\n") return
Example #9
Source File: evaluate_squad_2.py From language with Apache License 2.0 | 5 votes |
def main(_): with gfile.Open(FLAGS.data_file) as f: dataset_json = json.load(f) dataset = dataset_json['data'] with gfile.Open(FLAGS.pred_file) as f: preds = json.load(f) qid_to_has_ans = make_qid_to_has_ans(dataset) # maps qid to True/False has_ans_qids = [k for k, v in qid_to_has_ans.items() if v] no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v] exact_raw, f1_raw = get_raw_scores(dataset, preds) na_probs = {k: 0.0 for k in preds} exact_thresh = apply_no_ans_threshold(exact_raw, na_probs, qid_to_has_ans, 1.0) f1_thresh = apply_no_ans_threshold(f1_raw, na_probs, qid_to_has_ans, 1.0) out_eval = make_eval_dict(exact_thresh, f1_thresh) if has_ans_qids: has_ans_eval = make_eval_dict( exact_thresh, f1_thresh, qid_list=has_ans_qids) merge_eval(out_eval, has_ans_eval, 'HasAns') if no_ans_qids: no_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=no_ans_qids) merge_eval(out_eval, no_ans_eval, 'NoAns') if FLAGS.out_file: with gfile.Open(FLAGS.out_file, 'w') as f: json.dump(out_eval, f) else: print(json.dumps(out_eval, indent=2))
Example #10
Source File: evaluate_squad.py From language with Apache License 2.0 | 5 votes |
def main(_): def load_dataset_file(dataset_file): with gfile.Open(dataset_file) as df: dataset_json = json.load(df) data = dataset_json['data'] return data def load_preds_file(prediction_file): with gfile.Open(prediction_file) as pf: preds = json.load(pf) return preds if FLAGS.dataset_file and FLAGS.dataset_file2: dataset1 = load_dataset_file(FLAGS.dataset_file) dataset2 = load_dataset_file(FLAGS.dataset_file2) print(json.dumps(evaluate_dataset_dataset(dataset1, dataset2))) elif FLAGS.prediction_file and FLAGS.prediction_file2: preds1 = load_preds_file(FLAGS.prediction_file) preds2 = load_preds_file(FLAGS.prediction_file2) print(json.dumps(evaluate_preds_preds(preds1, preds2))) else: dataset = load_dataset_file(FLAGS.dataset_file) preds = load_preds_file(FLAGS.prediction_file) print(json.dumps(evaluate_dataset_preds(dataset, preds)))
Example #11
Source File: local_mnist.py From magenta with Apache License 2.0 | 4 votes |
def read_data_sets( train_dir, fake_data=False, # pylint:disable=unused-argument one_hot=False, dtype=np.float32, reshape=True, validation_size=5000, seed=None, ): """Read multiple datasets.""" # pylint:disable=invalid-name TRAIN_IMAGES = 'train-images-idx3-ubyte.gz' TRAIN_LABELS = 'train-labels-idx1-ubyte.gz' TEST_IMAGES = 't10k-images-idx3-ubyte.gz' TEST_LABELS = 't10k-labels-idx1-ubyte.gz' local_file = os.path.join(train_dir, TRAIN_IMAGES) with gfile.Open(local_file, 'rb') as f: train_images = extract_images(f) local_file = os.path.join(train_dir, TRAIN_LABELS) with gfile.Open(local_file, 'rb') as f: train_labels = extract_labels(f, one_hot=one_hot) local_file = os.path.join(train_dir, TEST_IMAGES) with gfile.Open(local_file, 'rb') as f: test_images = extract_images(f) local_file = os.path.join(train_dir, TEST_LABELS) with gfile.Open(local_file, 'rb') as f: test_labels = extract_labels(f, one_hot=one_hot) if not 0 <= validation_size <= len(train_images): raise ValueError( 'Validation size should be between 0 and {}. Received: {}.'.format( len(train_images), validation_size)) validation_images = train_images[:validation_size] validation_labels = train_labels[:validation_size] train_images = train_images[validation_size:] train_labels = train_labels[validation_size:] options = dict(dtype=dtype, reshape=reshape, seed=seed) train = DataSet(train_images, train_labels, **options) validation = DataSet(validation_images, validation_labels, **options) test = DataSet(test_images, test_labels, **options) return train, validation, test
Example #12
Source File: preprocess_distill_input.py From language with Apache License 2.0 | 4 votes |
def main(_): task_name = FLAGS.task_name.lower() with gfile.Open(FLAGS.sents_path, "r") as f: sents_data = f.read().strip().split("\n") header = sents_data[0] + "".join( ["\tlabel%d_prob" % i for i in range(num_labels[task_name])]) sents_data = sents_data[1:] if FLAGS.probs_path: with gfile.Open(FLAGS.probs_path, "r") as f: probs_data = f.read().strip().split("\n") else: probs_data = None if FLAGS.split_type == "train": assert len(sents_data) == len(probs_data) output_data = [ x.strip() + "\t" + y.strip() for x, y in zip(sents_data, probs_data) ] elif FLAGS.split_type == "train_argmax": assert len(sents_data) == len(probs_data) # Round the probability vectors before adding them to file output_data = [] for x, y, in zip(sents_data, probs_data): # Convert tsv probability vector to numpy style array prob_vector = np.array([float(yy) for yy in y.split("\t")]) # initialize a vector with zeros argmax_prob_vector = np.zeros_like(prob_vector) # keep only the argmax prediction argmax_prob_vector[np.argmax(prob_vector)] = 1.0 argmax_prob_str = "\t".join([str(yy) for yy in argmax_prob_vector]) output_data.append(x.strip() + "\t" + argmax_prob_str.strip()) elif FLAGS.split_type == "dev": if task_name == "sst-2": output_data = [ x.strip() + "\t1\t0" if x.split("\t")[1] == "0" else x.strip() + "\t0\t1" for x in sents_data ] elif task_name == "mnli": output_data = [ x.strip() + mnli_map[x.split("\t")[-1]] for x in sents_data ] output_data = [header] + output_data with gfile.Open(FLAGS.output_path, "w") as f: f.write("\n".join(output_data) + "\n") return
Example #13
Source File: preprocess_util.py From language with Apache License 2.0 | 4 votes |
def build_vocab(sents_data, task_name, vocab_mode="downstream_vocab", vocab_path=None): """find all words in input corpus to build a vocabulary.""" if vocab_mode == "bert_vocab": # Use a custom vocab to carry out filtering (such as BERT's word piece) with gfile.Open(vocab_path, "r") as f: vocab = f.read().strip().split("\n") # Filter out special tokens vocab = [x for x in vocab if x[0] != "[" and x[-1] != "]"] probs = [1.0 / len(vocab) for _ in vocab] elif vocab_mode == "full_corpus": # Use all words in a corpus of text to find out the vocabulary vocab = collections.Counter("\n".join(sents_data).split()) vocab = [(k, v) for k, v in vocab.items()] vocab.sort(key=lambda x: x[1], reverse=True) vocab_total = sum([x[1] for x in vocab]) probs = [float(x[1]) / vocab_total for x in vocab] vocab = [x[0] for x in vocab] elif "full_corpus_top_" in vocab_mode: full_vocab = collections.defaultdict(int) for sent in sents_data: for word in sent.split(): full_vocab[word] += 1 # Sort the vocabulary words according to their frequency full_vocab = sorted([(k, v) for k, v in full_vocab.items()], key=lambda x: x[1], reverse=True) # Take the top-k values from the vocabulary for the final list top_k_val = int(vocab_mode[len("full_corpus_top_"):]) vocab = [x[0] for x in full_vocab[:top_k_val]] probs = [1.0 / len(vocab) for _ in vocab] elif vocab_mode == "downstream_vocab": vocab = collections.defaultdict(int) for sent in sents_data: for index in task_input_indices[task_name]: original_sent = sent.split("\t")[index].split() for word in original_sent: vocab[word] += 1 vocab = [(k, v) for k, v in vocab.items()] vocab.sort(key=lambda x: x[1], reverse=True) vocab_total = sum([x[1] for x in vocab]) probs = [float(x[1]) / vocab_total for x in vocab] vocab = [x[0] for x in vocab] else: probs = None vocab = None return vocab, probs
Example #14
Source File: combine_qa.py From language with Apache License 2.0 | 4 votes |
def main(_): with gfile.Open(FLAGS.questions_path, "r") as f: questions_data = json.loads(f.read()) with gfile.Open(FLAGS.predictions_path, "r") as f: predictions_data = json.loads(f.read()) counter = 0 unanswerable = 0 total = 0 for instance in tqdm.tqdm(questions_data["data"]): for para in instance["paragraphs"]: para_text = para["context"] for qa in para["qas"]: answer_text = predictions_data[qa["id"]] total += 1 if answer_text.strip(): qa["is_impossible"] = False # due to minor data processing issues, there are a few cases where the # predicted answer does not exist exactly in the paragraph text. # In this case, check if the first word of the answer is present in # the paragraph and approximate the answer_start using it. if answer_text not in para_text: counter += 1 # If even the first word is not in the paragraph, ignore this QA if answer_text.split()[0] not in para_text: continue else: # approximate answer_start by the position of the first word qa["answers"] = [{ "text": answer_text, "answer_start": para_text.index(answer_text.split()[0]) }] continue # the usual case where answer_text is exactly present in para_text qa["answers"] = [{ "text": answer_text, "answer_start": para_text.index(answer_text) }] else: # this code makes it compatible to SQuAD 2.0 unanswerable += 1 qa["answers"] = [] qa["is_impossible"] = True logging.info("%d / %d answers were unanswerable", unanswerable, total) logging.info("%d / %d answers didn't have an exact match", counter, total) with gfile.Open(FLAGS.output_path, "w") as f: f.write(json.dumps(questions_data))
Example #15
Source File: preprocess_fraction_squad.py From language with Apache License 2.0 | 4 votes |
def main(_): random.seed(FLAGS.random_seed) with gfile.Open(FLAGS.input_path, "r") as f: sents_data = json.loads(f.read()) output_data = {"data": [], "version": FLAGS.version} # Find all the question IDs in the SQuAD dataset question_ids = [] for instance in sents_data["data"]: for para in instance["paragraphs"]: for qa in para["qas"]: question_ids.append(qa["id"]) # Randomly shuffle the question IDs, and choose FLAGS.fraction percent of them random.shuffle(question_ids) num_final_questions = int(round(len(question_ids) * FLAGS.fraction)) question_ids = {x: 1 for x in question_ids[:num_final_questions]} # Preserve the original dataset size and paragraphs, choose random questions # based on the question IDs which survived the filtering. for instance in tqdm.tqdm(sents_data["data"]): instance_data = {"title": instance["title"], "paragraphs": []} for para in instance["paragraphs"]: para_instance = {"context": para["context"], "qas": []} for qa in para["qas"]: # Only choose those questions which survived the filtering. if qa["id"] in question_ids: para_instance["qas"].append(qa) # Don't append paras with no QAs if para_instance["qas"]: instance_data["paragraphs"].append(para_instance) # Don't append instances with no paragraphs. if instance_data["paragraphs"]: output_data["data"].append(instance_data) # Count the total number of questions in the final, smaller dataset. total_questions = 0 for instance in output_data["data"]: for para in instance["paragraphs"]: for qa in para["qas"]: total_questions += 1 logging.info("Final dataset size = %d", total_questions) with gfile.Open(FLAGS.output_path, "w") as f: f.write(json.dumps(output_data)) return