Python tensorflow.compat.v1.gfile() Examples

The following are 15 code examples of tensorflow.compat.v1.gfile(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module tensorflow.compat.v1 , or try the search function .
Example #1
Source File: configurable_ops.py    From morph-net with Apache License 2.0 6 votes vote down vote up
def decorator_from_parameterization_file(
    filename, fallback_rule=FallbackRule.pass_through, **kwargs):
  """Create a ConfigurableOps from a parameterization file.

    Loads a json parameterization file from disk
    (as saved by tools.structure_exporter) and creates an ConfigurableOps from
    it.

  Args:
    filename: Path to a parameterization file in json format.
    fallback_rule: A `FallbackRule` enum which controls fallback behavior
      (see __init__ for more detail.)
    **kwargs: Miscellaneous args for ConfigurableOps.

  Returns:
    An ConfigurableOps instance with the parameterization from `filename`.
  """
  with tf.gfile.Open(filename, 'r') as f:
    parameterization = json.loads(f.read())
    return ConfigurableOps(
        parameterization=parameterization, fallback_rule=fallback_rule,
        **kwargs) 
Example #2
Source File: evaluate_squad_watermark.py    From language with Apache License 2.0 6 votes vote down vote up
def main(_):

  def load_dataset_file(dataset_file):
    with gfile.Open(dataset_file) as df:
      dataset_json = json.load(df)
    data = dataset_json['data']
    return data

  def load_preds_file(prediction_file):
    with gfile.Open(prediction_file) as pf:
      preds = json.load(pf)
    return preds

  dataset = load_dataset_file(FLAGS.watermark_file)
  preds = load_preds_file(FLAGS.watermark_output_file)
  logging.info('Watermark Label Accuracy =')
  logging.info(
      json.dumps(evaluate_dataset_preds(dataset, preds, ans_key='answers')))
  logging.info('Victim Label Accuracy =')
  logging.info(
      json.dumps(
          evaluate_dataset_preds(dataset, preds, ans_key='original_answers'))) 
Example #3
Source File: corpus.py    From lamb with Apache License 2.0 5 votes vote down vote up
def read_character_based_corpus(filename, encoding='utf-8'):
  with codecs.getreader(encoding)(tf.gfile.GFile(filename, mode='rb')) as f:
    return Corpus([line.rstrip('\n') for line in f]) 
Example #4
Source File: corpus.py    From lamb with Apache License 2.0 5 votes vote down vote up
def read_word_based_corpus(filename, encoding='utf-8'):
  with codecs.getreader(encoding)(tf.gfile.GFile(filename, mode='rb')) as f:
    return Corpus([line.split() for line in f]) 
Example #5
Source File: fixed_replay_buffer.py    From batch_rl with Apache License 2.0 5 votes vote down vote up
def _load_replay_buffers(self, num_buffers=None):
    """Loads multiple checkpoints into a list of replay buffers."""
    if not self._loaded_buffers:  # pytype: disable=attribute-error
      ckpts = gfile.ListDirectory(self._data_dir)  # pytype: disable=attribute-error
      # Assumes that the checkpoints are saved in a format CKPT_NAME.{SUFFIX}.gz
      ckpt_counters = collections.Counter(
          [name.split('.')[-2] for name in ckpts])
      # Should contain the files for add_count, action, observation, reward,
      # terminal and invalid_range
      ckpt_suffixes = [x for x in ckpt_counters if ckpt_counters[x] in [6, 7]]
      if num_buffers is not None:
        ckpt_suffixes = np.random.choice(
            ckpt_suffixes, num_buffers, replace=False)
      self._replay_buffers = []
      # Load the replay buffers in parallel
      with futures.ThreadPoolExecutor(
          max_workers=num_buffers) as thread_pool_executor:
        replay_futures = [thread_pool_executor.submit(
            self._load_buffer, suffix) for suffix in ckpt_suffixes]
      for f in replay_futures:
        replay_buffer = f.result()
        if replay_buffer is not None:
          self._replay_buffers.append(replay_buffer)
          self.add_count = max(replay_buffer.add_count, self.add_count)
      self._num_replay_buffers = len(self._replay_buffers)
      if self._num_replay_buffers:
        self._loaded_buffers = True 
Example #6
Source File: merge_datasets_simple.py    From language with Apache License 2.0 5 votes vote down vote up
def main(_):

  output_data = []
  dataset_paths = FLAGS.dataset_paths.split(",")

  for dp in dataset_paths:
    with gfile.Open(dp, "r") as f:
      base_dataset = f.read().strip().split("\n")
      base_dataset_header = base_dataset[0]
      base_dataset = base_dataset[1:]

    indices_base_dataset = [
        base_dataset_header.split("\t").index(x)
        for x in relevant_headers[FLAGS.task_name]
    ]

    for point in base_dataset:
      input_shards = [
          point.split("\t")[index] for index in indices_base_dataset
      ]
      output_data.append(("%d\t" % len(output_data)) + "\t".join(input_shards))

    logging.info("Final dataset size = %d", len(output_data))

  final_header = "index\t" + "\t".join(relevant_headers[FLAGS.task_name])
  output_data = [final_header] + output_data

  with gfile.Open(FLAGS.output_path, "w") as f:
    f.write("\n".join(output_data) + "\n") 
Example #7
Source File: dataset_analysis.py    From language with Apache License 2.0 5 votes vote down vote up
def main(_):

  with gfile.Open(FLAGS.input_path, "r") as f:
    sents_data = f.read().strip().split("\n")[1:]

  classes = [0 for _ in range(num_labels[FLAGS.task_name])]
  entropies = []

  # Assume that the last three columns are probability information
  for x in tqdm(sents_data):
    probs = (x.split("\t"))[-num_labels[FLAGS.task_name]:]
    probs = [float(x1) for x1 in probs]
    entropies.append(stats.entropy(probs))
    classes[np.argmax(probs)] += 1

  class_distro = []
  for i, cls1 in enumerate(classes):
    class_distro.append(float(cls1) / len(sents_data))
    logging.info("Class %d = %.6f (%d / %d)", i,
                 float(cls1) / len(sents_data), cls1, len(sents_data))

  class_entropy = stats.entropy(class_distro)
  logging.info("Class distribution self-entropy = %.8f", class_entropy)
  logging.info("Average per-instance self-entropy = %.8f", np.mean(entropies))
  logging.info("Max per-instance self-entropy = %.8f", np.max(entropies))
  logging.info("Min per-instance self-entropy = %.8f", np.min(entropies))
  logging.info("Std per-instance self-entropy = %.8f", np.std(entropies))
  return 
Example #8
Source File: preprocess_edit_distance_one.py    From language with Apache License 2.0 5 votes vote down vote up
def main(_):
  random.seed(FLAGS.random_seed)

  with gfile.Open(FLAGS.input_path, "r") as f:
    sents_data = f.read().strip().split("\n")

  header = sents_data[0]
  sents_data = sents_data[1:]

  vocab = build_vocab(sents_data)
  subset_sents_data = build_subset(sents_data)

  output_data = []

  for sent in subset_sents_data:
    output_data.append(sent)
    data_point_parts = sent.split("\t")
    original_sent = data_point_parts[0].split()

    if FLAGS.keep_only_original:
      continue

    # For each pertubation, construct a new sentence and randomly replace a word
    for _ in range(FLAGS.num_pertubations):
      pertubed = [x for x in original_sent]
      pertubed[random.randint(0, len(original_sent) - 1)] = random.choice(vocab)
      output_data.append(" ".join(pertubed) + " \t" +
                         "\t".join(data_point_parts[1:]))

  output_data = [header] + output_data

  with gfile.Open(FLAGS.output_path, "w") as f:
    f.write("\n".join(output_data) + "\n")

  return 
Example #9
Source File: evaluate_squad_2.py    From language with Apache License 2.0 5 votes vote down vote up
def main(_):
  with gfile.Open(FLAGS.data_file) as f:
    dataset_json = json.load(f)
    dataset = dataset_json['data']
  with gfile.Open(FLAGS.pred_file) as f:
    preds = json.load(f)

  qid_to_has_ans = make_qid_to_has_ans(dataset)  # maps qid to True/False
  has_ans_qids = [k for k, v in qid_to_has_ans.items() if v]
  no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v]
  exact_raw, f1_raw = get_raw_scores(dataset, preds)
  na_probs = {k: 0.0 for k in preds}
  exact_thresh = apply_no_ans_threshold(exact_raw, na_probs, qid_to_has_ans,
                                        1.0)
  f1_thresh = apply_no_ans_threshold(f1_raw, na_probs, qid_to_has_ans, 1.0)
  out_eval = make_eval_dict(exact_thresh, f1_thresh)
  if has_ans_qids:
    has_ans_eval = make_eval_dict(
        exact_thresh, f1_thresh, qid_list=has_ans_qids)
    merge_eval(out_eval, has_ans_eval, 'HasAns')
  if no_ans_qids:
    no_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=no_ans_qids)
    merge_eval(out_eval, no_ans_eval, 'NoAns')

  if FLAGS.out_file:
    with gfile.Open(FLAGS.out_file, 'w') as f:
      json.dump(out_eval, f)
  else:
    print(json.dumps(out_eval, indent=2)) 
Example #10
Source File: evaluate_squad.py    From language with Apache License 2.0 5 votes vote down vote up
def main(_):

  def load_dataset_file(dataset_file):
    with gfile.Open(dataset_file) as df:
      dataset_json = json.load(df)
    data = dataset_json['data']
    return data

  def load_preds_file(prediction_file):
    with gfile.Open(prediction_file) as pf:
      preds = json.load(pf)
    return preds

  if FLAGS.dataset_file and FLAGS.dataset_file2:
    dataset1 = load_dataset_file(FLAGS.dataset_file)
    dataset2 = load_dataset_file(FLAGS.dataset_file2)
    print(json.dumps(evaluate_dataset_dataset(dataset1, dataset2)))

  elif FLAGS.prediction_file and FLAGS.prediction_file2:
    preds1 = load_preds_file(FLAGS.prediction_file)
    preds2 = load_preds_file(FLAGS.prediction_file2)
    print(json.dumps(evaluate_preds_preds(preds1, preds2)))

  else:
    dataset = load_dataset_file(FLAGS.dataset_file)
    preds = load_preds_file(FLAGS.prediction_file)
    print(json.dumps(evaluate_dataset_preds(dataset, preds))) 
Example #11
Source File: local_mnist.py    From magenta with Apache License 2.0 4 votes vote down vote up
def read_data_sets(
    train_dir,
    fake_data=False,  # pylint:disable=unused-argument
    one_hot=False,
    dtype=np.float32,
    reshape=True,
    validation_size=5000,
    seed=None,
):
  """Read multiple datasets."""
  # pylint:disable=invalid-name
  TRAIN_IMAGES = 'train-images-idx3-ubyte.gz'
  TRAIN_LABELS = 'train-labels-idx1-ubyte.gz'
  TEST_IMAGES = 't10k-images-idx3-ubyte.gz'
  TEST_LABELS = 't10k-labels-idx1-ubyte.gz'

  local_file = os.path.join(train_dir, TRAIN_IMAGES)
  with gfile.Open(local_file, 'rb') as f:
    train_images = extract_images(f)

  local_file = os.path.join(train_dir, TRAIN_LABELS)
  with gfile.Open(local_file, 'rb') as f:
    train_labels = extract_labels(f, one_hot=one_hot)

  local_file = os.path.join(train_dir, TEST_IMAGES)
  with gfile.Open(local_file, 'rb') as f:
    test_images = extract_images(f)

  local_file = os.path.join(train_dir, TEST_LABELS)
  with gfile.Open(local_file, 'rb') as f:
    test_labels = extract_labels(f, one_hot=one_hot)

  if not 0 <= validation_size <= len(train_images):
    raise ValueError(
        'Validation size should be between 0 and {}. Received: {}.'.format(
            len(train_images), validation_size))

  validation_images = train_images[:validation_size]
  validation_labels = train_labels[:validation_size]
  train_images = train_images[validation_size:]
  train_labels = train_labels[validation_size:]

  options = dict(dtype=dtype, reshape=reshape, seed=seed)

  train = DataSet(train_images, train_labels, **options)
  validation = DataSet(validation_images, validation_labels, **options)
  test = DataSet(test_images, test_labels, **options)

  return train, validation, test 
Example #12
Source File: preprocess_distill_input.py    From language with Apache License 2.0 4 votes vote down vote up
def main(_):
  task_name = FLAGS.task_name.lower()

  with gfile.Open(FLAGS.sents_path, "r") as f:
    sents_data = f.read().strip().split("\n")

  header = sents_data[0] + "".join(
      ["\tlabel%d_prob" % i for i in range(num_labels[task_name])])
  sents_data = sents_data[1:]

  if FLAGS.probs_path:
    with gfile.Open(FLAGS.probs_path, "r") as f:
      probs_data = f.read().strip().split("\n")
  else:
    probs_data = None

  if FLAGS.split_type == "train":
    assert len(sents_data) == len(probs_data)
    output_data = [
        x.strip() + "\t" + y.strip() for x, y in zip(sents_data, probs_data)
    ]

  elif FLAGS.split_type == "train_argmax":
    assert len(sents_data) == len(probs_data)
    # Round the probability vectors before adding them to file
    output_data = []
    for x, y, in zip(sents_data, probs_data):
      # Convert tsv probability vector to numpy style array
      prob_vector = np.array([float(yy) for yy in y.split("\t")])
      # initialize a vector with zeros
      argmax_prob_vector = np.zeros_like(prob_vector)
      # keep only the argmax prediction
      argmax_prob_vector[np.argmax(prob_vector)] = 1.0
      argmax_prob_str = "\t".join([str(yy) for yy in argmax_prob_vector])
      output_data.append(x.strip() + "\t" + argmax_prob_str.strip())

  elif FLAGS.split_type == "dev":
    if task_name == "sst-2":
      output_data = [
          x.strip() + "\t1\t0" if x.split("\t")[1] == "0" else x.strip() +
          "\t0\t1" for x in sents_data
      ]
    elif task_name == "mnli":
      output_data = [
          x.strip() + mnli_map[x.split("\t")[-1]] for x in sents_data
      ]

  output_data = [header] + output_data

  with gfile.Open(FLAGS.output_path, "w") as f:
    f.write("\n".join(output_data) + "\n")

  return 
Example #13
Source File: preprocess_util.py    From language with Apache License 2.0 4 votes vote down vote up
def build_vocab(sents_data,
                task_name,
                vocab_mode="downstream_vocab",
                vocab_path=None):
  """find all words in input corpus to build a vocabulary."""
  if vocab_mode == "bert_vocab":
    # Use a custom vocab to carry out filtering (such as BERT's word piece)
    with gfile.Open(vocab_path, "r") as f:
      vocab = f.read().strip().split("\n")
    # Filter out special tokens
    vocab = [x for x in vocab if x[0] != "[" and x[-1] != "]"]
    probs = [1.0 / len(vocab) for _ in vocab]

  elif vocab_mode == "full_corpus":
    # Use all words in a corpus of text to find out the vocabulary
    vocab = collections.Counter("\n".join(sents_data).split())
    vocab = [(k, v) for k, v in vocab.items()]
    vocab.sort(key=lambda x: x[1], reverse=True)
    vocab_total = sum([x[1] for x in vocab])

    probs = [float(x[1]) / vocab_total for x in vocab]
    vocab = [x[0] for x in vocab]

  elif "full_corpus_top_" in vocab_mode:
    full_vocab = collections.defaultdict(int)
    for sent in sents_data:
      for word in sent.split():
        full_vocab[word] += 1
    # Sort the vocabulary words according to their frequency
    full_vocab = sorted([(k, v) for k, v in full_vocab.items()],
                        key=lambda x: x[1],
                        reverse=True)
    # Take the top-k values from the vocabulary for the final list
    top_k_val = int(vocab_mode[len("full_corpus_top_"):])
    vocab = [x[0] for x in full_vocab[:top_k_val]]
    probs = [1.0 / len(vocab) for _ in vocab]

  elif vocab_mode == "downstream_vocab":
    vocab = collections.defaultdict(int)
    for sent in sents_data:
      for index in task_input_indices[task_name]:
        original_sent = sent.split("\t")[index].split()
        for word in original_sent:
          vocab[word] += 1

    vocab = [(k, v) for k, v in vocab.items()]
    vocab.sort(key=lambda x: x[1], reverse=True)
    vocab_total = sum([x[1] for x in vocab])

    probs = [float(x[1]) / vocab_total for x in vocab]
    vocab = [x[0] for x in vocab]

  else:
    probs = None
    vocab = None

  return vocab, probs 
Example #14
Source File: combine_qa.py    From language with Apache License 2.0 4 votes vote down vote up
def main(_):
  with gfile.Open(FLAGS.questions_path, "r") as f:
    questions_data = json.loads(f.read())

  with gfile.Open(FLAGS.predictions_path, "r") as f:
    predictions_data = json.loads(f.read())

  counter = 0
  unanswerable = 0
  total = 0

  for instance in tqdm.tqdm(questions_data["data"]):

    for para in instance["paragraphs"]:
      para_text = para["context"]

      for qa in para["qas"]:
        answer_text = predictions_data[qa["id"]]
        total += 1

        if answer_text.strip():
          qa["is_impossible"] = False
          # due to minor data processing issues, there are a few cases where the
          # predicted answer does not exist exactly in the paragraph text.
          # In this case, check if the first word of the answer is present in
          # the paragraph and approximate the answer_start using it.
          if answer_text not in para_text:
            counter += 1
            # If even the first word is not in the paragraph, ignore this QA
            if answer_text.split()[0] not in para_text:
              continue
            else:
              # approximate answer_start by the position of the first word
              qa["answers"] = [{
                  "text": answer_text,
                  "answer_start": para_text.index(answer_text.split()[0])
              }]
              continue
          # the usual case where answer_text is exactly present in para_text
          qa["answers"] = [{
              "text": answer_text,
              "answer_start": para_text.index(answer_text)
          }]

        else:
          # this code makes it compatible to SQuAD 2.0
          unanswerable += 1
          qa["answers"] = []
          qa["is_impossible"] = True

  logging.info("%d / %d answers were unanswerable", unanswerable, total)
  logging.info("%d / %d answers didn't have an exact match", counter, total)

  with gfile.Open(FLAGS.output_path, "w") as f:
    f.write(json.dumps(questions_data)) 
Example #15
Source File: preprocess_fraction_squad.py    From language with Apache License 2.0 4 votes vote down vote up
def main(_):
  random.seed(FLAGS.random_seed)

  with gfile.Open(FLAGS.input_path, "r") as f:
    sents_data = json.loads(f.read())

  output_data = {"data": [], "version": FLAGS.version}

  # Find all the question IDs in the SQuAD dataset
  question_ids = []
  for instance in sents_data["data"]:
    for para in instance["paragraphs"]:
      for qa in para["qas"]:
        question_ids.append(qa["id"])

  # Randomly shuffle the question IDs, and choose FLAGS.fraction percent of them
  random.shuffle(question_ids)
  num_final_questions = int(round(len(question_ids) * FLAGS.fraction))
  question_ids = {x: 1 for x in question_ids[:num_final_questions]}

  # Preserve the original dataset size and paragraphs, choose random questions
  # based on the question IDs which survived the filtering.
  for instance in tqdm.tqdm(sents_data["data"]):
    instance_data = {"title": instance["title"], "paragraphs": []}
    for para in instance["paragraphs"]:
      para_instance = {"context": para["context"], "qas": []}
      for qa in para["qas"]:
        # Only choose those questions which survived the filtering.
        if qa["id"] in question_ids:
          para_instance["qas"].append(qa)
      # Don't append paras with no QAs
      if para_instance["qas"]:
        instance_data["paragraphs"].append(para_instance)
    # Don't append instances with no paragraphs.
    if instance_data["paragraphs"]:
      output_data["data"].append(instance_data)

  # Count the total number of questions in the final, smaller dataset.
  total_questions = 0
  for instance in output_data["data"]:
    for para in instance["paragraphs"]:
      for qa in para["qas"]:
        total_questions += 1

  logging.info("Final dataset size = %d", total_questions)

  with gfile.Open(FLAGS.output_path, "w") as f:
    f.write(json.dumps(output_data))

  return