Python Examples of tensorflow.compat.v1.gfile

Source File: configurable_ops.py From morph-net with Apache License 2.0

6 votes

def decorator_from_parameterization_file(
    filename, fallback_rule=FallbackRule.pass_through, **kwargs):
  """Create a ConfigurableOps from a parameterization file.

    Loads a json parameterization file from disk
    (as saved by tools.structure_exporter) and creates an ConfigurableOps from
    it.

  Args:
    filename: Path to a parameterization file in json format.
    fallback_rule: A `FallbackRule` enum which controls fallback behavior
      (see __init__ for more detail.)
    **kwargs: Miscellaneous args for ConfigurableOps.

  Returns:
    An ConfigurableOps instance with the parameterization from `filename`.
  """
  with tf.gfile.Open(filename, 'r') as f:
    parameterization = json.loads(f.read())
    return ConfigurableOps(
        parameterization=parameterization, fallback_rule=fallback_rule,
        **kwargs)

Source File: evaluate_squad_watermark.py From language with Apache License 2.0

6 votes

def main(_):

  def load_dataset_file(dataset_file):
    with gfile.Open(dataset_file) as df:
      dataset_json = json.load(df)
    data = dataset_json['data']
    return data

  def load_preds_file(prediction_file):
    with gfile.Open(prediction_file) as pf:
      preds = json.load(pf)
    return preds

  dataset = load_dataset_file(FLAGS.watermark_file)
  preds = load_preds_file(FLAGS.watermark_output_file)
  logging.info('Watermark Label Accuracy =')
  logging.info(
      json.dumps(evaluate_dataset_preds(dataset, preds, ans_key='answers')))
  logging.info('Victim Label Accuracy =')
  logging.info(
      json.dumps(
          evaluate_dataset_preds(dataset, preds, ans_key='original_answers')))

Source File: corpus.py From lamb with Apache License 2.0

5 votes

def read_character_based_corpus(filename, encoding='utf-8'):
  with codecs.getreader(encoding)(tf.gfile.GFile(filename, mode='rb')) as f:
    return Corpus([line.rstrip('\n') for line in f])

Source File: corpus.py From lamb with Apache License 2.0

5 votes

def read_word_based_corpus(filename, encoding='utf-8'):
  with codecs.getreader(encoding)(tf.gfile.GFile(filename, mode='rb')) as f:
    return Corpus([line.split() for line in f])

Source File: fixed_replay_buffer.py From batch_rl with Apache License 2.0

5 votes

def _load_replay_buffers(self, num_buffers=None):
    """Loads multiple checkpoints into a list of replay buffers."""
    if not self._loaded_buffers:  # pytype: disable=attribute-error
      ckpts = gfile.ListDirectory(self._data_dir)  # pytype: disable=attribute-error
      # Assumes that the checkpoints are saved in a format CKPT_NAME.{SUFFIX}.gz
      ckpt_counters = collections.Counter(
          [name.split('.')[-2] for name in ckpts])
      # Should contain the files for add_count, action, observation, reward,
      # terminal and invalid_range
      ckpt_suffixes = [x for x in ckpt_counters if ckpt_counters[x] in [6, 7]]
      if num_buffers is not None:
        ckpt_suffixes = np.random.choice(
            ckpt_suffixes, num_buffers, replace=False)
      self._replay_buffers = []
      # Load the replay buffers in parallel
      with futures.ThreadPoolExecutor(
          max_workers=num_buffers) as thread_pool_executor:
        replay_futures = [thread_pool_executor.submit(
            self._load_buffer, suffix) for suffix in ckpt_suffixes]
      for f in replay_futures:
        replay_buffer = f.result()
        if replay_buffer is not None:
          self._replay_buffers.append(replay_buffer)
          self.add_count = max(replay_buffer.add_count, self.add_count)
      self._num_replay_buffers = len(self._replay_buffers)
      if self._num_replay_buffers:
        self._loaded_buffers = True

Source File: merge_datasets_simple.py From language with Apache License 2.0

5 votes

def main(_):

  output_data = []
  dataset_paths = FLAGS.dataset_paths.split(",")

  for dp in dataset_paths:
    with gfile.Open(dp, "r") as f:
      base_dataset = f.read().strip().split("\n")
      base_dataset_header = base_dataset[0]
      base_dataset = base_dataset[1:]

    indices_base_dataset = [
        base_dataset_header.split("\t").index(x)
        for x in relevant_headers[FLAGS.task_name]
    ]

    for point in base_dataset:
      input_shards = [
          point.split("\t")[index] for index in indices_base_dataset
      ]
      output_data.append(("%d\t" % len(output_data)) + "\t".join(input_shards))

    logging.info("Final dataset size = %d", len(output_data))

  final_header = "index\t" + "\t".join(relevant_headers[FLAGS.task_name])
  output_data = [final_header] + output_data

  with gfile.Open(FLAGS.output_path, "w") as f:
    f.write("\n".join(output_data) + "\n")

Source File: dataset_analysis.py From language with Apache License 2.0

5 votes

def main(_):

  with gfile.Open(FLAGS.input_path, "r") as f:
    sents_data = f.read().strip().split("\n")[1:]

  classes = [0 for _ in range(num_labels[FLAGS.task_name])]
  entropies = []

  # Assume that the last three columns are probability information
  for x in tqdm(sents_data):
    probs = (x.split("\t"))[-num_labels[FLAGS.task_name]:]
    probs = [float(x1) for x1 in probs]
    entropies.append(stats.entropy(probs))
    classes[np.argmax(probs)] += 1

  class_distro = []
  for i, cls1 in enumerate(classes):
    class_distro.append(float(cls1) / len(sents_data))
    logging.info("Class %d = %.6f (%d / %d)", i,
                 float(cls1) / len(sents_data), cls1, len(sents_data))

  class_entropy = stats.entropy(class_distro)
  logging.info("Class distribution self-entropy = %.8f", class_entropy)
  logging.info("Average per-instance self-entropy = %.8f", np.mean(entropies))
  logging.info("Max per-instance self-entropy = %.8f", np.max(entropies))
  logging.info("Min per-instance self-entropy = %.8f", np.min(entropies))
  logging.info("Std per-instance self-entropy = %.8f", np.std(entropies))
  return

Source File: preprocess_edit_distance_one.py From language with Apache License 2.0

5 votes

def main(_):
  random.seed(FLAGS.random_seed)

  with gfile.Open(FLAGS.input_path, "r") as f:
    sents_data = f.read().strip().split("\n")

  header = sents_data[0]
  sents_data = sents_data[1:]

  vocab = build_vocab(sents_data)
  subset_sents_data = build_subset(sents_data)

  output_data = []

  for sent in subset_sents_data:
    output_data.append(sent)
    data_point_parts = sent.split("\t")
    original_sent = data_point_parts[0].split()

    if FLAGS.keep_only_original:
      continue

    # For each pertubation, construct a new sentence and randomly replace a word
    for _ in range(FLAGS.num_pertubations):
      pertubed = [x for x in original_sent]
      pertubed[random.randint(0, len(original_sent) - 1)] = random.choice(vocab)
      output_data.append(" ".join(pertubed) + " \t" +
                         "\t".join(data_point_parts[1:]))

  output_data = [header] + output_data

  with gfile.Open(FLAGS.output_path, "w") as f:
    f.write("\n".join(output_data) + "\n")

  return

Source File: evaluate_squad_2.py From language with Apache License 2.0

5 votes

def main(_):
  with gfile.Open(FLAGS.data_file) as f:
    dataset_json = json.load(f)
    dataset = dataset_json['data']
  with gfile.Open(FLAGS.pred_file) as f:
    preds = json.load(f)

  qid_to_has_ans = make_qid_to_has_ans(dataset)  # maps qid to True/False
  has_ans_qids = [k for k, v in qid_to_has_ans.items() if v]
  no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v]
  exact_raw, f1_raw = get_raw_scores(dataset, preds)
  na_probs = {k: 0.0 for k in preds}
  exact_thresh = apply_no_ans_threshold(exact_raw, na_probs, qid_to_has_ans,
                                        1.0)
  f1_thresh = apply_no_ans_threshold(f1_raw, na_probs, qid_to_has_ans, 1.0)
  out_eval = make_eval_dict(exact_thresh, f1_thresh)
  if has_ans_qids:
    has_ans_eval = make_eval_dict(
        exact_thresh, f1_thresh, qid_list=has_ans_qids)
    merge_eval(out_eval, has_ans_eval, 'HasAns')
  if no_ans_qids:
    no_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=no_ans_qids)
    merge_eval(out_eval, no_ans_eval, 'NoAns')

  if FLAGS.out_file:
    with gfile.Open(FLAGS.out_file, 'w') as f:
      json.dump(out_eval, f)
  else:
    print(json.dumps(out_eval, indent=2))

Source File: evaluate_squad.py From language with Apache License 2.0

5 votes

def main(_):

  def load_dataset_file(dataset_file):
    with gfile.Open(dataset_file) as df:
      dataset_json = json.load(df)
    data = dataset_json['data']
    return data

  def load_preds_file(prediction_file):
    with gfile.Open(prediction_file) as pf:
      preds = json.load(pf)
    return preds

  if FLAGS.dataset_file and FLAGS.dataset_file2:
    dataset1 = load_dataset_file(FLAGS.dataset_file)
    dataset2 = load_dataset_file(FLAGS.dataset_file2)
    print(json.dumps(evaluate_dataset_dataset(dataset1, dataset2)))

  elif FLAGS.prediction_file and FLAGS.prediction_file2:
    preds1 = load_preds_file(FLAGS.prediction_file)
    preds2 = load_preds_file(FLAGS.prediction_file2)
    print(json.dumps(evaluate_preds_preds(preds1, preds2)))

  else:
    dataset = load_dataset_file(FLAGS.dataset_file)
    preds = load_preds_file(FLAGS.prediction_file)
    print(json.dumps(evaluate_dataset_preds(dataset, preds)))

Source File: local_mnist.py From magenta with Apache License 2.0

4 votes

def read_data_sets(
    train_dir,
    fake_data=False,  # pylint:disable=unused-argument
    one_hot=False,
    dtype=np.float32,
    reshape=True,
    validation_size=5000,
    seed=None,
):
  """Read multiple datasets."""
  # pylint:disable=invalid-name
  TRAIN_IMAGES = 'train-images-idx3-ubyte.gz'
  TRAIN_LABELS = 'train-labels-idx1-ubyte.gz'
  TEST_IMAGES = 't10k-images-idx3-ubyte.gz'
  TEST_LABELS = 't10k-labels-idx1-ubyte.gz'

  local_file = os.path.join(train_dir, TRAIN_IMAGES)
  with gfile.Open(local_file, 'rb') as f:
    train_images = extract_images(f)

  local_file = os.path.join(train_dir, TRAIN_LABELS)
  with gfile.Open(local_file, 'rb') as f:
    train_labels = extract_labels(f, one_hot=one_hot)

  local_file = os.path.join(train_dir, TEST_IMAGES)
  with gfile.Open(local_file, 'rb') as f:
    test_images = extract_images(f)

  local_file = os.path.join(train_dir, TEST_LABELS)
  with gfile.Open(local_file, 'rb') as f:
    test_labels = extract_labels(f, one_hot=one_hot)

  if not 0 <= validation_size <= len(train_images):
    raise ValueError(
        'Validation size should be between 0 and {}. Received: {}.'.format(
            len(train_images), validation_size))

  validation_images = train_images[:validation_size]
  validation_labels = train_labels[:validation_size]
  train_images = train_images[validation_size:]
  train_labels = train_labels[validation_size:]

  options = dict(dtype=dtype, reshape=reshape, seed=seed)

  train = DataSet(train_images, train_labels, **options)
  validation = DataSet(validation_images, validation_labels, **options)
  test = DataSet(test_images, test_labels, **options)

  return train, validation, test

Source File: preprocess_distill_input.py From language with Apache License 2.0

4 votes

def main(_):
  task_name = FLAGS.task_name.lower()

  with gfile.Open(FLAGS.sents_path, "r") as f:
    sents_data = f.read().strip().split("\n")

  header = sents_data[0] + "".join(
      ["\tlabel%d_prob" % i for i in range(num_labels[task_name])])
  sents_data = sents_data[1:]

  if FLAGS.probs_path:
    with gfile.Open(FLAGS.probs_path, "r") as f:
      probs_data = f.read().strip().split("\n")
  else:
    probs_data = None

  if FLAGS.split_type == "train":
    assert len(sents_data) == len(probs_data)
    output_data = [
        x.strip() + "\t" + y.strip() for x, y in zip(sents_data, probs_data)
    ]

  elif FLAGS.split_type == "train_argmax":
    assert len(sents_data) == len(probs_data)
    # Round the probability vectors before adding them to file
    output_data = []
    for x, y, in zip(sents_data, probs_data):
      # Convert tsv probability vector to numpy style array
      prob_vector = np.array([float(yy) for yy in y.split("\t")])
      # initialize a vector with zeros
      argmax_prob_vector = np.zeros_like(prob_vector)
      # keep only the argmax prediction
      argmax_prob_vector[np.argmax(prob_vector)] = 1.0
      argmax_prob_str = "\t".join([str(yy) for yy in argmax_prob_vector])
      output_data.append(x.strip() + "\t" + argmax_prob_str.strip())

  elif FLAGS.split_type == "dev":
    if task_name == "sst-2":
      output_data = [
          x.strip() + "\t1\t0" if x.split("\t")[1] == "0" else x.strip() +
          "\t0\t1" for x in sents_data
      ]
    elif task_name == "mnli":
      output_data = [
          x.strip() + mnli_map[x.split("\t")[-1]] for x in sents_data
      ]

  output_data = [header] + output_data

  with gfile.Open(FLAGS.output_path, "w") as f:
    f.write("\n".join(output_data) + "\n")

  return

Source File: preprocess_util.py From language with Apache License 2.0

4 votes

def build_vocab(sents_data,
                task_name,
                vocab_mode="downstream_vocab",
                vocab_path=None):
  """find all words in input corpus to build a vocabulary."""
  if vocab_mode == "bert_vocab":
    # Use a custom vocab to carry out filtering (such as BERT's word piece)
    with gfile.Open(vocab_path, "r") as f:
      vocab = f.read().strip().split("\n")
    # Filter out special tokens
    vocab = [x for x in vocab if x[0] != "[" and x[-1] != "]"]
    probs = [1.0 / len(vocab) for _ in vocab]

  elif vocab_mode == "full_corpus":
    # Use all words in a corpus of text to find out the vocabulary
    vocab = collections.Counter("\n".join(sents_data).split())
    vocab = [(k, v) for k, v in vocab.items()]
    vocab.sort(key=lambda x: x[1], reverse=True)
    vocab_total = sum([x[1] for x in vocab])

    probs = [float(x[1]) / vocab_total for x in vocab]
    vocab = [x[0] for x in vocab]

  elif "full_corpus_top_" in vocab_mode:
    full_vocab = collections.defaultdict(int)
    for sent in sents_data:
      for word in sent.split():
        full_vocab[word] += 1
    # Sort the vocabulary words according to their frequency
    full_vocab = sorted([(k, v) for k, v in full_vocab.items()],
                        key=lambda x: x[1],
                        reverse=True)
    # Take the top-k values from the vocabulary for the final list
    top_k_val = int(vocab_mode[len("full_corpus_top_"):])
    vocab = [x[0] for x in full_vocab[:top_k_val]]
    probs = [1.0 / len(vocab) for _ in vocab]

  elif vocab_mode == "downstream_vocab":
    vocab = collections.defaultdict(int)
    for sent in sents_data:
      for index in task_input_indices[task_name]:
        original_sent = sent.split("\t")[index].split()
        for word in original_sent:
          vocab[word] += 1

    vocab = [(k, v) for k, v in vocab.items()]
    vocab.sort(key=lambda x: x[1], reverse=True)
    vocab_total = sum([x[1] for x in vocab])

    probs = [float(x[1]) / vocab_total for x in vocab]
    vocab = [x[0] for x in vocab]

  else:
    probs = None
    vocab = None

  return vocab, probs

Source File: combine_qa.py From language with Apache License 2.0

4 votes

def main(_):
  with gfile.Open(FLAGS.questions_path, "r") as f:
    questions_data = json.loads(f.read())

  with gfile.Open(FLAGS.predictions_path, "r") as f:
    predictions_data = json.loads(f.read())

  counter = 0
  unanswerable = 0
  total = 0

  for instance in tqdm.tqdm(questions_data["data"]):

    for para in instance["paragraphs"]:
      para_text = para["context"]

      for qa in para["qas"]:
        answer_text = predictions_data[qa["id"]]
        total += 1

        if answer_text.strip():
          qa["is_impossible"] = False
          # due to minor data processing issues, there are a few cases where the
          # predicted answer does not exist exactly in the paragraph text.
          # In this case, check if the first word of the answer is present in
          # the paragraph and approximate the answer_start using it.
          if answer_text not in para_text:
            counter += 1
            # If even the first word is not in the paragraph, ignore this QA
            if answer_text.split()[0] not in para_text:
              continue
            else:
              # approximate answer_start by the position of the first word
              qa["answers"] = [{
                  "text": answer_text,
                  "answer_start": para_text.index(answer_text.split()[0])
              }]
              continue
          # the usual case where answer_text is exactly present in para_text
          qa["answers"] = [{
              "text": answer_text,
              "answer_start": para_text.index(answer_text)
          }]

        else:
          # this code makes it compatible to SQuAD 2.0
          unanswerable += 1
          qa["answers"] = []
          qa["is_impossible"] = True

  logging.info("%d / %d answers were unanswerable", unanswerable, total)
  logging.info("%d / %d answers didn't have an exact match", counter, total)

  with gfile.Open(FLAGS.output_path, "w") as f:
    f.write(json.dumps(questions_data))

Source File: preprocess_fraction_squad.py From language with Apache License 2.0

4 votes

def main(_):
  random.seed(FLAGS.random_seed)

  with gfile.Open(FLAGS.input_path, "r") as f:
    sents_data = json.loads(f.read())

  output_data = {"data": [], "version": FLAGS.version}

  # Find all the question IDs in the SQuAD dataset
  question_ids = []
  for instance in sents_data["data"]:
    for para in instance["paragraphs"]:
      for qa in para["qas"]:
        question_ids.append(qa["id"])

  # Randomly shuffle the question IDs, and choose FLAGS.fraction percent of them
  random.shuffle(question_ids)
  num_final_questions = int(round(len(question_ids) * FLAGS.fraction))
  question_ids = {x: 1 for x in question_ids[:num_final_questions]}

  # Preserve the original dataset size and paragraphs, choose random questions
  # based on the question IDs which survived the filtering.
  for instance in tqdm.tqdm(sents_data["data"]):
    instance_data = {"title": instance["title"], "paragraphs": []}
    for para in instance["paragraphs"]:
      para_instance = {"context": para["context"], "qas": []}
      for qa in para["qas"]:
        # Only choose those questions which survived the filtering.
        if qa["id"] in question_ids:
          para_instance["qas"].append(qa)
      # Don't append paras with no QAs
      if para_instance["qas"]:
        instance_data["paragraphs"].append(para_instance)
    # Don't append instances with no paragraphs.
    if instance_data["paragraphs"]:
      output_data["data"].append(instance_data)

  # Count the total number of questions in the final, smaller dataset.
  total_questions = 0
  for instance in output_data["data"]:
    for para in instance["paragraphs"]:
      for qa in para["qas"]:
        total_questions += 1

  logging.info("Final dataset size = %d", total_questions)

  with gfile.Open(FLAGS.output_path, "w") as f:
    f.write(json.dumps(output_data))

  return

Python tensorflow.compat.v1.gfile() Examples