Python Examples of data_utils.print

Source File: neural_gpu_trainer.py From multilabel-image-classification-tensorflow with MIT License

6 votes

def print_vectors(embedding_key, vocab_path, word_vector_file):
  """Print vectors from the given variable."""
  _, rev_vocab = wmt.initialize_vocabulary(vocab_path)
  vectors_variable = [v for v in tf.trainable_variables()
                      if embedding_key == v.name]
  if len(vectors_variable) != 1:
    data.print_out("Word vector variable not found or too many.")
    sys.exit(1)
  vectors_variable = vectors_variable[0]
  vectors = vectors_variable.eval()
  l, s = vectors.shape[0], vectors.shape[1]
  data.print_out("Printing %d word vectors from %s to %s."
                 % (l, embedding_key, word_vector_file))
  with tf.gfile.GFile(word_vector_file, mode="w") as f:
    # Lines have format: dog 0.045123 -0.61323 0.413667 ...
    for i in xrange(l):
      f.write(rev_vocab[i])
      for j in xrange(s):
        f.write(" %.8f" % vectors[i][j])
      f.write("\n")

Source File: neural_gpu_trainer.py From ECO-pytorch with BSD 2-Clause "Simplified" License

6 votes

def multi_test(l, model, sess, task, nprint, batch_size, offset=None,
               ensemble=None):
  """Run multiple tests at lower batch size to save memory."""
  errors, seq_err = 0.0, 0.0
  to_print = nprint
  low_batch = FLAGS.low_batch_size
  low_batch = min(low_batch, batch_size)
  for mstep in xrange(batch_size / low_batch):
    cur_offset = None if offset is None else offset + mstep * low_batch
    err, sq_err, _ = single_test(l, model, sess, task, to_print, low_batch,
                                 False, cur_offset, ensemble=ensemble)
    to_print = max(0, to_print - low_batch)
    errors += err
    seq_err += sq_err
    if FLAGS.mode > 0:
      cur_errors = float(low_batch * errors) / ((mstep+1) * low_batch)
      cur_seq_err = float(low_batch * seq_err) / ((mstep+1) * low_batch)
      data.print_out("    %s multitest current errors %.2f sequence-errors %.2f"
                     % (task, 100*cur_errors, 100*cur_seq_err))
  errors = float(low_batch) * float(errors) / batch_size
  seq_err = float(low_batch) * float(seq_err) / batch_size
  data.print_out("  %s len %d errors %.2f sequence-errors %.2f"
                 % (task, l, 100*errors, 100*seq_err))
  return errors, seq_err

Source File: neural_gpu_trainer.py From HumanRecognition with MIT License

6 votes

def print_vectors(embedding_key, vocab_path, word_vector_file):
  """Print vectors from the given variable."""
  _, rev_vocab = wmt.initialize_vocabulary(vocab_path)
  vectors_variable = [v for v in tf.trainable_variables()
                      if embedding_key == v.name]
  if len(vectors_variable) != 1:
    data.print_out("Word vector variable not found or too many.")
    sys.exit(1)
  vectors_variable = vectors_variable[0]
  vectors = vectors_variable.eval()
  l, s = vectors.shape[0], vectors.shape[1]
  data.print_out("Printing %d word vectors from %s to %s."
                 % (l, embedding_key, word_vector_file))
  with tf.gfile.GFile(word_vector_file, mode="w") as f:
    # Lines have format: dog 0.045123 -0.61323 0.413667 ...
    for i in xrange(l):
      f.write(rev_vocab[i])
      for j in xrange(s):
        f.write(" %.8f" % vectors[i][j])
      f.write("\n")

Source File: neural_gpu_trainer.py From Action_Recognition_Zoo with MIT License

6 votes

def multi_test(l, model, sess, task, nprint, batch_size, offset=None,
               ensemble=None):
  """Run multiple tests at lower batch size to save memory."""
  errors, seq_err = 0.0, 0.0
  to_print = nprint
  low_batch = FLAGS.low_batch_size
  low_batch = min(low_batch, batch_size)
  for mstep in xrange(batch_size / low_batch):
    cur_offset = None if offset is None else offset + mstep * low_batch
    err, sq_err, _ = single_test(l, model, sess, task, to_print, low_batch,
                                 False, cur_offset, ensemble=ensemble)
    to_print = max(0, to_print - low_batch)
    errors += err
    seq_err += sq_err
    if FLAGS.mode > 0:
      cur_errors = float(low_batch * errors) / ((mstep+1) * low_batch)
      cur_seq_err = float(low_batch * seq_err) / ((mstep+1) * low_batch)
      data.print_out("    %s multitest current errors %.2f sequence-errors %.2f"
                     % (task, 100*cur_errors, 100*cur_seq_err))
  errors = float(low_batch) * float(errors) / batch_size
  seq_err = float(low_batch) * float(seq_err) / batch_size
  data.print_out("  %s len %d errors %.2f sequence-errors %.2f"
                 % (task, l, 100*errors, 100*seq_err))
  return errors, seq_err

Source File: neural_gpu_trainer.py From Gun-Detector with Apache License 2.0

6 votes

def print_vectors(embedding_key, vocab_path, word_vector_file):
  """Print vectors from the given variable."""
  _, rev_vocab = wmt.initialize_vocabulary(vocab_path)
  vectors_variable = [v for v in tf.trainable_variables()
                      if embedding_key == v.name]
  if len(vectors_variable) != 1:
    data.print_out("Word vector variable not found or too many.")
    sys.exit(1)
  vectors_variable = vectors_variable[0]
  vectors = vectors_variable.eval()
  l, s = vectors.shape[0], vectors.shape[1]
  data.print_out("Printing %d word vectors from %s to %s."
                 % (l, embedding_key, word_vector_file))
  with tf.gfile.GFile(word_vector_file, mode="w") as f:
    # Lines have format: dog 0.045123 -0.61323 0.413667 ...
    for i in xrange(l):
      f.write(rev_vocab[i])
      for j in xrange(s):
        f.write(" %.8f" % vectors[i][j])
      f.write("\n")

Source File: neural_gpu_trainer.py From g-tensorflow-models with Apache License 2.0

6 votes

def print_vectors(embedding_key, vocab_path, word_vector_file):
  """Print vectors from the given variable."""
  _, rev_vocab = wmt.initialize_vocabulary(vocab_path)
  vectors_variable = [v for v in tf.trainable_variables()
                      if embedding_key == v.name]
  if len(vectors_variable) != 1:
    data.print_out("Word vector variable not found or too many.")
    sys.exit(1)
  vectors_variable = vectors_variable[0]
  vectors = vectors_variable.eval()
  l, s = vectors.shape[0], vectors.shape[1]
  data.print_out("Printing %d word vectors from %s to %s."
                 % (l, embedding_key, word_vector_file))
  with tf.gfile.GFile(word_vector_file, mode="w") as f:
    # Lines have format: dog 0.045123 -0.61323 0.413667 ...
    for i in xrange(l):
      f.write(rev_vocab[i])
      for j in xrange(s):
        f.write(" %.8f" % vectors[i][j])
      f.write("\n")

Source File: neural_gpu_trainer.py From hands-detection with MIT License

6 votes

def print_vectors(embedding_key, vocab_path, word_vector_file):
  """Print vectors from the given variable."""
  _, rev_vocab = wmt.initialize_vocabulary(vocab_path)
  vectors_variable = [v for v in tf.trainable_variables()
                      if embedding_key == v.name]
  if len(vectors_variable) != 1:
    data.print_out("Word vector variable not found or too many.")
    sys.exit(1)
  vectors_variable = vectors_variable[0]
  vectors = vectors_variable.eval()
  l, s = vectors.shape[0], vectors.shape[1]
  data.print_out("Printing %d word vectors from %s to %s."
                 % (l, embedding_key, word_vector_file))
  with tf.gfile.GFile(word_vector_file, mode="w") as f:
    # Lines have format: dog 0.045123 -0.61323 0.413667 ...
    for i in xrange(l):
      f.write(rev_vocab[i])
      for j in xrange(s):
        f.write(" %.8f" % vectors[i][j])
      f.write("\n")

Source File: neural_gpu_trainer.py From yolo_v2 with Apache License 2.0

6 votes

def print_vectors(embedding_key, vocab_path, word_vector_file):
  """Print vectors from the given variable."""
  _, rev_vocab = wmt.initialize_vocabulary(vocab_path)
  vectors_variable = [v for v in tf.trainable_variables()
                      if embedding_key == v.name]
  if len(vectors_variable) != 1:
    data.print_out("Word vector variable not found or too many.")
    sys.exit(1)
  vectors_variable = vectors_variable[0]
  vectors = vectors_variable.eval()
  l, s = vectors.shape[0], vectors.shape[1]
  data.print_out("Printing %d word vectors from %s to %s."
                 % (l, embedding_key, word_vector_file))
  with tf.gfile.GFile(word_vector_file, mode="w") as f:
    # Lines have format: dog 0.045123 -0.61323 0.413667 ...
    for i in xrange(l):
      f.write(rev_vocab[i])
      for j in xrange(s):
        f.write(" %.8f" % vectors[i][j])
      f.write("\n")

Source File: neural_gpu_trainer.py From AI_Reader with Apache License 2.0

6 votes

def multi_test(l, model, sess, task, nprint, batch_size, offset=None,
               ensemble=None):
  """Run multiple tests at lower batch size to save memory."""
  errors, seq_err = 0.0, 0.0
  to_print = nprint
  low_batch = FLAGS.low_batch_size
  low_batch = min(low_batch, batch_size)
  for mstep in xrange(batch_size / low_batch):
    cur_offset = None if offset is None else offset + mstep * low_batch
    err, sq_err, _ = single_test(l, model, sess, task, to_print, low_batch,
                                 False, cur_offset, ensemble=ensemble)
    to_print = max(0, to_print - low_batch)
    errors += err
    seq_err += sq_err
    if FLAGS.mode > 0:
      cur_errors = float(low_batch * errors) / ((mstep+1) * low_batch)
      cur_seq_err = float(low_batch * seq_err) / ((mstep+1) * low_batch)
      data.print_out("    %s multitest current errors %.2f sequence-errors %.2f"
                     % (task, 100*cur_errors, 100*cur_seq_err))
  errors = float(low_batch) * float(errors) / batch_size
  seq_err = float(low_batch) * float(seq_err) / batch_size
  data.print_out("  %s len %d errors %.2f sequence-errors %.2f"
                 % (task, l, 100*errors, 100*seq_err))
  return errors, seq_err

Source File: neural_gpu_trainer.py From models with Apache License 2.0

6 votes

def print_vectors(embedding_key, vocab_path, word_vector_file):
  """Print vectors from the given variable."""
  _, rev_vocab = wmt.initialize_vocabulary(vocab_path)
  vectors_variable = [v for v in tf.trainable_variables()
                      if embedding_key == v.name]
  if len(vectors_variable) != 1:
    data.print_out("Word vector variable not found or too many.")
    sys.exit(1)
  vectors_variable = vectors_variable[0]
  vectors = vectors_variable.eval()
  l, s = vectors.shape[0], vectors.shape[1]
  data.print_out("Printing %d word vectors from %s to %s."
                 % (l, embedding_key, word_vector_file))
  with tf.gfile.GFile(word_vector_file, mode="w") as f:
    # Lines have format: dog 0.045123 -0.61323 0.413667 ...
    for i in xrange(l):
      f.write(rev_vocab[i])
      for j in xrange(s):
        f.write(" %.8f" % vectors[i][j])
      f.write("\n")

Source File: neural_gpu_trainer.py From object_detection_kitti with Apache License 2.0

6 votes

def print_vectors(embedding_key, vocab_path, word_vector_file):
  """Print vectors from the given variable."""
  _, rev_vocab = wmt.initialize_vocabulary(vocab_path)
  vectors_variable = [v for v in tf.trainable_variables()
                      if embedding_key == v.name]
  if len(vectors_variable) != 1:
    data.print_out("Word vector variable not found or too many.")
    sys.exit(1)
  vectors_variable = vectors_variable[0]
  vectors = vectors_variable.eval()
  l, s = vectors.shape[0], vectors.shape[1]
  data.print_out("Printing %d word vectors from %s to %s."
                 % (l, embedding_key, word_vector_file))
  with tf.gfile.GFile(word_vector_file, mode="w") as f:
    # Lines have format: dog 0.045123 -0.61323 0.413667 ...
    for i in xrange(l):
      f.write(rev_vocab[i])
      for j in xrange(s):
        f.write(" %.8f" % vectors[i][j])
      f.write("\n")

Source File: neural_gpu_trainer.py From DOTA_models with Apache License 2.0

6 votes

def print_vectors(embedding_key, vocab_path, word_vector_file):
  """Print vectors from the given variable."""
  _, rev_vocab = wmt.initialize_vocabulary(vocab_path)
  vectors_variable = [v for v in tf.trainable_variables()
                      if embedding_key == v.name]
  if len(vectors_variable) != 1:
    data.print_out("Word vector variable not found or too many.")
    sys.exit(1)
  vectors_variable = vectors_variable[0]
  vectors = vectors_variable.eval()
  l, s = vectors.shape[0], vectors.shape[1]
  data.print_out("Printing %d word vectors from %s to %s."
                 % (l, embedding_key, word_vector_file))
  with tf.gfile.GFile(word_vector_file, mode="w") as f:
    # Lines have format: dog 0.045123 -0.61323 0.413667 ...
    for i in xrange(l):
      f.write(rev_vocab[i])
      for j in xrange(s):
        f.write(" %.8f" % vectors[i][j])
      f.write("\n")

Source File: neural_gpu_trainer.py From object_detection_with_tensorflow with MIT License

6 votes

def print_vectors(embedding_key, vocab_path, word_vector_file):
  """Print vectors from the given variable."""
  _, rev_vocab = wmt.initialize_vocabulary(vocab_path)
  vectors_variable = [v for v in tf.trainable_variables()
                      if embedding_key == v.name]
  if len(vectors_variable) != 1:
    data.print_out("Word vector variable not found or too many.")
    sys.exit(1)
  vectors_variable = vectors_variable[0]
  vectors = vectors_variable.eval()
  l, s = vectors.shape[0], vectors.shape[1]
  data.print_out("Printing %d word vectors from %s to %s."
                 % (l, embedding_key, word_vector_file))
  with tf.gfile.GFile(word_vector_file, mode="w") as f:
    # Lines have format: dog 0.045123 -0.61323 0.413667 ...
    for i in xrange(l):
      f.write(rev_vocab[i])
      for j in xrange(s):
        f.write(" %.8f" % vectors[i][j])
      f.write("\n")

Source File: neural_gpu_trainer.py From HumanRecognition with MIT License

5 votes

def read_data(source_path, target_path, buckets, max_size=None, print_out=True):
  """Read data from source and target files and put into buckets.

  Args:
    source_path: path to the files with token-ids for the source language.
    target_path: path to the file with token-ids for the target language;
      it must be aligned with the source file: n-th line contains the desired
      output for n-th line from the source_path.
    buckets: the buckets to use.
    max_size: maximum number of lines to read, all other will be ignored;
      if 0 or None, data files will be read completely (no limit).
      If set to 1, no data will be returned (empty lists of the right form).
    print_out: whether to print out status or not.

  Returns:
    data_set: a list of length len(_buckets); data_set[n] contains a list of
      (source, target) pairs read from the provided data files that fit
      into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and
      len(target) < _buckets[n][1]; source and target are lists of token-ids.
  """
  data_set = [[] for _ in buckets]
  counter = 0
  if max_size != 1:
    with tf.gfile.GFile(source_path, mode="r") as source_file:
      with tf.gfile.GFile(target_path, mode="r") as target_file:
        source, target = source_file.readline(), target_file.readline()
        while source and target and (not max_size or counter < max_size):
          counter += 1
          if counter % 100000 == 0 and print_out:
            print "  reading data line %d" % counter
            sys.stdout.flush()
          source_ids = [int(x) for x in source.split()]
          target_ids = [int(x) for x in target.split()]
          source_ids, source_len = zero_split(source_ids)
          target_ids, target_len = zero_split(target_ids, append=wmt.EOS_ID)
          for bucket_id, size in enumerate(buckets):
            if source_len <= size and target_len <= size:
              data_set[bucket_id].append([source_ids, target_ids])
              break
          source, target = source_file.readline(), target_file.readline()
  return data_set

Source File: neural_gpu_trainer.py From HumanRecognition with MIT License

5 votes

def read_data_into_global(source_path, target_path, buckets,
                          max_size=None, print_out=True):
  """Read data into the global variables (can be in a separate thread)."""
  # pylint: disable=global-variable-not-assigned
  global global_train_set, train_buckets_scale
  # pylint: enable=global-variable-not-assigned
  data_set = read_data(source_path, target_path, buckets, max_size, print_out)
  global_train_set["wmt"].append(data_set)
  train_total_size = calculate_buckets_scale(data_set, buckets, "wmt")
  if print_out:
    print "  Finished global data reading (%d)." % train_total_size

Source File: neural_gpu_trainer.py From AI_Reader with Apache License 2.0

5 votes

def single_test(l, model, sess, task, nprint, batch_size, print_out=True,
                offset=None, ensemble=None, get_steps=False):
  """Test model on test data of length l using the given session."""
  inpt, target = data.get_batch(l, batch_size, False, task, offset)
  _, res, _, steps = model.step(sess, inpt, target, False, get_steps=get_steps)
  errors, total, seq_err = data.accuracy(inpt, res, target, batch_size, nprint)
  seq_err = float(seq_err) / batch_size
  if total > 0:
    errors = float(errors) / total
  if print_out:
    data.print_out("  %s len %d errors %.2f sequence-errors %.2f"
                   % (task, l, 100*errors, 100*seq_err))
  # Ensemble eval.
  if ensemble:
    results = []
    for m in ensemble:
      model.saver.restore(sess, m)
      _, result, _, _ = model.step(sess, inpt, target, False)
      m_errors, m_total, m_seq_err = data.accuracy(inpt, result, target,
                                                   batch_size, nprint)
      m_seq_err = float(m_seq_err) / batch_size
      if total > 0:
        m_errors = float(m_errors) / m_total
      data.print_out("     %s len %d m-errors %.2f m-sequence-errors %.2f"
                     % (task, l, 100*m_errors, 100*m_seq_err))
      results.append(result)
    ens = [sum(o) for o in zip(*results)]
    errors, total, seq_err = data.accuracy(inpt, ens, target,
                                           batch_size, nprint)
    seq_err = float(seq_err) / batch_size
    if total > 0:
      errors = float(errors) / total
    if print_out:
      data.print_out("  %s len %d ens-errors %.2f ens-sequence-errors %.2f"
                     % (task, l, 100*errors, 100*seq_err))
  return errors, seq_err, (steps, inpt, [np.argmax(o, axis=1) for o in res])

Source File: neural_gpu_trainer.py From HumanRecognition with MIT License

5 votes

def single_test(bin_id, model, sess, nprint, batch_size, dev, p, print_out=True,
                offset=None, beam_model=None):
  """Test model on test data of length l using the given session."""
  if not dev[p][bin_id]:
    data.print_out("  bin %d (%d)\t%s\tppl NA errors NA seq-errors NA"
                   % (bin_id, data.bins[bin_id], p))
    return 1.0, 1.0, 0.0
  inpt, target = data.get_batch(
      bin_id, batch_size, dev[p], FLAGS.height, offset)
  if FLAGS.beam_size > 1 and beam_model:
    loss, res, new_tgt, scores = m_step(
        model, beam_model, sess, batch_size, inpt, target, bin_id,
        FLAGS.eval_beam_steps, p)
    score_avgs = [sum(s) / float(len(s)) for s in scores]
    score_maxs = [max(s) for s in scores]
    score_str = ["(%.2f, %.2f)" % (score_avgs[i], score_maxs[i])
                 for i in xrange(FLAGS.eval_beam_steps)]
    data.print_out("  == scores (avg, max): %s" % "; ".join(score_str))
    errors, total, seq_err = data.accuracy(inpt, res, target, batch_size,
                                           nprint, new_tgt, scores[-1])
  else:
    loss, res, _, _ = model.step(sess, inpt, target, False)
    errors, total, seq_err = data.accuracy(inpt, res, target, batch_size,
                                           nprint)
  seq_err = float(seq_err) / batch_size
  if total > 0:
    errors = float(errors) / total
  if print_out:
    data.print_out("  bin %d (%d)\t%s\tppl %.2f errors %.2f seq-errors %.2f"
                   % (bin_id, data.bins[bin_id], p, data.safe_exp(loss),
                      100 * errors, 100 * seq_err))
  return (errors, seq_err, loss)

Source File: neural_gpu_trainer.py From HumanRecognition with MIT License

5 votes

def assign_vectors(word_vector_file, embedding_key, vocab_path, sess):
  """Assign the embedding_key variable from the given word vectors file."""
  # For words in the word vector file, set their embedding at start.
  if not tf.gfile.Exists(word_vector_file):
    data.print_out("Word vector file does not exist: %s" % word_vector_file)
    sys.exit(1)
  vocab, _ = wmt.initialize_vocabulary(vocab_path)
  vectors_variable = [v for v in tf.trainable_variables()
                      if embedding_key == v.name]
  if len(vectors_variable) != 1:
    data.print_out("Word vector variable not found or too many.")
    sys.exit(1)
  vectors_variable = vectors_variable[0]
  vectors = vectors_variable.eval()
  data.print_out("Pre-setting word vectors from %s" % word_vector_file)
  with tf.gfile.GFile(word_vector_file, mode="r") as f:
    # Lines have format: dog 0.045123 -0.61323 0.413667 ...
    for line in f:
      line_parts = line.split()
      # The first part is the word.
      word = line_parts[0]
      if word in vocab:
        # Remaining parts are components of the vector.
        word_vector = np.array(map(float, line_parts[1:]))
        if len(word_vector) != FLAGS.vec_size:
          data.print_out("Warn: Word '%s', Expecting vector size %d, "
                         "found %d" % (word, FLAGS.vec_size,
                                       len(word_vector)))
        else:
          vectors[vocab[word]] = word_vector
  # Assign the modified vectors to the vectors_variable in the graph.
  sess.run([vectors_variable.initializer],
           {vectors_variable.initializer.inputs[1]: vectors})

Source File: neural_gpu_trainer.py From DOTA_models with Apache License 2.0

5 votes

def read_data_into_global(source_path, target_path, buckets,
                          max_size=None, print_out=True):
  """Read data into the global variables (can be in a separate thread)."""
  # pylint: disable=global-variable-not-assigned
  global global_train_set, train_buckets_scale
  # pylint: enable=global-variable-not-assigned
  data_set = read_data(source_path, target_path, buckets, max_size, print_out)
  global_train_set["wmt"].append(data_set)
  train_total_size = calculate_buckets_scale(data_set, buckets, "wmt")
  if print_out:
    print "  Finished global data reading (%d)." % train_total_size

Source File: neural_gpu_trainer.py From object_detection_with_tensorflow with MIT License

5 votes

def assign_vectors(word_vector_file, embedding_key, vocab_path, sess):
  """Assign the embedding_key variable from the given word vectors file."""
  # For words in the word vector file, set their embedding at start.
  if not tf.gfile.Exists(word_vector_file):
    data.print_out("Word vector file does not exist: %s" % word_vector_file)
    sys.exit(1)
  vocab, _ = wmt.initialize_vocabulary(vocab_path)
  vectors_variable = [v for v in tf.trainable_variables()
                      if embedding_key == v.name]
  if len(vectors_variable) != 1:
    data.print_out("Word vector variable not found or too many.")
    sys.exit(1)
  vectors_variable = vectors_variable[0]
  vectors = vectors_variable.eval()
  data.print_out("Pre-setting word vectors from %s" % word_vector_file)
  with tf.gfile.GFile(word_vector_file, mode="r") as f:
    # Lines have format: dog 0.045123 -0.61323 0.413667 ...
    for line in f:
      line_parts = line.split()
      # The first part is the word.
      word = line_parts[0]
      if word in vocab:
        # Remaining parts are components of the vector.
        word_vector = np.array(map(float, line_parts[1:]))
        if len(word_vector) != FLAGS.vec_size:
          data.print_out("Warn: Word '%s', Expecting vector size %d, "
                         "found %d" % (word, FLAGS.vec_size,
                                       len(word_vector)))
        else:
          vectors[vocab[word]] = word_vector
  # Assign the modified vectors to the vectors_variable in the graph.
  sess.run([vectors_variable.initializer],
           {vectors_variable.initializer.inputs[1]: vectors})

Source File: neural_gpu_trainer.py From g-tensorflow-models with Apache License 2.0

5 votes

def read_data(source_path, target_path, buckets, max_size=None, print_out=True):
  """Read data from source and target files and put into buckets.

  Args:
    source_path: path to the files with token-ids for the source language.
    target_path: path to the file with token-ids for the target language;
      it must be aligned with the source file: n-th line contains the desired
      output for n-th line from the source_path.
    buckets: the buckets to use.
    max_size: maximum number of lines to read, all other will be ignored;
      if 0 or None, data files will be read completely (no limit).
      If set to 1, no data will be returned (empty lists of the right form).
    print_out: whether to print out status or not.

  Returns:
    data_set: a list of length len(_buckets); data_set[n] contains a list of
      (source, target) pairs read from the provided data files that fit
      into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and
      len(target) < _buckets[n][1]; source and target are lists of token-ids.
  """
  data_set = [[] for _ in buckets]
  counter = 0
  if max_size != 1:
    with tf.gfile.GFile(source_path, mode="r") as source_file:
      with tf.gfile.GFile(target_path, mode="r") as target_file:
        source, target = source_file.readline(), target_file.readline()
        while source and target and (not max_size or counter < max_size):
          counter += 1
          if counter % 100000 == 0 and print_out:
            print("  reading data line %d" % counter)
            sys.stdout.flush()
          source_ids = [int(x) for x in source.split()]
          target_ids = [int(x) for x in target.split()]
          source_ids, source_len = zero_split(source_ids)
          target_ids, target_len = zero_split(target_ids, append=wmt.EOS_ID)
          for bucket_id, size in enumerate(buckets):
            if source_len <= size and target_len <= size:
              data_set[bucket_id].append([source_ids, target_ids])
              break
          source, target = source_file.readline(), target_file.readline()
  return data_set

Source File: neural_gpu_trainer.py From g-tensorflow-models with Apache License 2.0

5 votes

def read_data_into_global(source_path, target_path, buckets,
                          max_size=None, print_out=True):
  """Read data into the global variables (can be in a separate thread)."""
  # pylint: disable=global-variable-not-assigned
  global global_train_set, train_buckets_scale
  # pylint: enable=global-variable-not-assigned
  data_set = read_data(source_path, target_path, buckets, max_size, print_out)
  global_train_set["wmt"].append(data_set)
  train_total_size = calculate_buckets_scale(data_set, buckets, "wmt")
  if print_out:
    print("  Finished global data reading (%d)." % train_total_size)

Source File: neural_gpu_trainer.py From g-tensorflow-models with Apache License 2.0

5 votes

def single_test(bin_id, model, sess, nprint, batch_size, dev, p, print_out=True,
                offset=None, beam_model=None):
  """Test model on test data of length l using the given session."""
  if not dev[p][bin_id]:
    data.print_out("  bin %d (%d)\t%s\tppl NA errors NA seq-errors NA"
                   % (bin_id, data.bins[bin_id], p))
    return 1.0, 1.0, 0.0
  inpt, target = data.get_batch(
      bin_id, batch_size, dev[p], FLAGS.height, offset)
  if FLAGS.beam_size > 1 and beam_model:
    loss, res, new_tgt, scores = m_step(
        model, beam_model, sess, batch_size, inpt, target, bin_id,
        FLAGS.eval_beam_steps, p)
    score_avgs = [sum(s) / float(len(s)) for s in scores]
    score_maxs = [max(s) for s in scores]
    score_str = ["(%.2f, %.2f)" % (score_avgs[i], score_maxs[i])
                 for i in xrange(FLAGS.eval_beam_steps)]
    data.print_out("  == scores (avg, max): %s" % "; ".join(score_str))
    errors, total, seq_err = data.accuracy(inpt, res, target, batch_size,
                                           nprint, new_tgt, scores[-1])
  else:
    loss, res, _, _ = model.step(sess, inpt, target, False)
    errors, total, seq_err = data.accuracy(inpt, res, target, batch_size,
                                           nprint)
  seq_err = float(seq_err) / batch_size
  if total > 0:
    errors = float(errors) / total
  if print_out:
    data.print_out("  bin %d (%d)\t%s\tppl %.2f errors %.2f seq-errors %.2f"
                   % (bin_id, data.bins[bin_id], p, data.safe_exp(loss),
                      100 * errors, 100 * seq_err))
  return (errors, seq_err, loss)

Source File: neural_gpu_trainer.py From g-tensorflow-models with Apache License 2.0

5 votes

def assign_vectors(word_vector_file, embedding_key, vocab_path, sess):
  """Assign the embedding_key variable from the given word vectors file."""
  # For words in the word vector file, set their embedding at start.
  if not tf.gfile.Exists(word_vector_file):
    data.print_out("Word vector file does not exist: %s" % word_vector_file)
    sys.exit(1)
  vocab, _ = wmt.initialize_vocabulary(vocab_path)
  vectors_variable = [v for v in tf.trainable_variables()
                      if embedding_key == v.name]
  if len(vectors_variable) != 1:
    data.print_out("Word vector variable not found or too many.")
    sys.exit(1)
  vectors_variable = vectors_variable[0]
  vectors = vectors_variable.eval()
  data.print_out("Pre-setting word vectors from %s" % word_vector_file)
  with tf.gfile.GFile(word_vector_file, mode="r") as f:
    # Lines have format: dog 0.045123 -0.61323 0.413667 ...
    for line in f:
      line_parts = line.split()
      # The first part is the word.
      word = line_parts[0]
      if word in vocab:
        # Remaining parts are components of the vector.
        word_vector = np.array(map(float, line_parts[1:]))
        if len(word_vector) != FLAGS.vec_size:
          data.print_out("Warn: Word '%s', Expecting vector size %d, "
                         "found %d" % (word, FLAGS.vec_size,
                                       len(word_vector)))
        else:
          vectors[vocab[word]] = word_vector
  # Assign the modified vectors to the vectors_variable in the graph.
  sess.run([vectors_variable.initializer],
           {vectors_variable.initializer.inputs[1]: vectors})

Source File: neural_gpu_trainer.py From models with Apache License 2.0

5 votes

def read_data(source_path, target_path, buckets, max_size=None, print_out=True):
  """Read data from source and target files and put into buckets.

  Args:
    source_path: path to the files with token-ids for the source language.
    target_path: path to the file with token-ids for the target language;
      it must be aligned with the source file: n-th line contains the desired
      output for n-th line from the source_path.
    buckets: the buckets to use.
    max_size: maximum number of lines to read, all other will be ignored;
      if 0 or None, data files will be read completely (no limit).
      If set to 1, no data will be returned (empty lists of the right form).
    print_out: whether to print out status or not.

  Returns:
    data_set: a list of length len(_buckets); data_set[n] contains a list of
      (source, target) pairs read from the provided data files that fit
      into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and
      len(target) < _buckets[n][1]; source and target are lists of token-ids.
  """
  data_set = [[] for _ in buckets]
  counter = 0
  if max_size != 1:
    with tf.gfile.GFile(source_path, mode="r") as source_file:
      with tf.gfile.GFile(target_path, mode="r") as target_file:
        source, target = source_file.readline(), target_file.readline()
        while source and target and (not max_size or counter < max_size):
          counter += 1
          if counter % 100000 == 0 and print_out:
            print("  reading data line %d" % counter)
            sys.stdout.flush()
          source_ids = [int(x) for x in source.split()]
          target_ids = [int(x) for x in target.split()]
          source_ids, source_len = zero_split(source_ids)
          target_ids, target_len = zero_split(target_ids, append=wmt.EOS_ID)
          for bucket_id, size in enumerate(buckets):
            if source_len <= size and target_len <= size:
              data_set[bucket_id].append([source_ids, target_ids])
              break
          source, target = source_file.readline(), target_file.readline()
  return data_set

Source File: neural_gpu_trainer.py From models with Apache License 2.0

5 votes

def read_data_into_global(source_path, target_path, buckets,
                          max_size=None, print_out=True):
  """Read data into the global variables (can be in a separate thread)."""
  # pylint: disable=global-variable-not-assigned
  global global_train_set, train_buckets_scale
  # pylint: enable=global-variable-not-assigned
  data_set = read_data(source_path, target_path, buckets, max_size, print_out)
  global_train_set["wmt"].append(data_set)
  train_total_size = calculate_buckets_scale(data_set, buckets, "wmt")
  if print_out:
    print("  Finished global data reading (%d)." % train_total_size)

Source File: neural_gpu_trainer.py From models with Apache License 2.0

5 votes

def single_test(bin_id, model, sess, nprint, batch_size, dev, p, print_out=True,
                offset=None, beam_model=None):
  """Test model on test data of length l using the given session."""
  if not dev[p][bin_id]:
    data.print_out("  bin %d (%d)\t%s\tppl NA errors NA seq-errors NA"
                   % (bin_id, data.bins[bin_id], p))
    return 1.0, 1.0, 0.0
  inpt, target = data.get_batch(
      bin_id, batch_size, dev[p], FLAGS.height, offset)
  if FLAGS.beam_size > 1 and beam_model:
    loss, res, new_tgt, scores = m_step(
        model, beam_model, sess, batch_size, inpt, target, bin_id,
        FLAGS.eval_beam_steps, p)
    score_avgs = [sum(s) / float(len(s)) for s in scores]
    score_maxs = [max(s) for s in scores]
    score_str = ["(%.2f, %.2f)" % (score_avgs[i], score_maxs[i])
                 for i in xrange(FLAGS.eval_beam_steps)]
    data.print_out("  == scores (avg, max): %s" % "; ".join(score_str))
    errors, total, seq_err = data.accuracy(inpt, res, target, batch_size,
                                           nprint, new_tgt, scores[-1])
  else:
    loss, res, _, _ = model.step(sess, inpt, target, False)
    errors, total, seq_err = data.accuracy(inpt, res, target, batch_size,
                                           nprint)
  seq_err = float(seq_err) / batch_size
  if total > 0:
    errors = float(errors) / total
  if print_out:
    data.print_out("  bin %d (%d)\t%s\tppl %.2f errors %.2f seq-errors %.2f"
                   % (bin_id, data.bins[bin_id], p, data.safe_exp(loss),
                      100 * errors, 100 * seq_err))
  return (errors, seq_err, loss)

Source File: neural_gpu_trainer.py From models with Apache License 2.0

5 votes

def assign_vectors(word_vector_file, embedding_key, vocab_path, sess):
  """Assign the embedding_key variable from the given word vectors file."""
  # For words in the word vector file, set their embedding at start.
  if not tf.gfile.Exists(word_vector_file):
    data.print_out("Word vector file does not exist: %s" % word_vector_file)
    sys.exit(1)
  vocab, _ = wmt.initialize_vocabulary(vocab_path)
  vectors_variable = [v for v in tf.trainable_variables()
                      if embedding_key == v.name]
  if len(vectors_variable) != 1:
    data.print_out("Word vector variable not found or too many.")
    sys.exit(1)
  vectors_variable = vectors_variable[0]
  vectors = vectors_variable.eval()
  data.print_out("Pre-setting word vectors from %s" % word_vector_file)
  with tf.gfile.GFile(word_vector_file, mode="r") as f:
    # Lines have format: dog 0.045123 -0.61323 0.413667 ...
    for line in f:
      line_parts = line.split()
      # The first part is the word.
      word = line_parts[0]
      if word in vocab:
        # Remaining parts are components of the vector.
        word_vector = np.array(map(float, line_parts[1:]))
        if len(word_vector) != FLAGS.vec_size:
          data.print_out("Warn: Word '%s', Expecting vector size %d, "
                         "found %d" % (word, FLAGS.vec_size,
                                       len(word_vector)))
        else:
          vectors[vocab[word]] = word_vector
  # Assign the modified vectors to the vectors_variable in the graph.
  sess.run([vectors_variable.initializer],
           {vectors_variable.initializer.inputs[1]: vectors})

Source File: neural_gpu_trainer.py From multilabel-image-classification-tensorflow with MIT License

5 votes

def read_data(source_path, target_path, buckets, max_size=None, print_out=True):
  """Read data from source and target files and put into buckets.

  Args:
    source_path: path to the files with token-ids for the source language.
    target_path: path to the file with token-ids for the target language;
      it must be aligned with the source file: n-th line contains the desired
      output for n-th line from the source_path.
    buckets: the buckets to use.
    max_size: maximum number of lines to read, all other will be ignored;
      if 0 or None, data files will be read completely (no limit).
      If set to 1, no data will be returned (empty lists of the right form).
    print_out: whether to print out status or not.

  Returns:
    data_set: a list of length len(_buckets); data_set[n] contains a list of
      (source, target) pairs read from the provided data files that fit
      into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and
      len(target) < _buckets[n][1]; source and target are lists of token-ids.
  """
  data_set = [[] for _ in buckets]
  counter = 0
  if max_size != 1:
    with tf.gfile.GFile(source_path, mode="r") as source_file:
      with tf.gfile.GFile(target_path, mode="r") as target_file:
        source, target = source_file.readline(), target_file.readline()
        while source and target and (not max_size or counter < max_size):
          counter += 1
          if counter % 100000 == 0 and print_out:
            print("  reading data line %d" % counter)
            sys.stdout.flush()
          source_ids = [int(x) for x in source.split()]
          target_ids = [int(x) for x in target.split()]
          source_ids, source_len = zero_split(source_ids)
          target_ids, target_len = zero_split(target_ids, append=wmt.EOS_ID)
          for bucket_id, size in enumerate(buckets):
            if source_len <= size and target_len <= size:
              data_set[bucket_id].append([source_ids, target_ids])
              break
          source, target = source_file.readline(), target_file.readline()
  return data_set

Source File: neural_gpu_trainer.py From multilabel-image-classification-tensorflow with MIT License

5 votes

def read_data_into_global(source_path, target_path, buckets,
                          max_size=None, print_out=True):
  """Read data into the global variables (can be in a separate thread)."""
  # pylint: disable=global-variable-not-assigned
  global global_train_set, train_buckets_scale
  # pylint: enable=global-variable-not-assigned
  data_set = read_data(source_path, target_path, buckets, max_size, print_out)
  global_train_set["wmt"].append(data_set)
  train_total_size = calculate_buckets_scale(data_set, buckets, "wmt")
  if print_out:
    print("  Finished global data reading (%d)." % train_total_size)

Python data_utils.print_out() Examples