Python data_utils.print_out() Examples

The following are 30 code examples of data_utils.print_out(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module data_utils , or try the search function .
Example #1
Source File: neural_gpu_trainer.py    From multilabel-image-classification-tensorflow with MIT License 6 votes vote down vote up
def print_vectors(embedding_key, vocab_path, word_vector_file):
  """Print vectors from the given variable."""
  _, rev_vocab = wmt.initialize_vocabulary(vocab_path)
  vectors_variable = [v for v in tf.trainable_variables()
                      if embedding_key == v.name]
  if len(vectors_variable) != 1:
    data.print_out("Word vector variable not found or too many.")
    sys.exit(1)
  vectors_variable = vectors_variable[0]
  vectors = vectors_variable.eval()
  l, s = vectors.shape[0], vectors.shape[1]
  data.print_out("Printing %d word vectors from %s to %s."
                 % (l, embedding_key, word_vector_file))
  with tf.gfile.GFile(word_vector_file, mode="w") as f:
    # Lines have format: dog 0.045123 -0.61323 0.413667 ...
    for i in xrange(l):
      f.write(rev_vocab[i])
      for j in xrange(s):
        f.write(" %.8f" % vectors[i][j])
      f.write("\n") 
Example #2
Source File: neural_gpu_trainer.py    From ECO-pytorch with BSD 2-Clause "Simplified" License 6 votes vote down vote up
def multi_test(l, model, sess, task, nprint, batch_size, offset=None,
               ensemble=None):
  """Run multiple tests at lower batch size to save memory."""
  errors, seq_err = 0.0, 0.0
  to_print = nprint
  low_batch = FLAGS.low_batch_size
  low_batch = min(low_batch, batch_size)
  for mstep in xrange(batch_size / low_batch):
    cur_offset = None if offset is None else offset + mstep * low_batch
    err, sq_err, _ = single_test(l, model, sess, task, to_print, low_batch,
                                 False, cur_offset, ensemble=ensemble)
    to_print = max(0, to_print - low_batch)
    errors += err
    seq_err += sq_err
    if FLAGS.mode > 0:
      cur_errors = float(low_batch * errors) / ((mstep+1) * low_batch)
      cur_seq_err = float(low_batch * seq_err) / ((mstep+1) * low_batch)
      data.print_out("    %s multitest current errors %.2f sequence-errors %.2f"
                     % (task, 100*cur_errors, 100*cur_seq_err))
  errors = float(low_batch) * float(errors) / batch_size
  seq_err = float(low_batch) * float(seq_err) / batch_size
  data.print_out("  %s len %d errors %.2f sequence-errors %.2f"
                 % (task, l, 100*errors, 100*seq_err))
  return errors, seq_err 
Example #3
Source File: neural_gpu_trainer.py    From HumanRecognition with MIT License 6 votes vote down vote up
def print_vectors(embedding_key, vocab_path, word_vector_file):
  """Print vectors from the given variable."""
  _, rev_vocab = wmt.initialize_vocabulary(vocab_path)
  vectors_variable = [v for v in tf.trainable_variables()
                      if embedding_key == v.name]
  if len(vectors_variable) != 1:
    data.print_out("Word vector variable not found or too many.")
    sys.exit(1)
  vectors_variable = vectors_variable[0]
  vectors = vectors_variable.eval()
  l, s = vectors.shape[0], vectors.shape[1]
  data.print_out("Printing %d word vectors from %s to %s."
                 % (l, embedding_key, word_vector_file))
  with tf.gfile.GFile(word_vector_file, mode="w") as f:
    # Lines have format: dog 0.045123 -0.61323 0.413667 ...
    for i in xrange(l):
      f.write(rev_vocab[i])
      for j in xrange(s):
        f.write(" %.8f" % vectors[i][j])
      f.write("\n") 
Example #4
Source File: neural_gpu_trainer.py    From Action_Recognition_Zoo with MIT License 6 votes vote down vote up
def multi_test(l, model, sess, task, nprint, batch_size, offset=None,
               ensemble=None):
  """Run multiple tests at lower batch size to save memory."""
  errors, seq_err = 0.0, 0.0
  to_print = nprint
  low_batch = FLAGS.low_batch_size
  low_batch = min(low_batch, batch_size)
  for mstep in xrange(batch_size / low_batch):
    cur_offset = None if offset is None else offset + mstep * low_batch
    err, sq_err, _ = single_test(l, model, sess, task, to_print, low_batch,
                                 False, cur_offset, ensemble=ensemble)
    to_print = max(0, to_print - low_batch)
    errors += err
    seq_err += sq_err
    if FLAGS.mode > 0:
      cur_errors = float(low_batch * errors) / ((mstep+1) * low_batch)
      cur_seq_err = float(low_batch * seq_err) / ((mstep+1) * low_batch)
      data.print_out("    %s multitest current errors %.2f sequence-errors %.2f"
                     % (task, 100*cur_errors, 100*cur_seq_err))
  errors = float(low_batch) * float(errors) / batch_size
  seq_err = float(low_batch) * float(seq_err) / batch_size
  data.print_out("  %s len %d errors %.2f sequence-errors %.2f"
                 % (task, l, 100*errors, 100*seq_err))
  return errors, seq_err 
Example #5
Source File: neural_gpu_trainer.py    From Gun-Detector with Apache License 2.0 6 votes vote down vote up
def print_vectors(embedding_key, vocab_path, word_vector_file):
  """Print vectors from the given variable."""
  _, rev_vocab = wmt.initialize_vocabulary(vocab_path)
  vectors_variable = [v for v in tf.trainable_variables()
                      if embedding_key == v.name]
  if len(vectors_variable) != 1:
    data.print_out("Word vector variable not found or too many.")
    sys.exit(1)
  vectors_variable = vectors_variable[0]
  vectors = vectors_variable.eval()
  l, s = vectors.shape[0], vectors.shape[1]
  data.print_out("Printing %d word vectors from %s to %s."
                 % (l, embedding_key, word_vector_file))
  with tf.gfile.GFile(word_vector_file, mode="w") as f:
    # Lines have format: dog 0.045123 -0.61323 0.413667 ...
    for i in xrange(l):
      f.write(rev_vocab[i])
      for j in xrange(s):
        f.write(" %.8f" % vectors[i][j])
      f.write("\n") 
Example #6
Source File: neural_gpu_trainer.py    From g-tensorflow-models with Apache License 2.0 6 votes vote down vote up
def print_vectors(embedding_key, vocab_path, word_vector_file):
  """Print vectors from the given variable."""
  _, rev_vocab = wmt.initialize_vocabulary(vocab_path)
  vectors_variable = [v for v in tf.trainable_variables()
                      if embedding_key == v.name]
  if len(vectors_variable) != 1:
    data.print_out("Word vector variable not found or too many.")
    sys.exit(1)
  vectors_variable = vectors_variable[0]
  vectors = vectors_variable.eval()
  l, s = vectors.shape[0], vectors.shape[1]
  data.print_out("Printing %d word vectors from %s to %s."
                 % (l, embedding_key, word_vector_file))
  with tf.gfile.GFile(word_vector_file, mode="w") as f:
    # Lines have format: dog 0.045123 -0.61323 0.413667 ...
    for i in xrange(l):
      f.write(rev_vocab[i])
      for j in xrange(s):
        f.write(" %.8f" % vectors[i][j])
      f.write("\n") 
Example #7
Source File: neural_gpu_trainer.py    From hands-detection with MIT License 6 votes vote down vote up
def print_vectors(embedding_key, vocab_path, word_vector_file):
  """Print vectors from the given variable."""
  _, rev_vocab = wmt.initialize_vocabulary(vocab_path)
  vectors_variable = [v for v in tf.trainable_variables()
                      if embedding_key == v.name]
  if len(vectors_variable) != 1:
    data.print_out("Word vector variable not found or too many.")
    sys.exit(1)
  vectors_variable = vectors_variable[0]
  vectors = vectors_variable.eval()
  l, s = vectors.shape[0], vectors.shape[1]
  data.print_out("Printing %d word vectors from %s to %s."
                 % (l, embedding_key, word_vector_file))
  with tf.gfile.GFile(word_vector_file, mode="w") as f:
    # Lines have format: dog 0.045123 -0.61323 0.413667 ...
    for i in xrange(l):
      f.write(rev_vocab[i])
      for j in xrange(s):
        f.write(" %.8f" % vectors[i][j])
      f.write("\n") 
Example #8
Source File: neural_gpu_trainer.py    From yolo_v2 with Apache License 2.0 6 votes vote down vote up
def print_vectors(embedding_key, vocab_path, word_vector_file):
  """Print vectors from the given variable."""
  _, rev_vocab = wmt.initialize_vocabulary(vocab_path)
  vectors_variable = [v for v in tf.trainable_variables()
                      if embedding_key == v.name]
  if len(vectors_variable) != 1:
    data.print_out("Word vector variable not found or too many.")
    sys.exit(1)
  vectors_variable = vectors_variable[0]
  vectors = vectors_variable.eval()
  l, s = vectors.shape[0], vectors.shape[1]
  data.print_out("Printing %d word vectors from %s to %s."
                 % (l, embedding_key, word_vector_file))
  with tf.gfile.GFile(word_vector_file, mode="w") as f:
    # Lines have format: dog 0.045123 -0.61323 0.413667 ...
    for i in xrange(l):
      f.write(rev_vocab[i])
      for j in xrange(s):
        f.write(" %.8f" % vectors[i][j])
      f.write("\n") 
Example #9
Source File: neural_gpu_trainer.py    From AI_Reader with Apache License 2.0 6 votes vote down vote up
def multi_test(l, model, sess, task, nprint, batch_size, offset=None,
               ensemble=None):
  """Run multiple tests at lower batch size to save memory."""
  errors, seq_err = 0.0, 0.0
  to_print = nprint
  low_batch = FLAGS.low_batch_size
  low_batch = min(low_batch, batch_size)
  for mstep in xrange(batch_size / low_batch):
    cur_offset = None if offset is None else offset + mstep * low_batch
    err, sq_err, _ = single_test(l, model, sess, task, to_print, low_batch,
                                 False, cur_offset, ensemble=ensemble)
    to_print = max(0, to_print - low_batch)
    errors += err
    seq_err += sq_err
    if FLAGS.mode > 0:
      cur_errors = float(low_batch * errors) / ((mstep+1) * low_batch)
      cur_seq_err = float(low_batch * seq_err) / ((mstep+1) * low_batch)
      data.print_out("    %s multitest current errors %.2f sequence-errors %.2f"
                     % (task, 100*cur_errors, 100*cur_seq_err))
  errors = float(low_batch) * float(errors) / batch_size
  seq_err = float(low_batch) * float(seq_err) / batch_size
  data.print_out("  %s len %d errors %.2f sequence-errors %.2f"
                 % (task, l, 100*errors, 100*seq_err))
  return errors, seq_err 
Example #10
Source File: neural_gpu_trainer.py    From models with Apache License 2.0 6 votes vote down vote up
def print_vectors(embedding_key, vocab_path, word_vector_file):
  """Print vectors from the given variable."""
  _, rev_vocab = wmt.initialize_vocabulary(vocab_path)
  vectors_variable = [v for v in tf.trainable_variables()
                      if embedding_key == v.name]
  if len(vectors_variable) != 1:
    data.print_out("Word vector variable not found or too many.")
    sys.exit(1)
  vectors_variable = vectors_variable[0]
  vectors = vectors_variable.eval()
  l, s = vectors.shape[0], vectors.shape[1]
  data.print_out("Printing %d word vectors from %s to %s."
                 % (l, embedding_key, word_vector_file))
  with tf.gfile.GFile(word_vector_file, mode="w") as f:
    # Lines have format: dog 0.045123 -0.61323 0.413667 ...
    for i in xrange(l):
      f.write(rev_vocab[i])
      for j in xrange(s):
        f.write(" %.8f" % vectors[i][j])
      f.write("\n") 
Example #11
Source File: neural_gpu_trainer.py    From object_detection_kitti with Apache License 2.0 6 votes vote down vote up
def print_vectors(embedding_key, vocab_path, word_vector_file):
  """Print vectors from the given variable."""
  _, rev_vocab = wmt.initialize_vocabulary(vocab_path)
  vectors_variable = [v for v in tf.trainable_variables()
                      if embedding_key == v.name]
  if len(vectors_variable) != 1:
    data.print_out("Word vector variable not found or too many.")
    sys.exit(1)
  vectors_variable = vectors_variable[0]
  vectors = vectors_variable.eval()
  l, s = vectors.shape[0], vectors.shape[1]
  data.print_out("Printing %d word vectors from %s to %s."
                 % (l, embedding_key, word_vector_file))
  with tf.gfile.GFile(word_vector_file, mode="w") as f:
    # Lines have format: dog 0.045123 -0.61323 0.413667 ...
    for i in xrange(l):
      f.write(rev_vocab[i])
      for j in xrange(s):
        f.write(" %.8f" % vectors[i][j])
      f.write("\n") 
Example #12
Source File: neural_gpu_trainer.py    From DOTA_models with Apache License 2.0 6 votes vote down vote up
def print_vectors(embedding_key, vocab_path, word_vector_file):
  """Print vectors from the given variable."""
  _, rev_vocab = wmt.initialize_vocabulary(vocab_path)
  vectors_variable = [v for v in tf.trainable_variables()
                      if embedding_key == v.name]
  if len(vectors_variable) != 1:
    data.print_out("Word vector variable not found or too many.")
    sys.exit(1)
  vectors_variable = vectors_variable[0]
  vectors = vectors_variable.eval()
  l, s = vectors.shape[0], vectors.shape[1]
  data.print_out("Printing %d word vectors from %s to %s."
                 % (l, embedding_key, word_vector_file))
  with tf.gfile.GFile(word_vector_file, mode="w") as f:
    # Lines have format: dog 0.045123 -0.61323 0.413667 ...
    for i in xrange(l):
      f.write(rev_vocab[i])
      for j in xrange(s):
        f.write(" %.8f" % vectors[i][j])
      f.write("\n") 
Example #13
Source File: neural_gpu_trainer.py    From object_detection_with_tensorflow with MIT License 6 votes vote down vote up
def print_vectors(embedding_key, vocab_path, word_vector_file):
  """Print vectors from the given variable."""
  _, rev_vocab = wmt.initialize_vocabulary(vocab_path)
  vectors_variable = [v for v in tf.trainable_variables()
                      if embedding_key == v.name]
  if len(vectors_variable) != 1:
    data.print_out("Word vector variable not found or too many.")
    sys.exit(1)
  vectors_variable = vectors_variable[0]
  vectors = vectors_variable.eval()
  l, s = vectors.shape[0], vectors.shape[1]
  data.print_out("Printing %d word vectors from %s to %s."
                 % (l, embedding_key, word_vector_file))
  with tf.gfile.GFile(word_vector_file, mode="w") as f:
    # Lines have format: dog 0.045123 -0.61323 0.413667 ...
    for i in xrange(l):
      f.write(rev_vocab[i])
      for j in xrange(s):
        f.write(" %.8f" % vectors[i][j])
      f.write("\n") 
Example #14
Source File: neural_gpu_trainer.py    From HumanRecognition with MIT License 5 votes vote down vote up
def read_data(source_path, target_path, buckets, max_size=None, print_out=True):
  """Read data from source and target files and put into buckets.

  Args:
    source_path: path to the files with token-ids for the source language.
    target_path: path to the file with token-ids for the target language;
      it must be aligned with the source file: n-th line contains the desired
      output for n-th line from the source_path.
    buckets: the buckets to use.
    max_size: maximum number of lines to read, all other will be ignored;
      if 0 or None, data files will be read completely (no limit).
      If set to 1, no data will be returned (empty lists of the right form).
    print_out: whether to print out status or not.

  Returns:
    data_set: a list of length len(_buckets); data_set[n] contains a list of
      (source, target) pairs read from the provided data files that fit
      into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and
      len(target) < _buckets[n][1]; source and target are lists of token-ids.
  """
  data_set = [[] for _ in buckets]
  counter = 0
  if max_size != 1:
    with tf.gfile.GFile(source_path, mode="r") as source_file:
      with tf.gfile.GFile(target_path, mode="r") as target_file:
        source, target = source_file.readline(), target_file.readline()
        while source and target and (not max_size or counter < max_size):
          counter += 1
          if counter % 100000 == 0 and print_out:
            print "  reading data line %d" % counter
            sys.stdout.flush()
          source_ids = [int(x) for x in source.split()]
          target_ids = [int(x) for x in target.split()]
          source_ids, source_len = zero_split(source_ids)
          target_ids, target_len = zero_split(target_ids, append=wmt.EOS_ID)
          for bucket_id, size in enumerate(buckets):
            if source_len <= size and target_len <= size:
              data_set[bucket_id].append([source_ids, target_ids])
              break
          source, target = source_file.readline(), target_file.readline()
  return data_set 
Example #15
Source File: neural_gpu_trainer.py    From HumanRecognition with MIT License 5 votes vote down vote up
def read_data_into_global(source_path, target_path, buckets,
                          max_size=None, print_out=True):
  """Read data into the global variables (can be in a separate thread)."""
  # pylint: disable=global-variable-not-assigned
  global global_train_set, train_buckets_scale
  # pylint: enable=global-variable-not-assigned
  data_set = read_data(source_path, target_path, buckets, max_size, print_out)
  global_train_set["wmt"].append(data_set)
  train_total_size = calculate_buckets_scale(data_set, buckets, "wmt")
  if print_out:
    print "  Finished global data reading (%d)." % train_total_size 
Example #16
Source File: neural_gpu_trainer.py    From AI_Reader with Apache License 2.0 5 votes vote down vote up
def single_test(l, model, sess, task, nprint, batch_size, print_out=True,
                offset=None, ensemble=None, get_steps=False):
  """Test model on test data of length l using the given session."""
  inpt, target = data.get_batch(l, batch_size, False, task, offset)
  _, res, _, steps = model.step(sess, inpt, target, False, get_steps=get_steps)
  errors, total, seq_err = data.accuracy(inpt, res, target, batch_size, nprint)
  seq_err = float(seq_err) / batch_size
  if total > 0:
    errors = float(errors) / total
  if print_out:
    data.print_out("  %s len %d errors %.2f sequence-errors %.2f"
                   % (task, l, 100*errors, 100*seq_err))
  # Ensemble eval.
  if ensemble:
    results = []
    for m in ensemble:
      model.saver.restore(sess, m)
      _, result, _, _ = model.step(sess, inpt, target, False)
      m_errors, m_total, m_seq_err = data.accuracy(inpt, result, target,
                                                   batch_size, nprint)
      m_seq_err = float(m_seq_err) / batch_size
      if total > 0:
        m_errors = float(m_errors) / m_total
      data.print_out("     %s len %d m-errors %.2f m-sequence-errors %.2f"
                     % (task, l, 100*m_errors, 100*m_seq_err))
      results.append(result)
    ens = [sum(o) for o in zip(*results)]
    errors, total, seq_err = data.accuracy(inpt, ens, target,
                                           batch_size, nprint)
    seq_err = float(seq_err) / batch_size
    if total > 0:
      errors = float(errors) / total
    if print_out:
      data.print_out("  %s len %d ens-errors %.2f ens-sequence-errors %.2f"
                     % (task, l, 100*errors, 100*seq_err))
  return errors, seq_err, (steps, inpt, [np.argmax(o, axis=1) for o in res]) 
Example #17
Source File: neural_gpu_trainer.py    From HumanRecognition with MIT License 5 votes vote down vote up
def single_test(bin_id, model, sess, nprint, batch_size, dev, p, print_out=True,
                offset=None, beam_model=None):
  """Test model on test data of length l using the given session."""
  if not dev[p][bin_id]:
    data.print_out("  bin %d (%d)\t%s\tppl NA errors NA seq-errors NA"
                   % (bin_id, data.bins[bin_id], p))
    return 1.0, 1.0, 0.0
  inpt, target = data.get_batch(
      bin_id, batch_size, dev[p], FLAGS.height, offset)
  if FLAGS.beam_size > 1 and beam_model:
    loss, res, new_tgt, scores = m_step(
        model, beam_model, sess, batch_size, inpt, target, bin_id,
        FLAGS.eval_beam_steps, p)
    score_avgs = [sum(s) / float(len(s)) for s in scores]
    score_maxs = [max(s) for s in scores]
    score_str = ["(%.2f, %.2f)" % (score_avgs[i], score_maxs[i])
                 for i in xrange(FLAGS.eval_beam_steps)]
    data.print_out("  == scores (avg, max): %s" % "; ".join(score_str))
    errors, total, seq_err = data.accuracy(inpt, res, target, batch_size,
                                           nprint, new_tgt, scores[-1])
  else:
    loss, res, _, _ = model.step(sess, inpt, target, False)
    errors, total, seq_err = data.accuracy(inpt, res, target, batch_size,
                                           nprint)
  seq_err = float(seq_err) / batch_size
  if total > 0:
    errors = float(errors) / total
  if print_out:
    data.print_out("  bin %d (%d)\t%s\tppl %.2f errors %.2f seq-errors %.2f"
                   % (bin_id, data.bins[bin_id], p, data.safe_exp(loss),
                      100 * errors, 100 * seq_err))
  return (errors, seq_err, loss) 
Example #18
Source File: neural_gpu_trainer.py    From HumanRecognition with MIT License 5 votes vote down vote up
def assign_vectors(word_vector_file, embedding_key, vocab_path, sess):
  """Assign the embedding_key variable from the given word vectors file."""
  # For words in the word vector file, set their embedding at start.
  if not tf.gfile.Exists(word_vector_file):
    data.print_out("Word vector file does not exist: %s" % word_vector_file)
    sys.exit(1)
  vocab, _ = wmt.initialize_vocabulary(vocab_path)
  vectors_variable = [v for v in tf.trainable_variables()
                      if embedding_key == v.name]
  if len(vectors_variable) != 1:
    data.print_out("Word vector variable not found or too many.")
    sys.exit(1)
  vectors_variable = vectors_variable[0]
  vectors = vectors_variable.eval()
  data.print_out("Pre-setting word vectors from %s" % word_vector_file)
  with tf.gfile.GFile(word_vector_file, mode="r") as f:
    # Lines have format: dog 0.045123 -0.61323 0.413667 ...
    for line in f:
      line_parts = line.split()
      # The first part is the word.
      word = line_parts[0]
      if word in vocab:
        # Remaining parts are components of the vector.
        word_vector = np.array(map(float, line_parts[1:]))
        if len(word_vector) != FLAGS.vec_size:
          data.print_out("Warn: Word '%s', Expecting vector size %d, "
                         "found %d" % (word, FLAGS.vec_size,
                                       len(word_vector)))
        else:
          vectors[vocab[word]] = word_vector
  # Assign the modified vectors to the vectors_variable in the graph.
  sess.run([vectors_variable.initializer],
           {vectors_variable.initializer.inputs[1]: vectors}) 
Example #19
Source File: neural_gpu_trainer.py    From DOTA_models with Apache License 2.0 5 votes vote down vote up
def read_data_into_global(source_path, target_path, buckets,
                          max_size=None, print_out=True):
  """Read data into the global variables (can be in a separate thread)."""
  # pylint: disable=global-variable-not-assigned
  global global_train_set, train_buckets_scale
  # pylint: enable=global-variable-not-assigned
  data_set = read_data(source_path, target_path, buckets, max_size, print_out)
  global_train_set["wmt"].append(data_set)
  train_total_size = calculate_buckets_scale(data_set, buckets, "wmt")
  if print_out:
    print "  Finished global data reading (%d)." % train_total_size 
Example #20
Source File: neural_gpu_trainer.py    From object_detection_with_tensorflow with MIT License 5 votes vote down vote up
def assign_vectors(word_vector_file, embedding_key, vocab_path, sess):
  """Assign the embedding_key variable from the given word vectors file."""
  # For words in the word vector file, set their embedding at start.
  if not tf.gfile.Exists(word_vector_file):
    data.print_out("Word vector file does not exist: %s" % word_vector_file)
    sys.exit(1)
  vocab, _ = wmt.initialize_vocabulary(vocab_path)
  vectors_variable = [v for v in tf.trainable_variables()
                      if embedding_key == v.name]
  if len(vectors_variable) != 1:
    data.print_out("Word vector variable not found or too many.")
    sys.exit(1)
  vectors_variable = vectors_variable[0]
  vectors = vectors_variable.eval()
  data.print_out("Pre-setting word vectors from %s" % word_vector_file)
  with tf.gfile.GFile(word_vector_file, mode="r") as f:
    # Lines have format: dog 0.045123 -0.61323 0.413667 ...
    for line in f:
      line_parts = line.split()
      # The first part is the word.
      word = line_parts[0]
      if word in vocab:
        # Remaining parts are components of the vector.
        word_vector = np.array(map(float, line_parts[1:]))
        if len(word_vector) != FLAGS.vec_size:
          data.print_out("Warn: Word '%s', Expecting vector size %d, "
                         "found %d" % (word, FLAGS.vec_size,
                                       len(word_vector)))
        else:
          vectors[vocab[word]] = word_vector
  # Assign the modified vectors to the vectors_variable in the graph.
  sess.run([vectors_variable.initializer],
           {vectors_variable.initializer.inputs[1]: vectors}) 
Example #21
Source File: neural_gpu_trainer.py    From g-tensorflow-models with Apache License 2.0 5 votes vote down vote up
def read_data(source_path, target_path, buckets, max_size=None, print_out=True):
  """Read data from source and target files and put into buckets.

  Args:
    source_path: path to the files with token-ids for the source language.
    target_path: path to the file with token-ids for the target language;
      it must be aligned with the source file: n-th line contains the desired
      output for n-th line from the source_path.
    buckets: the buckets to use.
    max_size: maximum number of lines to read, all other will be ignored;
      if 0 or None, data files will be read completely (no limit).
      If set to 1, no data will be returned (empty lists of the right form).
    print_out: whether to print out status or not.

  Returns:
    data_set: a list of length len(_buckets); data_set[n] contains a list of
      (source, target) pairs read from the provided data files that fit
      into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and
      len(target) < _buckets[n][1]; source and target are lists of token-ids.
  """
  data_set = [[] for _ in buckets]
  counter = 0
  if max_size != 1:
    with tf.gfile.GFile(source_path, mode="r") as source_file:
      with tf.gfile.GFile(target_path, mode="r") as target_file:
        source, target = source_file.readline(), target_file.readline()
        while source and target and (not max_size or counter < max_size):
          counter += 1
          if counter % 100000 == 0 and print_out:
            print("  reading data line %d" % counter)
            sys.stdout.flush()
          source_ids = [int(x) for x in source.split()]
          target_ids = [int(x) for x in target.split()]
          source_ids, source_len = zero_split(source_ids)
          target_ids, target_len = zero_split(target_ids, append=wmt.EOS_ID)
          for bucket_id, size in enumerate(buckets):
            if source_len <= size and target_len <= size:
              data_set[bucket_id].append([source_ids, target_ids])
              break
          source, target = source_file.readline(), target_file.readline()
  return data_set 
Example #22
Source File: neural_gpu_trainer.py    From g-tensorflow-models with Apache License 2.0 5 votes vote down vote up
def read_data_into_global(source_path, target_path, buckets,
                          max_size=None, print_out=True):
  """Read data into the global variables (can be in a separate thread)."""
  # pylint: disable=global-variable-not-assigned
  global global_train_set, train_buckets_scale
  # pylint: enable=global-variable-not-assigned
  data_set = read_data(source_path, target_path, buckets, max_size, print_out)
  global_train_set["wmt"].append(data_set)
  train_total_size = calculate_buckets_scale(data_set, buckets, "wmt")
  if print_out:
    print("  Finished global data reading (%d)." % train_total_size) 
Example #23
Source File: neural_gpu_trainer.py    From g-tensorflow-models with Apache License 2.0 5 votes vote down vote up
def single_test(bin_id, model, sess, nprint, batch_size, dev, p, print_out=True,
                offset=None, beam_model=None):
  """Test model on test data of length l using the given session."""
  if not dev[p][bin_id]:
    data.print_out("  bin %d (%d)\t%s\tppl NA errors NA seq-errors NA"
                   % (bin_id, data.bins[bin_id], p))
    return 1.0, 1.0, 0.0
  inpt, target = data.get_batch(
      bin_id, batch_size, dev[p], FLAGS.height, offset)
  if FLAGS.beam_size > 1 and beam_model:
    loss, res, new_tgt, scores = m_step(
        model, beam_model, sess, batch_size, inpt, target, bin_id,
        FLAGS.eval_beam_steps, p)
    score_avgs = [sum(s) / float(len(s)) for s in scores]
    score_maxs = [max(s) for s in scores]
    score_str = ["(%.2f, %.2f)" % (score_avgs[i], score_maxs[i])
                 for i in xrange(FLAGS.eval_beam_steps)]
    data.print_out("  == scores (avg, max): %s" % "; ".join(score_str))
    errors, total, seq_err = data.accuracy(inpt, res, target, batch_size,
                                           nprint, new_tgt, scores[-1])
  else:
    loss, res, _, _ = model.step(sess, inpt, target, False)
    errors, total, seq_err = data.accuracy(inpt, res, target, batch_size,
                                           nprint)
  seq_err = float(seq_err) / batch_size
  if total > 0:
    errors = float(errors) / total
  if print_out:
    data.print_out("  bin %d (%d)\t%s\tppl %.2f errors %.2f seq-errors %.2f"
                   % (bin_id, data.bins[bin_id], p, data.safe_exp(loss),
                      100 * errors, 100 * seq_err))
  return (errors, seq_err, loss) 
Example #24
Source File: neural_gpu_trainer.py    From g-tensorflow-models with Apache License 2.0 5 votes vote down vote up
def assign_vectors(word_vector_file, embedding_key, vocab_path, sess):
  """Assign the embedding_key variable from the given word vectors file."""
  # For words in the word vector file, set their embedding at start.
  if not tf.gfile.Exists(word_vector_file):
    data.print_out("Word vector file does not exist: %s" % word_vector_file)
    sys.exit(1)
  vocab, _ = wmt.initialize_vocabulary(vocab_path)
  vectors_variable = [v for v in tf.trainable_variables()
                      if embedding_key == v.name]
  if len(vectors_variable) != 1:
    data.print_out("Word vector variable not found or too many.")
    sys.exit(1)
  vectors_variable = vectors_variable[0]
  vectors = vectors_variable.eval()
  data.print_out("Pre-setting word vectors from %s" % word_vector_file)
  with tf.gfile.GFile(word_vector_file, mode="r") as f:
    # Lines have format: dog 0.045123 -0.61323 0.413667 ...
    for line in f:
      line_parts = line.split()
      # The first part is the word.
      word = line_parts[0]
      if word in vocab:
        # Remaining parts are components of the vector.
        word_vector = np.array(map(float, line_parts[1:]))
        if len(word_vector) != FLAGS.vec_size:
          data.print_out("Warn: Word '%s', Expecting vector size %d, "
                         "found %d" % (word, FLAGS.vec_size,
                                       len(word_vector)))
        else:
          vectors[vocab[word]] = word_vector
  # Assign the modified vectors to the vectors_variable in the graph.
  sess.run([vectors_variable.initializer],
           {vectors_variable.initializer.inputs[1]: vectors}) 
Example #25
Source File: neural_gpu_trainer.py    From models with Apache License 2.0 5 votes vote down vote up
def read_data(source_path, target_path, buckets, max_size=None, print_out=True):
  """Read data from source and target files and put into buckets.

  Args:
    source_path: path to the files with token-ids for the source language.
    target_path: path to the file with token-ids for the target language;
      it must be aligned with the source file: n-th line contains the desired
      output for n-th line from the source_path.
    buckets: the buckets to use.
    max_size: maximum number of lines to read, all other will be ignored;
      if 0 or None, data files will be read completely (no limit).
      If set to 1, no data will be returned (empty lists of the right form).
    print_out: whether to print out status or not.

  Returns:
    data_set: a list of length len(_buckets); data_set[n] contains a list of
      (source, target) pairs read from the provided data files that fit
      into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and
      len(target) < _buckets[n][1]; source and target are lists of token-ids.
  """
  data_set = [[] for _ in buckets]
  counter = 0
  if max_size != 1:
    with tf.gfile.GFile(source_path, mode="r") as source_file:
      with tf.gfile.GFile(target_path, mode="r") as target_file:
        source, target = source_file.readline(), target_file.readline()
        while source and target and (not max_size or counter < max_size):
          counter += 1
          if counter % 100000 == 0 and print_out:
            print("  reading data line %d" % counter)
            sys.stdout.flush()
          source_ids = [int(x) for x in source.split()]
          target_ids = [int(x) for x in target.split()]
          source_ids, source_len = zero_split(source_ids)
          target_ids, target_len = zero_split(target_ids, append=wmt.EOS_ID)
          for bucket_id, size in enumerate(buckets):
            if source_len <= size and target_len <= size:
              data_set[bucket_id].append([source_ids, target_ids])
              break
          source, target = source_file.readline(), target_file.readline()
  return data_set 
Example #26
Source File: neural_gpu_trainer.py    From models with Apache License 2.0 5 votes vote down vote up
def read_data_into_global(source_path, target_path, buckets,
                          max_size=None, print_out=True):
  """Read data into the global variables (can be in a separate thread)."""
  # pylint: disable=global-variable-not-assigned
  global global_train_set, train_buckets_scale
  # pylint: enable=global-variable-not-assigned
  data_set = read_data(source_path, target_path, buckets, max_size, print_out)
  global_train_set["wmt"].append(data_set)
  train_total_size = calculate_buckets_scale(data_set, buckets, "wmt")
  if print_out:
    print("  Finished global data reading (%d)." % train_total_size) 
Example #27
Source File: neural_gpu_trainer.py    From models with Apache License 2.0 5 votes vote down vote up
def single_test(bin_id, model, sess, nprint, batch_size, dev, p, print_out=True,
                offset=None, beam_model=None):
  """Test model on test data of length l using the given session."""
  if not dev[p][bin_id]:
    data.print_out("  bin %d (%d)\t%s\tppl NA errors NA seq-errors NA"
                   % (bin_id, data.bins[bin_id], p))
    return 1.0, 1.0, 0.0
  inpt, target = data.get_batch(
      bin_id, batch_size, dev[p], FLAGS.height, offset)
  if FLAGS.beam_size > 1 and beam_model:
    loss, res, new_tgt, scores = m_step(
        model, beam_model, sess, batch_size, inpt, target, bin_id,
        FLAGS.eval_beam_steps, p)
    score_avgs = [sum(s) / float(len(s)) for s in scores]
    score_maxs = [max(s) for s in scores]
    score_str = ["(%.2f, %.2f)" % (score_avgs[i], score_maxs[i])
                 for i in xrange(FLAGS.eval_beam_steps)]
    data.print_out("  == scores (avg, max): %s" % "; ".join(score_str))
    errors, total, seq_err = data.accuracy(inpt, res, target, batch_size,
                                           nprint, new_tgt, scores[-1])
  else:
    loss, res, _, _ = model.step(sess, inpt, target, False)
    errors, total, seq_err = data.accuracy(inpt, res, target, batch_size,
                                           nprint)
  seq_err = float(seq_err) / batch_size
  if total > 0:
    errors = float(errors) / total
  if print_out:
    data.print_out("  bin %d (%d)\t%s\tppl %.2f errors %.2f seq-errors %.2f"
                   % (bin_id, data.bins[bin_id], p, data.safe_exp(loss),
                      100 * errors, 100 * seq_err))
  return (errors, seq_err, loss) 
Example #28
Source File: neural_gpu_trainer.py    From models with Apache License 2.0 5 votes vote down vote up
def assign_vectors(word_vector_file, embedding_key, vocab_path, sess):
  """Assign the embedding_key variable from the given word vectors file."""
  # For words in the word vector file, set their embedding at start.
  if not tf.gfile.Exists(word_vector_file):
    data.print_out("Word vector file does not exist: %s" % word_vector_file)
    sys.exit(1)
  vocab, _ = wmt.initialize_vocabulary(vocab_path)
  vectors_variable = [v for v in tf.trainable_variables()
                      if embedding_key == v.name]
  if len(vectors_variable) != 1:
    data.print_out("Word vector variable not found or too many.")
    sys.exit(1)
  vectors_variable = vectors_variable[0]
  vectors = vectors_variable.eval()
  data.print_out("Pre-setting word vectors from %s" % word_vector_file)
  with tf.gfile.GFile(word_vector_file, mode="r") as f:
    # Lines have format: dog 0.045123 -0.61323 0.413667 ...
    for line in f:
      line_parts = line.split()
      # The first part is the word.
      word = line_parts[0]
      if word in vocab:
        # Remaining parts are components of the vector.
        word_vector = np.array(map(float, line_parts[1:]))
        if len(word_vector) != FLAGS.vec_size:
          data.print_out("Warn: Word '%s', Expecting vector size %d, "
                         "found %d" % (word, FLAGS.vec_size,
                                       len(word_vector)))
        else:
          vectors[vocab[word]] = word_vector
  # Assign the modified vectors to the vectors_variable in the graph.
  sess.run([vectors_variable.initializer],
           {vectors_variable.initializer.inputs[1]: vectors}) 
Example #29
Source File: neural_gpu_trainer.py    From multilabel-image-classification-tensorflow with MIT License 5 votes vote down vote up
def read_data(source_path, target_path, buckets, max_size=None, print_out=True):
  """Read data from source and target files and put into buckets.

  Args:
    source_path: path to the files with token-ids for the source language.
    target_path: path to the file with token-ids for the target language;
      it must be aligned with the source file: n-th line contains the desired
      output for n-th line from the source_path.
    buckets: the buckets to use.
    max_size: maximum number of lines to read, all other will be ignored;
      if 0 or None, data files will be read completely (no limit).
      If set to 1, no data will be returned (empty lists of the right form).
    print_out: whether to print out status or not.

  Returns:
    data_set: a list of length len(_buckets); data_set[n] contains a list of
      (source, target) pairs read from the provided data files that fit
      into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and
      len(target) < _buckets[n][1]; source and target are lists of token-ids.
  """
  data_set = [[] for _ in buckets]
  counter = 0
  if max_size != 1:
    with tf.gfile.GFile(source_path, mode="r") as source_file:
      with tf.gfile.GFile(target_path, mode="r") as target_file:
        source, target = source_file.readline(), target_file.readline()
        while source and target and (not max_size or counter < max_size):
          counter += 1
          if counter % 100000 == 0 and print_out:
            print("  reading data line %d" % counter)
            sys.stdout.flush()
          source_ids = [int(x) for x in source.split()]
          target_ids = [int(x) for x in target.split()]
          source_ids, source_len = zero_split(source_ids)
          target_ids, target_len = zero_split(target_ids, append=wmt.EOS_ID)
          for bucket_id, size in enumerate(buckets):
            if source_len <= size and target_len <= size:
              data_set[bucket_id].append([source_ids, target_ids])
              break
          source, target = source_file.readline(), target_file.readline()
  return data_set 
Example #30
Source File: neural_gpu_trainer.py    From multilabel-image-classification-tensorflow with MIT License 5 votes vote down vote up
def read_data_into_global(source_path, target_path, buckets,
                          max_size=None, print_out=True):
  """Read data into the global variables (can be in a separate thread)."""
  # pylint: disable=global-variable-not-assigned
  global global_train_set, train_buckets_scale
  # pylint: enable=global-variable-not-assigned
  data_set = read_data(source_path, target_path, buckets, max_size, print_out)
  global_train_set["wmt"].append(data_set)
  train_total_size = calculate_buckets_scale(data_set, buckets, "wmt")
  if print_out:
    print("  Finished global data reading (%d)." % train_total_size)