Python data_utils.print_out() Examples
The following are 30
code examples of data_utils.print_out().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
data_utils
, or try the search function
.
Example #1
Source File: neural_gpu_trainer.py From multilabel-image-classification-tensorflow with MIT License | 6 votes |
def print_vectors(embedding_key, vocab_path, word_vector_file): """Print vectors from the given variable.""" _, rev_vocab = wmt.initialize_vocabulary(vocab_path) vectors_variable = [v for v in tf.trainable_variables() if embedding_key == v.name] if len(vectors_variable) != 1: data.print_out("Word vector variable not found or too many.") sys.exit(1) vectors_variable = vectors_variable[0] vectors = vectors_variable.eval() l, s = vectors.shape[0], vectors.shape[1] data.print_out("Printing %d word vectors from %s to %s." % (l, embedding_key, word_vector_file)) with tf.gfile.GFile(word_vector_file, mode="w") as f: # Lines have format: dog 0.045123 -0.61323 0.413667 ... for i in xrange(l): f.write(rev_vocab[i]) for j in xrange(s): f.write(" %.8f" % vectors[i][j]) f.write("\n")
Example #2
Source File: neural_gpu_trainer.py From ECO-pytorch with BSD 2-Clause "Simplified" License | 6 votes |
def multi_test(l, model, sess, task, nprint, batch_size, offset=None, ensemble=None): """Run multiple tests at lower batch size to save memory.""" errors, seq_err = 0.0, 0.0 to_print = nprint low_batch = FLAGS.low_batch_size low_batch = min(low_batch, batch_size) for mstep in xrange(batch_size / low_batch): cur_offset = None if offset is None else offset + mstep * low_batch err, sq_err, _ = single_test(l, model, sess, task, to_print, low_batch, False, cur_offset, ensemble=ensemble) to_print = max(0, to_print - low_batch) errors += err seq_err += sq_err if FLAGS.mode > 0: cur_errors = float(low_batch * errors) / ((mstep+1) * low_batch) cur_seq_err = float(low_batch * seq_err) / ((mstep+1) * low_batch) data.print_out(" %s multitest current errors %.2f sequence-errors %.2f" % (task, 100*cur_errors, 100*cur_seq_err)) errors = float(low_batch) * float(errors) / batch_size seq_err = float(low_batch) * float(seq_err) / batch_size data.print_out(" %s len %d errors %.2f sequence-errors %.2f" % (task, l, 100*errors, 100*seq_err)) return errors, seq_err
Example #3
Source File: neural_gpu_trainer.py From HumanRecognition with MIT License | 6 votes |
def print_vectors(embedding_key, vocab_path, word_vector_file): """Print vectors from the given variable.""" _, rev_vocab = wmt.initialize_vocabulary(vocab_path) vectors_variable = [v for v in tf.trainable_variables() if embedding_key == v.name] if len(vectors_variable) != 1: data.print_out("Word vector variable not found or too many.") sys.exit(1) vectors_variable = vectors_variable[0] vectors = vectors_variable.eval() l, s = vectors.shape[0], vectors.shape[1] data.print_out("Printing %d word vectors from %s to %s." % (l, embedding_key, word_vector_file)) with tf.gfile.GFile(word_vector_file, mode="w") as f: # Lines have format: dog 0.045123 -0.61323 0.413667 ... for i in xrange(l): f.write(rev_vocab[i]) for j in xrange(s): f.write(" %.8f" % vectors[i][j]) f.write("\n")
Example #4
Source File: neural_gpu_trainer.py From Action_Recognition_Zoo with MIT License | 6 votes |
def multi_test(l, model, sess, task, nprint, batch_size, offset=None, ensemble=None): """Run multiple tests at lower batch size to save memory.""" errors, seq_err = 0.0, 0.0 to_print = nprint low_batch = FLAGS.low_batch_size low_batch = min(low_batch, batch_size) for mstep in xrange(batch_size / low_batch): cur_offset = None if offset is None else offset + mstep * low_batch err, sq_err, _ = single_test(l, model, sess, task, to_print, low_batch, False, cur_offset, ensemble=ensemble) to_print = max(0, to_print - low_batch) errors += err seq_err += sq_err if FLAGS.mode > 0: cur_errors = float(low_batch * errors) / ((mstep+1) * low_batch) cur_seq_err = float(low_batch * seq_err) / ((mstep+1) * low_batch) data.print_out(" %s multitest current errors %.2f sequence-errors %.2f" % (task, 100*cur_errors, 100*cur_seq_err)) errors = float(low_batch) * float(errors) / batch_size seq_err = float(low_batch) * float(seq_err) / batch_size data.print_out(" %s len %d errors %.2f sequence-errors %.2f" % (task, l, 100*errors, 100*seq_err)) return errors, seq_err
Example #5
Source File: neural_gpu_trainer.py From Gun-Detector with Apache License 2.0 | 6 votes |
def print_vectors(embedding_key, vocab_path, word_vector_file): """Print vectors from the given variable.""" _, rev_vocab = wmt.initialize_vocabulary(vocab_path) vectors_variable = [v for v in tf.trainable_variables() if embedding_key == v.name] if len(vectors_variable) != 1: data.print_out("Word vector variable not found or too many.") sys.exit(1) vectors_variable = vectors_variable[0] vectors = vectors_variable.eval() l, s = vectors.shape[0], vectors.shape[1] data.print_out("Printing %d word vectors from %s to %s." % (l, embedding_key, word_vector_file)) with tf.gfile.GFile(word_vector_file, mode="w") as f: # Lines have format: dog 0.045123 -0.61323 0.413667 ... for i in xrange(l): f.write(rev_vocab[i]) for j in xrange(s): f.write(" %.8f" % vectors[i][j]) f.write("\n")
Example #6
Source File: neural_gpu_trainer.py From g-tensorflow-models with Apache License 2.0 | 6 votes |
def print_vectors(embedding_key, vocab_path, word_vector_file): """Print vectors from the given variable.""" _, rev_vocab = wmt.initialize_vocabulary(vocab_path) vectors_variable = [v for v in tf.trainable_variables() if embedding_key == v.name] if len(vectors_variable) != 1: data.print_out("Word vector variable not found or too many.") sys.exit(1) vectors_variable = vectors_variable[0] vectors = vectors_variable.eval() l, s = vectors.shape[0], vectors.shape[1] data.print_out("Printing %d word vectors from %s to %s." % (l, embedding_key, word_vector_file)) with tf.gfile.GFile(word_vector_file, mode="w") as f: # Lines have format: dog 0.045123 -0.61323 0.413667 ... for i in xrange(l): f.write(rev_vocab[i]) for j in xrange(s): f.write(" %.8f" % vectors[i][j]) f.write("\n")
Example #7
Source File: neural_gpu_trainer.py From hands-detection with MIT License | 6 votes |
def print_vectors(embedding_key, vocab_path, word_vector_file): """Print vectors from the given variable.""" _, rev_vocab = wmt.initialize_vocabulary(vocab_path) vectors_variable = [v for v in tf.trainable_variables() if embedding_key == v.name] if len(vectors_variable) != 1: data.print_out("Word vector variable not found or too many.") sys.exit(1) vectors_variable = vectors_variable[0] vectors = vectors_variable.eval() l, s = vectors.shape[0], vectors.shape[1] data.print_out("Printing %d word vectors from %s to %s." % (l, embedding_key, word_vector_file)) with tf.gfile.GFile(word_vector_file, mode="w") as f: # Lines have format: dog 0.045123 -0.61323 0.413667 ... for i in xrange(l): f.write(rev_vocab[i]) for j in xrange(s): f.write(" %.8f" % vectors[i][j]) f.write("\n")
Example #8
Source File: neural_gpu_trainer.py From yolo_v2 with Apache License 2.0 | 6 votes |
def print_vectors(embedding_key, vocab_path, word_vector_file): """Print vectors from the given variable.""" _, rev_vocab = wmt.initialize_vocabulary(vocab_path) vectors_variable = [v for v in tf.trainable_variables() if embedding_key == v.name] if len(vectors_variable) != 1: data.print_out("Word vector variable not found or too many.") sys.exit(1) vectors_variable = vectors_variable[0] vectors = vectors_variable.eval() l, s = vectors.shape[0], vectors.shape[1] data.print_out("Printing %d word vectors from %s to %s." % (l, embedding_key, word_vector_file)) with tf.gfile.GFile(word_vector_file, mode="w") as f: # Lines have format: dog 0.045123 -0.61323 0.413667 ... for i in xrange(l): f.write(rev_vocab[i]) for j in xrange(s): f.write(" %.8f" % vectors[i][j]) f.write("\n")
Example #9
Source File: neural_gpu_trainer.py From AI_Reader with Apache License 2.0 | 6 votes |
def multi_test(l, model, sess, task, nprint, batch_size, offset=None, ensemble=None): """Run multiple tests at lower batch size to save memory.""" errors, seq_err = 0.0, 0.0 to_print = nprint low_batch = FLAGS.low_batch_size low_batch = min(low_batch, batch_size) for mstep in xrange(batch_size / low_batch): cur_offset = None if offset is None else offset + mstep * low_batch err, sq_err, _ = single_test(l, model, sess, task, to_print, low_batch, False, cur_offset, ensemble=ensemble) to_print = max(0, to_print - low_batch) errors += err seq_err += sq_err if FLAGS.mode > 0: cur_errors = float(low_batch * errors) / ((mstep+1) * low_batch) cur_seq_err = float(low_batch * seq_err) / ((mstep+1) * low_batch) data.print_out(" %s multitest current errors %.2f sequence-errors %.2f" % (task, 100*cur_errors, 100*cur_seq_err)) errors = float(low_batch) * float(errors) / batch_size seq_err = float(low_batch) * float(seq_err) / batch_size data.print_out(" %s len %d errors %.2f sequence-errors %.2f" % (task, l, 100*errors, 100*seq_err)) return errors, seq_err
Example #10
Source File: neural_gpu_trainer.py From models with Apache License 2.0 | 6 votes |
def print_vectors(embedding_key, vocab_path, word_vector_file): """Print vectors from the given variable.""" _, rev_vocab = wmt.initialize_vocabulary(vocab_path) vectors_variable = [v for v in tf.trainable_variables() if embedding_key == v.name] if len(vectors_variable) != 1: data.print_out("Word vector variable not found or too many.") sys.exit(1) vectors_variable = vectors_variable[0] vectors = vectors_variable.eval() l, s = vectors.shape[0], vectors.shape[1] data.print_out("Printing %d word vectors from %s to %s." % (l, embedding_key, word_vector_file)) with tf.gfile.GFile(word_vector_file, mode="w") as f: # Lines have format: dog 0.045123 -0.61323 0.413667 ... for i in xrange(l): f.write(rev_vocab[i]) for j in xrange(s): f.write(" %.8f" % vectors[i][j]) f.write("\n")
Example #11
Source File: neural_gpu_trainer.py From object_detection_kitti with Apache License 2.0 | 6 votes |
def print_vectors(embedding_key, vocab_path, word_vector_file): """Print vectors from the given variable.""" _, rev_vocab = wmt.initialize_vocabulary(vocab_path) vectors_variable = [v for v in tf.trainable_variables() if embedding_key == v.name] if len(vectors_variable) != 1: data.print_out("Word vector variable not found or too many.") sys.exit(1) vectors_variable = vectors_variable[0] vectors = vectors_variable.eval() l, s = vectors.shape[0], vectors.shape[1] data.print_out("Printing %d word vectors from %s to %s." % (l, embedding_key, word_vector_file)) with tf.gfile.GFile(word_vector_file, mode="w") as f: # Lines have format: dog 0.045123 -0.61323 0.413667 ... for i in xrange(l): f.write(rev_vocab[i]) for j in xrange(s): f.write(" %.8f" % vectors[i][j]) f.write("\n")
Example #12
Source File: neural_gpu_trainer.py From DOTA_models with Apache License 2.0 | 6 votes |
def print_vectors(embedding_key, vocab_path, word_vector_file): """Print vectors from the given variable.""" _, rev_vocab = wmt.initialize_vocabulary(vocab_path) vectors_variable = [v for v in tf.trainable_variables() if embedding_key == v.name] if len(vectors_variable) != 1: data.print_out("Word vector variable not found or too many.") sys.exit(1) vectors_variable = vectors_variable[0] vectors = vectors_variable.eval() l, s = vectors.shape[0], vectors.shape[1] data.print_out("Printing %d word vectors from %s to %s." % (l, embedding_key, word_vector_file)) with tf.gfile.GFile(word_vector_file, mode="w") as f: # Lines have format: dog 0.045123 -0.61323 0.413667 ... for i in xrange(l): f.write(rev_vocab[i]) for j in xrange(s): f.write(" %.8f" % vectors[i][j]) f.write("\n")
Example #13
Source File: neural_gpu_trainer.py From object_detection_with_tensorflow with MIT License | 6 votes |
def print_vectors(embedding_key, vocab_path, word_vector_file): """Print vectors from the given variable.""" _, rev_vocab = wmt.initialize_vocabulary(vocab_path) vectors_variable = [v for v in tf.trainable_variables() if embedding_key == v.name] if len(vectors_variable) != 1: data.print_out("Word vector variable not found or too many.") sys.exit(1) vectors_variable = vectors_variable[0] vectors = vectors_variable.eval() l, s = vectors.shape[0], vectors.shape[1] data.print_out("Printing %d word vectors from %s to %s." % (l, embedding_key, word_vector_file)) with tf.gfile.GFile(word_vector_file, mode="w") as f: # Lines have format: dog 0.045123 -0.61323 0.413667 ... for i in xrange(l): f.write(rev_vocab[i]) for j in xrange(s): f.write(" %.8f" % vectors[i][j]) f.write("\n")
Example #14
Source File: neural_gpu_trainer.py From HumanRecognition with MIT License | 5 votes |
def read_data(source_path, target_path, buckets, max_size=None, print_out=True): """Read data from source and target files and put into buckets. Args: source_path: path to the files with token-ids for the source language. target_path: path to the file with token-ids for the target language; it must be aligned with the source file: n-th line contains the desired output for n-th line from the source_path. buckets: the buckets to use. max_size: maximum number of lines to read, all other will be ignored; if 0 or None, data files will be read completely (no limit). If set to 1, no data will be returned (empty lists of the right form). print_out: whether to print out status or not. Returns: data_set: a list of length len(_buckets); data_set[n] contains a list of (source, target) pairs read from the provided data files that fit into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and len(target) < _buckets[n][1]; source and target are lists of token-ids. """ data_set = [[] for _ in buckets] counter = 0 if max_size != 1: with tf.gfile.GFile(source_path, mode="r") as source_file: with tf.gfile.GFile(target_path, mode="r") as target_file: source, target = source_file.readline(), target_file.readline() while source and target and (not max_size or counter < max_size): counter += 1 if counter % 100000 == 0 and print_out: print " reading data line %d" % counter sys.stdout.flush() source_ids = [int(x) for x in source.split()] target_ids = [int(x) for x in target.split()] source_ids, source_len = zero_split(source_ids) target_ids, target_len = zero_split(target_ids, append=wmt.EOS_ID) for bucket_id, size in enumerate(buckets): if source_len <= size and target_len <= size: data_set[bucket_id].append([source_ids, target_ids]) break source, target = source_file.readline(), target_file.readline() return data_set
Example #15
Source File: neural_gpu_trainer.py From HumanRecognition with MIT License | 5 votes |
def read_data_into_global(source_path, target_path, buckets, max_size=None, print_out=True): """Read data into the global variables (can be in a separate thread).""" # pylint: disable=global-variable-not-assigned global global_train_set, train_buckets_scale # pylint: enable=global-variable-not-assigned data_set = read_data(source_path, target_path, buckets, max_size, print_out) global_train_set["wmt"].append(data_set) train_total_size = calculate_buckets_scale(data_set, buckets, "wmt") if print_out: print " Finished global data reading (%d)." % train_total_size
Example #16
Source File: neural_gpu_trainer.py From AI_Reader with Apache License 2.0 | 5 votes |
def single_test(l, model, sess, task, nprint, batch_size, print_out=True, offset=None, ensemble=None, get_steps=False): """Test model on test data of length l using the given session.""" inpt, target = data.get_batch(l, batch_size, False, task, offset) _, res, _, steps = model.step(sess, inpt, target, False, get_steps=get_steps) errors, total, seq_err = data.accuracy(inpt, res, target, batch_size, nprint) seq_err = float(seq_err) / batch_size if total > 0: errors = float(errors) / total if print_out: data.print_out(" %s len %d errors %.2f sequence-errors %.2f" % (task, l, 100*errors, 100*seq_err)) # Ensemble eval. if ensemble: results = [] for m in ensemble: model.saver.restore(sess, m) _, result, _, _ = model.step(sess, inpt, target, False) m_errors, m_total, m_seq_err = data.accuracy(inpt, result, target, batch_size, nprint) m_seq_err = float(m_seq_err) / batch_size if total > 0: m_errors = float(m_errors) / m_total data.print_out(" %s len %d m-errors %.2f m-sequence-errors %.2f" % (task, l, 100*m_errors, 100*m_seq_err)) results.append(result) ens = [sum(o) for o in zip(*results)] errors, total, seq_err = data.accuracy(inpt, ens, target, batch_size, nprint) seq_err = float(seq_err) / batch_size if total > 0: errors = float(errors) / total if print_out: data.print_out(" %s len %d ens-errors %.2f ens-sequence-errors %.2f" % (task, l, 100*errors, 100*seq_err)) return errors, seq_err, (steps, inpt, [np.argmax(o, axis=1) for o in res])
Example #17
Source File: neural_gpu_trainer.py From HumanRecognition with MIT License | 5 votes |
def single_test(bin_id, model, sess, nprint, batch_size, dev, p, print_out=True, offset=None, beam_model=None): """Test model on test data of length l using the given session.""" if not dev[p][bin_id]: data.print_out(" bin %d (%d)\t%s\tppl NA errors NA seq-errors NA" % (bin_id, data.bins[bin_id], p)) return 1.0, 1.0, 0.0 inpt, target = data.get_batch( bin_id, batch_size, dev[p], FLAGS.height, offset) if FLAGS.beam_size > 1 and beam_model: loss, res, new_tgt, scores = m_step( model, beam_model, sess, batch_size, inpt, target, bin_id, FLAGS.eval_beam_steps, p) score_avgs = [sum(s) / float(len(s)) for s in scores] score_maxs = [max(s) for s in scores] score_str = ["(%.2f, %.2f)" % (score_avgs[i], score_maxs[i]) for i in xrange(FLAGS.eval_beam_steps)] data.print_out(" == scores (avg, max): %s" % "; ".join(score_str)) errors, total, seq_err = data.accuracy(inpt, res, target, batch_size, nprint, new_tgt, scores[-1]) else: loss, res, _, _ = model.step(sess, inpt, target, False) errors, total, seq_err = data.accuracy(inpt, res, target, batch_size, nprint) seq_err = float(seq_err) / batch_size if total > 0: errors = float(errors) / total if print_out: data.print_out(" bin %d (%d)\t%s\tppl %.2f errors %.2f seq-errors %.2f" % (bin_id, data.bins[bin_id], p, data.safe_exp(loss), 100 * errors, 100 * seq_err)) return (errors, seq_err, loss)
Example #18
Source File: neural_gpu_trainer.py From HumanRecognition with MIT License | 5 votes |
def assign_vectors(word_vector_file, embedding_key, vocab_path, sess): """Assign the embedding_key variable from the given word vectors file.""" # For words in the word vector file, set their embedding at start. if not tf.gfile.Exists(word_vector_file): data.print_out("Word vector file does not exist: %s" % word_vector_file) sys.exit(1) vocab, _ = wmt.initialize_vocabulary(vocab_path) vectors_variable = [v for v in tf.trainable_variables() if embedding_key == v.name] if len(vectors_variable) != 1: data.print_out("Word vector variable not found or too many.") sys.exit(1) vectors_variable = vectors_variable[0] vectors = vectors_variable.eval() data.print_out("Pre-setting word vectors from %s" % word_vector_file) with tf.gfile.GFile(word_vector_file, mode="r") as f: # Lines have format: dog 0.045123 -0.61323 0.413667 ... for line in f: line_parts = line.split() # The first part is the word. word = line_parts[0] if word in vocab: # Remaining parts are components of the vector. word_vector = np.array(map(float, line_parts[1:])) if len(word_vector) != FLAGS.vec_size: data.print_out("Warn: Word '%s', Expecting vector size %d, " "found %d" % (word, FLAGS.vec_size, len(word_vector))) else: vectors[vocab[word]] = word_vector # Assign the modified vectors to the vectors_variable in the graph. sess.run([vectors_variable.initializer], {vectors_variable.initializer.inputs[1]: vectors})
Example #19
Source File: neural_gpu_trainer.py From DOTA_models with Apache License 2.0 | 5 votes |
def read_data_into_global(source_path, target_path, buckets, max_size=None, print_out=True): """Read data into the global variables (can be in a separate thread).""" # pylint: disable=global-variable-not-assigned global global_train_set, train_buckets_scale # pylint: enable=global-variable-not-assigned data_set = read_data(source_path, target_path, buckets, max_size, print_out) global_train_set["wmt"].append(data_set) train_total_size = calculate_buckets_scale(data_set, buckets, "wmt") if print_out: print " Finished global data reading (%d)." % train_total_size
Example #20
Source File: neural_gpu_trainer.py From object_detection_with_tensorflow with MIT License | 5 votes |
def assign_vectors(word_vector_file, embedding_key, vocab_path, sess): """Assign the embedding_key variable from the given word vectors file.""" # For words in the word vector file, set their embedding at start. if not tf.gfile.Exists(word_vector_file): data.print_out("Word vector file does not exist: %s" % word_vector_file) sys.exit(1) vocab, _ = wmt.initialize_vocabulary(vocab_path) vectors_variable = [v for v in tf.trainable_variables() if embedding_key == v.name] if len(vectors_variable) != 1: data.print_out("Word vector variable not found or too many.") sys.exit(1) vectors_variable = vectors_variable[0] vectors = vectors_variable.eval() data.print_out("Pre-setting word vectors from %s" % word_vector_file) with tf.gfile.GFile(word_vector_file, mode="r") as f: # Lines have format: dog 0.045123 -0.61323 0.413667 ... for line in f: line_parts = line.split() # The first part is the word. word = line_parts[0] if word in vocab: # Remaining parts are components of the vector. word_vector = np.array(map(float, line_parts[1:])) if len(word_vector) != FLAGS.vec_size: data.print_out("Warn: Word '%s', Expecting vector size %d, " "found %d" % (word, FLAGS.vec_size, len(word_vector))) else: vectors[vocab[word]] = word_vector # Assign the modified vectors to the vectors_variable in the graph. sess.run([vectors_variable.initializer], {vectors_variable.initializer.inputs[1]: vectors})
Example #21
Source File: neural_gpu_trainer.py From g-tensorflow-models with Apache License 2.0 | 5 votes |
def read_data(source_path, target_path, buckets, max_size=None, print_out=True): """Read data from source and target files and put into buckets. Args: source_path: path to the files with token-ids for the source language. target_path: path to the file with token-ids for the target language; it must be aligned with the source file: n-th line contains the desired output for n-th line from the source_path. buckets: the buckets to use. max_size: maximum number of lines to read, all other will be ignored; if 0 or None, data files will be read completely (no limit). If set to 1, no data will be returned (empty lists of the right form). print_out: whether to print out status or not. Returns: data_set: a list of length len(_buckets); data_set[n] contains a list of (source, target) pairs read from the provided data files that fit into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and len(target) < _buckets[n][1]; source and target are lists of token-ids. """ data_set = [[] for _ in buckets] counter = 0 if max_size != 1: with tf.gfile.GFile(source_path, mode="r") as source_file: with tf.gfile.GFile(target_path, mode="r") as target_file: source, target = source_file.readline(), target_file.readline() while source and target and (not max_size or counter < max_size): counter += 1 if counter % 100000 == 0 and print_out: print(" reading data line %d" % counter) sys.stdout.flush() source_ids = [int(x) for x in source.split()] target_ids = [int(x) for x in target.split()] source_ids, source_len = zero_split(source_ids) target_ids, target_len = zero_split(target_ids, append=wmt.EOS_ID) for bucket_id, size in enumerate(buckets): if source_len <= size and target_len <= size: data_set[bucket_id].append([source_ids, target_ids]) break source, target = source_file.readline(), target_file.readline() return data_set
Example #22
Source File: neural_gpu_trainer.py From g-tensorflow-models with Apache License 2.0 | 5 votes |
def read_data_into_global(source_path, target_path, buckets, max_size=None, print_out=True): """Read data into the global variables (can be in a separate thread).""" # pylint: disable=global-variable-not-assigned global global_train_set, train_buckets_scale # pylint: enable=global-variable-not-assigned data_set = read_data(source_path, target_path, buckets, max_size, print_out) global_train_set["wmt"].append(data_set) train_total_size = calculate_buckets_scale(data_set, buckets, "wmt") if print_out: print(" Finished global data reading (%d)." % train_total_size)
Example #23
Source File: neural_gpu_trainer.py From g-tensorflow-models with Apache License 2.0 | 5 votes |
def single_test(bin_id, model, sess, nprint, batch_size, dev, p, print_out=True, offset=None, beam_model=None): """Test model on test data of length l using the given session.""" if not dev[p][bin_id]: data.print_out(" bin %d (%d)\t%s\tppl NA errors NA seq-errors NA" % (bin_id, data.bins[bin_id], p)) return 1.0, 1.0, 0.0 inpt, target = data.get_batch( bin_id, batch_size, dev[p], FLAGS.height, offset) if FLAGS.beam_size > 1 and beam_model: loss, res, new_tgt, scores = m_step( model, beam_model, sess, batch_size, inpt, target, bin_id, FLAGS.eval_beam_steps, p) score_avgs = [sum(s) / float(len(s)) for s in scores] score_maxs = [max(s) for s in scores] score_str = ["(%.2f, %.2f)" % (score_avgs[i], score_maxs[i]) for i in xrange(FLAGS.eval_beam_steps)] data.print_out(" == scores (avg, max): %s" % "; ".join(score_str)) errors, total, seq_err = data.accuracy(inpt, res, target, batch_size, nprint, new_tgt, scores[-1]) else: loss, res, _, _ = model.step(sess, inpt, target, False) errors, total, seq_err = data.accuracy(inpt, res, target, batch_size, nprint) seq_err = float(seq_err) / batch_size if total > 0: errors = float(errors) / total if print_out: data.print_out(" bin %d (%d)\t%s\tppl %.2f errors %.2f seq-errors %.2f" % (bin_id, data.bins[bin_id], p, data.safe_exp(loss), 100 * errors, 100 * seq_err)) return (errors, seq_err, loss)
Example #24
Source File: neural_gpu_trainer.py From g-tensorflow-models with Apache License 2.0 | 5 votes |
def assign_vectors(word_vector_file, embedding_key, vocab_path, sess): """Assign the embedding_key variable from the given word vectors file.""" # For words in the word vector file, set their embedding at start. if not tf.gfile.Exists(word_vector_file): data.print_out("Word vector file does not exist: %s" % word_vector_file) sys.exit(1) vocab, _ = wmt.initialize_vocabulary(vocab_path) vectors_variable = [v for v in tf.trainable_variables() if embedding_key == v.name] if len(vectors_variable) != 1: data.print_out("Word vector variable not found or too many.") sys.exit(1) vectors_variable = vectors_variable[0] vectors = vectors_variable.eval() data.print_out("Pre-setting word vectors from %s" % word_vector_file) with tf.gfile.GFile(word_vector_file, mode="r") as f: # Lines have format: dog 0.045123 -0.61323 0.413667 ... for line in f: line_parts = line.split() # The first part is the word. word = line_parts[0] if word in vocab: # Remaining parts are components of the vector. word_vector = np.array(map(float, line_parts[1:])) if len(word_vector) != FLAGS.vec_size: data.print_out("Warn: Word '%s', Expecting vector size %d, " "found %d" % (word, FLAGS.vec_size, len(word_vector))) else: vectors[vocab[word]] = word_vector # Assign the modified vectors to the vectors_variable in the graph. sess.run([vectors_variable.initializer], {vectors_variable.initializer.inputs[1]: vectors})
Example #25
Source File: neural_gpu_trainer.py From models with Apache License 2.0 | 5 votes |
def read_data(source_path, target_path, buckets, max_size=None, print_out=True): """Read data from source and target files and put into buckets. Args: source_path: path to the files with token-ids for the source language. target_path: path to the file with token-ids for the target language; it must be aligned with the source file: n-th line contains the desired output for n-th line from the source_path. buckets: the buckets to use. max_size: maximum number of lines to read, all other will be ignored; if 0 or None, data files will be read completely (no limit). If set to 1, no data will be returned (empty lists of the right form). print_out: whether to print out status or not. Returns: data_set: a list of length len(_buckets); data_set[n] contains a list of (source, target) pairs read from the provided data files that fit into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and len(target) < _buckets[n][1]; source and target are lists of token-ids. """ data_set = [[] for _ in buckets] counter = 0 if max_size != 1: with tf.gfile.GFile(source_path, mode="r") as source_file: with tf.gfile.GFile(target_path, mode="r") as target_file: source, target = source_file.readline(), target_file.readline() while source and target and (not max_size or counter < max_size): counter += 1 if counter % 100000 == 0 and print_out: print(" reading data line %d" % counter) sys.stdout.flush() source_ids = [int(x) for x in source.split()] target_ids = [int(x) for x in target.split()] source_ids, source_len = zero_split(source_ids) target_ids, target_len = zero_split(target_ids, append=wmt.EOS_ID) for bucket_id, size in enumerate(buckets): if source_len <= size and target_len <= size: data_set[bucket_id].append([source_ids, target_ids]) break source, target = source_file.readline(), target_file.readline() return data_set
Example #26
Source File: neural_gpu_trainer.py From models with Apache License 2.0 | 5 votes |
def read_data_into_global(source_path, target_path, buckets, max_size=None, print_out=True): """Read data into the global variables (can be in a separate thread).""" # pylint: disable=global-variable-not-assigned global global_train_set, train_buckets_scale # pylint: enable=global-variable-not-assigned data_set = read_data(source_path, target_path, buckets, max_size, print_out) global_train_set["wmt"].append(data_set) train_total_size = calculate_buckets_scale(data_set, buckets, "wmt") if print_out: print(" Finished global data reading (%d)." % train_total_size)
Example #27
Source File: neural_gpu_trainer.py From models with Apache License 2.0 | 5 votes |
def single_test(bin_id, model, sess, nprint, batch_size, dev, p, print_out=True, offset=None, beam_model=None): """Test model on test data of length l using the given session.""" if not dev[p][bin_id]: data.print_out(" bin %d (%d)\t%s\tppl NA errors NA seq-errors NA" % (bin_id, data.bins[bin_id], p)) return 1.0, 1.0, 0.0 inpt, target = data.get_batch( bin_id, batch_size, dev[p], FLAGS.height, offset) if FLAGS.beam_size > 1 and beam_model: loss, res, new_tgt, scores = m_step( model, beam_model, sess, batch_size, inpt, target, bin_id, FLAGS.eval_beam_steps, p) score_avgs = [sum(s) / float(len(s)) for s in scores] score_maxs = [max(s) for s in scores] score_str = ["(%.2f, %.2f)" % (score_avgs[i], score_maxs[i]) for i in xrange(FLAGS.eval_beam_steps)] data.print_out(" == scores (avg, max): %s" % "; ".join(score_str)) errors, total, seq_err = data.accuracy(inpt, res, target, batch_size, nprint, new_tgt, scores[-1]) else: loss, res, _, _ = model.step(sess, inpt, target, False) errors, total, seq_err = data.accuracy(inpt, res, target, batch_size, nprint) seq_err = float(seq_err) / batch_size if total > 0: errors = float(errors) / total if print_out: data.print_out(" bin %d (%d)\t%s\tppl %.2f errors %.2f seq-errors %.2f" % (bin_id, data.bins[bin_id], p, data.safe_exp(loss), 100 * errors, 100 * seq_err)) return (errors, seq_err, loss)
Example #28
Source File: neural_gpu_trainer.py From models with Apache License 2.0 | 5 votes |
def assign_vectors(word_vector_file, embedding_key, vocab_path, sess): """Assign the embedding_key variable from the given word vectors file.""" # For words in the word vector file, set their embedding at start. if not tf.gfile.Exists(word_vector_file): data.print_out("Word vector file does not exist: %s" % word_vector_file) sys.exit(1) vocab, _ = wmt.initialize_vocabulary(vocab_path) vectors_variable = [v for v in tf.trainable_variables() if embedding_key == v.name] if len(vectors_variable) != 1: data.print_out("Word vector variable not found or too many.") sys.exit(1) vectors_variable = vectors_variable[0] vectors = vectors_variable.eval() data.print_out("Pre-setting word vectors from %s" % word_vector_file) with tf.gfile.GFile(word_vector_file, mode="r") as f: # Lines have format: dog 0.045123 -0.61323 0.413667 ... for line in f: line_parts = line.split() # The first part is the word. word = line_parts[0] if word in vocab: # Remaining parts are components of the vector. word_vector = np.array(map(float, line_parts[1:])) if len(word_vector) != FLAGS.vec_size: data.print_out("Warn: Word '%s', Expecting vector size %d, " "found %d" % (word, FLAGS.vec_size, len(word_vector))) else: vectors[vocab[word]] = word_vector # Assign the modified vectors to the vectors_variable in the graph. sess.run([vectors_variable.initializer], {vectors_variable.initializer.inputs[1]: vectors})
Example #29
Source File: neural_gpu_trainer.py From multilabel-image-classification-tensorflow with MIT License | 5 votes |
def read_data(source_path, target_path, buckets, max_size=None, print_out=True): """Read data from source and target files and put into buckets. Args: source_path: path to the files with token-ids for the source language. target_path: path to the file with token-ids for the target language; it must be aligned with the source file: n-th line contains the desired output for n-th line from the source_path. buckets: the buckets to use. max_size: maximum number of lines to read, all other will be ignored; if 0 or None, data files will be read completely (no limit). If set to 1, no data will be returned (empty lists of the right form). print_out: whether to print out status or not. Returns: data_set: a list of length len(_buckets); data_set[n] contains a list of (source, target) pairs read from the provided data files that fit into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and len(target) < _buckets[n][1]; source and target are lists of token-ids. """ data_set = [[] for _ in buckets] counter = 0 if max_size != 1: with tf.gfile.GFile(source_path, mode="r") as source_file: with tf.gfile.GFile(target_path, mode="r") as target_file: source, target = source_file.readline(), target_file.readline() while source and target and (not max_size or counter < max_size): counter += 1 if counter % 100000 == 0 and print_out: print(" reading data line %d" % counter) sys.stdout.flush() source_ids = [int(x) for x in source.split()] target_ids = [int(x) for x in target.split()] source_ids, source_len = zero_split(source_ids) target_ids, target_len = zero_split(target_ids, append=wmt.EOS_ID) for bucket_id, size in enumerate(buckets): if source_len <= size and target_len <= size: data_set[bucket_id].append([source_ids, target_ids]) break source, target = source_file.readline(), target_file.readline() return data_set
Example #30
Source File: neural_gpu_trainer.py From multilabel-image-classification-tensorflow with MIT License | 5 votes |
def read_data_into_global(source_path, target_path, buckets, max_size=None, print_out=True): """Read data into the global variables (can be in a separate thread).""" # pylint: disable=global-variable-not-assigned global global_train_set, train_buckets_scale # pylint: enable=global-variable-not-assigned data_set = read_data(source_path, target_path, buckets, max_size, print_out) global_train_set["wmt"].append(data_set) train_total_size = calculate_buckets_scale(data_set, buckets, "wmt") if print_out: print(" Finished global data reading (%d)." % train_total_size)