Python tensorflow.dataset() Examples
The following are 30
code examples of tensorflow.dataset().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
tensorflow
, or try the search function
.
Example #1
Source File: eager.py From mead-baseline with Apache License 2.0 | 6 votes |
def _evaluate(self, es, reporting_fns, **kwargs): """Run the model with beam search and report Bleu. :param es: `tf.dataset` of input :param reporting_fns: Input hooks """ preds = [] golds = [] start = time.time() for features, tgt in es: features['dst'] = tgt[:, :-1] tgt_lens = features.pop('tgt_len') top_preds = self.model.predict(features, make_input=False, **kwargs) preds.extend(convert_seq2seq_preds(top_preds[:, 0, :], self.tgt_rlut)) golds.extend(convert_seq2seq_golds(tgt, tgt_lens, self.tgt_rlut)) metrics = {'bleu': bleu(preds, golds, self.bleu_n_grams)[0]} self.report( 0, metrics, start, 'Test', 'EPOCH', reporting_fns ) return metrics
Example #2
Source File: data_wrapper_base.py From MMNet with Apache License 2.0 | 6 votes |
def setup_dataset( self, placeholders: Tuple[tf.placeholder, tf.placeholder], batch_size: int=None, ): self.batch_size = self.args.batch_size if batch_size is None else batch_size dataset = tf.data.Dataset.from_tensor_slices(placeholders) dataset = dataset.map(self._parse_function, num_parallel_calls=self.args.num_threads).prefetch( self.args.prefetch_factor * self.batch_size) if self.is_training: dataset = dataset.repeat() if self.shuffle: dataset = dataset.shuffle(buffer_size=self.args.buffer_size) self.dataset = dataset.batch(self.batch_size) self.iterator = self.dataset.make_initializable_iterator() self.next_elem = self.iterator.get_next()
Example #3
Source File: utils.py From mead-baseline with Apache License 2.0 | 6 votes |
def _test(self, ts, dataset=True): """Test an epoch of data using either the input loader or using `tf.dataset` In non-`tf.dataset` mode, we cycle the loader data feed, and pull a batch and feed it to the feed dict When we use `tf.dataset`s under the hood, this function simply uses the loader to know how many steps to train. :param loader: A data feed :param kwargs: See below :Keyword Arguments: * *dataset* (`bool`) Set to `True` if using `tf.dataset`s, defaults to `True` * *reporting_fns* (`list`) A list of reporting hooks to use * *verbose* (`dict`) A dictionary containing `console` boolean and `file` name if on :return: Metrics """ return self.evaluator.test(ts, dataset=dataset)
Example #4
Source File: eager.py From mead-baseline with Apache License 2.0 | 6 votes |
def _test(self, ts, steps=0, **kwargs): """Test an epoch of data using either the input loader or using `tf.dataset` In non-`tf.dataset` mode, we cycle the loader data feed, and pull a batch and feed it to the feed dict When we use `tf.dataset`s under the hood, this function simply uses the loader to know how many steps to train. :param loader: A data feed :param kwargs: See below :Keyword Arguments: * *dataset* (`bool`) Set to `True` if using `tf.dataset`s, defaults to `True` * *reporting_fns* (`list`) A list of reporting hooks to use * *verbose* (`dict`) A dictionary containing `console` boolean and `file` name if on :return: Metrics """ return self.evaluator.test(ts, steps, **kwargs)
Example #5
Source File: distributed.py From mead-baseline with Apache License 2.0 | 5 votes |
def distribute(self, dataset): return self.strategy.experimental_distribute_dataset(dataset)
Example #6
Source File: distributed.py From mead-baseline with Apache License 2.0 | 5 votes |
def distribute(self, dataset): return self.strategy.experimental_distribute_dataset(dataset)
Example #7
Source File: distributed.py From mead-baseline with Apache License 2.0 | 5 votes |
def distribute(self, dataset): return self.strategy.experimental_distribute_dataset(dataset)
Example #8
Source File: eager.py From mead-baseline with Apache License 2.0 | 5 votes |
def test(self, vs, reporting_fns, phase): """Run an epoch of testing over the dataset If we are using a `tf.dataset`-based `fit_func`, we will just cycle the number of steps and let the `dataset` yield new batches. If we are using `feed_dict`s, we convert each batch from the `DataFeed` and pass that into TF as the `feed_dict` :param vs: A validation set :param reporting_fns: Reporting hooks :param phase: The phase of evaluation (`Test`, `Valid`) :param dataset: (`bool`) Are we using `tf.dataset`s :return: Metrics """ total_loss = 0.0 total_toks = 0 epochs = 0 if phase == 'Valid': self.valid_epochs += 1 epochs = self.valid_epochs SET_TRAIN_FLAG(False) start = time.time() h = None for features, y in vs: if self.model.requires_state: loss_value, h = loss_with_state(self.model, h, features, y) else: loss_value = loss_without_state(self.model, features, y) loss_value = loss_value.numpy() toks = self._num_toks(y) total_loss += loss_value * tf.cast(toks, tf.float32).numpy() total_toks += toks.numpy() metrics = self.calc_metrics(total_loss, total_toks) self.report( epochs, metrics, start, phase, 'EPOCH', reporting_fns ) return metrics
Example #9
Source File: gqn_provider.py From tf-gqn with Apache License 2.0 | 5 votes |
def _get_dataset_files(dataset_info, mode, root): """Generates lists of files for a given dataset version.""" basepath = dataset_info.basepath base = os.path.join(root, basepath, mode) if mode == 'train': num_files = dataset_info.train_size else: num_files = dataset_info.test_size length = len(str(num_files)) template = '{:0%d}-of-{:0%d}.tfrecord' % (length, length) record_paths = [ # indexing runs from 1 to n os.path.join(base, template.format(i, num_files)) for i in range(1, num_files + 1)] return record_paths
Example #10
Source File: utils.py From mead-baseline with Apache License 2.0 | 5 votes |
def to_tensors(ts, src_lengths_key, dst=False): """Convert a data feed into a tuple of `features` (`dict`) and `y` values This method is required to produce `tf.dataset`s from the input data feed. Any fields ending with `_lengths` are ignored, unless they match the `src_lengths_key` or `tgt_lengths_key`, in which case, they are converted to `src_len` and `tgt_len` :param ts: The data feed to convert :param lengths_key: This is a field passed from the model params specifying source of truth of the temporal lengths :param dst: `bool` that says if we should prepare a `dst` tensor. This is needed in distributed mode :return: A `tuple` of `features` and `y` (labels) """ keys = ts[0].keys() # This is kind of a hack keys = [k for k in keys if '_lengths' not in k and k != 'ids'] + [src_lengths_key, "tgt_lengths"] features = dict((k, []) for k in keys) for sample in ts: for k in keys: for s in sample[k]: features[k].append(s) features['src_len'] = features[src_lengths_key] del features[src_lengths_key] features['tgt_len'] = features['tgt_lengths'] del features['tgt_lengths'] features = dict((k, np.stack(v).astype(np.int32)) for k, v in features.items()) if dst: features['dst'] = features['tgt'][:, :-1] tgt = features.pop('tgt') return features, tgt
Example #11
Source File: data_wrapper_base.py From MMNet with Apache License 2.0 | 5 votes |
def resize_and_padding_before_augmentation(self, image, size): # If width > height, resize height to model's input height while preserving aspect ratio # If height > width, resize width to model's input width while preserving aspect ratio if self.args.debug_augmentation: assert size[0] == size[1], "resize_and_padding_before_augmentation only supports square target image" image = tf.expand_dims(image, 0) image_dims = tf.shape(image) height = image_dims[1] width = image_dims[2] min_size = min(*size) width_aspect = tf.maximum(min_size, tf.cast(width * min_size / height, dtype=tf.int32)) height_aspect = tf.maximum(min_size, tf.cast(height * min_size / width, dtype=tf.int32)) image = tf.image.resize_bilinear(image, (height_aspect, width_aspect)) image = image[:, :self.padded_max_size, :self.padded_max_size, :] # Pads the image on the bottom and right with zeros until it has dimensions target_height, target_width. image = tf.image.pad_to_bounding_box( image, offset_height=tf.maximum(self.padded_max_size-height_aspect, 0), offset_width=tf.maximum(self.padded_max_size-width_aspect, 0), target_height=self.padded_max_size, target_width=self.padded_max_size, ) image = tf.squeeze(image, 0) return image else: # Have to return some dummy tensor which have .get_shape() to tf.dataset return tf.constant(0, shape=self.padded_original_image_dummy_shape, dtype=tf.uint8, name="dummy")
Example #12
Source File: data_wrapper_base.py From MMNet with Apache License 2.0 | 5 votes |
def add_arguments(parser): g_common = parser.add_argument_group("(DataWrapperBase) Common Arguments for all data wrapper.") g_common.add_argument("--dataset_path", required=True, type=str, help="The name of the dataset to load.") g_common.add_argument("--dataset_split_name", required=True, type=str, nargs="*", help="The name of the train/test split. Support multiple splits") g_common.add_argument("--batch_size", default=32, type=utils.positive_int, help="The number of examples in batch.") g_common.add_argument("--no-shuffle", dest="shuffle", action="store_false") g_common.add_argument("--shuffle", dest="shuffle", action="store_true") g_common.set_defaults(shuffle=True) g_common.add_argument("--width", required=True, type=int) g_common.add_argument("--height", required=True, type=int) g_common.add_argument("--no-debug_augmentation", dest="debug_augmentation", action="store_false") g_common.add_argument("--debug_augmentation", dest="debug_augmentation", action="store_true") g_common.set_defaults(debug_augmentation=False) g_common.add_argument("--max_padded_size", default=224, type=int, help=("We will resize & pads the original image " "until it has dimensions (padded_size, padded_size)" "Recommend to set this value as width(or height) * 1.8 ~ 2")) g_common.add_argument("--augmentation_method", type=str, required=True, choices=_available_augmentation_methods) g_common.add_argument("--num_threads", default=8, type=int) g_common.add_argument("--buffer_size", default=1000, type=int) g_common.add_argument("--prefetch_factor", default=100, type=int) g_common.add_argument("--rotation_range", default=0, type=int, help="Receives maximum angle to be rotated in terms of degree: " "The image is randomly rotated by the angle " "randomly chosen from [-rotation_range, rotation_range], " "and then cropped appropriately to remove dark areas.\n" "So, be aware that the rotation performs certain kind of zooming.") g_common.add_argument("--no-has_sub_dataset", dest="has_sub_dataset", action="store_false") g_common.add_argument("--has_sub_dataset", dest="has_sub_dataset", action="store_true") g_common.set_defaults(has_sub_dataset=False)
Example #13
Source File: utils.py From mead-baseline with Apache License 2.0 | 5 votes |
def process_batch(self, batch_dict, handle, txts, dataset=True): if dataset: guess = self.sess.run(self.model.best) else: feed_dict = self.model.make_input(batch_dict) guess = self.sess.run(self.model.best, feed_dict=feed_dict) sentence_lengths = batch_dict[self.model.lengths_key] ids = batch_dict['ids'] truth = batch_dict['y'] correct_labels = 0 total_labels = 0 # For fscore gold_chunks = [] pred_chunks = [] # For each sentence for b in range(len(guess)): length = sentence_lengths[b] sentence = guess[b][:length] # truth[b] is padded, cutting at :length gives us back true length gold = truth[b][:length] valid_guess = sentence[gold != Offsets.PAD] valid_gold = gold[gold != Offsets.PAD] valid_sentence_length = np.sum(gold != Offsets.PAD) correct_labels += np.sum(np.equal(valid_guess, valid_gold)) total_labels += valid_sentence_length gold_chunks.append(set(to_spans(valid_gold, self.idx2label, self.span_type, self.verbose))) pred_chunks.append(set(to_spans(valid_guess, self.idx2label, self.span_type, self.verbose))) # Should we write a file out? If so, we have to have txts if handle is not None: id = ids[b] txt = txts[id] write_sentence_conll(handle, valid_guess, valid_gold, txt, self.idx2label) return correct_labels, total_labels, gold_chunks, pred_chunks
Example #14
Source File: utils.py From mead-baseline with Apache License 2.0 | 5 votes |
def to_tensors(ts, lengths_key): """Convert a data feed into a tuple of `features` (`dict`) and `y` values This method is required to produce `tf.dataset`s from the input data feed. Any fields ending with `_lengths` are ignored, unless they match the `lengths_key` name (as are `ids`) :param ts: The data feed to convert :param lengths_key: This is a field passed from the model params specifying source of truth of the temporal lengths :return: A `tuple` of `features` and `y` (labels) """ keys = ts[0].keys() # This is kind of a hack keys = [k for k in keys if '_lengths' not in k and k != 'ids'] + [lengths_key] features = dict((k, []) for k in keys) for sample in ts: for k in features.keys(): # add each sample for s in sample[k]: features[k].append(s) features['lengths'] = features[lengths_key] del features[lengths_key] features = dict((k, np.stack(v)) for k, v in features.items()) y = features.pop('y') return features, y
Example #15
Source File: pipeline.py From ranking with Apache License 2.0 | 5 votes |
def _make_input_fn(self, input_pattern, batch_size, list_size, randomize_input=True, num_epochs=None): """Returns the input function for the ranking model. Args: input_pattern: (str) File pattern for the input data. batch_size: (int) The number of input examples to process per batch. list_size: (int) The list size for an ELWC example. randomize_input: (bool) If true, randomize input example order. It should almost always be true except for unittest/debug purposes. num_epochs: (int) The number of times the input dataset must be repeated. None to repeat the data indefinitely. Returns: An `input_fn` for `tf.estimator.Estimator`. """ def _input_fn(): """`input_fn` for the `Estimator`.""" return self._make_dataset( batch_size=batch_size, list_size=list_size, input_pattern=input_pattern, randomize_input=randomize_input, num_epochs=num_epochs) return _input_fn
Example #16
Source File: input_fn.py From bert-multitask-learning with MIT License | 4 votes |
def train_eval_input_fn(params, mode='train'): '''Train and eval input function of estimator. This function will write and read tf record for training and evaluation. Usage: def train_input_fn(): return train_eval_input_fn(params) estimator.train( train_input_fn, max_steps=params.train_steps, hooks=[train_hook]) Arguments: params {Params} -- Params objects Keyword Arguments: mode {str} -- ModeKeys (default: {'train'}) Returns: tf Dataset -- Tensorflow dataset ''' write_tfrecord(params=params) dataset_dict = read_tfrecord(params=params, mode=mode) dataset = tf.data.experimental.sample_from_datasets( [ds for _, ds in dataset_dict.items()]) if mode == 'train': dataset = dataset.shuffle(params.shuffle_buffer) dataset = dataset.prefetch(params.prefetch) if params.dynamic_padding: dataset = dataset.apply( tf.data.experimental.bucket_by_sequence_length( element_length_func=element_length_func, bucket_batch_sizes=params.bucket_batch_sizes, bucket_boundaries=params.bucket_boundaries, )) else: if mode == 'train': dataset = dataset.batch(params.batch_size) else: dataset = dataset.batch(params.batch_size*2) return dataset
Example #17
Source File: utils.py From mead-baseline with Apache License 2.0 | 4 votes |
def test(self, vs, reporting_fns, phase, dataset=True): """Run an epoch of testing over the dataset If we are using a `tf.dataset`-based `fit_func`, we will just cycle the number of steps and let the `dataset` yield new batches. If we are using `feed_dict`s, we convert each batch from the `DataFeed` and pass that into TF as the `feed_dict` :param vs: A validation set :param reporting_fns: Reporting hooks :param phase: The phase of evaluation (`Test`, `Valid`) :param dataset: (`bool`) Are we using `tf.dataset`s :return: Metrics """ total_loss = 0.0 total_toks = 0 epochs = 0 if phase == 'Valid': self.valid_epochs += 1 epochs = self.valid_epochs if self.model.requires_state: state = self.model.sess.run(self.model.initial_state, self.model.make_input(vs[0], False)) fetches = { "loss": self.test_loss, } if self.model.requires_state: fetches["final_state"] = self.model.final_state start = time.time() for batch_dict in vs: feed_dict = {} if not dataset: feed_dict = self.model.make_input(batch_dict, False) # In Keras LSTM, the order is h first, c second, its the opposite in TF 1, however I dont think it # ends up mattering here if self.model.requires_state: for i, (s1, s2) in enumerate(self.model.initial_state): feed_dict[s1] = state[i][0] # .c # 0 feed_dict[s2] = state[i][1] # .h # 1 vals = self.model.sess.run(fetches, feed_dict) loss = vals["loss"] toks = self._num_toks(batch_dict) if self.model.requires_state: state = vals["final_state"] total_loss += loss * toks total_toks += toks metrics = self.calc_metrics(total_loss, total_toks) self.report( epochs, metrics, start, phase, 'EPOCH', reporting_fns ) return metrics
Example #18
Source File: input_fn.py From bert-multitask-learning with MIT License | 4 votes |
def to_serving_input(input_file_or_list, config, mode=PREDICT, tokenizer=None): '''A serving input function that takes input file path or list of string and apply BERT preprocessing. This fn will return a data dict instead of tf dataset. Used in serving. Arguments: input_file_or_list {str or list} -- file path of list of str config {Params} -- Params Keyword Arguments: mode {str} -- ModeKeys (default: {PREDICT}) tokenizer {tokenizer} -- Tokenizer (default: {None}) ''' # if is string, treat it as path to file if isinstance(input_file_or_list, str): inputs = open(input_file_or_list, 'r', encoding='utf8').readlines() else: inputs = input_file_or_list if tokenizer is None: tokenizer = FullTokenizer(config.vocab_file) data_dict = {} for doc in inputs: inputs_a = cluster_alphnum(doc) tokens, target = tokenize_text_with_seqs( tokenizer, inputs_a, None) tokens_a, tokens_b, target = truncate_seq_pair( tokens, None, target, config.max_seq_len) tokens, segment_ids, target = add_special_tokens_with_seqs( tokens_a, tokens_b, target) input_mask, tokens, segment_ids, target = create_mask_and_padding( tokens, segment_ids, target, config.max_seq_len) input_ids = tokenizer.convert_tokens_to_ids(tokens) data_dict['input_ids'] = input_ids data_dict['input_mask'] = input_mask data_dict['segment_ids'] = segment_ids yield data_dict
Example #19
Source File: eager.py From mead-baseline with Apache License 2.0 | 4 votes |
def _test(self, loader, steps=0, **kwargs): """Test an epoch of data using either the input loader or using `tf.dataset` In non-`tf.dataset` mode, we cycle the loader data feed, and pull a batch and feed it to the feed dict When we use `tf.dataset`s under the hood, this function simply uses the loader to know how many steps to train. :param loader: A data feed :param kwargs: See below :Keyword Arguments: * *dataset* (`bool`) Set to `True` if using `tf.dataset`s, defaults to `True` * *reporting_fns* (`list`) A list of reporting hooks to use * *verbose* (`dict`) A dictionary containing `console` boolean and `file` name if on :return: Metrics """ cm = ConfusionMatrix(self.model.labels) total_loss = 0 total_norm = 0 verbose = kwargs.get("verbose", None) pg = create_progress_bar(steps) SET_TRAIN_FLAG(False) for features, y in pg(loader): logits = self.model(features) y_ = tf.argmax(logits, axis=1, output_type=tf.int32) cm.add_batch(y, y_) lossv = tf.compat.v1.losses.sparse_softmax_cross_entropy(labels=y, logits=logits).numpy() batchsz = int(y.shape[0]) assert len(y_) == batchsz total_loss += lossv * batchsz total_norm += batchsz cm.add_batch(y, y_) metrics = cm.get_all_metrics() metrics['avg_loss'] = total_loss / float(total_norm) verbose_output(verbose, cm) return metrics
Example #20
Source File: eager.py From mead-baseline with Apache License 2.0 | 4 votes |
def _train(self, loader, steps=0, **kwargs): """Train an epoch of data using either the input loader or using `tf.dataset` In non-`tf.dataset` mode, we cycle the loader data feed, and pull a batch and feed it to the feed dict When we use `tf.dataset`s under the hood, this function simply uses the loader to know how many steps to train. We do use a `feed_dict` for passing the `TRAIN_FLAG` in either case :param loader: A data feed :param kwargs: See below :Keyword Arguments: * *dataset* (`bool`) Set to `True` if using `tf.dataset`s, defaults to `True` * *reporting_fns* (`list`) A list of reporting hooks to use :return: Metrics """ SET_TRAIN_FLAG(True) reporting_fns = kwargs.get('reporting_fns', []) pg = create_progress_bar(steps) epoch_loss = tf.Variable(0.0) epoch_div = tf.Variable(0, dtype=tf.int32) nstep_loss = tf.Variable(0.0) nstep_div = tf.Variable(0, dtype=tf.int32) self.nstep_start = time.time() @tf.function def _train_step(inputs): """Replicated training step.""" features, y = inputs loss = self.optimizer.update(self.model, features, y) batchsz = get_shape_as_list(y)[0] report_loss = loss * batchsz return report_loss, batchsz for inputs in pg(loader): step_report_loss, step_batchsz = _train_step(inputs) epoch_loss.assign_add(step_report_loss) nstep_loss.assign_add(step_report_loss) epoch_div.assign_add(step_batchsz) nstep_div.assign_add(step_batchsz) step = self.optimizer.global_step.numpy() + 1 if step % self.nsteps == 0: metrics = self.calc_metrics(nstep_loss.numpy(), nstep_div.numpy()) self.report( step, metrics, self.nstep_start, 'Train', 'STEP', reporting_fns, self.nsteps ) nstep_loss.assign(0.0) nstep_div.assign(0) self.nstep_start = time.time() epoch_loss = epoch_loss.numpy() epoch_div = epoch_div.numpy() metrics = self.calc_metrics(epoch_loss, epoch_div) return metrics
Example #21
Source File: eager.py From mead-baseline with Apache License 2.0 | 4 votes |
def train(self, ts, reporting_fns, dataset=True): """Train by looping over the steps For a `tf.dataset`-backed `fit_func`, we are using the previously wired `dataset`s in the model (and `dataset` is `True`). For `feed_dict`, we convert the ts samples to `feed_dict`s and hand them in one-by-one :param ts: The training set :param reporting_fns: A list of reporting hooks :param dataset: (`bool`) Are we using `tf.dataset`s :return: Metrics """ SET_TRAIN_FLAG(True) epoch_loss = tf.Variable(0.0) epoch_div = tf.Variable(0, dtype=tf.int32) nstep_loss = tf.Variable(0.0) nstep_div = tf.Variable(0, dtype=tf.int32) self.nstep_start = time.time() start = time.time() @tf.function def _train_step(features, y): """Replicated training step.""" loss = self.optimizer.update(self.model, features, y) toks = self._num_toks(features['tgt_len']) report_loss = loss * tf.cast(toks, tf.float32) return report_loss, toks with autograph_options({"function_optimization": False, "layout_optimizer": False}): for features, y in ts: features['dst'] = y[:, :-1] step_report_loss, step_toks = _train_step(features, y) epoch_loss.assign_add(step_report_loss) nstep_loss.assign_add(step_report_loss) epoch_div.assign_add(step_toks) nstep_div.assign_add(step_toks) step = self.optimizer.global_step.numpy() + 1 if step % self.nsteps == 0: metrics = self.calc_metrics(nstep_loss.numpy(), nstep_div.numpy()) self.report( step, metrics, self.nstep_start, 'Train', 'STEP', reporting_fns, self.nsteps ) nstep_loss.assign(0.0) nstep_div.assign(0) self.nstep_start = time.time() epoch_loss = epoch_loss.numpy() epoch_div = epoch_div.numpy() metrics = self.calc_metrics(epoch_loss, epoch_div) self.train_epochs += 1 self.report( self.train_epochs, metrics, start, 'Train', 'EPOCH', reporting_fns ) return metrics
Example #22
Source File: utils.py From mead-baseline with Apache License 2.0 | 4 votes |
def _test(self, loader, **kwargs): """Test an epoch of data using either the input loader or using `tf.dataset` In non-`tf.dataset` mode, we cycle the loader data feed, and pull a batch and feed it to the feed dict When we use `tf.dataset`s under the hood, this function simply uses the loader to know how many steps to train. :param loader: A data feed :param kwargs: See below :Keyword Arguments: * *dataset* (`bool`) Set to `True` if using `tf.dataset`s, defaults to `True` * *reporting_fns* (`list`) A list of reporting hooks to use * *verbose* (`dict`) A dictionary containing `console` boolean and `file` name if on :return: Metrics """ if self.ema: self.sess.run(self.ema_load) use_dataset = kwargs.get('dataset', True) cm = ConfusionMatrix(self.model.labels) steps = len(loader) total_loss = 0 total_norm = 0 verbose = kwargs.get("verbose", None) pg = create_progress_bar(steps) for i, batch_dict in enumerate(pg(loader)): y = batch_dict['y'] if use_dataset: guess, lossv = self.sess.run([self.model.best, self.test_loss]) else: feed_dict = self.model.make_input(batch_dict, False) guess, lossv = self.sess.run([self.model.best, self.test_loss], feed_dict=feed_dict) batchsz = len(guess) total_loss += lossv * batchsz total_norm += batchsz cm.add_batch(y, guess) metrics = cm.get_all_metrics() metrics['avg_loss'] = total_loss / float(total_norm) verbose_output(verbose, cm) return metrics
Example #23
Source File: utils.py From mead-baseline with Apache License 2.0 | 4 votes |
def _train(self, loader, dataset=True, **kwargs): """Train an epoch of data using either the input loader or using `tf.dataset` In non-`tf.dataset` mode, we cycle the loader data feed, and pull a batch and feed it to the feed dict When we use `tf.dataset`s under the hood, this function simply uses the loader to know how many steps to train. We do use a `feed_dict` for passing the `TRAIN_FLAG` in either case :param loader: A data feed :param kwargs: See below :Keyword Arguments: * *dataset* (`bool`) Set to `True` if using `tf.dataset`s, defaults to `True` * *reporting_fns* (`list`) A list of reporting hooks to use :return: Metrics """ if self.ema: self.sess.run(self.ema_restore) reporting_fns = kwargs.get('reporting_fns', []) epoch_loss = 0 epoch_div = 0 steps = len(loader) pg = create_progress_bar(steps) for batch_dict in pg(loader): if dataset: _, step, lossv = self.sess.run([self.train_op, self.global_step, self.loss], feed_dict={TRAIN_FLAG(): 1}) else: feed_dict = self.model.make_input(batch_dict, True) _, step, lossv = self.sess.run([self.train_op, self.global_step, self.loss], feed_dict=feed_dict) batchsz = self._get_batchsz(batch_dict) report_lossv = lossv * batchsz epoch_loss += report_lossv epoch_div += batchsz self.nstep_agg += report_lossv self.nstep_div += batchsz if (step + 1) % self.nsteps == 0: metrics = self.calc_metrics(self.nstep_agg, self.nstep_div) self.report( step + 1, metrics, self.nstep_start, 'Train', 'STEP', reporting_fns, self.nsteps ) self.reset_nstep() metrics = self.calc_metrics(epoch_loss, epoch_div) return metrics
Example #24
Source File: eager.py From mead-baseline with Apache License 2.0 | 4 votes |
def test(self, vs, reporting_fns, phase='Valid', dataset=True, **kwargs): """Run an epoch of testing over the dataset If we are using a `tf.dataset`-based `fit_func`, we will just cycle the number of steps and let the `dataset` yield new batches. If we are using `feed_dict`s, we convert each batch from the `DataFeed` and pass that into TF as the `feed_dict` :param vs: A validation set :param reporting_fns: Reporting hooks :param phase: The phase of evaluation (`Test`, `Valid`) :param dataset: (`bool`) Are we using `tf.dataset`s :return: Metrics """ SET_TRAIN_FLAG(False) if phase == 'Test': return self._evaluate(vs, reporting_fns, **kwargs) self.valid_epochs += 1 total_loss = 0 total_toks = 0 preds = [] golds = [] start = time.time() for features, tgt in vs: features['dst'] = tgt[:, :-1] top_preds = self.model.predict(features, beam=1, make_input=False) loss_value = loss(self.model, features, tgt).numpy() toks = tf.cast(self._num_toks(features['tgt_len']), tf.float32).numpy() total_loss += loss_value * toks total_toks += toks preds.extend(convert_seq2seq_preds(top_preds[:, 0, :], self.tgt_rlut)) golds.extend(convert_seq2seq_golds(tgt, features['tgt_len'], self.tgt_rlut)) metrics = self.calc_metrics(total_loss, total_toks) metrics['bleu'] = bleu(preds, golds, self.bleu_n_grams)[0] self.report( self.valid_epochs, metrics, start, phase, 'EPOCH', reporting_fns ) return metrics
Example #25
Source File: pipeline.py From ranking with Apache License 2.0 | 4 votes |
def __init__(self, context_feature_columns, example_feature_columns, hparams, estimator, label_feature_name="relevance", label_feature_type=tf.int64, dataset_reader=tf.data.TFRecordDataset, best_exporter_metric=None, best_exporter_metric_higher_better=True, size_feature_name=None): """Constructor. Args: context_feature_columns: (dict) Context (aka, query) feature columns. example_feature_columns: (dict) Example (aka, document) feature columns. hparams: (dict) A dict containing model hyperparameters. estimator: (`Estimator`) An `Estimator` instance for model train and eval. label_feature_name: (str) The name of the label feature. label_feature_type: (`tf.dtype`) The value type of the label feature. dataset_reader: (`tf.Dataset`) The dataset format for the input files. best_exporter_metric: (str) Metric key for exporting the best model. If None, exports the model with the minimal loss value. best_exporter_metric_higher_better: (bool) If a higher metric is better. This is only used if `best_exporter_metric` is not None. size_feature_name: (str) If set, populates the feature dictionary with this name and the coresponding value is a `tf.int32` Tensor of shape [batch_size] indicating the actual sizes of the example lists before padding and truncation. If None, which is default, this feature is not generated. """ self._validate_parameters(estimator, hparams) self._context_feature_columns = context_feature_columns self._example_feature_columns = example_feature_columns self._hparams = hparams self._estimator = estimator self._label_feature_name = label_feature_name self._label_feature_type = label_feature_type self._dataset_reader = dataset_reader self._best_exporter_metric = best_exporter_metric self._best_exporter_metric_higher_better = ( best_exporter_metric_higher_better) self._size_feature_name = size_feature_name
Example #26
Source File: distributed.py From mead-baseline with Apache License 2.0 | 4 votes |
def test(self, vs, reporting_fns, steps=0, phase='Valid', **kwargs): """Run an epoch of testing over the dataset If we are using a `tf.dataset`-based `fit_func`, we will just cycle the number of steps and let the `dataset` yield new batches. If we are using `feed_dict`s, we convert each batch from the `DataFeed` and pass that into TF as the `feed_dict` :param vs: A validation set :param reporting_fns: Reporting hooks :param phase: The phase of evaluation (`Test`, `Valid`) :param dataset: (`bool`) Are we using `tf.dataset`s :return: Metrics """ def _replicated_valid_step(inputs): features, tgt = inputs top_preds = self.model.predict(features, beam=1, make_input=False) per_replica_loss = loss(self.model, features, tgt) per_replica_toks = self._num_toks(features['tgt_len']) per_replica_report_loss = per_replica_loss * tf.cast(per_replica_toks, tf.float32) return per_replica_report_loss, per_replica_toks, top_preds if phase == 'Test': SET_TRAIN_FLAG(False) return self._evaluate(vs, reporting_fns, **kwargs) strategy = self.strategy num_replicas = strategy.num_replicas_in_sync with strategy.scope(): SET_TRAIN_FLAG(False) self.valid_epochs += 1 total_loss = tf.Variable(0.0) total_toks = tf.Variable(0, dtype=tf.int32) preds = [] golds = [] start = time.time() test_iter = iter(vs) for i in range(steps): features, tgt = next(test_iter) inputs = (features, tgt) per_replica_loss, per_replica_toks, _ = strategy.experimental_run_v2(_replicated_valid_step, args=(inputs,)) total_loss.assign_add(strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_loss, axis=None)) total_toks.assign_add(strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_toks, axis=None)) # Not sure a good way to get top preds merged yet metrics = self.calc_metrics(total_loss.numpy(), total_toks.numpy()) self.report( self.valid_epochs, metrics, start, phase, 'EPOCH', reporting_fns ) return metrics
Example #27
Source File: utils.py From mead-baseline with Apache License 2.0 | 4 votes |
def test(self, vs, reporting_fns, phase='Valid', dataset=True): """Run an epoch of testing over the dataset If we are using a `tf.dataset`-based `fit_func`, we will just cycle the number of steps and let the `dataset` yield new batches. If we are using `feed_dict`s, we convert each batch from the `DataFeed` and pass that into TF as the `feed_dict` :param vs: A validation set :param reporting_fns: Reporting hooks :param phase: The phase of evaluation (`Test`, `Valid`) :param dataset: (`bool`) Are we using `tf.dataset`s :return: Metrics """ if phase == 'Test' and not dataset: return self._evaluate(vs, reporting_fns) self.valid_epochs += 1 total_loss = 0 total_toks = 0 preds = [] golds = [] start = time.time() pg = create_progress_bar(len(vs)) for batch_dict in pg(vs): if dataset: lossv, top_preds = self.model.sess.run([self.test_loss, self.model.decoder.best]) else: feed_dict = self.model.make_input(batch_dict) lossv, top_preds = self.model.sess.run([self.test_loss, self.model.decoder.best], feed_dict=feed_dict) toks = self._num_toks(batch_dict['tgt_lengths']) total_loss += lossv * toks total_toks += toks preds.extend(convert_seq2seq_preds(top_preds.T, self.tgt_rlut)) golds.extend(convert_seq2seq_golds(batch_dict['tgt'], batch_dict['tgt_lengths'], self.tgt_rlut)) metrics = self.calc_metrics(total_loss, total_toks) metrics['bleu'] = bleu(preds, golds, self.bleu_n_grams)[0] self.report( self.valid_epochs, metrics, start, phase, 'EPOCH', reporting_fns ) return metrics
Example #28
Source File: utils.py From mead-baseline with Apache License 2.0 | 4 votes |
def train(self, ts, reporting_fns, dataset=True): """Train by looping over the steps For a `tf.dataset`-backed `fit_func`, we are using the previously wired `dataset`s in the model (and `dataset` is `True`). For `feed_dict`, we convert the ts samples to `feed_dict`s and hand them in one-by-one :param ts: The training set :param reporting_fns: A list of reporting hooks :param dataset: (`bool`) Are we using `tf.dataset`s :return: Metrics """ epoch_loss = 0 epoch_toks = 0 start = time.time() self.nstep_start = start for batch_dict in ts: if dataset: _, global_step, lossv = self.sess.run([self.train_op, self.global_step, self.loss], feed_dict={TRAIN_FLAG(): 1}) else: feed_dict = self.model.make_input(batch_dict, True) _, global_step, lossv = self.sess.run([self.train_op, self.global_step, self.loss], feed_dict=feed_dict) # ?? How to get this cleaner? toks = self._num_toks(batch_dict['tgt_lengths']) report_loss = lossv * toks epoch_loss += report_loss epoch_toks += toks self.nstep_agg += report_loss self.nstep_div += toks if (global_step + 1) % self.nsteps == 0: metrics = self.calc_metrics(self.nstep_agg, self.nstep_div) self.report( global_step + 1, metrics, self.nstep_start, 'Train', 'STEP', reporting_fns, self.nsteps ) self.reset_nstep() metrics = self.calc_metrics(epoch_loss, epoch_toks) self.train_epochs += 1 self.report( self.train_epochs, metrics, start, 'Train', 'EPOCH', reporting_fns ) return metrics
Example #29
Source File: eager.py From mead-baseline with Apache License 2.0 | 4 votes |
def _train(self, loader, steps=0, **kwargs): """Train an epoch of data using either the input loader or using `tf.dataset` In non-`tf.dataset` mode, we cycle the loader data feed, and pull a batch and feed it to the feed dict When we use `tf.dataset`s under the hood, this function simply uses the loader to know how many steps to train. We do use a `feed_dict` for passing the `TRAIN_FLAG` in either case :param loader: A data feed :param kwargs: See below :Keyword Arguments: * *dataset* (`bool`) Set to `True` if using `tf.dataset`s, defaults to `True` * *reporting_fns* (`list`) A list of reporting hooks to use :return: Metrics """ SET_TRAIN_FLAG(True) reporting_fns = kwargs.get('reporting_fns', []) pg = create_progress_bar(steps) epoch_loss = tf.Variable(0.0) epoch_div = tf.Variable(0, dtype=tf.int32) nstep_loss = tf.Variable(0.0) nstep_div = tf.Variable(0, dtype=tf.int32) self.nstep_start = time.time() @tf.function def _train_step(inputs): features, y = inputs loss = self.optimizer.update(self.model, features, y) batchsz = get_shape_as_list(y)[0] report_loss = loss * batchsz return report_loss, batchsz with autograph_options({"function_optimization": False, "layout_optimizer": False}): for inputs in pg(loader): step_report_loss, step_batchsz = _train_step(inputs) epoch_loss.assign_add(step_report_loss) nstep_loss.assign_add(step_report_loss) epoch_div.assign_add(step_batchsz) nstep_div.assign_add(step_batchsz) step = self.optimizer.global_step.numpy() + 1 if step % self.nsteps == 0: metrics = self.calc_metrics(nstep_loss.numpy(), nstep_div.numpy()) self.report( step, metrics, self.nstep_start, 'Train', 'STEP', reporting_fns, self.nsteps ) nstep_loss.assign(0.0) nstep_div.assign(0) self.nstep_start = time.time() epoch_loss = epoch_loss.numpy() epoch_div = epoch_div.numpy() metrics = self.calc_metrics(epoch_loss, epoch_div) return metrics
Example #30
Source File: utils.py From mead-baseline with Apache License 2.0 | 4 votes |
def _train(self, ts, **kwargs): """Train an epoch of data using either the input loader or using `tf.dataset` In non-`tf.dataset` mode, we cycle the loader data feed, and pull a batch and feed it to the feed dict When we use `tf.dataset`s under the hood, this function simply uses the loader to know how many steps to train. We do use a `feed_dict` for passing the `TRAIN_FLAG` in either case :param ts: A data feed :param kwargs: See below :Keyword Arguments: * *dataset* (`bool`) Set to `True` if using `tf.dataset`s, defaults to `True` * *reporting_fns* (`list`) A list of reporting hooks to use :return: Metrics """ use_dataset = kwargs.get('dataset', True) reporting_fns = kwargs.get('reporting_fns', []) epoch_loss = 0 epoch_div = 0 steps = len(ts) pg = create_progress_bar(steps) for batch_dict in pg(ts): if use_dataset: _, step, lossv = self.sess.run([self.train_op, self.global_step, self.loss], feed_dict={TRAIN_FLAG(): 1}) else: feed_dict = self.model.make_input(batch_dict, True) _, step, lossv = self.sess.run([self.train_op, self.global_step, self.loss], feed_dict=feed_dict) batchsz = self._get_batchsz(batch_dict) report_loss = lossv * batchsz epoch_loss += report_loss epoch_div += batchsz self.nstep_agg += report_loss self.nstep_div += batchsz if (step + 1) % self.nsteps == 0: metrics = self.calc_metrics(self.nstep_agg, self.nstep_div) self.report( step + 1, metrics, self.nstep_start, 'Train', 'STEP', reporting_fns, self.nsteps ) self.reset_nstep() metrics = self.calc_metrics(epoch_loss, epoch_div) return metrics