Python tensorflow.dataset() Examples
The following are 30
code examples of tensorflow.dataset().
Example #1
Source File: From mead-baseline with Apache License 2.0 | 6 votes |
def _evaluate(self, es, reporting_fns, **kwargs): """Run the model with beam search and report Bleu. :param es: `tf.dataset` of input :param reporting_fns: Input hooks """ preds = [] golds = [] start = time.time() for features, tgt in es: features['dst'] = tgt[:, :-1] tgt_lens = features.pop('tgt_len') top_preds = self.model.predict(features, make_input=False, **kwargs) preds.extend(convert_seq2seq_preds(top_preds[:, 0, :], self.tgt_rlut)) golds.extend(convert_seq2seq_golds(tgt, tgt_lens, self.tgt_rlut)) metrics = {'bleu': bleu(preds, golds, self.bleu_n_grams)[0]} 0, metrics, start, 'Test', 'EPOCH', reporting_fns ) return metrics
Example #2
Source File: From MMNet with Apache License 2.0 | 6 votes |
def setup_dataset( self, placeholders: Tuple[tf.placeholder, tf.placeholder], batch_size: int=None, ): self.batch_size = self.args.batch_size if batch_size is None else batch_size dataset = dataset =, num_parallel_calls=self.args.num_threads).prefetch( self.args.prefetch_factor * self.batch_size) if self.is_training: dataset = dataset.repeat() if self.shuffle: dataset = dataset.shuffle(buffer_size=self.args.buffer_size) self.dataset = dataset.batch(self.batch_size) self.iterator = self.dataset.make_initializable_iterator() self.next_elem = self.iterator.get_next()
Example #3
Source File: From mead-baseline with Apache License 2.0 | 6 votes |
def _test(self, ts, dataset=True): """Test an epoch of data using either the input loader or using `tf.dataset` In non-`tf.dataset` mode, we cycle the loader data feed, and pull a batch and feed it to the feed dict When we use `tf.dataset`s under the hood, this function simply uses the loader to know how many steps to train. :param loader: A data feed :param kwargs: See below :Keyword Arguments: * *dataset* (`bool`) Set to `True` if using `tf.dataset`s, defaults to `True` * *reporting_fns* (`list`) A list of reporting hooks to use * *verbose* (`dict`) A dictionary containing `console` boolean and `file` name if on :return: Metrics """ return self.evaluator.test(ts, dataset=dataset)
Example #4
Source File: From mead-baseline with Apache License 2.0 | 6 votes |
def _test(self, ts, steps=0, **kwargs): """Test an epoch of data using either the input loader or using `tf.dataset` In non-`tf.dataset` mode, we cycle the loader data feed, and pull a batch and feed it to the feed dict When we use `tf.dataset`s under the hood, this function simply uses the loader to know how many steps to train. :param loader: A data feed :param kwargs: See below :Keyword Arguments: * *dataset* (`bool`) Set to `True` if using `tf.dataset`s, defaults to `True` * *reporting_fns* (`list`) A list of reporting hooks to use * *verbose* (`dict`) A dictionary containing `console` boolean and `file` name if on :return: Metrics """ return self.evaluator.test(ts, steps, **kwargs)
Example #5
Source File: From mead-baseline with Apache License 2.0 | 5 votes |
def distribute(self, dataset): return self.strategy.experimental_distribute_dataset(dataset)
Example #6
Source File: From mead-baseline with Apache License 2.0 | 5 votes |
def distribute(self, dataset): return self.strategy.experimental_distribute_dataset(dataset)
Example #7
Source File: From mead-baseline with Apache License 2.0 | 5 votes |
def distribute(self, dataset): return self.strategy.experimental_distribute_dataset(dataset)
Example #8
Source File: From mead-baseline with Apache License 2.0 | 5 votes |
def test(self, vs, reporting_fns, phase): """Run an epoch of testing over the dataset If we are using a `tf.dataset`-based `fit_func`, we will just cycle the number of steps and let the `dataset` yield new batches. If we are using `feed_dict`s, we convert each batch from the `DataFeed` and pass that into TF as the `feed_dict` :param vs: A validation set :param reporting_fns: Reporting hooks :param phase: The phase of evaluation (`Test`, `Valid`) :param dataset: (`bool`) Are we using `tf.dataset`s :return: Metrics """ total_loss = 0.0 total_toks = 0 epochs = 0 if phase == 'Valid': self.valid_epochs += 1 epochs = self.valid_epochs SET_TRAIN_FLAG(False) start = time.time() h = None for features, y in vs: if self.model.requires_state: loss_value, h = loss_with_state(self.model, h, features, y) else: loss_value = loss_without_state(self.model, features, y) loss_value = loss_value.numpy() toks = self._num_toks(y) total_loss += loss_value * tf.cast(toks, tf.float32).numpy() total_toks += toks.numpy() metrics = self.calc_metrics(total_loss, total_toks) epochs, metrics, start, phase, 'EPOCH', reporting_fns ) return metrics
Example #9
Source File: From tf-gqn with Apache License 2.0 | 5 votes |
def _get_dataset_files(dataset_info, mode, root): """Generates lists of files for a given dataset version.""" basepath = dataset_info.basepath base = os.path.join(root, basepath, mode) if mode == 'train': num_files = dataset_info.train_size else: num_files = dataset_info.test_size length = len(str(num_files)) template = '{:0%d}-of-{:0%d}.tfrecord' % (length, length) record_paths = [ # indexing runs from 1 to n os.path.join(base, template.format(i, num_files)) for i in range(1, num_files + 1)] return record_paths
Example #10
Source File: From mead-baseline with Apache License 2.0 | 5 votes |
def to_tensors(ts, src_lengths_key, dst=False): """Convert a data feed into a tuple of `features` (`dict`) and `y` values This method is required to produce `tf.dataset`s from the input data feed. Any fields ending with `_lengths` are ignored, unless they match the `src_lengths_key` or `tgt_lengths_key`, in which case, they are converted to `src_len` and `tgt_len` :param ts: The data feed to convert :param lengths_key: This is a field passed from the model params specifying source of truth of the temporal lengths :param dst: `bool` that says if we should prepare a `dst` tensor. This is needed in distributed mode :return: A `tuple` of `features` and `y` (labels) """ keys = ts[0].keys() # This is kind of a hack keys = [k for k in keys if '_lengths' not in k and k != 'ids'] + [src_lengths_key, "tgt_lengths"] features = dict((k, []) for k in keys) for sample in ts: for k in keys: for s in sample[k]: features[k].append(s) features['src_len'] = features[src_lengths_key] del features[src_lengths_key] features['tgt_len'] = features['tgt_lengths'] del features['tgt_lengths'] features = dict((k, np.stack(v).astype(np.int32)) for k, v in features.items()) if dst: features['dst'] = features['tgt'][:, :-1] tgt = features.pop('tgt') return features, tgt
Example #11
Source File: From MMNet with Apache License 2.0 | 5 votes |
def resize_and_padding_before_augmentation(self, image, size): # If width > height, resize height to model's input height while preserving aspect ratio # If height > width, resize width to model's input width while preserving aspect ratio if self.args.debug_augmentation: assert size[0] == size[1], "resize_and_padding_before_augmentation only supports square target image" image = tf.expand_dims(image, 0) image_dims = tf.shape(image) height = image_dims[1] width = image_dims[2] min_size = min(*size) width_aspect = tf.maximum(min_size, tf.cast(width * min_size / height, dtype=tf.int32)) height_aspect = tf.maximum(min_size, tf.cast(height * min_size / width, dtype=tf.int32)) image = tf.image.resize_bilinear(image, (height_aspect, width_aspect)) image = image[:, :self.padded_max_size, :self.padded_max_size, :] # Pads the image on the bottom and right with zeros until it has dimensions target_height, target_width. image = tf.image.pad_to_bounding_box( image, offset_height=tf.maximum(self.padded_max_size-height_aspect, 0), offset_width=tf.maximum(self.padded_max_size-width_aspect, 0), target_height=self.padded_max_size, target_width=self.padded_max_size, ) image = tf.squeeze(image, 0) return image else: # Have to return some dummy tensor which have .get_shape() to tf.dataset return tf.constant(0, shape=self.padded_original_image_dummy_shape, dtype=tf.uint8, name="dummy")
Example #12
Source File: From MMNet with Apache License 2.0 | 5 votes |
def add_arguments(parser): g_common = parser.add_argument_group("(DataWrapperBase) Common Arguments for all data wrapper.") g_common.add_argument("--dataset_path", required=True, type=str, help="The name of the dataset to load.") g_common.add_argument("--dataset_split_name", required=True, type=str, nargs="*", help="The name of the train/test split. Support multiple splits") g_common.add_argument("--batch_size", default=32, type=utils.positive_int, help="The number of examples in batch.") g_common.add_argument("--no-shuffle", dest="shuffle", action="store_false") g_common.add_argument("--shuffle", dest="shuffle", action="store_true") g_common.set_defaults(shuffle=True) g_common.add_argument("--width", required=True, type=int) g_common.add_argument("--height", required=True, type=int) g_common.add_argument("--no-debug_augmentation", dest="debug_augmentation", action="store_false") g_common.add_argument("--debug_augmentation", dest="debug_augmentation", action="store_true") g_common.set_defaults(debug_augmentation=False) g_common.add_argument("--max_padded_size", default=224, type=int, help=("We will resize & pads the original image " "until it has dimensions (padded_size, padded_size)" "Recommend to set this value as width(or height) * 1.8 ~ 2")) g_common.add_argument("--augmentation_method", type=str, required=True, choices=_available_augmentation_methods) g_common.add_argument("--num_threads", default=8, type=int) g_common.add_argument("--buffer_size", default=1000, type=int) g_common.add_argument("--prefetch_factor", default=100, type=int) g_common.add_argument("--rotation_range", default=0, type=int, help="Receives maximum angle to be rotated in terms of degree: " "The image is randomly rotated by the angle " "randomly chosen from [-rotation_range, rotation_range], " "and then cropped appropriately to remove dark areas.\n" "So, be aware that the rotation performs certain kind of zooming.") g_common.add_argument("--no-has_sub_dataset", dest="has_sub_dataset", action="store_false") g_common.add_argument("--has_sub_dataset", dest="has_sub_dataset", action="store_true") g_common.set_defaults(has_sub_dataset=False)
Example #13
Source File: From mead-baseline with Apache License 2.0 | 5 votes |
def process_batch(self, batch_dict, handle, txts, dataset=True): if dataset: guess = else: feed_dict = self.model.make_input(batch_dict) guess =, feed_dict=feed_dict) sentence_lengths = batch_dict[self.model.lengths_key] ids = batch_dict['ids'] truth = batch_dict['y'] correct_labels = 0 total_labels = 0 # For fscore gold_chunks = [] pred_chunks = [] # For each sentence for b in range(len(guess)): length = sentence_lengths[b] sentence = guess[b][:length] # truth[b] is padded, cutting at :length gives us back true length gold = truth[b][:length] valid_guess = sentence[gold != Offsets.PAD] valid_gold = gold[gold != Offsets.PAD] valid_sentence_length = np.sum(gold != Offsets.PAD) correct_labels += np.sum(np.equal(valid_guess, valid_gold)) total_labels += valid_sentence_length gold_chunks.append(set(to_spans(valid_gold, self.idx2label, self.span_type, self.verbose))) pred_chunks.append(set(to_spans(valid_guess, self.idx2label, self.span_type, self.verbose))) # Should we write a file out? If so, we have to have txts if handle is not None: id = ids[b] txt = txts[id] write_sentence_conll(handle, valid_guess, valid_gold, txt, self.idx2label) return correct_labels, total_labels, gold_chunks, pred_chunks
Example #14
Source File: From mead-baseline with Apache License 2.0 | 5 votes |
def to_tensors(ts, lengths_key): """Convert a data feed into a tuple of `features` (`dict`) and `y` values This method is required to produce `tf.dataset`s from the input data feed. Any fields ending with `_lengths` are ignored, unless they match the `lengths_key` name (as are `ids`) :param ts: The data feed to convert :param lengths_key: This is a field passed from the model params specifying source of truth of the temporal lengths :return: A `tuple` of `features` and `y` (labels) """ keys = ts[0].keys() # This is kind of a hack keys = [k for k in keys if '_lengths' not in k and k != 'ids'] + [lengths_key] features = dict((k, []) for k in keys) for sample in ts: for k in features.keys(): # add each sample for s in sample[k]: features[k].append(s) features['lengths'] = features[lengths_key] del features[lengths_key] features = dict((k, np.stack(v)) for k, v in features.items()) y = features.pop('y') return features, y
Example #15
Source File: From ranking with Apache License 2.0 | 5 votes |
def _make_input_fn(self, input_pattern, batch_size, list_size, randomize_input=True, num_epochs=None): """Returns the input function for the ranking model. Args: input_pattern: (str) File pattern for the input data. batch_size: (int) The number of input examples to process per batch. list_size: (int) The list size for an ELWC example. randomize_input: (bool) If true, randomize input example order. It should almost always be true except for unittest/debug purposes. num_epochs: (int) The number of times the input dataset must be repeated. None to repeat the data indefinitely. Returns: An `input_fn` for `tf.estimator.Estimator`. """ def _input_fn(): """`input_fn` for the `Estimator`.""" return self._make_dataset( batch_size=batch_size, list_size=list_size, input_pattern=input_pattern, randomize_input=randomize_input, num_epochs=num_epochs) return _input_fn
Example #16
Source File: From bert-multitask-learning with MIT License | 4 votes |
def train_eval_input_fn(params, mode='train'): '''Train and eval input function of estimator. This function will write and read tf record for training and evaluation. Usage: def train_input_fn(): return train_eval_input_fn(params) estimator.train( train_input_fn, max_steps=params.train_steps, hooks=[train_hook]) Arguments: params {Params} -- Params objects Keyword Arguments: mode {str} -- ModeKeys (default: {'train'}) Returns: tf Dataset -- Tensorflow dataset ''' write_tfrecord(params=params) dataset_dict = read_tfrecord(params=params, mode=mode) dataset = [ds for _, ds in dataset_dict.items()]) if mode == 'train': dataset = dataset.shuffle(params.shuffle_buffer) dataset = dataset.prefetch(params.prefetch) if params.dynamic_padding: dataset = dataset.apply( element_length_func=element_length_func, bucket_batch_sizes=params.bucket_batch_sizes, bucket_boundaries=params.bucket_boundaries, )) else: if mode == 'train': dataset = dataset.batch(params.batch_size) else: dataset = dataset.batch(params.batch_size*2) return dataset
Example #17
Source File: From mead-baseline with Apache License 2.0 | 4 votes |
def test(self, vs, reporting_fns, phase, dataset=True): """Run an epoch of testing over the dataset If we are using a `tf.dataset`-based `fit_func`, we will just cycle the number of steps and let the `dataset` yield new batches. If we are using `feed_dict`s, we convert each batch from the `DataFeed` and pass that into TF as the `feed_dict` :param vs: A validation set :param reporting_fns: Reporting hooks :param phase: The phase of evaluation (`Test`, `Valid`) :param dataset: (`bool`) Are we using `tf.dataset`s :return: Metrics """ total_loss = 0.0 total_toks = 0 epochs = 0 if phase == 'Valid': self.valid_epochs += 1 epochs = self.valid_epochs if self.model.requires_state: state =, self.model.make_input(vs[0], False)) fetches = { "loss": self.test_loss, } if self.model.requires_state: fetches["final_state"] = self.model.final_state start = time.time() for batch_dict in vs: feed_dict = {} if not dataset: feed_dict = self.model.make_input(batch_dict, False) # In Keras LSTM, the order is h first, c second, its the opposite in TF 1, however I dont think it # ends up mattering here if self.model.requires_state: for i, (s1, s2) in enumerate(self.model.initial_state): feed_dict[s1] = state[i][0] # .c # 0 feed_dict[s2] = state[i][1] # .h # 1 vals =, feed_dict) loss = vals["loss"] toks = self._num_toks(batch_dict) if self.model.requires_state: state = vals["final_state"] total_loss += loss * toks total_toks += toks metrics = self.calc_metrics(total_loss, total_toks) epochs, metrics, start, phase, 'EPOCH', reporting_fns ) return metrics
Example #18
Source File: From bert-multitask-learning with MIT License | 4 votes |
def to_serving_input(input_file_or_list, config, mode=PREDICT, tokenizer=None): '''A serving input function that takes input file path or list of string and apply BERT preprocessing. This fn will return a data dict instead of tf dataset. Used in serving. Arguments: input_file_or_list {str or list} -- file path of list of str config {Params} -- Params Keyword Arguments: mode {str} -- ModeKeys (default: {PREDICT}) tokenizer {tokenizer} -- Tokenizer (default: {None}) ''' # if is string, treat it as path to file if isinstance(input_file_or_list, str): inputs = open(input_file_or_list, 'r', encoding='utf8').readlines() else: inputs = input_file_or_list if tokenizer is None: tokenizer = FullTokenizer(config.vocab_file) data_dict = {} for doc in inputs: inputs_a = cluster_alphnum(doc) tokens, target = tokenize_text_with_seqs( tokenizer, inputs_a, None) tokens_a, tokens_b, target = truncate_seq_pair( tokens, None, target, config.max_seq_len) tokens, segment_ids, target = add_special_tokens_with_seqs( tokens_a, tokens_b, target) input_mask, tokens, segment_ids, target = create_mask_and_padding( tokens, segment_ids, target, config.max_seq_len) input_ids = tokenizer.convert_tokens_to_ids(tokens) data_dict['input_ids'] = input_ids data_dict['input_mask'] = input_mask data_dict['segment_ids'] = segment_ids yield data_dict
Example #19
Source File: From mead-baseline with Apache License 2.0 | 4 votes |
def _test(self, loader, steps=0, **kwargs): """Test an epoch of data using either the input loader or using `tf.dataset` In non-`tf.dataset` mode, we cycle the loader data feed, and pull a batch and feed it to the feed dict When we use `tf.dataset`s under the hood, this function simply uses the loader to know how many steps to train. :param loader: A data feed :param kwargs: See below :Keyword Arguments: * *dataset* (`bool`) Set to `True` if using `tf.dataset`s, defaults to `True` * *reporting_fns* (`list`) A list of reporting hooks to use * *verbose* (`dict`) A dictionary containing `console` boolean and `file` name if on :return: Metrics """ cm = ConfusionMatrix(self.model.labels) total_loss = 0 total_norm = 0 verbose = kwargs.get("verbose", None) pg = create_progress_bar(steps) SET_TRAIN_FLAG(False) for features, y in pg(loader): logits = self.model(features) y_ = tf.argmax(logits, axis=1, output_type=tf.int32) cm.add_batch(y, y_) lossv = tf.compat.v1.losses.sparse_softmax_cross_entropy(labels=y, logits=logits).numpy() batchsz = int(y.shape[0]) assert len(y_) == batchsz total_loss += lossv * batchsz total_norm += batchsz cm.add_batch(y, y_) metrics = cm.get_all_metrics() metrics['avg_loss'] = total_loss / float(total_norm) verbose_output(verbose, cm) return metrics
Example #20
Source File: From mead-baseline with Apache License 2.0 | 4 votes |
def _train(self, loader, steps=0, **kwargs): """Train an epoch of data using either the input loader or using `tf.dataset` In non-`tf.dataset` mode, we cycle the loader data feed, and pull a batch and feed it to the feed dict When we use `tf.dataset`s under the hood, this function simply uses the loader to know how many steps to train. We do use a `feed_dict` for passing the `TRAIN_FLAG` in either case :param loader: A data feed :param kwargs: See below :Keyword Arguments: * *dataset* (`bool`) Set to `True` if using `tf.dataset`s, defaults to `True` * *reporting_fns* (`list`) A list of reporting hooks to use :return: Metrics """ SET_TRAIN_FLAG(True) reporting_fns = kwargs.get('reporting_fns', []) pg = create_progress_bar(steps) epoch_loss = tf.Variable(0.0) epoch_div = tf.Variable(0, dtype=tf.int32) nstep_loss = tf.Variable(0.0) nstep_div = tf.Variable(0, dtype=tf.int32) self.nstep_start = time.time() @tf.function def _train_step(inputs): """Replicated training step.""" features, y = inputs loss = self.optimizer.update(self.model, features, y) batchsz = get_shape_as_list(y)[0] report_loss = loss * batchsz return report_loss, batchsz for inputs in pg(loader): step_report_loss, step_batchsz = _train_step(inputs) epoch_loss.assign_add(step_report_loss) nstep_loss.assign_add(step_report_loss) epoch_div.assign_add(step_batchsz) nstep_div.assign_add(step_batchsz) step = self.optimizer.global_step.numpy() + 1 if step % self.nsteps == 0: metrics = self.calc_metrics(nstep_loss.numpy(), nstep_div.numpy()) step, metrics, self.nstep_start, 'Train', 'STEP', reporting_fns, self.nsteps ) nstep_loss.assign(0.0) nstep_div.assign(0) self.nstep_start = time.time() epoch_loss = epoch_loss.numpy() epoch_div = epoch_div.numpy() metrics = self.calc_metrics(epoch_loss, epoch_div) return metrics
Example #21
Source File: From mead-baseline with Apache License 2.0 | 4 votes |
def train(self, ts, reporting_fns, dataset=True): """Train by looping over the steps For a `tf.dataset`-backed `fit_func`, we are using the previously wired `dataset`s in the model (and `dataset` is `True`). For `feed_dict`, we convert the ts samples to `feed_dict`s and hand them in one-by-one :param ts: The training set :param reporting_fns: A list of reporting hooks :param dataset: (`bool`) Are we using `tf.dataset`s :return: Metrics """ SET_TRAIN_FLAG(True) epoch_loss = tf.Variable(0.0) epoch_div = tf.Variable(0, dtype=tf.int32) nstep_loss = tf.Variable(0.0) nstep_div = tf.Variable(0, dtype=tf.int32) self.nstep_start = time.time() start = time.time() @tf.function def _train_step(features, y): """Replicated training step.""" loss = self.optimizer.update(self.model, features, y) toks = self._num_toks(features['tgt_len']) report_loss = loss * tf.cast(toks, tf.float32) return report_loss, toks with autograph_options({"function_optimization": False, "layout_optimizer": False}): for features, y in ts: features['dst'] = y[:, :-1] step_report_loss, step_toks = _train_step(features, y) epoch_loss.assign_add(step_report_loss) nstep_loss.assign_add(step_report_loss) epoch_div.assign_add(step_toks) nstep_div.assign_add(step_toks) step = self.optimizer.global_step.numpy() + 1 if step % self.nsteps == 0: metrics = self.calc_metrics(nstep_loss.numpy(), nstep_div.numpy()) step, metrics, self.nstep_start, 'Train', 'STEP', reporting_fns, self.nsteps ) nstep_loss.assign(0.0) nstep_div.assign(0) self.nstep_start = time.time() epoch_loss = epoch_loss.numpy() epoch_div = epoch_div.numpy() metrics = self.calc_metrics(epoch_loss, epoch_div) self.train_epochs += 1 self.train_epochs, metrics, start, 'Train', 'EPOCH', reporting_fns ) return metrics
Example #22
Source File: From mead-baseline with Apache License 2.0 | 4 votes |
def _test(self, loader, **kwargs): """Test an epoch of data using either the input loader or using `tf.dataset` In non-`tf.dataset` mode, we cycle the loader data feed, and pull a batch and feed it to the feed dict When we use `tf.dataset`s under the hood, this function simply uses the loader to know how many steps to train. :param loader: A data feed :param kwargs: See below :Keyword Arguments: * *dataset* (`bool`) Set to `True` if using `tf.dataset`s, defaults to `True` * *reporting_fns* (`list`) A list of reporting hooks to use * *verbose* (`dict`) A dictionary containing `console` boolean and `file` name if on :return: Metrics """ if self.ema: use_dataset = kwargs.get('dataset', True) cm = ConfusionMatrix(self.model.labels) steps = len(loader) total_loss = 0 total_norm = 0 verbose = kwargs.get("verbose", None) pg = create_progress_bar(steps) for i, batch_dict in enumerate(pg(loader)): y = batch_dict['y'] if use_dataset: guess, lossv =[, self.test_loss]) else: feed_dict = self.model.make_input(batch_dict, False) guess, lossv =[, self.test_loss], feed_dict=feed_dict) batchsz = len(guess) total_loss += lossv * batchsz total_norm += batchsz cm.add_batch(y, guess) metrics = cm.get_all_metrics() metrics['avg_loss'] = total_loss / float(total_norm) verbose_output(verbose, cm) return metrics
Example #23
Source File: From mead-baseline with Apache License 2.0 | 4 votes |
def _train(self, loader, dataset=True, **kwargs): """Train an epoch of data using either the input loader or using `tf.dataset` In non-`tf.dataset` mode, we cycle the loader data feed, and pull a batch and feed it to the feed dict When we use `tf.dataset`s under the hood, this function simply uses the loader to know how many steps to train. We do use a `feed_dict` for passing the `TRAIN_FLAG` in either case :param loader: A data feed :param kwargs: See below :Keyword Arguments: * *dataset* (`bool`) Set to `True` if using `tf.dataset`s, defaults to `True` * *reporting_fns* (`list`) A list of reporting hooks to use :return: Metrics """ if self.ema: reporting_fns = kwargs.get('reporting_fns', []) epoch_loss = 0 epoch_div = 0 steps = len(loader) pg = create_progress_bar(steps) for batch_dict in pg(loader): if dataset: _, step, lossv =[self.train_op, self.global_step, self.loss], feed_dict={TRAIN_FLAG(): 1}) else: feed_dict = self.model.make_input(batch_dict, True) _, step, lossv =[self.train_op, self.global_step, self.loss], feed_dict=feed_dict) batchsz = self._get_batchsz(batch_dict) report_lossv = lossv * batchsz epoch_loss += report_lossv epoch_div += batchsz self.nstep_agg += report_lossv self.nstep_div += batchsz if (step + 1) % self.nsteps == 0: metrics = self.calc_metrics(self.nstep_agg, self.nstep_div) step + 1, metrics, self.nstep_start, 'Train', 'STEP', reporting_fns, self.nsteps ) self.reset_nstep() metrics = self.calc_metrics(epoch_loss, epoch_div) return metrics
Example #24
Source File: From mead-baseline with Apache License 2.0 | 4 votes |
def test(self, vs, reporting_fns, phase='Valid', dataset=True, **kwargs): """Run an epoch of testing over the dataset If we are using a `tf.dataset`-based `fit_func`, we will just cycle the number of steps and let the `dataset` yield new batches. If we are using `feed_dict`s, we convert each batch from the `DataFeed` and pass that into TF as the `feed_dict` :param vs: A validation set :param reporting_fns: Reporting hooks :param phase: The phase of evaluation (`Test`, `Valid`) :param dataset: (`bool`) Are we using `tf.dataset`s :return: Metrics """ SET_TRAIN_FLAG(False) if phase == 'Test': return self._evaluate(vs, reporting_fns, **kwargs) self.valid_epochs += 1 total_loss = 0 total_toks = 0 preds = [] golds = [] start = time.time() for features, tgt in vs: features['dst'] = tgt[:, :-1] top_preds = self.model.predict(features, beam=1, make_input=False) loss_value = loss(self.model, features, tgt).numpy() toks = tf.cast(self._num_toks(features['tgt_len']), tf.float32).numpy() total_loss += loss_value * toks total_toks += toks preds.extend(convert_seq2seq_preds(top_preds[:, 0, :], self.tgt_rlut)) golds.extend(convert_seq2seq_golds(tgt, features['tgt_len'], self.tgt_rlut)) metrics = self.calc_metrics(total_loss, total_toks) metrics['bleu'] = bleu(preds, golds, self.bleu_n_grams)[0] self.valid_epochs, metrics, start, phase, 'EPOCH', reporting_fns ) return metrics
Example #25
Source File: From ranking with Apache License 2.0 | 4 votes |
def __init__(self, context_feature_columns, example_feature_columns, hparams, estimator, label_feature_name="relevance", label_feature_type=tf.int64,, best_exporter_metric=None, best_exporter_metric_higher_better=True, size_feature_name=None): """Constructor. Args: context_feature_columns: (dict) Context (aka, query) feature columns. example_feature_columns: (dict) Example (aka, document) feature columns. hparams: (dict) A dict containing model hyperparameters. estimator: (`Estimator`) An `Estimator` instance for model train and eval. label_feature_name: (str) The name of the label feature. label_feature_type: (`tf.dtype`) The value type of the label feature. dataset_reader: (`tf.Dataset`) The dataset format for the input files. best_exporter_metric: (str) Metric key for exporting the best model. If None, exports the model with the minimal loss value. best_exporter_metric_higher_better: (bool) If a higher metric is better. This is only used if `best_exporter_metric` is not None. size_feature_name: (str) If set, populates the feature dictionary with this name and the coresponding value is a `tf.int32` Tensor of shape [batch_size] indicating the actual sizes of the example lists before padding and truncation. If None, which is default, this feature is not generated. """ self._validate_parameters(estimator, hparams) self._context_feature_columns = context_feature_columns self._example_feature_columns = example_feature_columns self._hparams = hparams self._estimator = estimator self._label_feature_name = label_feature_name self._label_feature_type = label_feature_type self._dataset_reader = dataset_reader self._best_exporter_metric = best_exporter_metric self._best_exporter_metric_higher_better = ( best_exporter_metric_higher_better) self._size_feature_name = size_feature_name
Example #26
Source File: From mead-baseline with Apache License 2.0 | 4 votes |
def test(self, vs, reporting_fns, steps=0, phase='Valid', **kwargs): """Run an epoch of testing over the dataset If we are using a `tf.dataset`-based `fit_func`, we will just cycle the number of steps and let the `dataset` yield new batches. If we are using `feed_dict`s, we convert each batch from the `DataFeed` and pass that into TF as the `feed_dict` :param vs: A validation set :param reporting_fns: Reporting hooks :param phase: The phase of evaluation (`Test`, `Valid`) :param dataset: (`bool`) Are we using `tf.dataset`s :return: Metrics """ def _replicated_valid_step(inputs): features, tgt = inputs top_preds = self.model.predict(features, beam=1, make_input=False) per_replica_loss = loss(self.model, features, tgt) per_replica_toks = self._num_toks(features['tgt_len']) per_replica_report_loss = per_replica_loss * tf.cast(per_replica_toks, tf.float32) return per_replica_report_loss, per_replica_toks, top_preds if phase == 'Test': SET_TRAIN_FLAG(False) return self._evaluate(vs, reporting_fns, **kwargs) strategy = self.strategy num_replicas = strategy.num_replicas_in_sync with strategy.scope(): SET_TRAIN_FLAG(False) self.valid_epochs += 1 total_loss = tf.Variable(0.0) total_toks = tf.Variable(0, dtype=tf.int32) preds = [] golds = [] start = time.time() test_iter = iter(vs) for i in range(steps): features, tgt = next(test_iter) inputs = (features, tgt) per_replica_loss, per_replica_toks, _ = strategy.experimental_run_v2(_replicated_valid_step, args=(inputs,)) total_loss.assign_add(strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_loss, axis=None)) total_toks.assign_add(strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_toks, axis=None)) # Not sure a good way to get top preds merged yet metrics = self.calc_metrics(total_loss.numpy(), total_toks.numpy()) self.valid_epochs, metrics, start, phase, 'EPOCH', reporting_fns ) return metrics
Example #27
Source File: From mead-baseline with Apache License 2.0 | 4 votes |
def test(self, vs, reporting_fns, phase='Valid', dataset=True): """Run an epoch of testing over the dataset If we are using a `tf.dataset`-based `fit_func`, we will just cycle the number of steps and let the `dataset` yield new batches. If we are using `feed_dict`s, we convert each batch from the `DataFeed` and pass that into TF as the `feed_dict` :param vs: A validation set :param reporting_fns: Reporting hooks :param phase: The phase of evaluation (`Test`, `Valid`) :param dataset: (`bool`) Are we using `tf.dataset`s :return: Metrics """ if phase == 'Test' and not dataset: return self._evaluate(vs, reporting_fns) self.valid_epochs += 1 total_loss = 0 total_toks = 0 preds = [] golds = [] start = time.time() pg = create_progress_bar(len(vs)) for batch_dict in pg(vs): if dataset: lossv, top_preds =[self.test_loss,]) else: feed_dict = self.model.make_input(batch_dict) lossv, top_preds =[self.test_loss,], feed_dict=feed_dict) toks = self._num_toks(batch_dict['tgt_lengths']) total_loss += lossv * toks total_toks += toks preds.extend(convert_seq2seq_preds(top_preds.T, self.tgt_rlut)) golds.extend(convert_seq2seq_golds(batch_dict['tgt'], batch_dict['tgt_lengths'], self.tgt_rlut)) metrics = self.calc_metrics(total_loss, total_toks) metrics['bleu'] = bleu(preds, golds, self.bleu_n_grams)[0] self.valid_epochs, metrics, start, phase, 'EPOCH', reporting_fns ) return metrics
Example #28
Source File: From mead-baseline with Apache License 2.0 | 4 votes |
def train(self, ts, reporting_fns, dataset=True): """Train by looping over the steps For a `tf.dataset`-backed `fit_func`, we are using the previously wired `dataset`s in the model (and `dataset` is `True`). For `feed_dict`, we convert the ts samples to `feed_dict`s and hand them in one-by-one :param ts: The training set :param reporting_fns: A list of reporting hooks :param dataset: (`bool`) Are we using `tf.dataset`s :return: Metrics """ epoch_loss = 0 epoch_toks = 0 start = time.time() self.nstep_start = start for batch_dict in ts: if dataset: _, global_step, lossv =[self.train_op, self.global_step, self.loss], feed_dict={TRAIN_FLAG(): 1}) else: feed_dict = self.model.make_input(batch_dict, True) _, global_step, lossv =[self.train_op, self.global_step, self.loss], feed_dict=feed_dict) # ?? How to get this cleaner? toks = self._num_toks(batch_dict['tgt_lengths']) report_loss = lossv * toks epoch_loss += report_loss epoch_toks += toks self.nstep_agg += report_loss self.nstep_div += toks if (global_step + 1) % self.nsteps == 0: metrics = self.calc_metrics(self.nstep_agg, self.nstep_div) global_step + 1, metrics, self.nstep_start, 'Train', 'STEP', reporting_fns, self.nsteps ) self.reset_nstep() metrics = self.calc_metrics(epoch_loss, epoch_toks) self.train_epochs += 1 self.train_epochs, metrics, start, 'Train', 'EPOCH', reporting_fns ) return metrics
Example #29
Source File: From mead-baseline with Apache License 2.0 | 4 votes |
def _train(self, loader, steps=0, **kwargs): """Train an epoch of data using either the input loader or using `tf.dataset` In non-`tf.dataset` mode, we cycle the loader data feed, and pull a batch and feed it to the feed dict When we use `tf.dataset`s under the hood, this function simply uses the loader to know how many steps to train. We do use a `feed_dict` for passing the `TRAIN_FLAG` in either case :param loader: A data feed :param kwargs: See below :Keyword Arguments: * *dataset* (`bool`) Set to `True` if using `tf.dataset`s, defaults to `True` * *reporting_fns* (`list`) A list of reporting hooks to use :return: Metrics """ SET_TRAIN_FLAG(True) reporting_fns = kwargs.get('reporting_fns', []) pg = create_progress_bar(steps) epoch_loss = tf.Variable(0.0) epoch_div = tf.Variable(0, dtype=tf.int32) nstep_loss = tf.Variable(0.0) nstep_div = tf.Variable(0, dtype=tf.int32) self.nstep_start = time.time() @tf.function def _train_step(inputs): features, y = inputs loss = self.optimizer.update(self.model, features, y) batchsz = get_shape_as_list(y)[0] report_loss = loss * batchsz return report_loss, batchsz with autograph_options({"function_optimization": False, "layout_optimizer": False}): for inputs in pg(loader): step_report_loss, step_batchsz = _train_step(inputs) epoch_loss.assign_add(step_report_loss) nstep_loss.assign_add(step_report_loss) epoch_div.assign_add(step_batchsz) nstep_div.assign_add(step_batchsz) step = self.optimizer.global_step.numpy() + 1 if step % self.nsteps == 0: metrics = self.calc_metrics(nstep_loss.numpy(), nstep_div.numpy()) step, metrics, self.nstep_start, 'Train', 'STEP', reporting_fns, self.nsteps ) nstep_loss.assign(0.0) nstep_div.assign(0) self.nstep_start = time.time() epoch_loss = epoch_loss.numpy() epoch_div = epoch_div.numpy() metrics = self.calc_metrics(epoch_loss, epoch_div) return metrics
Example #30
Source File: From mead-baseline with Apache License 2.0 | 4 votes |
def _train(self, ts, **kwargs): """Train an epoch of data using either the input loader or using `tf.dataset` In non-`tf.dataset` mode, we cycle the loader data feed, and pull a batch and feed it to the feed dict When we use `tf.dataset`s under the hood, this function simply uses the loader to know how many steps to train. We do use a `feed_dict` for passing the `TRAIN_FLAG` in either case :param ts: A data feed :param kwargs: See below :Keyword Arguments: * *dataset* (`bool`) Set to `True` if using `tf.dataset`s, defaults to `True` * *reporting_fns* (`list`) A list of reporting hooks to use :return: Metrics """ use_dataset = kwargs.get('dataset', True) reporting_fns = kwargs.get('reporting_fns', []) epoch_loss = 0 epoch_div = 0 steps = len(ts) pg = create_progress_bar(steps) for batch_dict in pg(ts): if use_dataset: _, step, lossv =[self.train_op, self.global_step, self.loss], feed_dict={TRAIN_FLAG(): 1}) else: feed_dict = self.model.make_input(batch_dict, True) _, step, lossv =[self.train_op, self.global_step, self.loss], feed_dict=feed_dict) batchsz = self._get_batchsz(batch_dict) report_loss = lossv * batchsz epoch_loss += report_loss epoch_div += batchsz self.nstep_agg += report_loss self.nstep_div += batchsz if (step + 1) % self.nsteps == 0: metrics = self.calc_metrics(self.nstep_agg, self.nstep_div) step + 1, metrics, self.nstep_start, 'Train', 'STEP', reporting_fns, self.nsteps ) self.reset_nstep() metrics = self.calc_metrics(epoch_loss, epoch_div) return metrics