Python Examples of tensorflow.dataset

Source File: eager.py From mead-baseline with Apache License 2.0

6 votes

def _evaluate(self, es, reporting_fns, **kwargs):
        """Run the model with beam search and report Bleu.

        :param es: `tf.dataset` of input
        :param reporting_fns: Input hooks
        """
        preds = []
        golds = []
        start = time.time()

        for features, tgt in es:
            features['dst'] = tgt[:, :-1]
            tgt_lens = features.pop('tgt_len')
            top_preds = self.model.predict(features, make_input=False, **kwargs)
            preds.extend(convert_seq2seq_preds(top_preds[:, 0, :], self.tgt_rlut))
            golds.extend(convert_seq2seq_golds(tgt, tgt_lens, self.tgt_rlut))
        metrics = {'bleu': bleu(preds, golds, self.bleu_n_grams)[0]}
        self.report(
            0, metrics, start, 'Test', 'EPOCH', reporting_fns
        )
        return metrics

Source File: data_wrapper_base.py From MMNet with Apache License 2.0

6 votes

def setup_dataset(
        self,
        placeholders: Tuple[tf.placeholder, tf.placeholder],
        batch_size: int=None,
    ):
        self.batch_size = self.args.batch_size if batch_size is None else batch_size

        dataset = tf.data.Dataset.from_tensor_slices(placeholders)
        dataset = dataset.map(self._parse_function, num_parallel_calls=self.args.num_threads).prefetch(
            self.args.prefetch_factor * self.batch_size)
        if self.is_training:
            dataset = dataset.repeat()
        if self.shuffle:
            dataset = dataset.shuffle(buffer_size=self.args.buffer_size)
        self.dataset = dataset.batch(self.batch_size)
        self.iterator = self.dataset.make_initializable_iterator()
        self.next_elem = self.iterator.get_next()

Source File: utils.py From mead-baseline with Apache License 2.0

6 votes

def _test(self, ts, dataset=True):
        """Test an epoch of data using either the input loader or using `tf.dataset`

        In non-`tf.dataset` mode, we cycle the loader data feed, and pull a batch and feed it to the feed dict
        When we use `tf.dataset`s under the hood, this function simply uses the loader to know how many steps
        to train.

        :param loader: A data feed
        :param kwargs: See below

        :Keyword Arguments:
          * *dataset* (`bool`) Set to `True` if using `tf.dataset`s, defaults to `True`
          * *reporting_fns* (`list`) A list of reporting hooks to use
          * *verbose* (`dict`) A dictionary containing `console` boolean and `file` name if on

        :return: Metrics
        """
        return self.evaluator.test(ts, dataset=dataset)

Source File: eager.py From mead-baseline with Apache License 2.0

6 votes

def _test(self, ts, steps=0, **kwargs):
        """Test an epoch of data using either the input loader or using `tf.dataset`

        In non-`tf.dataset` mode, we cycle the loader data feed, and pull a batch and feed it to the feed dict
        When we use `tf.dataset`s under the hood, this function simply uses the loader to know how many steps
        to train.

        :param loader: A data feed
        :param kwargs: See below

        :Keyword Arguments:
          * *dataset* (`bool`) Set to `True` if using `tf.dataset`s, defaults to `True`
          * *reporting_fns* (`list`) A list of reporting hooks to use
          * *verbose* (`dict`) A dictionary containing `console` boolean and `file` name if on

        :return: Metrics
        """
        return self.evaluator.test(ts, steps, **kwargs)

Source File: distributed.py From mead-baseline with Apache License 2.0

5 votes

def distribute(self, dataset):
        return self.strategy.experimental_distribute_dataset(dataset)

Source File: distributed.py From mead-baseline with Apache License 2.0

5 votes

def distribute(self, dataset):
        return self.strategy.experimental_distribute_dataset(dataset)

Source File: distributed.py From mead-baseline with Apache License 2.0

5 votes

def distribute(self, dataset):
        return self.strategy.experimental_distribute_dataset(dataset)

Source File: eager.py From mead-baseline with Apache License 2.0

5 votes

def test(self, vs, reporting_fns, phase):
        """Run an epoch of testing over the dataset

        If we are using a `tf.dataset`-based `fit_func`, we will just
        cycle the number of steps and let the `dataset` yield new batches.

        If we are using `feed_dict`s, we convert each batch from the `DataFeed`
        and pass that into TF as the `feed_dict`

        :param vs: A validation set
        :param reporting_fns: Reporting hooks
        :param phase: The phase of evaluation (`Test`, `Valid`)
        :param dataset: (`bool`) Are we using `tf.dataset`s
        :return: Metrics
        """
        total_loss = 0.0
        total_toks = 0
        epochs = 0
        if phase == 'Valid':
            self.valid_epochs += 1
            epochs = self.valid_epochs
        SET_TRAIN_FLAG(False)

        start = time.time()
        h = None
        for features, y in vs:
            if self.model.requires_state:
                loss_value, h = loss_with_state(self.model, h, features, y)
            else:
                loss_value = loss_without_state(self.model, features, y)
            loss_value = loss_value.numpy()
            toks = self._num_toks(y)
            total_loss += loss_value * tf.cast(toks, tf.float32).numpy()
            total_toks += toks.numpy()

        metrics = self.calc_metrics(total_loss, total_toks)
        self.report(
            epochs, metrics, start,
            phase, 'EPOCH', reporting_fns
        )
        return metrics

Source File: gqn_provider.py From tf-gqn with Apache License 2.0

5 votes

def _get_dataset_files(dataset_info, mode, root):
  """Generates lists of files for a given dataset version."""
  basepath = dataset_info.basepath
  base = os.path.join(root, basepath, mode)
  if mode == 'train':
    num_files = dataset_info.train_size
  else:
    num_files = dataset_info.test_size
  length = len(str(num_files))
  template = '{:0%d}-of-{:0%d}.tfrecord' % (length, length)
  record_paths = [  # indexing runs from 1 to n
      os.path.join(base, template.format(i, num_files))
      for i in range(1, num_files + 1)]
  return record_paths

Source File: utils.py From mead-baseline with Apache License 2.0

5 votes

def to_tensors(ts, src_lengths_key, dst=False):
    """Convert a data feed into a tuple of `features` (`dict`) and `y` values

    This method is required to produce `tf.dataset`s from the input data feed.
    Any fields ending with `_lengths` are ignored, unless they match the
    `src_lengths_key` or `tgt_lengths_key`, in which case, they are converted to `src_len` and `tgt_len`

    :param ts: The data feed to convert
    :param lengths_key: This is a field passed from the model params specifying source of truth of the temporal lengths
    :param dst: `bool` that says if we should prepare a `dst` tensor.  This is needed in distributed mode
    :return: A `tuple` of `features` and `y` (labels)
    """
    keys = ts[0].keys()
    # This is kind of a hack
    keys = [k for k in keys if '_lengths' not in k and k != 'ids'] + [src_lengths_key, "tgt_lengths"]

    features = dict((k, []) for k in keys)
    for sample in ts:
        for k in keys:
            for s in sample[k]:
                features[k].append(s)
    features['src_len'] = features[src_lengths_key]
    del features[src_lengths_key]
    features['tgt_len'] = features['tgt_lengths']
    del features['tgt_lengths']
    features = dict((k, np.stack(v).astype(np.int32)) for k, v in features.items())
    if dst:
        features['dst'] = features['tgt'][:, :-1]
    tgt = features.pop('tgt')

    return features, tgt

Source File: data_wrapper_base.py From MMNet with Apache License 2.0

5 votes

def resize_and_padding_before_augmentation(self, image, size):
        # If width > height, resize height to model's input height while preserving aspect ratio
        # If height > width, resize width to model's input width while preserving aspect ratio
        if self.args.debug_augmentation:
            assert size[0] == size[1], "resize_and_padding_before_augmentation only supports square target image"
            image = tf.expand_dims(image, 0)

            image_dims = tf.shape(image)
            height = image_dims[1]
            width = image_dims[2]

            min_size = min(*size)
            width_aspect = tf.maximum(min_size, tf.cast(width * min_size / height, dtype=tf.int32))
            height_aspect = tf.maximum(min_size, tf.cast(height * min_size / width, dtype=tf.int32))

            image = tf.image.resize_bilinear(image, (height_aspect, width_aspect))
            image = image[:, :self.padded_max_size, :self.padded_max_size, :]

            # Pads the image on the bottom and right with zeros until it has dimensions target_height, target_width.
            image = tf.image.pad_to_bounding_box(
                image,
                offset_height=tf.maximum(self.padded_max_size-height_aspect, 0),
                offset_width=tf.maximum(self.padded_max_size-width_aspect, 0),
                target_height=self.padded_max_size,
                target_width=self.padded_max_size,
            )

            image = tf.squeeze(image, 0)
            return image
        else:
            # Have to return some dummy tensor which have .get_shape() to tf.dataset
            return tf.constant(0, shape=self.padded_original_image_dummy_shape, dtype=tf.uint8, name="dummy")

Source File: data_wrapper_base.py From MMNet with Apache License 2.0

5 votes

def add_arguments(parser):
        g_common = parser.add_argument_group("(DataWrapperBase) Common Arguments for all data wrapper.")
        g_common.add_argument("--dataset_path", required=True, type=str, help="The name of the dataset to load.")
        g_common.add_argument("--dataset_split_name", required=True, type=str, nargs="*",
                              help="The name of the train/test split. Support multiple splits")

        g_common.add_argument("--batch_size", default=32, type=utils.positive_int,
                              help="The number of examples in batch.")
        g_common.add_argument("--no-shuffle", dest="shuffle", action="store_false")
        g_common.add_argument("--shuffle", dest="shuffle", action="store_true")
        g_common.set_defaults(shuffle=True)

        g_common.add_argument("--width", required=True, type=int)
        g_common.add_argument("--height", required=True, type=int)
        g_common.add_argument("--no-debug_augmentation", dest="debug_augmentation", action="store_false")
        g_common.add_argument("--debug_augmentation", dest="debug_augmentation", action="store_true")
        g_common.set_defaults(debug_augmentation=False)
        g_common.add_argument("--max_padded_size", default=224, type=int,
                              help=("We will resize & pads the original image "
                                    "until it has dimensions (padded_size, padded_size)"
                                    "Recommend to set this value as width(or height) * 1.8 ~ 2"))
        g_common.add_argument("--augmentation_method", type=str, required=True,
                              choices=_available_augmentation_methods)
        g_common.add_argument("--num_threads", default=8, type=int)
        g_common.add_argument("--buffer_size", default=1000, type=int)
        g_common.add_argument("--prefetch_factor", default=100, type=int)

        g_common.add_argument("--rotation_range", default=0, type=int,
                              help="Receives maximum angle to be rotated in terms of degree: "
                                   "The image is randomly rotated by the angle "
                                   "randomly chosen from [-rotation_range, rotation_range], "
                                   "and then cropped appropriately to remove dark areas.\n"
                                   "So, be aware that the rotation performs certain kind of zooming.")
        g_common.add_argument("--no-has_sub_dataset", dest="has_sub_dataset", action="store_false")
        g_common.add_argument("--has_sub_dataset", dest="has_sub_dataset", action="store_true")
        g_common.set_defaults(has_sub_dataset=False)

Source File: utils.py From mead-baseline with Apache License 2.0

5 votes

def process_batch(self, batch_dict, handle, txts, dataset=True):
        if dataset:
            guess = self.sess.run(self.model.best)
        else:
            feed_dict = self.model.make_input(batch_dict)
            guess = self.sess.run(self.model.best, feed_dict=feed_dict)

        sentence_lengths = batch_dict[self.model.lengths_key]

        ids = batch_dict['ids']
        truth = batch_dict['y']
        correct_labels = 0
        total_labels = 0

        # For fscore
        gold_chunks = []
        pred_chunks = []

        # For each sentence
        for b in range(len(guess)):
            length = sentence_lengths[b]
            sentence = guess[b][:length]
            # truth[b] is padded, cutting at :length gives us back true length
            gold = truth[b][:length]

            valid_guess = sentence[gold != Offsets.PAD]
            valid_gold = gold[gold != Offsets.PAD]
            valid_sentence_length = np.sum(gold != Offsets.PAD)
            correct_labels += np.sum(np.equal(valid_guess, valid_gold))
            total_labels += valid_sentence_length

            gold_chunks.append(set(to_spans(valid_gold, self.idx2label, self.span_type, self.verbose)))
            pred_chunks.append(set(to_spans(valid_guess, self.idx2label, self.span_type, self.verbose)))

            # Should we write a file out?  If so, we have to have txts
            if handle is not None:
                id = ids[b]
                txt = txts[id]
                write_sentence_conll(handle, valid_guess, valid_gold, txt, self.idx2label)

        return correct_labels, total_labels, gold_chunks, pred_chunks

Source File: utils.py From mead-baseline with Apache License 2.0

5 votes

def to_tensors(ts, lengths_key):
    """Convert a data feed into a tuple of `features` (`dict`) and `y` values

    This method is required to produce `tf.dataset`s from the input data feed.
    Any fields ending with `_lengths` are ignored, unless they match the
    `lengths_key` name (as are `ids`)

    :param ts: The data feed to convert
    :param lengths_key: This is a field passed from the model params specifying source of truth of the temporal lengths
    :return: A `tuple` of `features` and `y` (labels)
    """
    keys = ts[0].keys()
    # This is kind of a hack
    keys = [k for k in keys if '_lengths' not in k and k != 'ids'] + [lengths_key]

    features = dict((k, []) for k in keys)
    for sample in ts:
        for k in features.keys():
            # add each sample
            for s in sample[k]:
                features[k].append(s)

    features['lengths'] = features[lengths_key]
    del features[lengths_key]
    features = dict((k, np.stack(v)) for k, v in features.items())
    y = features.pop('y')
    return features, y

Source File: pipeline.py From ranking with Apache License 2.0

5 votes

def _make_input_fn(self,
                     input_pattern,
                     batch_size,
                     list_size,
                     randomize_input=True,
                     num_epochs=None):
    """Returns the input function for the ranking model.

    Args:
      input_pattern: (str) File pattern for the input data.
      batch_size: (int) The number of input examples to process per batch.
      list_size: (int) The list size for an ELWC example.
      randomize_input: (bool) If true, randomize input example order. It should
        almost always be true except for unittest/debug purposes.
      num_epochs: (int) The number of times the input dataset must be repeated.
        None to repeat the data indefinitely.

    Returns:
      An `input_fn` for `tf.estimator.Estimator`.
    """

    def _input_fn():
      """`input_fn` for the `Estimator`."""
      return self._make_dataset(
          batch_size=batch_size,
          list_size=list_size,
          input_pattern=input_pattern,
          randomize_input=randomize_input,
          num_epochs=num_epochs)

    return _input_fn

Source File: input_fn.py From bert-multitask-learning with MIT License

4 votes

def train_eval_input_fn(params, mode='train'):
    '''Train and eval input function of estimator.
    This function will write and read tf record for training
    and evaluation.

    Usage:
        def train_input_fn(): return train_eval_input_fn(params)
        estimator.train(
            train_input_fn, max_steps=params.train_steps, hooks=[train_hook])

    Arguments:
        params {Params} -- Params objects

    Keyword Arguments:
        mode {str} -- ModeKeys (default: {'train'})

    Returns:
        tf Dataset -- Tensorflow dataset
    '''
    write_tfrecord(params=params)

    dataset_dict = read_tfrecord(params=params, mode=mode)

    dataset = tf.data.experimental.sample_from_datasets(
        [ds for _, ds in dataset_dict.items()])

    if mode == 'train':
        dataset = dataset.shuffle(params.shuffle_buffer)

    dataset = dataset.prefetch(params.prefetch)
    if params.dynamic_padding:
        dataset = dataset.apply(
            tf.data.experimental.bucket_by_sequence_length(
                element_length_func=element_length_func,
                bucket_batch_sizes=params.bucket_batch_sizes,
                bucket_boundaries=params.bucket_boundaries,
            ))
    else:
        if mode == 'train':
            dataset = dataset.batch(params.batch_size)
        else:
            dataset = dataset.batch(params.batch_size*2)

    return dataset

Source File: utils.py From mead-baseline with Apache License 2.0

4 votes

def test(self, vs, reporting_fns, phase, dataset=True):
        """Run an epoch of testing over the dataset

        If we are using a `tf.dataset`-based `fit_func`, we will just
        cycle the number of steps and let the `dataset` yield new batches.

        If we are using `feed_dict`s, we convert each batch from the `DataFeed`
        and pass that into TF as the `feed_dict`

        :param vs: A validation set
        :param reporting_fns: Reporting hooks
        :param phase: The phase of evaluation (`Test`, `Valid`)
        :param dataset: (`bool`) Are we using `tf.dataset`s
        :return: Metrics
        """
        total_loss = 0.0
        total_toks = 0
        epochs = 0
        if phase == 'Valid':
            self.valid_epochs += 1
            epochs = self.valid_epochs

        if self.model.requires_state:
            state = self.model.sess.run(self.model.initial_state, self.model.make_input(vs[0], False))

        fetches = {
            "loss": self.test_loss,
        }

        if self.model.requires_state:
            fetches["final_state"] = self.model.final_state

        start = time.time()

        for batch_dict in vs:
            feed_dict = {}
            if not dataset:
                feed_dict = self.model.make_input(batch_dict, False)
            # In Keras LSTM, the order is h first, c second, its the opposite in TF 1, however I dont think it
            # ends up mattering here
            if self.model.requires_state:

                for i, (s1, s2) in enumerate(self.model.initial_state):
                    feed_dict[s1] = state[i][0]  # .c  # 0
                    feed_dict[s2] = state[i][1]  # .h  # 1

            vals = self.model.sess.run(fetches, feed_dict)
            loss = vals["loss"]
            toks = self._num_toks(batch_dict)
            if self.model.requires_state:
                state = vals["final_state"]
            total_loss += loss * toks
            total_toks += toks

        metrics = self.calc_metrics(total_loss, total_toks)
        self.report(
            epochs, metrics, start,
            phase, 'EPOCH', reporting_fns
        )
        return metrics

Source File: input_fn.py From bert-multitask-learning with MIT License

4 votes

def to_serving_input(input_file_or_list, config, mode=PREDICT, tokenizer=None):
    '''A serving input function that takes input file path or
    list of string and apply BERT preprocessing. This fn will
    return a data dict instead of tf dataset. Used in serving.

    Arguments:
        input_file_or_list {str or list} -- file path of list of str
        config {Params} -- Params

    Keyword Arguments:
        mode {str} -- ModeKeys (default: {PREDICT})
        tokenizer {tokenizer} -- Tokenizer (default: {None})
    '''

    # if is string, treat it as path to file
    if isinstance(input_file_or_list, str):
        inputs = open(input_file_or_list, 'r', encoding='utf8').readlines()
    else:
        inputs = input_file_or_list

    if tokenizer is None:
        tokenizer = FullTokenizer(config.vocab_file)

    data_dict = {}
    for doc in inputs:
        inputs_a = cluster_alphnum(doc)
        tokens, target = tokenize_text_with_seqs(
            tokenizer, inputs_a, None)

        tokens_a, tokens_b, target = truncate_seq_pair(
            tokens, None, target, config.max_seq_len)

        tokens, segment_ids, target = add_special_tokens_with_seqs(
            tokens_a, tokens_b, target)

        input_mask, tokens, segment_ids, target = create_mask_and_padding(
            tokens, segment_ids, target, config.max_seq_len)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        data_dict['input_ids'] = input_ids
        data_dict['input_mask'] = input_mask
        data_dict['segment_ids'] = segment_ids
        yield data_dict

Source File: eager.py From mead-baseline with Apache License 2.0

4 votes

def _test(self, loader, steps=0, **kwargs):
        """Test an epoch of data using either the input loader or using `tf.dataset`

        In non-`tf.dataset` mode, we cycle the loader data feed, and pull a batch and feed it to the feed dict
        When we use `tf.dataset`s under the hood, this function simply uses the loader to know how many steps
        to train.

        :param loader: A data feed
        :param kwargs: See below

        :Keyword Arguments:
          * *dataset* (`bool`) Set to `True` if using `tf.dataset`s, defaults to `True`
          * *reporting_fns* (`list`) A list of reporting hooks to use
          * *verbose* (`dict`) A dictionary containing `console` boolean and `file` name if on

        :return: Metrics
        """

        cm = ConfusionMatrix(self.model.labels)
        total_loss = 0
        total_norm = 0
        verbose = kwargs.get("verbose", None)

        pg = create_progress_bar(steps)

        SET_TRAIN_FLAG(False)
        for features, y in pg(loader):
            logits = self.model(features)
            y_ = tf.argmax(logits, axis=1, output_type=tf.int32)
            cm.add_batch(y, y_)
            lossv = tf.compat.v1.losses.sparse_softmax_cross_entropy(labels=y, logits=logits).numpy()
            batchsz = int(y.shape[0])
            assert len(y_) == batchsz
            total_loss += lossv * batchsz
            total_norm += batchsz
            cm.add_batch(y, y_)

        metrics = cm.get_all_metrics()
        metrics['avg_loss'] = total_loss / float(total_norm)
        verbose_output(verbose, cm)

        return metrics

Source File: eager.py From mead-baseline with Apache License 2.0

4 votes

def _train(self, loader, steps=0, **kwargs):
        """Train an epoch of data using either the input loader or using `tf.dataset`

        In non-`tf.dataset` mode, we cycle the loader data feed, and pull a batch and feed it to the feed dict
        When we use `tf.dataset`s under the hood, this function simply uses the loader to know how many steps
        to train.  We do use a `feed_dict` for passing the `TRAIN_FLAG` in either case

        :param loader: A data feed
        :param kwargs: See below

        :Keyword Arguments:
         * *dataset* (`bool`) Set to `True` if using `tf.dataset`s, defaults to `True`
         * *reporting_fns* (`list`) A list of reporting hooks to use

        :return: Metrics
        """

        SET_TRAIN_FLAG(True)
        reporting_fns = kwargs.get('reporting_fns', [])
        pg = create_progress_bar(steps)
        epoch_loss = tf.Variable(0.0)
        epoch_div = tf.Variable(0, dtype=tf.int32)
        nstep_loss = tf.Variable(0.0)
        nstep_div = tf.Variable(0, dtype=tf.int32)
        self.nstep_start = time.time()

        @tf.function
        def _train_step(inputs):
            """Replicated training step."""
            features, y = inputs
            loss = self.optimizer.update(self.model, features, y)
            batchsz = get_shape_as_list(y)[0]
            report_loss = loss * batchsz
            return report_loss, batchsz

        for inputs in pg(loader):
            step_report_loss, step_batchsz = _train_step(inputs)
            epoch_loss.assign_add(step_report_loss)
            nstep_loss.assign_add(step_report_loss)
            epoch_div.assign_add(step_batchsz)
            nstep_div.assign_add(step_batchsz)
            step = self.optimizer.global_step.numpy() + 1

            if step % self.nsteps == 0:
                metrics = self.calc_metrics(nstep_loss.numpy(), nstep_div.numpy())
                self.report(
                    step, metrics, self.nstep_start,
                    'Train', 'STEP', reporting_fns, self.nsteps
                )
                nstep_loss.assign(0.0)
                nstep_div.assign(0)
                self.nstep_start = time.time()

        epoch_loss = epoch_loss.numpy()
        epoch_div = epoch_div.numpy()
        metrics = self.calc_metrics(epoch_loss, epoch_div)
        return metrics

Source File: eager.py From mead-baseline with Apache License 2.0

4 votes

def train(self, ts, reporting_fns, dataset=True):
        """Train by looping over the steps

        For a `tf.dataset`-backed `fit_func`, we are using the previously wired `dataset`s
        in the model (and `dataset` is `True`).  For `feed_dict`, we convert the ts samples
        to `feed_dict`s and hand them in one-by-one

        :param ts: The training set
        :param reporting_fns: A list of reporting hooks
        :param dataset: (`bool`) Are we using `tf.dataset`s
        :return: Metrics
        """
        SET_TRAIN_FLAG(True)
        epoch_loss = tf.Variable(0.0)
        epoch_div = tf.Variable(0, dtype=tf.int32)
        nstep_loss = tf.Variable(0.0)
        nstep_div = tf.Variable(0, dtype=tf.int32)
        self.nstep_start = time.time()
        start = time.time()

        @tf.function
        def _train_step(features, y):
            """Replicated training step."""

            loss = self.optimizer.update(self.model, features, y)
            toks = self._num_toks(features['tgt_len'])
            report_loss = loss * tf.cast(toks, tf.float32)
            return report_loss, toks

        with autograph_options({"function_optimization": False, "layout_optimizer": False}):
            for features, y in ts:
                features['dst'] = y[:, :-1]
                step_report_loss, step_toks = _train_step(features, y)
                epoch_loss.assign_add(step_report_loss)
                nstep_loss.assign_add(step_report_loss)
                epoch_div.assign_add(step_toks)
                nstep_div.assign_add(step_toks)

                step = self.optimizer.global_step.numpy() + 1
                if step % self.nsteps == 0:
                    metrics = self.calc_metrics(nstep_loss.numpy(), nstep_div.numpy())
                    self.report(
                        step, metrics, self.nstep_start,
                        'Train', 'STEP', reporting_fns, self.nsteps
                    )
                    nstep_loss.assign(0.0)
                    nstep_div.assign(0)
                    self.nstep_start = time.time()

        epoch_loss = epoch_loss.numpy()
        epoch_div = epoch_div.numpy()
        metrics = self.calc_metrics(epoch_loss, epoch_div)
        self.train_epochs += 1
        self.report(
            self.train_epochs, metrics, start,
            'Train', 'EPOCH', reporting_fns
        )
        return metrics

Source File: utils.py From mead-baseline with Apache License 2.0

4 votes

def _test(self, loader, **kwargs):
        """Test an epoch of data using either the input loader or using `tf.dataset`

        In non-`tf.dataset` mode, we cycle the loader data feed, and pull a batch and feed it to the feed dict
        When we use `tf.dataset`s under the hood, this function simply uses the loader to know how many steps
        to train.

        :param loader: A data feed
        :param kwargs: See below

        :Keyword Arguments:
          * *dataset* (`bool`) Set to `True` if using `tf.dataset`s, defaults to `True`
          * *reporting_fns* (`list`) A list of reporting hooks to use
          * *verbose* (`dict`) A dictionary containing `console` boolean and `file` name if on

        :return: Metrics
        """
        if self.ema:
            self.sess.run(self.ema_load)

        use_dataset = kwargs.get('dataset', True)

        cm = ConfusionMatrix(self.model.labels)
        steps = len(loader)
        total_loss = 0
        total_norm = 0
        verbose = kwargs.get("verbose", None)

        pg = create_progress_bar(steps)
        for i, batch_dict in enumerate(pg(loader)):
            y = batch_dict['y']
            if use_dataset:
                guess, lossv = self.sess.run([self.model.best, self.test_loss])
            else:
                feed_dict = self.model.make_input(batch_dict, False)
                guess, lossv = self.sess.run([self.model.best, self.test_loss], feed_dict=feed_dict)

            batchsz = len(guess)
            total_loss += lossv * batchsz
            total_norm += batchsz
            cm.add_batch(y, guess)

        metrics = cm.get_all_metrics()
        metrics['avg_loss'] = total_loss / float(total_norm)
        verbose_output(verbose, cm)

        return metrics

Source File: utils.py From mead-baseline with Apache License 2.0

4 votes

def _train(self, loader, dataset=True, **kwargs):
        """Train an epoch of data using either the input loader or using `tf.dataset`

        In non-`tf.dataset` mode, we cycle the loader data feed, and pull a batch and feed it to the feed dict
        When we use `tf.dataset`s under the hood, this function simply uses the loader to know how many steps
        to train.  We do use a `feed_dict` for passing the `TRAIN_FLAG` in either case

        :param loader: A data feed
        :param kwargs: See below

        :Keyword Arguments:
         * *dataset* (`bool`) Set to `True` if using `tf.dataset`s, defaults to `True`
         * *reporting_fns* (`list`) A list of reporting hooks to use

        :return: Metrics
        """
        if self.ema:
            self.sess.run(self.ema_restore)

        reporting_fns = kwargs.get('reporting_fns', [])
        epoch_loss = 0
        epoch_div = 0
        steps = len(loader)
        pg = create_progress_bar(steps)
        for batch_dict in pg(loader):
            if dataset:
                _, step, lossv = self.sess.run([self.train_op, self.global_step, self.loss],
                                               feed_dict={TRAIN_FLAG(): 1})
            else:
                feed_dict = self.model.make_input(batch_dict, True)
                _, step, lossv = self.sess.run([self.train_op, self.global_step, self.loss], feed_dict=feed_dict)

            batchsz = self._get_batchsz(batch_dict)
            report_lossv = lossv * batchsz
            epoch_loss += report_lossv
            epoch_div += batchsz
            self.nstep_agg += report_lossv
            self.nstep_div += batchsz

            if (step + 1) % self.nsteps == 0:
                metrics = self.calc_metrics(self.nstep_agg, self.nstep_div)
                self.report(
                    step + 1, metrics, self.nstep_start,
                    'Train', 'STEP', reporting_fns, self.nsteps
                )
                self.reset_nstep()

        metrics = self.calc_metrics(epoch_loss, epoch_div)
        return metrics

Source File: eager.py From mead-baseline with Apache License 2.0

4 votes

def test(self, vs, reporting_fns, phase='Valid', dataset=True, **kwargs):
        """Run an epoch of testing over the dataset

        If we are using a `tf.dataset`-based `fit_func`, we will just
        cycle the number of steps and let the `dataset` yield new batches.

        If we are using `feed_dict`s, we convert each batch from the `DataFeed`
        and pass that into TF as the `feed_dict`

        :param vs: A validation set
        :param reporting_fns: Reporting hooks
        :param phase: The phase of evaluation (`Test`, `Valid`)
        :param dataset: (`bool`) Are we using `tf.dataset`s
        :return: Metrics
        """
        SET_TRAIN_FLAG(False)
        if phase == 'Test':
            return self._evaluate(vs, reporting_fns, **kwargs)

        self.valid_epochs += 1

        total_loss = 0
        total_toks = 0
        preds = []
        golds = []

        start = time.time()
        for features, tgt in vs:
            features['dst'] = tgt[:, :-1]
            top_preds = self.model.predict(features, beam=1, make_input=False)
            loss_value = loss(self.model, features, tgt).numpy()
            toks = tf.cast(self._num_toks(features['tgt_len']), tf.float32).numpy()
            total_loss += loss_value * toks
            total_toks += toks
            preds.extend(convert_seq2seq_preds(top_preds[:, 0, :], self.tgt_rlut))
            golds.extend(convert_seq2seq_golds(tgt, features['tgt_len'], self.tgt_rlut))

        metrics = self.calc_metrics(total_loss, total_toks)
        metrics['bleu'] = bleu(preds, golds, self.bleu_n_grams)[0]
        self.report(
            self.valid_epochs, metrics, start,
            phase, 'EPOCH', reporting_fns
        )
        return metrics

Source File: pipeline.py From ranking with Apache License 2.0

4 votes

def __init__(self,
               context_feature_columns,
               example_feature_columns,
               hparams,
               estimator,
               label_feature_name="relevance",
               label_feature_type=tf.int64,
               dataset_reader=tf.data.TFRecordDataset,
               best_exporter_metric=None,
               best_exporter_metric_higher_better=True,
               size_feature_name=None):
    """Constructor.

    Args:
      context_feature_columns: (dict) Context (aka, query) feature columns.
      example_feature_columns: (dict) Example (aka, document) feature columns.
      hparams: (dict) A dict containing model hyperparameters.
      estimator: (`Estimator`) An `Estimator` instance for model train and eval.
      label_feature_name: (str) The name of the label feature.
      label_feature_type: (`tf.dtype`) The value type of the label feature.
      dataset_reader: (`tf.Dataset`) The dataset format for the input files.
      best_exporter_metric: (str) Metric key for exporting the best model. If
        None, exports the model with the minimal loss value.
      best_exporter_metric_higher_better: (bool) If a higher metric is better.
        This is only used if `best_exporter_metric` is not None.
      size_feature_name: (str) If set, populates the feature dictionary with
        this name and the coresponding value is a `tf.int32` Tensor of shape
        [batch_size] indicating the actual sizes of the example lists before
        padding and truncation. If None, which is default, this feature is not
        generated.
    """
    self._validate_parameters(estimator, hparams)

    self._context_feature_columns = context_feature_columns
    self._example_feature_columns = example_feature_columns
    self._hparams = hparams
    self._estimator = estimator
    self._label_feature_name = label_feature_name
    self._label_feature_type = label_feature_type
    self._dataset_reader = dataset_reader
    self._best_exporter_metric = best_exporter_metric
    self._best_exporter_metric_higher_better = (
        best_exporter_metric_higher_better)
    self._size_feature_name = size_feature_name

Source File: distributed.py From mead-baseline with Apache License 2.0

4 votes

def test(self, vs, reporting_fns, steps=0, phase='Valid', **kwargs):
        """Run an epoch of testing over the dataset

        If we are using a `tf.dataset`-based `fit_func`, we will just
        cycle the number of steps and let the `dataset` yield new batches.

        If we are using `feed_dict`s, we convert each batch from the `DataFeed`
        and pass that into TF as the `feed_dict`

        :param vs: A validation set
        :param reporting_fns: Reporting hooks
        :param phase: The phase of evaluation (`Test`, `Valid`)
        :param dataset: (`bool`) Are we using `tf.dataset`s
        :return: Metrics
        """


        def _replicated_valid_step(inputs):
            features, tgt = inputs
            top_preds = self.model.predict(features, beam=1, make_input=False)
            per_replica_loss = loss(self.model, features, tgt)
            per_replica_toks = self._num_toks(features['tgt_len'])
            per_replica_report_loss = per_replica_loss * tf.cast(per_replica_toks, tf.float32)
            return per_replica_report_loss, per_replica_toks, top_preds

        if phase == 'Test':
            SET_TRAIN_FLAG(False)
            return self._evaluate(vs, reporting_fns, **kwargs)

        strategy = self.strategy
        num_replicas = strategy.num_replicas_in_sync

        with strategy.scope():

            SET_TRAIN_FLAG(False)
            self.valid_epochs += 1

            total_loss = tf.Variable(0.0)
            total_toks = tf.Variable(0, dtype=tf.int32)
            preds = []
            golds = []

            start = time.time()

            test_iter = iter(vs)

            for i in range(steps):
                features, tgt = next(test_iter)
                inputs = (features, tgt)
                per_replica_loss, per_replica_toks, _ = strategy.experimental_run_v2(_replicated_valid_step, args=(inputs,))
                total_loss.assign_add(strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_loss, axis=None))
                total_toks.assign_add(strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_toks, axis=None))
                # Not sure a good way to get top preds merged yet

            metrics = self.calc_metrics(total_loss.numpy(), total_toks.numpy())
            self.report(
                self.valid_epochs, metrics, start,
                phase, 'EPOCH', reporting_fns
            )
            return metrics

Source File: utils.py From mead-baseline with Apache License 2.0

4 votes

def test(self, vs, reporting_fns, phase='Valid', dataset=True):
        """Run an epoch of testing over the dataset

        If we are using a `tf.dataset`-based `fit_func`, we will just
        cycle the number of steps and let the `dataset` yield new batches.

        If we are using `feed_dict`s, we convert each batch from the `DataFeed`
        and pass that into TF as the `feed_dict`

        :param vs: A validation set
        :param reporting_fns: Reporting hooks
        :param phase: The phase of evaluation (`Test`, `Valid`)
        :param dataset: (`bool`) Are we using `tf.dataset`s
        :return: Metrics
        """
        if phase == 'Test' and not dataset:
            return self._evaluate(vs, reporting_fns)
        self.valid_epochs += 1

        total_loss = 0
        total_toks = 0
        preds = []
        golds = []

        start = time.time()
        pg = create_progress_bar(len(vs))
        for batch_dict in pg(vs):

            if dataset:
                lossv, top_preds = self.model.sess.run([self.test_loss, self.model.decoder.best])
            else:
                feed_dict = self.model.make_input(batch_dict)
                lossv, top_preds = self.model.sess.run([self.test_loss, self.model.decoder.best], feed_dict=feed_dict)
            toks = self._num_toks(batch_dict['tgt_lengths'])
            total_loss += lossv * toks
            total_toks += toks

            preds.extend(convert_seq2seq_preds(top_preds.T, self.tgt_rlut))
            golds.extend(convert_seq2seq_golds(batch_dict['tgt'], batch_dict['tgt_lengths'], self.tgt_rlut))

        metrics = self.calc_metrics(total_loss, total_toks)
        metrics['bleu'] = bleu(preds, golds, self.bleu_n_grams)[0]
        self.report(
            self.valid_epochs, metrics, start,
            phase, 'EPOCH', reporting_fns
        )
        return metrics

Source File: utils.py From mead-baseline with Apache License 2.0

4 votes

def train(self, ts, reporting_fns, dataset=True):
        """Train by looping over the steps

        For a `tf.dataset`-backed `fit_func`, we are using the previously wired `dataset`s
        in the model (and `dataset` is `True`).  For `feed_dict`, we convert the ts samples
        to `feed_dict`s and hand them in one-by-one

        :param ts: The training set
        :param reporting_fns: A list of reporting hooks
        :param dataset: (`bool`) Are we using `tf.dataset`s
        :return: Metrics
        """
        epoch_loss = 0
        epoch_toks = 0

        start = time.time()
        self.nstep_start = start
        for batch_dict in ts:
            if dataset:
                _, global_step, lossv = self.sess.run([self.train_op, self.global_step, self.loss],
                                                      feed_dict={TRAIN_FLAG(): 1})
            else:
                feed_dict = self.model.make_input(batch_dict, True)
                _, global_step, lossv = self.sess.run([self.train_op, self.global_step, self.loss], feed_dict=feed_dict)

            # ?? How to get this cleaner?
            toks = self._num_toks(batch_dict['tgt_lengths'])
            report_loss = lossv * toks

            epoch_loss += report_loss
            epoch_toks += toks
            self.nstep_agg += report_loss
            self.nstep_div += toks

            if (global_step + 1) % self.nsteps == 0:
                metrics = self.calc_metrics(self.nstep_agg, self.nstep_div)
                self.report(
                    global_step + 1, metrics, self.nstep_start,
                    'Train', 'STEP', reporting_fns, self.nsteps
                )
                self.reset_nstep()

        metrics = self.calc_metrics(epoch_loss, epoch_toks)
        self.train_epochs += 1
        self.report(
            self.train_epochs, metrics, start,
            'Train', 'EPOCH', reporting_fns
        )
        return metrics

Source File: eager.py From mead-baseline with Apache License 2.0

4 votes

def _train(self, loader, steps=0, **kwargs):
        """Train an epoch of data using either the input loader or using `tf.dataset`

        In non-`tf.dataset` mode, we cycle the loader data feed, and pull a batch and feed it to the feed dict
        When we use `tf.dataset`s under the hood, this function simply uses the loader to know how many steps
        to train.  We do use a `feed_dict` for passing the `TRAIN_FLAG` in either case

        :param loader: A data feed
        :param kwargs: See below

        :Keyword Arguments:
         * *dataset* (`bool`) Set to `True` if using `tf.dataset`s, defaults to `True`
         * *reporting_fns* (`list`) A list of reporting hooks to use

        :return: Metrics
        """
        SET_TRAIN_FLAG(True)
        reporting_fns = kwargs.get('reporting_fns', [])
        pg = create_progress_bar(steps)
        epoch_loss = tf.Variable(0.0)
        epoch_div = tf.Variable(0, dtype=tf.int32)
        nstep_loss = tf.Variable(0.0)
        nstep_div = tf.Variable(0, dtype=tf.int32)
        self.nstep_start = time.time()

        @tf.function
        def _train_step(inputs):
            features, y = inputs
            loss = self.optimizer.update(self.model, features, y)
            batchsz = get_shape_as_list(y)[0]
            report_loss = loss * batchsz
            return report_loss, batchsz

        with autograph_options({"function_optimization": False, "layout_optimizer": False}):
            for inputs in pg(loader):
                step_report_loss, step_batchsz = _train_step(inputs)
                epoch_loss.assign_add(step_report_loss)
                nstep_loss.assign_add(step_report_loss)
                epoch_div.assign_add(step_batchsz)
                nstep_div.assign_add(step_batchsz)

                step = self.optimizer.global_step.numpy() + 1
                if step % self.nsteps == 0:
                    metrics = self.calc_metrics(nstep_loss.numpy(), nstep_div.numpy())
                    self.report(
                        step, metrics, self.nstep_start,
                        'Train', 'STEP', reporting_fns, self.nsteps
                    )
                    nstep_loss.assign(0.0)
                    nstep_div.assign(0)
                    self.nstep_start = time.time()

        epoch_loss = epoch_loss.numpy()
        epoch_div = epoch_div.numpy()
        metrics = self.calc_metrics(epoch_loss, epoch_div)
        return metrics

Source File: utils.py From mead-baseline with Apache License 2.0

4 votes

def _train(self, ts, **kwargs):
        """Train an epoch of data using either the input loader or using `tf.dataset`

        In non-`tf.dataset` mode, we cycle the loader data feed, and pull a batch and feed it to the feed dict
        When we use `tf.dataset`s under the hood, this function simply uses the loader to know how many steps
        to train.  We do use a `feed_dict` for passing the `TRAIN_FLAG` in either case

        :param ts: A data feed
        :param kwargs: See below

        :Keyword Arguments:
         * *dataset* (`bool`) Set to `True` if using `tf.dataset`s, defaults to `True`
         * *reporting_fns* (`list`) A list of reporting hooks to use

        :return: Metrics
        """
        use_dataset = kwargs.get('dataset', True)
        reporting_fns = kwargs.get('reporting_fns', [])
        epoch_loss = 0
        epoch_div = 0
        steps = len(ts)
        pg = create_progress_bar(steps)
        for batch_dict in pg(ts):
            if use_dataset:
                _, step, lossv = self.sess.run([self.train_op, self.global_step, self.loss],
                                               feed_dict={TRAIN_FLAG(): 1})
            else:
                feed_dict = self.model.make_input(batch_dict, True)
                _, step, lossv = self.sess.run([self.train_op, self.global_step, self.loss], feed_dict=feed_dict)

            batchsz = self._get_batchsz(batch_dict)
            report_loss = lossv * batchsz
            epoch_loss += report_loss
            epoch_div += batchsz
            self.nstep_agg += report_loss
            self.nstep_div += batchsz
            if (step + 1) % self.nsteps == 0:
                metrics = self.calc_metrics(self.nstep_agg, self.nstep_div)
                self.report(
                    step + 1, metrics, self.nstep_start,
                    'Train', 'STEP', reporting_fns, self.nsteps
                )
                self.reset_nstep()

        metrics = self.calc_metrics(epoch_loss, epoch_div)
        return metrics

Python tensorflow.dataset() Examples