Python tensorflow.dataset() Examples

The following are 30 code examples of tensorflow.dataset(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module tensorflow , or try the search function .
Example #1
Source File: eager.py    From mead-baseline with Apache License 2.0 6 votes vote down vote up
def _evaluate(self, es, reporting_fns, **kwargs):
        """Run the model with beam search and report Bleu.

        :param es: `tf.dataset` of input
        :param reporting_fns: Input hooks
        """
        preds = []
        golds = []
        start = time.time()

        for features, tgt in es:
            features['dst'] = tgt[:, :-1]
            tgt_lens = features.pop('tgt_len')
            top_preds = self.model.predict(features, make_input=False, **kwargs)
            preds.extend(convert_seq2seq_preds(top_preds[:, 0, :], self.tgt_rlut))
            golds.extend(convert_seq2seq_golds(tgt, tgt_lens, self.tgt_rlut))
        metrics = {'bleu': bleu(preds, golds, self.bleu_n_grams)[0]}
        self.report(
            0, metrics, start, 'Test', 'EPOCH', reporting_fns
        )
        return metrics 
Example #2
Source File: data_wrapper_base.py    From MMNet with Apache License 2.0 6 votes vote down vote up
def setup_dataset(
        self,
        placeholders: Tuple[tf.placeholder, tf.placeholder],
        batch_size: int=None,
    ):
        self.batch_size = self.args.batch_size if batch_size is None else batch_size

        dataset = tf.data.Dataset.from_tensor_slices(placeholders)
        dataset = dataset.map(self._parse_function, num_parallel_calls=self.args.num_threads).prefetch(
            self.args.prefetch_factor * self.batch_size)
        if self.is_training:
            dataset = dataset.repeat()
        if self.shuffle:
            dataset = dataset.shuffle(buffer_size=self.args.buffer_size)
        self.dataset = dataset.batch(self.batch_size)
        self.iterator = self.dataset.make_initializable_iterator()
        self.next_elem = self.iterator.get_next() 
Example #3
Source File: utils.py    From mead-baseline with Apache License 2.0 6 votes vote down vote up
def _test(self, ts, dataset=True):
        """Test an epoch of data using either the input loader or using `tf.dataset`

        In non-`tf.dataset` mode, we cycle the loader data feed, and pull a batch and feed it to the feed dict
        When we use `tf.dataset`s under the hood, this function simply uses the loader to know how many steps
        to train.

        :param loader: A data feed
        :param kwargs: See below

        :Keyword Arguments:
          * *dataset* (`bool`) Set to `True` if using `tf.dataset`s, defaults to `True`
          * *reporting_fns* (`list`) A list of reporting hooks to use
          * *verbose* (`dict`) A dictionary containing `console` boolean and `file` name if on

        :return: Metrics
        """
        return self.evaluator.test(ts, dataset=dataset) 
Example #4
Source File: eager.py    From mead-baseline with Apache License 2.0 6 votes vote down vote up
def _test(self, ts, steps=0, **kwargs):
        """Test an epoch of data using either the input loader or using `tf.dataset`

        In non-`tf.dataset` mode, we cycle the loader data feed, and pull a batch and feed it to the feed dict
        When we use `tf.dataset`s under the hood, this function simply uses the loader to know how many steps
        to train.

        :param loader: A data feed
        :param kwargs: See below

        :Keyword Arguments:
          * *dataset* (`bool`) Set to `True` if using `tf.dataset`s, defaults to `True`
          * *reporting_fns* (`list`) A list of reporting hooks to use
          * *verbose* (`dict`) A dictionary containing `console` boolean and `file` name if on

        :return: Metrics
        """
        return self.evaluator.test(ts, steps, **kwargs) 
Example #5
Source File: distributed.py    From mead-baseline with Apache License 2.0 5 votes vote down vote up
def distribute(self, dataset):
        return self.strategy.experimental_distribute_dataset(dataset) 
Example #6
Source File: distributed.py    From mead-baseline with Apache License 2.0 5 votes vote down vote up
def distribute(self, dataset):
        return self.strategy.experimental_distribute_dataset(dataset) 
Example #7
Source File: distributed.py    From mead-baseline with Apache License 2.0 5 votes vote down vote up
def distribute(self, dataset):
        return self.strategy.experimental_distribute_dataset(dataset) 
Example #8
Source File: eager.py    From mead-baseline with Apache License 2.0 5 votes vote down vote up
def test(self, vs, reporting_fns, phase):
        """Run an epoch of testing over the dataset

        If we are using a `tf.dataset`-based `fit_func`, we will just
        cycle the number of steps and let the `dataset` yield new batches.

        If we are using `feed_dict`s, we convert each batch from the `DataFeed`
        and pass that into TF as the `feed_dict`

        :param vs: A validation set
        :param reporting_fns: Reporting hooks
        :param phase: The phase of evaluation (`Test`, `Valid`)
        :param dataset: (`bool`) Are we using `tf.dataset`s
        :return: Metrics
        """
        total_loss = 0.0
        total_toks = 0
        epochs = 0
        if phase == 'Valid':
            self.valid_epochs += 1
            epochs = self.valid_epochs
        SET_TRAIN_FLAG(False)

        start = time.time()
        h = None
        for features, y in vs:
            if self.model.requires_state:
                loss_value, h = loss_with_state(self.model, h, features, y)
            else:
                loss_value = loss_without_state(self.model, features, y)
            loss_value = loss_value.numpy()
            toks = self._num_toks(y)
            total_loss += loss_value * tf.cast(toks, tf.float32).numpy()
            total_toks += toks.numpy()

        metrics = self.calc_metrics(total_loss, total_toks)
        self.report(
            epochs, metrics, start,
            phase, 'EPOCH', reporting_fns
        )
        return metrics 
Example #9
Source File: gqn_provider.py    From tf-gqn with Apache License 2.0 5 votes vote down vote up
def _get_dataset_files(dataset_info, mode, root):
  """Generates lists of files for a given dataset version."""
  basepath = dataset_info.basepath
  base = os.path.join(root, basepath, mode)
  if mode == 'train':
    num_files = dataset_info.train_size
  else:
    num_files = dataset_info.test_size
  length = len(str(num_files))
  template = '{:0%d}-of-{:0%d}.tfrecord' % (length, length)
  record_paths = [  # indexing runs from 1 to n
      os.path.join(base, template.format(i, num_files))
      for i in range(1, num_files + 1)]
  return record_paths 
Example #10
Source File: utils.py    From mead-baseline with Apache License 2.0 5 votes vote down vote up
def to_tensors(ts, src_lengths_key, dst=False):
    """Convert a data feed into a tuple of `features` (`dict`) and `y` values

    This method is required to produce `tf.dataset`s from the input data feed.
    Any fields ending with `_lengths` are ignored, unless they match the
    `src_lengths_key` or `tgt_lengths_key`, in which case, they are converted to `src_len` and `tgt_len`

    :param ts: The data feed to convert
    :param lengths_key: This is a field passed from the model params specifying source of truth of the temporal lengths
    :param dst: `bool` that says if we should prepare a `dst` tensor.  This is needed in distributed mode
    :return: A `tuple` of `features` and `y` (labels)
    """
    keys = ts[0].keys()
    # This is kind of a hack
    keys = [k for k in keys if '_lengths' not in k and k != 'ids'] + [src_lengths_key, "tgt_lengths"]

    features = dict((k, []) for k in keys)
    for sample in ts:
        for k in keys:
            for s in sample[k]:
                features[k].append(s)
    features['src_len'] = features[src_lengths_key]
    del features[src_lengths_key]
    features['tgt_len'] = features['tgt_lengths']
    del features['tgt_lengths']
    features = dict((k, np.stack(v).astype(np.int32)) for k, v in features.items())
    if dst:
        features['dst'] = features['tgt'][:, :-1]
    tgt = features.pop('tgt')

    return features, tgt 
Example #11
Source File: data_wrapper_base.py    From MMNet with Apache License 2.0 5 votes vote down vote up
def resize_and_padding_before_augmentation(self, image, size):
        # If width > height, resize height to model's input height while preserving aspect ratio
        # If height > width, resize width to model's input width while preserving aspect ratio
        if self.args.debug_augmentation:
            assert size[0] == size[1], "resize_and_padding_before_augmentation only supports square target image"
            image = tf.expand_dims(image, 0)

            image_dims = tf.shape(image)
            height = image_dims[1]
            width = image_dims[2]

            min_size = min(*size)
            width_aspect = tf.maximum(min_size, tf.cast(width * min_size / height, dtype=tf.int32))
            height_aspect = tf.maximum(min_size, tf.cast(height * min_size / width, dtype=tf.int32))

            image = tf.image.resize_bilinear(image, (height_aspect, width_aspect))
            image = image[:, :self.padded_max_size, :self.padded_max_size, :]

            # Pads the image on the bottom and right with zeros until it has dimensions target_height, target_width.
            image = tf.image.pad_to_bounding_box(
                image,
                offset_height=tf.maximum(self.padded_max_size-height_aspect, 0),
                offset_width=tf.maximum(self.padded_max_size-width_aspect, 0),
                target_height=self.padded_max_size,
                target_width=self.padded_max_size,
            )

            image = tf.squeeze(image, 0)
            return image
        else:
            # Have to return some dummy tensor which have .get_shape() to tf.dataset
            return tf.constant(0, shape=self.padded_original_image_dummy_shape, dtype=tf.uint8, name="dummy") 
Example #12
Source File: data_wrapper_base.py    From MMNet with Apache License 2.0 5 votes vote down vote up
def add_arguments(parser):
        g_common = parser.add_argument_group("(DataWrapperBase) Common Arguments for all data wrapper.")
        g_common.add_argument("--dataset_path", required=True, type=str, help="The name of the dataset to load.")
        g_common.add_argument("--dataset_split_name", required=True, type=str, nargs="*",
                              help="The name of the train/test split. Support multiple splits")

        g_common.add_argument("--batch_size", default=32, type=utils.positive_int,
                              help="The number of examples in batch.")
        g_common.add_argument("--no-shuffle", dest="shuffle", action="store_false")
        g_common.add_argument("--shuffle", dest="shuffle", action="store_true")
        g_common.set_defaults(shuffle=True)

        g_common.add_argument("--width", required=True, type=int)
        g_common.add_argument("--height", required=True, type=int)
        g_common.add_argument("--no-debug_augmentation", dest="debug_augmentation", action="store_false")
        g_common.add_argument("--debug_augmentation", dest="debug_augmentation", action="store_true")
        g_common.set_defaults(debug_augmentation=False)
        g_common.add_argument("--max_padded_size", default=224, type=int,
                              help=("We will resize & pads the original image "
                                    "until it has dimensions (padded_size, padded_size)"
                                    "Recommend to set this value as width(or height) * 1.8 ~ 2"))
        g_common.add_argument("--augmentation_method", type=str, required=True,
                              choices=_available_augmentation_methods)
        g_common.add_argument("--num_threads", default=8, type=int)
        g_common.add_argument("--buffer_size", default=1000, type=int)
        g_common.add_argument("--prefetch_factor", default=100, type=int)

        g_common.add_argument("--rotation_range", default=0, type=int,
                              help="Receives maximum angle to be rotated in terms of degree: "
                                   "The image is randomly rotated by the angle "
                                   "randomly chosen from [-rotation_range, rotation_range], "
                                   "and then cropped appropriately to remove dark areas.\n"
                                   "So, be aware that the rotation performs certain kind of zooming.")
        g_common.add_argument("--no-has_sub_dataset", dest="has_sub_dataset", action="store_false")
        g_common.add_argument("--has_sub_dataset", dest="has_sub_dataset", action="store_true")
        g_common.set_defaults(has_sub_dataset=False) 
Example #13
Source File: utils.py    From mead-baseline with Apache License 2.0 5 votes vote down vote up
def process_batch(self, batch_dict, handle, txts, dataset=True):
        if dataset:
            guess = self.sess.run(self.model.best)
        else:
            feed_dict = self.model.make_input(batch_dict)
            guess = self.sess.run(self.model.best, feed_dict=feed_dict)

        sentence_lengths = batch_dict[self.model.lengths_key]

        ids = batch_dict['ids']
        truth = batch_dict['y']
        correct_labels = 0
        total_labels = 0

        # For fscore
        gold_chunks = []
        pred_chunks = []

        # For each sentence
        for b in range(len(guess)):
            length = sentence_lengths[b]
            sentence = guess[b][:length]
            # truth[b] is padded, cutting at :length gives us back true length
            gold = truth[b][:length]

            valid_guess = sentence[gold != Offsets.PAD]
            valid_gold = gold[gold != Offsets.PAD]
            valid_sentence_length = np.sum(gold != Offsets.PAD)
            correct_labels += np.sum(np.equal(valid_guess, valid_gold))
            total_labels += valid_sentence_length

            gold_chunks.append(set(to_spans(valid_gold, self.idx2label, self.span_type, self.verbose)))
            pred_chunks.append(set(to_spans(valid_guess, self.idx2label, self.span_type, self.verbose)))

            # Should we write a file out?  If so, we have to have txts
            if handle is not None:
                id = ids[b]
                txt = txts[id]
                write_sentence_conll(handle, valid_guess, valid_gold, txt, self.idx2label)

        return correct_labels, total_labels, gold_chunks, pred_chunks 
Example #14
Source File: utils.py    From mead-baseline with Apache License 2.0 5 votes vote down vote up
def to_tensors(ts, lengths_key):
    """Convert a data feed into a tuple of `features` (`dict`) and `y` values

    This method is required to produce `tf.dataset`s from the input data feed.
    Any fields ending with `_lengths` are ignored, unless they match the
    `lengths_key` name (as are `ids`)

    :param ts: The data feed to convert
    :param lengths_key: This is a field passed from the model params specifying source of truth of the temporal lengths
    :return: A `tuple` of `features` and `y` (labels)
    """
    keys = ts[0].keys()
    # This is kind of a hack
    keys = [k for k in keys if '_lengths' not in k and k != 'ids'] + [lengths_key]

    features = dict((k, []) for k in keys)
    for sample in ts:
        for k in features.keys():
            # add each sample
            for s in sample[k]:
                features[k].append(s)

    features['lengths'] = features[lengths_key]
    del features[lengths_key]
    features = dict((k, np.stack(v)) for k, v in features.items())
    y = features.pop('y')
    return features, y 
Example #15
Source File: pipeline.py    From ranking with Apache License 2.0 5 votes vote down vote up
def _make_input_fn(self,
                     input_pattern,
                     batch_size,
                     list_size,
                     randomize_input=True,
                     num_epochs=None):
    """Returns the input function for the ranking model.

    Args:
      input_pattern: (str) File pattern for the input data.
      batch_size: (int) The number of input examples to process per batch.
      list_size: (int) The list size for an ELWC example.
      randomize_input: (bool) If true, randomize input example order. It should
        almost always be true except for unittest/debug purposes.
      num_epochs: (int) The number of times the input dataset must be repeated.
        None to repeat the data indefinitely.

    Returns:
      An `input_fn` for `tf.estimator.Estimator`.
    """

    def _input_fn():
      """`input_fn` for the `Estimator`."""
      return self._make_dataset(
          batch_size=batch_size,
          list_size=list_size,
          input_pattern=input_pattern,
          randomize_input=randomize_input,
          num_epochs=num_epochs)

    return _input_fn 
Example #16
Source File: input_fn.py    From bert-multitask-learning with MIT License 4 votes vote down vote up
def train_eval_input_fn(params, mode='train'):
    '''Train and eval input function of estimator.
    This function will write and read tf record for training
    and evaluation.

    Usage:
        def train_input_fn(): return train_eval_input_fn(params)
        estimator.train(
            train_input_fn, max_steps=params.train_steps, hooks=[train_hook])

    Arguments:
        params {Params} -- Params objects

    Keyword Arguments:
        mode {str} -- ModeKeys (default: {'train'})

    Returns:
        tf Dataset -- Tensorflow dataset
    '''
    write_tfrecord(params=params)

    dataset_dict = read_tfrecord(params=params, mode=mode)

    dataset = tf.data.experimental.sample_from_datasets(
        [ds for _, ds in dataset_dict.items()])

    if mode == 'train':
        dataset = dataset.shuffle(params.shuffle_buffer)

    dataset = dataset.prefetch(params.prefetch)
    if params.dynamic_padding:
        dataset = dataset.apply(
            tf.data.experimental.bucket_by_sequence_length(
                element_length_func=element_length_func,
                bucket_batch_sizes=params.bucket_batch_sizes,
                bucket_boundaries=params.bucket_boundaries,
            ))
    else:
        if mode == 'train':
            dataset = dataset.batch(params.batch_size)
        else:
            dataset = dataset.batch(params.batch_size*2)

    return dataset 
Example #17
Source File: utils.py    From mead-baseline with Apache License 2.0 4 votes vote down vote up
def test(self, vs, reporting_fns, phase, dataset=True):
        """Run an epoch of testing over the dataset

        If we are using a `tf.dataset`-based `fit_func`, we will just
        cycle the number of steps and let the `dataset` yield new batches.

        If we are using `feed_dict`s, we convert each batch from the `DataFeed`
        and pass that into TF as the `feed_dict`

        :param vs: A validation set
        :param reporting_fns: Reporting hooks
        :param phase: The phase of evaluation (`Test`, `Valid`)
        :param dataset: (`bool`) Are we using `tf.dataset`s
        :return: Metrics
        """
        total_loss = 0.0
        total_toks = 0
        epochs = 0
        if phase == 'Valid':
            self.valid_epochs += 1
            epochs = self.valid_epochs

        if self.model.requires_state:
            state = self.model.sess.run(self.model.initial_state, self.model.make_input(vs[0], False))

        fetches = {
            "loss": self.test_loss,
        }

        if self.model.requires_state:
            fetches["final_state"] = self.model.final_state

        start = time.time()

        for batch_dict in vs:
            feed_dict = {}
            if not dataset:
                feed_dict = self.model.make_input(batch_dict, False)
            # In Keras LSTM, the order is h first, c second, its the opposite in TF 1, however I dont think it
            # ends up mattering here
            if self.model.requires_state:

                for i, (s1, s2) in enumerate(self.model.initial_state):
                    feed_dict[s1] = state[i][0]  # .c  # 0
                    feed_dict[s2] = state[i][1]  # .h  # 1

            vals = self.model.sess.run(fetches, feed_dict)
            loss = vals["loss"]
            toks = self._num_toks(batch_dict)
            if self.model.requires_state:
                state = vals["final_state"]
            total_loss += loss * toks
            total_toks += toks

        metrics = self.calc_metrics(total_loss, total_toks)
        self.report(
            epochs, metrics, start,
            phase, 'EPOCH', reporting_fns
        )
        return metrics 
Example #18
Source File: input_fn.py    From bert-multitask-learning with MIT License 4 votes vote down vote up
def to_serving_input(input_file_or_list, config, mode=PREDICT, tokenizer=None):
    '''A serving input function that takes input file path or
    list of string and apply BERT preprocessing. This fn will
    return a data dict instead of tf dataset. Used in serving.

    Arguments:
        input_file_or_list {str or list} -- file path of list of str
        config {Params} -- Params

    Keyword Arguments:
        mode {str} -- ModeKeys (default: {PREDICT})
        tokenizer {tokenizer} -- Tokenizer (default: {None})
    '''

    # if is string, treat it as path to file
    if isinstance(input_file_or_list, str):
        inputs = open(input_file_or_list, 'r', encoding='utf8').readlines()
    else:
        inputs = input_file_or_list

    if tokenizer is None:
        tokenizer = FullTokenizer(config.vocab_file)

    data_dict = {}
    for doc in inputs:
        inputs_a = cluster_alphnum(doc)
        tokens, target = tokenize_text_with_seqs(
            tokenizer, inputs_a, None)

        tokens_a, tokens_b, target = truncate_seq_pair(
            tokens, None, target, config.max_seq_len)

        tokens, segment_ids, target = add_special_tokens_with_seqs(
            tokens_a, tokens_b, target)

        input_mask, tokens, segment_ids, target = create_mask_and_padding(
            tokens, segment_ids, target, config.max_seq_len)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        data_dict['input_ids'] = input_ids
        data_dict['input_mask'] = input_mask
        data_dict['segment_ids'] = segment_ids
        yield data_dict 
Example #19
Source File: eager.py    From mead-baseline with Apache License 2.0 4 votes vote down vote up
def _test(self, loader, steps=0, **kwargs):
        """Test an epoch of data using either the input loader or using `tf.dataset`

        In non-`tf.dataset` mode, we cycle the loader data feed, and pull a batch and feed it to the feed dict
        When we use `tf.dataset`s under the hood, this function simply uses the loader to know how many steps
        to train.

        :param loader: A data feed
        :param kwargs: See below

        :Keyword Arguments:
          * *dataset* (`bool`) Set to `True` if using `tf.dataset`s, defaults to `True`
          * *reporting_fns* (`list`) A list of reporting hooks to use
          * *verbose* (`dict`) A dictionary containing `console` boolean and `file` name if on

        :return: Metrics
        """

        cm = ConfusionMatrix(self.model.labels)
        total_loss = 0
        total_norm = 0
        verbose = kwargs.get("verbose", None)

        pg = create_progress_bar(steps)

        SET_TRAIN_FLAG(False)
        for features, y in pg(loader):
            logits = self.model(features)
            y_ = tf.argmax(logits, axis=1, output_type=tf.int32)
            cm.add_batch(y, y_)
            lossv = tf.compat.v1.losses.sparse_softmax_cross_entropy(labels=y, logits=logits).numpy()
            batchsz = int(y.shape[0])
            assert len(y_) == batchsz
            total_loss += lossv * batchsz
            total_norm += batchsz
            cm.add_batch(y, y_)

        metrics = cm.get_all_metrics()
        metrics['avg_loss'] = total_loss / float(total_norm)
        verbose_output(verbose, cm)

        return metrics 
Example #20
Source File: eager.py    From mead-baseline with Apache License 2.0 4 votes vote down vote up
def _train(self, loader, steps=0, **kwargs):
        """Train an epoch of data using either the input loader or using `tf.dataset`

        In non-`tf.dataset` mode, we cycle the loader data feed, and pull a batch and feed it to the feed dict
        When we use `tf.dataset`s under the hood, this function simply uses the loader to know how many steps
        to train.  We do use a `feed_dict` for passing the `TRAIN_FLAG` in either case

        :param loader: A data feed
        :param kwargs: See below

        :Keyword Arguments:
         * *dataset* (`bool`) Set to `True` if using `tf.dataset`s, defaults to `True`
         * *reporting_fns* (`list`) A list of reporting hooks to use

        :return: Metrics
        """

        SET_TRAIN_FLAG(True)
        reporting_fns = kwargs.get('reporting_fns', [])
        pg = create_progress_bar(steps)
        epoch_loss = tf.Variable(0.0)
        epoch_div = tf.Variable(0, dtype=tf.int32)
        nstep_loss = tf.Variable(0.0)
        nstep_div = tf.Variable(0, dtype=tf.int32)
        self.nstep_start = time.time()

        @tf.function
        def _train_step(inputs):
            """Replicated training step."""
            features, y = inputs
            loss = self.optimizer.update(self.model, features, y)
            batchsz = get_shape_as_list(y)[0]
            report_loss = loss * batchsz
            return report_loss, batchsz

        for inputs in pg(loader):
            step_report_loss, step_batchsz = _train_step(inputs)
            epoch_loss.assign_add(step_report_loss)
            nstep_loss.assign_add(step_report_loss)
            epoch_div.assign_add(step_batchsz)
            nstep_div.assign_add(step_batchsz)
            step = self.optimizer.global_step.numpy() + 1

            if step % self.nsteps == 0:
                metrics = self.calc_metrics(nstep_loss.numpy(), nstep_div.numpy())
                self.report(
                    step, metrics, self.nstep_start,
                    'Train', 'STEP', reporting_fns, self.nsteps
                )
                nstep_loss.assign(0.0)
                nstep_div.assign(0)
                self.nstep_start = time.time()

        epoch_loss = epoch_loss.numpy()
        epoch_div = epoch_div.numpy()
        metrics = self.calc_metrics(epoch_loss, epoch_div)
        return metrics 
Example #21
Source File: eager.py    From mead-baseline with Apache License 2.0 4 votes vote down vote up
def train(self, ts, reporting_fns, dataset=True):
        """Train by looping over the steps

        For a `tf.dataset`-backed `fit_func`, we are using the previously wired `dataset`s
        in the model (and `dataset` is `True`).  For `feed_dict`, we convert the ts samples
        to `feed_dict`s and hand them in one-by-one

        :param ts: The training set
        :param reporting_fns: A list of reporting hooks
        :param dataset: (`bool`) Are we using `tf.dataset`s
        :return: Metrics
        """
        SET_TRAIN_FLAG(True)
        epoch_loss = tf.Variable(0.0)
        epoch_div = tf.Variable(0, dtype=tf.int32)
        nstep_loss = tf.Variable(0.0)
        nstep_div = tf.Variable(0, dtype=tf.int32)
        self.nstep_start = time.time()
        start = time.time()

        @tf.function
        def _train_step(features, y):
            """Replicated training step."""

            loss = self.optimizer.update(self.model, features, y)
            toks = self._num_toks(features['tgt_len'])
            report_loss = loss * tf.cast(toks, tf.float32)
            return report_loss, toks

        with autograph_options({"function_optimization": False, "layout_optimizer": False}):
            for features, y in ts:
                features['dst'] = y[:, :-1]
                step_report_loss, step_toks = _train_step(features, y)
                epoch_loss.assign_add(step_report_loss)
                nstep_loss.assign_add(step_report_loss)
                epoch_div.assign_add(step_toks)
                nstep_div.assign_add(step_toks)

                step = self.optimizer.global_step.numpy() + 1
                if step % self.nsteps == 0:
                    metrics = self.calc_metrics(nstep_loss.numpy(), nstep_div.numpy())
                    self.report(
                        step, metrics, self.nstep_start,
                        'Train', 'STEP', reporting_fns, self.nsteps
                    )
                    nstep_loss.assign(0.0)
                    nstep_div.assign(0)
                    self.nstep_start = time.time()

        epoch_loss = epoch_loss.numpy()
        epoch_div = epoch_div.numpy()
        metrics = self.calc_metrics(epoch_loss, epoch_div)
        self.train_epochs += 1
        self.report(
            self.train_epochs, metrics, start,
            'Train', 'EPOCH', reporting_fns
        )
        return metrics 
Example #22
Source File: utils.py    From mead-baseline with Apache License 2.0 4 votes vote down vote up
def _test(self, loader, **kwargs):
        """Test an epoch of data using either the input loader or using `tf.dataset`

        In non-`tf.dataset` mode, we cycle the loader data feed, and pull a batch and feed it to the feed dict
        When we use `tf.dataset`s under the hood, this function simply uses the loader to know how many steps
        to train.

        :param loader: A data feed
        :param kwargs: See below

        :Keyword Arguments:
          * *dataset* (`bool`) Set to `True` if using `tf.dataset`s, defaults to `True`
          * *reporting_fns* (`list`) A list of reporting hooks to use
          * *verbose* (`dict`) A dictionary containing `console` boolean and `file` name if on

        :return: Metrics
        """
        if self.ema:
            self.sess.run(self.ema_load)

        use_dataset = kwargs.get('dataset', True)

        cm = ConfusionMatrix(self.model.labels)
        steps = len(loader)
        total_loss = 0
        total_norm = 0
        verbose = kwargs.get("verbose", None)

        pg = create_progress_bar(steps)
        for i, batch_dict in enumerate(pg(loader)):
            y = batch_dict['y']
            if use_dataset:
                guess, lossv = self.sess.run([self.model.best, self.test_loss])
            else:
                feed_dict = self.model.make_input(batch_dict, False)
                guess, lossv = self.sess.run([self.model.best, self.test_loss], feed_dict=feed_dict)

            batchsz = len(guess)
            total_loss += lossv * batchsz
            total_norm += batchsz
            cm.add_batch(y, guess)

        metrics = cm.get_all_metrics()
        metrics['avg_loss'] = total_loss / float(total_norm)
        verbose_output(verbose, cm)

        return metrics 
Example #23
Source File: utils.py    From mead-baseline with Apache License 2.0 4 votes vote down vote up
def _train(self, loader, dataset=True, **kwargs):
        """Train an epoch of data using either the input loader or using `tf.dataset`

        In non-`tf.dataset` mode, we cycle the loader data feed, and pull a batch and feed it to the feed dict
        When we use `tf.dataset`s under the hood, this function simply uses the loader to know how many steps
        to train.  We do use a `feed_dict` for passing the `TRAIN_FLAG` in either case

        :param loader: A data feed
        :param kwargs: See below

        :Keyword Arguments:
         * *dataset* (`bool`) Set to `True` if using `tf.dataset`s, defaults to `True`
         * *reporting_fns* (`list`) A list of reporting hooks to use

        :return: Metrics
        """
        if self.ema:
            self.sess.run(self.ema_restore)

        reporting_fns = kwargs.get('reporting_fns', [])
        epoch_loss = 0
        epoch_div = 0
        steps = len(loader)
        pg = create_progress_bar(steps)
        for batch_dict in pg(loader):
            if dataset:
                _, step, lossv = self.sess.run([self.train_op, self.global_step, self.loss],
                                               feed_dict={TRAIN_FLAG(): 1})
            else:
                feed_dict = self.model.make_input(batch_dict, True)
                _, step, lossv = self.sess.run([self.train_op, self.global_step, self.loss], feed_dict=feed_dict)

            batchsz = self._get_batchsz(batch_dict)
            report_lossv = lossv * batchsz
            epoch_loss += report_lossv
            epoch_div += batchsz
            self.nstep_agg += report_lossv
            self.nstep_div += batchsz

            if (step + 1) % self.nsteps == 0:
                metrics = self.calc_metrics(self.nstep_agg, self.nstep_div)
                self.report(
                    step + 1, metrics, self.nstep_start,
                    'Train', 'STEP', reporting_fns, self.nsteps
                )
                self.reset_nstep()

        metrics = self.calc_metrics(epoch_loss, epoch_div)
        return metrics 
Example #24
Source File: eager.py    From mead-baseline with Apache License 2.0 4 votes vote down vote up
def test(self, vs, reporting_fns, phase='Valid', dataset=True, **kwargs):
        """Run an epoch of testing over the dataset

        If we are using a `tf.dataset`-based `fit_func`, we will just
        cycle the number of steps and let the `dataset` yield new batches.

        If we are using `feed_dict`s, we convert each batch from the `DataFeed`
        and pass that into TF as the `feed_dict`

        :param vs: A validation set
        :param reporting_fns: Reporting hooks
        :param phase: The phase of evaluation (`Test`, `Valid`)
        :param dataset: (`bool`) Are we using `tf.dataset`s
        :return: Metrics
        """
        SET_TRAIN_FLAG(False)
        if phase == 'Test':
            return self._evaluate(vs, reporting_fns, **kwargs)

        self.valid_epochs += 1

        total_loss = 0
        total_toks = 0
        preds = []
        golds = []

        start = time.time()
        for features, tgt in vs:
            features['dst'] = tgt[:, :-1]
            top_preds = self.model.predict(features, beam=1, make_input=False)
            loss_value = loss(self.model, features, tgt).numpy()
            toks = tf.cast(self._num_toks(features['tgt_len']), tf.float32).numpy()
            total_loss += loss_value * toks
            total_toks += toks
            preds.extend(convert_seq2seq_preds(top_preds[:, 0, :], self.tgt_rlut))
            golds.extend(convert_seq2seq_golds(tgt, features['tgt_len'], self.tgt_rlut))

        metrics = self.calc_metrics(total_loss, total_toks)
        metrics['bleu'] = bleu(preds, golds, self.bleu_n_grams)[0]
        self.report(
            self.valid_epochs, metrics, start,
            phase, 'EPOCH', reporting_fns
        )
        return metrics 
Example #25
Source File: pipeline.py    From ranking with Apache License 2.0 4 votes vote down vote up
def __init__(self,
               context_feature_columns,
               example_feature_columns,
               hparams,
               estimator,
               label_feature_name="relevance",
               label_feature_type=tf.int64,
               dataset_reader=tf.data.TFRecordDataset,
               best_exporter_metric=None,
               best_exporter_metric_higher_better=True,
               size_feature_name=None):
    """Constructor.

    Args:
      context_feature_columns: (dict) Context (aka, query) feature columns.
      example_feature_columns: (dict) Example (aka, document) feature columns.
      hparams: (dict) A dict containing model hyperparameters.
      estimator: (`Estimator`) An `Estimator` instance for model train and eval.
      label_feature_name: (str) The name of the label feature.
      label_feature_type: (`tf.dtype`) The value type of the label feature.
      dataset_reader: (`tf.Dataset`) The dataset format for the input files.
      best_exporter_metric: (str) Metric key for exporting the best model. If
        None, exports the model with the minimal loss value.
      best_exporter_metric_higher_better: (bool) If a higher metric is better.
        This is only used if `best_exporter_metric` is not None.
      size_feature_name: (str) If set, populates the feature dictionary with
        this name and the coresponding value is a `tf.int32` Tensor of shape
        [batch_size] indicating the actual sizes of the example lists before
        padding and truncation. If None, which is default, this feature is not
        generated.
    """
    self._validate_parameters(estimator, hparams)

    self._context_feature_columns = context_feature_columns
    self._example_feature_columns = example_feature_columns
    self._hparams = hparams
    self._estimator = estimator
    self._label_feature_name = label_feature_name
    self._label_feature_type = label_feature_type
    self._dataset_reader = dataset_reader
    self._best_exporter_metric = best_exporter_metric
    self._best_exporter_metric_higher_better = (
        best_exporter_metric_higher_better)
    self._size_feature_name = size_feature_name 
Example #26
Source File: distributed.py    From mead-baseline with Apache License 2.0 4 votes vote down vote up
def test(self, vs, reporting_fns, steps=0, phase='Valid', **kwargs):
        """Run an epoch of testing over the dataset

        If we are using a `tf.dataset`-based `fit_func`, we will just
        cycle the number of steps and let the `dataset` yield new batches.

        If we are using `feed_dict`s, we convert each batch from the `DataFeed`
        and pass that into TF as the `feed_dict`

        :param vs: A validation set
        :param reporting_fns: Reporting hooks
        :param phase: The phase of evaluation (`Test`, `Valid`)
        :param dataset: (`bool`) Are we using `tf.dataset`s
        :return: Metrics
        """


        def _replicated_valid_step(inputs):
            features, tgt = inputs
            top_preds = self.model.predict(features, beam=1, make_input=False)
            per_replica_loss = loss(self.model, features, tgt)
            per_replica_toks = self._num_toks(features['tgt_len'])
            per_replica_report_loss = per_replica_loss * tf.cast(per_replica_toks, tf.float32)
            return per_replica_report_loss, per_replica_toks, top_preds

        if phase == 'Test':
            SET_TRAIN_FLAG(False)
            return self._evaluate(vs, reporting_fns, **kwargs)

        strategy = self.strategy
        num_replicas = strategy.num_replicas_in_sync

        with strategy.scope():

            SET_TRAIN_FLAG(False)
            self.valid_epochs += 1

            total_loss = tf.Variable(0.0)
            total_toks = tf.Variable(0, dtype=tf.int32)
            preds = []
            golds = []

            start = time.time()

            test_iter = iter(vs)

            for i in range(steps):
                features, tgt = next(test_iter)
                inputs = (features, tgt)
                per_replica_loss, per_replica_toks, _ = strategy.experimental_run_v2(_replicated_valid_step, args=(inputs,))
                total_loss.assign_add(strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_loss, axis=None))
                total_toks.assign_add(strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_toks, axis=None))
                # Not sure a good way to get top preds merged yet

            metrics = self.calc_metrics(total_loss.numpy(), total_toks.numpy())
            self.report(
                self.valid_epochs, metrics, start,
                phase, 'EPOCH', reporting_fns
            )
            return metrics 
Example #27
Source File: utils.py    From mead-baseline with Apache License 2.0 4 votes vote down vote up
def test(self, vs, reporting_fns, phase='Valid', dataset=True):
        """Run an epoch of testing over the dataset

        If we are using a `tf.dataset`-based `fit_func`, we will just
        cycle the number of steps and let the `dataset` yield new batches.

        If we are using `feed_dict`s, we convert each batch from the `DataFeed`
        and pass that into TF as the `feed_dict`

        :param vs: A validation set
        :param reporting_fns: Reporting hooks
        :param phase: The phase of evaluation (`Test`, `Valid`)
        :param dataset: (`bool`) Are we using `tf.dataset`s
        :return: Metrics
        """
        if phase == 'Test' and not dataset:
            return self._evaluate(vs, reporting_fns)
        self.valid_epochs += 1

        total_loss = 0
        total_toks = 0
        preds = []
        golds = []

        start = time.time()
        pg = create_progress_bar(len(vs))
        for batch_dict in pg(vs):

            if dataset:
                lossv, top_preds = self.model.sess.run([self.test_loss, self.model.decoder.best])
            else:
                feed_dict = self.model.make_input(batch_dict)
                lossv, top_preds = self.model.sess.run([self.test_loss, self.model.decoder.best], feed_dict=feed_dict)
            toks = self._num_toks(batch_dict['tgt_lengths'])
            total_loss += lossv * toks
            total_toks += toks

            preds.extend(convert_seq2seq_preds(top_preds.T, self.tgt_rlut))
            golds.extend(convert_seq2seq_golds(batch_dict['tgt'], batch_dict['tgt_lengths'], self.tgt_rlut))

        metrics = self.calc_metrics(total_loss, total_toks)
        metrics['bleu'] = bleu(preds, golds, self.bleu_n_grams)[0]
        self.report(
            self.valid_epochs, metrics, start,
            phase, 'EPOCH', reporting_fns
        )
        return metrics 
Example #28
Source File: utils.py    From mead-baseline with Apache License 2.0 4 votes vote down vote up
def train(self, ts, reporting_fns, dataset=True):
        """Train by looping over the steps

        For a `tf.dataset`-backed `fit_func`, we are using the previously wired `dataset`s
        in the model (and `dataset` is `True`).  For `feed_dict`, we convert the ts samples
        to `feed_dict`s and hand them in one-by-one

        :param ts: The training set
        :param reporting_fns: A list of reporting hooks
        :param dataset: (`bool`) Are we using `tf.dataset`s
        :return: Metrics
        """
        epoch_loss = 0
        epoch_toks = 0

        start = time.time()
        self.nstep_start = start
        for batch_dict in ts:
            if dataset:
                _, global_step, lossv = self.sess.run([self.train_op, self.global_step, self.loss],
                                                      feed_dict={TRAIN_FLAG(): 1})
            else:
                feed_dict = self.model.make_input(batch_dict, True)
                _, global_step, lossv = self.sess.run([self.train_op, self.global_step, self.loss], feed_dict=feed_dict)

            # ?? How to get this cleaner?
            toks = self._num_toks(batch_dict['tgt_lengths'])
            report_loss = lossv * toks

            epoch_loss += report_loss
            epoch_toks += toks
            self.nstep_agg += report_loss
            self.nstep_div += toks

            if (global_step + 1) % self.nsteps == 0:
                metrics = self.calc_metrics(self.nstep_agg, self.nstep_div)
                self.report(
                    global_step + 1, metrics, self.nstep_start,
                    'Train', 'STEP', reporting_fns, self.nsteps
                )
                self.reset_nstep()

        metrics = self.calc_metrics(epoch_loss, epoch_toks)
        self.train_epochs += 1
        self.report(
            self.train_epochs, metrics, start,
            'Train', 'EPOCH', reporting_fns
        )
        return metrics 
Example #29
Source File: eager.py    From mead-baseline with Apache License 2.0 4 votes vote down vote up
def _train(self, loader, steps=0, **kwargs):
        """Train an epoch of data using either the input loader or using `tf.dataset`

        In non-`tf.dataset` mode, we cycle the loader data feed, and pull a batch and feed it to the feed dict
        When we use `tf.dataset`s under the hood, this function simply uses the loader to know how many steps
        to train.  We do use a `feed_dict` for passing the `TRAIN_FLAG` in either case

        :param loader: A data feed
        :param kwargs: See below

        :Keyword Arguments:
         * *dataset* (`bool`) Set to `True` if using `tf.dataset`s, defaults to `True`
         * *reporting_fns* (`list`) A list of reporting hooks to use

        :return: Metrics
        """
        SET_TRAIN_FLAG(True)
        reporting_fns = kwargs.get('reporting_fns', [])
        pg = create_progress_bar(steps)
        epoch_loss = tf.Variable(0.0)
        epoch_div = tf.Variable(0, dtype=tf.int32)
        nstep_loss = tf.Variable(0.0)
        nstep_div = tf.Variable(0, dtype=tf.int32)
        self.nstep_start = time.time()

        @tf.function
        def _train_step(inputs):
            features, y = inputs
            loss = self.optimizer.update(self.model, features, y)
            batchsz = get_shape_as_list(y)[0]
            report_loss = loss * batchsz
            return report_loss, batchsz

        with autograph_options({"function_optimization": False, "layout_optimizer": False}):
            for inputs in pg(loader):
                step_report_loss, step_batchsz = _train_step(inputs)
                epoch_loss.assign_add(step_report_loss)
                nstep_loss.assign_add(step_report_loss)
                epoch_div.assign_add(step_batchsz)
                nstep_div.assign_add(step_batchsz)

                step = self.optimizer.global_step.numpy() + 1
                if step % self.nsteps == 0:
                    metrics = self.calc_metrics(nstep_loss.numpy(), nstep_div.numpy())
                    self.report(
                        step, metrics, self.nstep_start,
                        'Train', 'STEP', reporting_fns, self.nsteps
                    )
                    nstep_loss.assign(0.0)
                    nstep_div.assign(0)
                    self.nstep_start = time.time()

        epoch_loss = epoch_loss.numpy()
        epoch_div = epoch_div.numpy()
        metrics = self.calc_metrics(epoch_loss, epoch_div)
        return metrics 
Example #30
Source File: utils.py    From mead-baseline with Apache License 2.0 4 votes vote down vote up
def _train(self, ts, **kwargs):
        """Train an epoch of data using either the input loader or using `tf.dataset`

        In non-`tf.dataset` mode, we cycle the loader data feed, and pull a batch and feed it to the feed dict
        When we use `tf.dataset`s under the hood, this function simply uses the loader to know how many steps
        to train.  We do use a `feed_dict` for passing the `TRAIN_FLAG` in either case

        :param ts: A data feed
        :param kwargs: See below

        :Keyword Arguments:
         * *dataset* (`bool`) Set to `True` if using `tf.dataset`s, defaults to `True`
         * *reporting_fns* (`list`) A list of reporting hooks to use

        :return: Metrics
        """
        use_dataset = kwargs.get('dataset', True)
        reporting_fns = kwargs.get('reporting_fns', [])
        epoch_loss = 0
        epoch_div = 0
        steps = len(ts)
        pg = create_progress_bar(steps)
        for batch_dict in pg(ts):
            if use_dataset:
                _, step, lossv = self.sess.run([self.train_op, self.global_step, self.loss],
                                               feed_dict={TRAIN_FLAG(): 1})
            else:
                feed_dict = self.model.make_input(batch_dict, True)
                _, step, lossv = self.sess.run([self.train_op, self.global_step, self.loss], feed_dict=feed_dict)

            batchsz = self._get_batchsz(batch_dict)
            report_loss = lossv * batchsz
            epoch_loss += report_loss
            epoch_div += batchsz
            self.nstep_agg += report_loss
            self.nstep_div += batchsz
            if (step + 1) % self.nsteps == 0:
                metrics = self.calc_metrics(self.nstep_agg, self.nstep_div)
                self.report(
                    step + 1, metrics, self.nstep_start,
                    'Train', 'STEP', reporting_fns, self.nsteps
                )
                self.reset_nstep()

        metrics = self.calc_metrics(epoch_loss, epoch_div)
        return metrics