Python Examples of official.recommendation.data_preprocessing.instantiate

Source File: ncf_common.py From models with Apache License 2.0

6 votes

def get_inputs(params):
  """Returns some parameters used by the model."""
  if FLAGS.download_if_missing and not FLAGS.use_synthetic_data:
    movielens.download(FLAGS.dataset, FLAGS.data_dir)

  if FLAGS.seed is not None:
    np.random.seed(FLAGS.seed)

  if FLAGS.use_synthetic_data:
    producer = data_pipeline.DummyConstructor()
    num_users, num_items = data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[
        FLAGS.dataset]
    num_train_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH
    num_eval_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH
  else:
    num_users, num_items, producer = data_preprocessing.instantiate_pipeline(
        dataset=FLAGS.dataset, data_dir=FLAGS.data_dir, params=params,
        constructor_type=FLAGS.constructor_type,
        deterministic=FLAGS.seed is not None)
    num_train_steps = producer.train_batches_per_epoch
    num_eval_steps = producer.eval_batches_per_epoch

  return num_users, num_items, num_train_steps, num_eval_steps, producer

Source File: ncf_common.py From Live-feed-object-device-identification-using-Tensorflow-and-OpenCV with Apache License 2.0

6 votes

def get_inputs(params):
  """Returns some parameters used by the model."""
  if FLAGS.download_if_missing and not FLAGS.use_synthetic_data:
    movielens.download(FLAGS.dataset, FLAGS.data_dir)

  if FLAGS.seed is not None:
    np.random.seed(FLAGS.seed)

  if FLAGS.use_synthetic_data:
    producer = data_pipeline.DummyConstructor()
    num_users, num_items = data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[
        FLAGS.dataset]
    num_train_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH
    num_eval_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH
  else:
    num_users, num_items, producer = data_preprocessing.instantiate_pipeline(
        dataset=FLAGS.dataset, data_dir=FLAGS.data_dir, params=params,
        constructor_type=FLAGS.constructor_type,
        deterministic=FLAGS.seed is not None)
    num_train_steps = producer.train_batches_per_epoch
    num_eval_steps = producer.eval_batches_per_epoch

  return num_users, num_items, num_train_steps, num_eval_steps, producer

Source File: ncf_common.py From models with Apache License 2.0

6 votes

def get_inputs(params):
  """Returns some parameters used by the model."""
  if FLAGS.download_if_missing and not FLAGS.use_synthetic_data:
    movielens.download(FLAGS.dataset, FLAGS.data_dir)

  if FLAGS.seed is not None:
    np.random.seed(FLAGS.seed)

  if FLAGS.use_synthetic_data:
    producer = data_pipeline.DummyConstructor()
    num_users, num_items = movielens.DATASET_TO_NUM_USERS_AND_ITEMS[
        FLAGS.dataset]
    num_train_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH
    num_eval_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH
  else:
    num_users, num_items, producer = data_preprocessing.instantiate_pipeline(
        dataset=FLAGS.dataset, data_dir=FLAGS.data_dir, params=params,
        constructor_type=FLAGS.constructor_type,
        deterministic=FLAGS.seed is not None)
    num_train_steps = producer.train_batches_per_epoch
    num_eval_steps = producer.eval_batches_per_epoch

  return num_users, num_items, num_train_steps, num_eval_steps, producer

Source File: ncf_main.py From training_results_v0.5 with Apache License 2.0

5 votes

def main(_):
  """Train NCF model and evaluate its hit rate (HR) metric."""
  tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
      FLAGS.tpu,
      zone=FLAGS.tpu_zone,
      project=FLAGS.gcp_project)
  master = tpu_cluster_resolver.master()

  ncf_dataset, cleanup_fn = data_preprocessing.instantiate_pipeline(
      dataset=FLAGS.dataset,
      data_dir=FLAGS.data_dir,
      # TODO(shizhiw): support multihost.
      batch_size=FLAGS.batch_size,
      eval_batch_size=FLAGS.eval_batch_size,
      num_neg=FLAGS.num_neg,
      epochs_per_cycle=1,
      match_mlperf=FLAGS.ml_perf,
      use_subprocess=FLAGS.use_subprocess,
      cache_id=FLAGS.cache_id)

  train_params, eval_params = create_params(ncf_dataset)

  eval_graph_spec = build_graph(
      eval_params, ncf_dataset, tpu_embedding.INFERENCE)

  for epoch in range(_NUM_EPOCHS):
    tf.logging.info("Training {}...".format(epoch))
    # build training graph each epoch as number of batches per epoch
    # i.e. batch_count might change by 1 between epochs.
    train_graph_spec = build_graph(
        train_params, ncf_dataset, tpu_embedding.TRAINING)

    run_graph(master, train_graph_spec, epoch)

    tf.logging.info("Evaluating {}...".format(epoch))
    run_graph(master, eval_graph_spec, epoch)

  cleanup_fn()  # Cleanup data construction artifacts and subprocess.

Source File: create_ncf_data.py From Live-feed-object-device-identification-using-Tensorflow-and-OpenCV with Apache License 2.0

5 votes

def prepare_raw_data(flag_obj):
  """Downloads and prepares raw data for data generation."""
  movielens.download(flag_obj.dataset, flag_obj.data_dir)

  data_processing_params = {
      "train_epochs": flag_obj.num_train_epochs,
      "batch_size": flag_obj.train_prebatch_size,
      "eval_batch_size": flag_obj.eval_prebatch_size,
      "batches_per_step": 1,
      "stream_files": True,
      "num_neg": flag_obj.num_negative_samples,
  }

  num_users, num_items, producer = data_preprocessing.instantiate_pipeline(
      dataset=flag_obj.dataset,
      data_dir=flag_obj.data_dir,
      params=data_processing_params,
      constructor_type=flag_obj.constructor_type,
      epoch_dir=flag_obj.data_dir,
      generate_data_offline=True)

  # pylint: disable=protected-access
  input_metadata = {
      "num_users": num_users,
      "num_items": num_items,
      "constructor_type": flag_obj.constructor_type,
      "num_train_elements": producer._elements_in_epoch,
      "num_eval_elements": producer._eval_elements_in_epoch,
      "num_train_epochs": flag_obj.num_train_epochs,
      "train_prebatch_size": flag_obj.train_prebatch_size,
      "eval_prebatch_size": flag_obj.eval_prebatch_size,
      "num_train_steps": producer.train_batches_per_epoch,
      "num_eval_steps": producer.eval_batches_per_epoch,
  }
  # pylint: enable=protected-access

  return producer, input_metadata

Source File: create_ncf_data.py From models with Apache License 2.0

5 votes

def prepare_raw_data(flag_obj):
  """Downloads and prepares raw data for data generation."""
  movielens.download(flag_obj.dataset, flag_obj.data_dir)

  data_processing_params = {
      "train_epochs": flag_obj.num_train_epochs,
      "batch_size": flag_obj.train_prebatch_size,
      "eval_batch_size": flag_obj.eval_prebatch_size,
      "batches_per_step": 1,
      "stream_files": True,
      "num_neg": flag_obj.num_negative_samples,
  }

  num_users, num_items, producer = data_preprocessing.instantiate_pipeline(
      dataset=flag_obj.dataset,
      data_dir=flag_obj.data_dir,
      params=data_processing_params,
      constructor_type=flag_obj.constructor_type,
      epoch_dir=flag_obj.data_dir,
      generate_data_offline=True)

  # pylint: disable=protected-access
  input_metadata = {
      "num_users": num_users,
      "num_items": num_items,
      "constructor_type": flag_obj.constructor_type,
      "num_train_elements": producer._elements_in_epoch,
      "num_eval_elements": producer._eval_elements_in_epoch,
      "num_train_epochs": flag_obj.num_train_epochs,
      "train_prebatch_size": flag_obj.train_prebatch_size,
      "eval_prebatch_size": flag_obj.eval_prebatch_size,
      "num_train_steps": producer.train_batches_per_epoch,
      "num_eval_steps": producer.eval_batches_per_epoch,
  }
  # pylint: enable=protected-access

  return producer, input_metadata

Source File: ncf_main.py From class-balanced-loss with MIT License

5 votes

def main(_):
  """Train NCF model and evaluate its hit rate (HR) metric."""
  tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
      FLAGS.tpu,
      zone=FLAGS.tpu_zone,
      project=FLAGS.gcp_project)
  master = tpu_cluster_resolver.master()

  ncf_dataset, cleanup_fn = data_preprocessing.instantiate_pipeline(
      dataset=FLAGS.dataset,
      data_dir=FLAGS.data_dir,
      # TODO(shizhiw): support multihost.
      batch_size=FLAGS.batch_size,
      eval_batch_size=FLAGS.eval_batch_size,
      num_neg=FLAGS.num_neg,
      epochs_per_cycle=1,
      match_mlperf=FLAGS.ml_perf,
      use_subprocess=FLAGS.use_subprocess,
      cache_id=FLAGS.cache_id)

  train_params, eval_params = create_params(ncf_dataset)

  eval_graph_spec = build_graph(
      eval_params, ncf_dataset, tpu_embedding.INFERENCE)

  for epoch in range(_NUM_EPOCHS):
    tf.logging.info("Training {}...".format(epoch))
    # build training graph each epoch as number of batches per epoch
    # i.e. batch_count might change by 1 between epochs.
    train_graph_spec = build_graph(
        train_params, ncf_dataset, tpu_embedding.TRAINING)

    run_graph(master, train_graph_spec, epoch)

    tf.logging.info("Evaluating {}...".format(epoch))
    run_graph(master, eval_graph_spec, epoch)

  cleanup_fn()  # Cleanup data construction artifacts and subprocess.

Source File: data_test.py From multilabel-image-classification-tensorflow with MIT License

5 votes

def test_end_to_end(self):
    ncf_dataset, _ = data_preprocessing.instantiate_pipeline(
        dataset=DATASET, data_dir=self.temp_data_dir,
        batch_size=BATCH_SIZE, eval_batch_size=EVAL_BATCH_SIZE,
        num_cycles=1, num_data_readers=2, num_neg=NUM_NEG)

    g = tf.Graph()
    with g.as_default():
      input_fn, record_dir, batch_count = \
        data_preprocessing.make_input_fn(ncf_dataset, True)
      dataset = input_fn({"batch_size": BATCH_SIZE, "use_tpu": False,
                          "use_xla_for_gpu": False})
    first_epoch = self.drain_dataset(dataset=dataset, g=g)
    user_inv_map = {v: k for k, v in ncf_dataset.user_map.items()}
    item_inv_map = {v: k for k, v in ncf_dataset.item_map.items()}

    train_examples = {
        True: set(),
        False: set(),
    }
    for features, labels in first_epoch:
      for u, i, l in zip(features[movielens.USER_COLUMN],
                         features[movielens.ITEM_COLUMN], labels):

        u_raw = user_inv_map[u]
        i_raw = item_inv_map[i]
        if ((u_raw, i_raw) in self.seen_pairs) != l:
          # The evaluation item is not considered during false negative
          # generation, so it will occasionally appear as a negative example
          # during training.
          assert not l
          assert i_raw == self.holdout[u_raw][1]
        train_examples[l].add((u_raw, i_raw))
    num_positives_seen = len(train_examples[True])

    assert ncf_dataset.num_train_positives == num_positives_seen

    # This check is more heuristic because negatives are sampled with
    # replacement. It only checks that negative generation is reasonably random.
    assert len(train_examples[False]) / NUM_NEG / num_positives_seen > 0.9

Python official.recommendation.data_preprocessing.instantiate_pipeline() Examples