Python official.recommendation.data_preprocessing.instantiate_pipeline() Examples
The following are 8
code examples of official.recommendation.data_preprocessing.instantiate_pipeline().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
official.recommendation.data_preprocessing
, or try the search function
.
Example #1
Source File: ncf_common.py From models with Apache License 2.0 | 6 votes |
def get_inputs(params): """Returns some parameters used by the model.""" if FLAGS.download_if_missing and not FLAGS.use_synthetic_data: movielens.download(FLAGS.dataset, FLAGS.data_dir) if FLAGS.seed is not None: np.random.seed(FLAGS.seed) if FLAGS.use_synthetic_data: producer = data_pipeline.DummyConstructor() num_users, num_items = data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[ FLAGS.dataset] num_train_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH num_eval_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH else: num_users, num_items, producer = data_preprocessing.instantiate_pipeline( dataset=FLAGS.dataset, data_dir=FLAGS.data_dir, params=params, constructor_type=FLAGS.constructor_type, deterministic=FLAGS.seed is not None) num_train_steps = producer.train_batches_per_epoch num_eval_steps = producer.eval_batches_per_epoch return num_users, num_items, num_train_steps, num_eval_steps, producer
Example #2
Source File: ncf_common.py From Live-feed-object-device-identification-using-Tensorflow-and-OpenCV with Apache License 2.0 | 6 votes |
def get_inputs(params): """Returns some parameters used by the model.""" if FLAGS.download_if_missing and not FLAGS.use_synthetic_data: movielens.download(FLAGS.dataset, FLAGS.data_dir) if FLAGS.seed is not None: np.random.seed(FLAGS.seed) if FLAGS.use_synthetic_data: producer = data_pipeline.DummyConstructor() num_users, num_items = data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[ FLAGS.dataset] num_train_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH num_eval_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH else: num_users, num_items, producer = data_preprocessing.instantiate_pipeline( dataset=FLAGS.dataset, data_dir=FLAGS.data_dir, params=params, constructor_type=FLAGS.constructor_type, deterministic=FLAGS.seed is not None) num_train_steps = producer.train_batches_per_epoch num_eval_steps = producer.eval_batches_per_epoch return num_users, num_items, num_train_steps, num_eval_steps, producer
Example #3
Source File: ncf_common.py From models with Apache License 2.0 | 6 votes |
def get_inputs(params): """Returns some parameters used by the model.""" if FLAGS.download_if_missing and not FLAGS.use_synthetic_data: movielens.download(FLAGS.dataset, FLAGS.data_dir) if FLAGS.seed is not None: np.random.seed(FLAGS.seed) if FLAGS.use_synthetic_data: producer = data_pipeline.DummyConstructor() num_users, num_items = movielens.DATASET_TO_NUM_USERS_AND_ITEMS[ FLAGS.dataset] num_train_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH num_eval_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH else: num_users, num_items, producer = data_preprocessing.instantiate_pipeline( dataset=FLAGS.dataset, data_dir=FLAGS.data_dir, params=params, constructor_type=FLAGS.constructor_type, deterministic=FLAGS.seed is not None) num_train_steps = producer.train_batches_per_epoch num_eval_steps = producer.eval_batches_per_epoch return num_users, num_items, num_train_steps, num_eval_steps, producer
Example #4
Source File: ncf_main.py From training_results_v0.5 with Apache License 2.0 | 5 votes |
def main(_): """Train NCF model and evaluate its hit rate (HR) metric.""" tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) master = tpu_cluster_resolver.master() ncf_dataset, cleanup_fn = data_preprocessing.instantiate_pipeline( dataset=FLAGS.dataset, data_dir=FLAGS.data_dir, # TODO(shizhiw): support multihost. batch_size=FLAGS.batch_size, eval_batch_size=FLAGS.eval_batch_size, num_neg=FLAGS.num_neg, epochs_per_cycle=1, match_mlperf=FLAGS.ml_perf, use_subprocess=FLAGS.use_subprocess, cache_id=FLAGS.cache_id) train_params, eval_params = create_params(ncf_dataset) eval_graph_spec = build_graph( eval_params, ncf_dataset, tpu_embedding.INFERENCE) for epoch in range(_NUM_EPOCHS): tf.logging.info("Training {}...".format(epoch)) # build training graph each epoch as number of batches per epoch # i.e. batch_count might change by 1 between epochs. train_graph_spec = build_graph( train_params, ncf_dataset, tpu_embedding.TRAINING) run_graph(master, train_graph_spec, epoch) tf.logging.info("Evaluating {}...".format(epoch)) run_graph(master, eval_graph_spec, epoch) cleanup_fn() # Cleanup data construction artifacts and subprocess.
Example #5
Source File: create_ncf_data.py From Live-feed-object-device-identification-using-Tensorflow-and-OpenCV with Apache License 2.0 | 5 votes |
def prepare_raw_data(flag_obj): """Downloads and prepares raw data for data generation.""" movielens.download(flag_obj.dataset, flag_obj.data_dir) data_processing_params = { "train_epochs": flag_obj.num_train_epochs, "batch_size": flag_obj.train_prebatch_size, "eval_batch_size": flag_obj.eval_prebatch_size, "batches_per_step": 1, "stream_files": True, "num_neg": flag_obj.num_negative_samples, } num_users, num_items, producer = data_preprocessing.instantiate_pipeline( dataset=flag_obj.dataset, data_dir=flag_obj.data_dir, params=data_processing_params, constructor_type=flag_obj.constructor_type, epoch_dir=flag_obj.data_dir, generate_data_offline=True) # pylint: disable=protected-access input_metadata = { "num_users": num_users, "num_items": num_items, "constructor_type": flag_obj.constructor_type, "num_train_elements": producer._elements_in_epoch, "num_eval_elements": producer._eval_elements_in_epoch, "num_train_epochs": flag_obj.num_train_epochs, "train_prebatch_size": flag_obj.train_prebatch_size, "eval_prebatch_size": flag_obj.eval_prebatch_size, "num_train_steps": producer.train_batches_per_epoch, "num_eval_steps": producer.eval_batches_per_epoch, } # pylint: enable=protected-access return producer, input_metadata
Example #6
Source File: create_ncf_data.py From models with Apache License 2.0 | 5 votes |
def prepare_raw_data(flag_obj): """Downloads and prepares raw data for data generation.""" movielens.download(flag_obj.dataset, flag_obj.data_dir) data_processing_params = { "train_epochs": flag_obj.num_train_epochs, "batch_size": flag_obj.train_prebatch_size, "eval_batch_size": flag_obj.eval_prebatch_size, "batches_per_step": 1, "stream_files": True, "num_neg": flag_obj.num_negative_samples, } num_users, num_items, producer = data_preprocessing.instantiate_pipeline( dataset=flag_obj.dataset, data_dir=flag_obj.data_dir, params=data_processing_params, constructor_type=flag_obj.constructor_type, epoch_dir=flag_obj.data_dir, generate_data_offline=True) # pylint: disable=protected-access input_metadata = { "num_users": num_users, "num_items": num_items, "constructor_type": flag_obj.constructor_type, "num_train_elements": producer._elements_in_epoch, "num_eval_elements": producer._eval_elements_in_epoch, "num_train_epochs": flag_obj.num_train_epochs, "train_prebatch_size": flag_obj.train_prebatch_size, "eval_prebatch_size": flag_obj.eval_prebatch_size, "num_train_steps": producer.train_batches_per_epoch, "num_eval_steps": producer.eval_batches_per_epoch, } # pylint: enable=protected-access return producer, input_metadata
Example #7
Source File: ncf_main.py From class-balanced-loss with MIT License | 5 votes |
def main(_): """Train NCF model and evaluate its hit rate (HR) metric.""" tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) master = tpu_cluster_resolver.master() ncf_dataset, cleanup_fn = data_preprocessing.instantiate_pipeline( dataset=FLAGS.dataset, data_dir=FLAGS.data_dir, # TODO(shizhiw): support multihost. batch_size=FLAGS.batch_size, eval_batch_size=FLAGS.eval_batch_size, num_neg=FLAGS.num_neg, epochs_per_cycle=1, match_mlperf=FLAGS.ml_perf, use_subprocess=FLAGS.use_subprocess, cache_id=FLAGS.cache_id) train_params, eval_params = create_params(ncf_dataset) eval_graph_spec = build_graph( eval_params, ncf_dataset, tpu_embedding.INFERENCE) for epoch in range(_NUM_EPOCHS): tf.logging.info("Training {}...".format(epoch)) # build training graph each epoch as number of batches per epoch # i.e. batch_count might change by 1 between epochs. train_graph_spec = build_graph( train_params, ncf_dataset, tpu_embedding.TRAINING) run_graph(master, train_graph_spec, epoch) tf.logging.info("Evaluating {}...".format(epoch)) run_graph(master, eval_graph_spec, epoch) cleanup_fn() # Cleanup data construction artifacts and subprocess.
Example #8
Source File: data_test.py From multilabel-image-classification-tensorflow with MIT License | 5 votes |
def test_end_to_end(self): ncf_dataset, _ = data_preprocessing.instantiate_pipeline( dataset=DATASET, data_dir=self.temp_data_dir, batch_size=BATCH_SIZE, eval_batch_size=EVAL_BATCH_SIZE, num_cycles=1, num_data_readers=2, num_neg=NUM_NEG) g = tf.Graph() with g.as_default(): input_fn, record_dir, batch_count = \ data_preprocessing.make_input_fn(ncf_dataset, True) dataset = input_fn({"batch_size": BATCH_SIZE, "use_tpu": False, "use_xla_for_gpu": False}) first_epoch = self.drain_dataset(dataset=dataset, g=g) user_inv_map = {v: k for k, v in ncf_dataset.user_map.items()} item_inv_map = {v: k for k, v in ncf_dataset.item_map.items()} train_examples = { True: set(), False: set(), } for features, labels in first_epoch: for u, i, l in zip(features[movielens.USER_COLUMN], features[movielens.ITEM_COLUMN], labels): u_raw = user_inv_map[u] i_raw = item_inv_map[i] if ((u_raw, i_raw) in self.seen_pairs) != l: # The evaluation item is not considered during false negative # generation, so it will occasionally appear as a negative example # during training. assert not l assert i_raw == self.holdout[u_raw][1] train_examples[l].add((u_raw, i_raw)) num_positives_seen = len(train_examples[True]) assert ncf_dataset.num_train_positives == num_positives_seen # This check is more heuristic because negatives are sampled with # replacement. It only checks that negative generation is reasonably random. assert len(train_examples[False]) / NUM_NEG / num_positives_seen > 0.9