Python tensorflow.python.training.basic_session_run_hooks.CheckpointSaverHook() Examples
The following are 19
code examples of tensorflow.python.training.basic_session_run_hooks.CheckpointSaverHook().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
tensorflow.python.training.basic_session_run_hooks
, or try the search function
.
Example #1
Source File: async_checkpoint.py From training_results_v0.5 with Apache License 2.0 | 5 votes |
def __init__(self, checkpoint_dir, save_secs=None, save_steps=None, saver=None, checkpoint_basename="model.ckpt", scaffold=None, listeners=None): """Initializes a `CheckpointSaverHook`. Args: checkpoint_dir: `str`, base directory for the checkpoint files. save_secs: `int`, save every N secs. save_steps: `int`, save every N steps. saver: `Saver` object, used for saving. checkpoint_basename: `str`, base name for the checkpoint files. scaffold: `Scaffold`, use to get saver object. listeners: List of `CheckpointSaverListener` subclass instances. Used for callbacks that run immediately before or after this hook saves the checkpoint. Raises: ValueError: One of `save_steps` or `save_secs` should be set. ValueError: At most one of `saver` or `scaffold` should be set. """ logging.info("Create AsyncCheckpointSaverHook.") if saver is not None and scaffold is not None: raise ValueError("You cannot provide both saver and scaffold.") self._saver = saver self._save_thread = None self._write_graph_thread = None self._checkpoint_dir = checkpoint_dir self._save_path = os.path.join(checkpoint_dir, checkpoint_basename) self._scaffold = scaffold self._timer = basic_session_run_hooks.SecondOrStepTimer( every_secs=save_secs, every_steps=save_steps) self._listeners = listeners or [] self._steps_per_run = 1 self._summary_writer = None self._global_step_tensor = None
Example #2
Source File: training_test.py From tf-slim with Apache License 2.0 | 5 votes |
def testResumeTrainAchievesRoughlyTheSameLoss(self): number_of_steps = [300, 1, 5] logdir = tempfile.mkdtemp('resume_train_same_loss') for i in range(len(number_of_steps)): with ops.Graph().as_default(): random_seed.set_random_seed(i) tf_inputs = constant_op.constant(self._inputs, dtype=dtypes.float32) tf_labels = constant_op.constant(self._labels, dtype=dtypes.float32) tf_predictions = logistic_classifier(tf_inputs) losses.log_loss(tf_labels, tf_predictions) total_loss = losses.get_total_loss() optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=1.0) train_op = training.create_train_op(total_loss, optimizer) saver = saver_lib.Saver() loss = training.train( train_op, logdir, hooks=[ basic_session_run_hooks.StopAtStepHook( num_steps=number_of_steps[i]), basic_session_run_hooks.CheckpointSaverHook( logdir, save_steps=50, saver=saver), ], save_checkpoint_secs=None, save_summaries_steps=None) self.assertIsNotNone(loss) self.assertLess(loss, .015)
Example #3
Source File: async_checkpoint.py From training_results_v0.5 with Apache License 2.0 | 5 votes |
def __init__(self, checkpoint_dir, save_secs=None, save_steps=None, saver=None, checkpoint_basename="model.ckpt", scaffold=None, listeners=None): """Initializes a `CheckpointSaverHook`. Args: checkpoint_dir: `str`, base directory for the checkpoint files. save_secs: `int`, save every N secs. save_steps: `int`, save every N steps. saver: `Saver` object, used for saving. checkpoint_basename: `str`, base name for the checkpoint files. scaffold: `Scaffold`, use to get saver object. listeners: List of `CheckpointSaverListener` subclass instances. Used for callbacks that run immediately before or after this hook saves the checkpoint. Raises: ValueError: One of `save_steps` or `save_secs` should be set. ValueError: At most one of `saver` or `scaffold` should be set. """ logging.info("Create AsyncCheckpointSaverHook.") if saver is not None and scaffold is not None: raise ValueError("You cannot provide both saver and scaffold.") self._saver = saver self._save_thread = None self._write_graph_thread = None self._checkpoint_dir = checkpoint_dir self._save_path = os.path.join(checkpoint_dir, checkpoint_basename) self._scaffold = scaffold self._timer = basic_session_run_hooks.SecondOrStepTimer( every_secs=save_secs, every_steps=save_steps) self._listeners = listeners or [] self._steps_per_run = 1 self._summary_writer = None self._global_step_tensor = None
Example #4
Source File: async_checkpoint.py From training_results_v0.5 with Apache License 2.0 | 5 votes |
def begin(self): self._summary_writer = SummaryWriterCache.get(self._checkpoint_dir) self._global_step_tensor = training_util._get_or_create_global_step_read() # pylint: disable=protected-access if self._global_step_tensor is None: raise RuntimeError( "Global step should be created to use CheckpointSaverHook.") for l in self._listeners: l.begin()
Example #5
Source File: async_checkpoint.py From training_results_v0.5 with Apache License 2.0 | 5 votes |
def __init__(self, checkpoint_dir, save_secs=None, save_steps=None, saver=None, checkpoint_basename="model.ckpt", scaffold=None, listeners=None): """Initializes a `CheckpointSaverHook`. Args: checkpoint_dir: `str`, base directory for the checkpoint files. save_secs: `int`, save every N secs. save_steps: `int`, save every N steps. saver: `Saver` object, used for saving. checkpoint_basename: `str`, base name for the checkpoint files. scaffold: `Scaffold`, use to get saver object. listeners: List of `CheckpointSaverListener` subclass instances. Used for callbacks that run immediately before or after this hook saves the checkpoint. Raises: ValueError: One of `save_steps` or `save_secs` should be set. ValueError: At most one of `saver` or `scaffold` should be set. """ logging.info("Create AsyncCheckpointSaverHook.") if saver is not None and scaffold is not None: raise ValueError("You cannot provide both saver and scaffold.") self._saver = saver self._save_thread = None self._write_graph_thread = None self._checkpoint_dir = checkpoint_dir self._save_path = os.path.join(checkpoint_dir, checkpoint_basename) self._scaffold = scaffold self._timer = basic_session_run_hooks.SecondOrStepTimer( every_secs=save_secs, every_steps=save_steps) self._listeners = listeners or [] self._steps_per_run = 1 self._summary_writer = None self._global_step_tensor = None
Example #6
Source File: async_checkpoint.py From training_results_v0.5 with Apache License 2.0 | 5 votes |
def begin(self): self._summary_writer = SummaryWriterCache.get(self._checkpoint_dir) self._global_step_tensor = training_util._get_or_create_global_step_read() # pylint: disable=protected-access if self._global_step_tensor is None: raise RuntimeError( "Global step should be created to use CheckpointSaverHook.") for l in self._listeners: l.begin()
Example #7
Source File: async_checkpoint.py From training_results_v0.5 with Apache License 2.0 | 5 votes |
def __init__(self, checkpoint_dir, save_secs=None, save_steps=None, saver=None, checkpoint_basename="model.ckpt", scaffold=None, listeners=None): """Initializes a `CheckpointSaverHook`. Args: checkpoint_dir: `str`, base directory for the checkpoint files. save_secs: `int`, save every N secs. save_steps: `int`, save every N steps. saver: `Saver` object, used for saving. checkpoint_basename: `str`, base name for the checkpoint files. scaffold: `Scaffold`, use to get saver object. listeners: List of `CheckpointSaverListener` subclass instances. Used for callbacks that run immediately before or after this hook saves the checkpoint. Raises: ValueError: One of `save_steps` or `save_secs` should be set. ValueError: At most one of `saver` or `scaffold` should be set. """ logging.info("Create AsyncCheckpointSaverHook.") if saver is not None and scaffold is not None: raise ValueError("You cannot provide both saver and scaffold.") self._saver = saver self._save_thread = None self._write_graph_thread = None self._checkpoint_dir = checkpoint_dir self._save_path = os.path.join(checkpoint_dir, checkpoint_basename) self._scaffold = scaffold self._timer = basic_session_run_hooks.SecondOrStepTimer( every_secs=save_secs, every_steps=save_steps) self._listeners = listeners or [] self._steps_per_run = 1 self._summary_writer = None self._global_step_tensor = None
Example #8
Source File: async_checkpoint.py From training_results_v0.5 with Apache License 2.0 | 5 votes |
def __init__(self, checkpoint_dir, save_secs=None, save_steps=None, saver=None, checkpoint_basename="model.ckpt", scaffold=None, listeners=None): """Initializes a `CheckpointSaverHook`. Args: checkpoint_dir: `str`, base directory for the checkpoint files. save_secs: `int`, save every N secs. save_steps: `int`, save every N steps. saver: `Saver` object, used for saving. checkpoint_basename: `str`, base name for the checkpoint files. scaffold: `Scaffold`, use to get saver object. listeners: List of `CheckpointSaverListener` subclass instances. Used for callbacks that run immediately before or after this hook saves the checkpoint. Raises: ValueError: One of `save_steps` or `save_secs` should be set. ValueError: At most one of `saver` or `scaffold` should be set. """ logging.info("Create AsyncCheckpointSaverHook.") if saver is not None and scaffold is not None: raise ValueError("You cannot provide both saver and scaffold.") self._saver = saver self._save_thread = None self._write_graph_thread = None self._checkpoint_dir = checkpoint_dir self._save_path = os.path.join(checkpoint_dir, checkpoint_basename) self._scaffold = scaffold self._timer = basic_session_run_hooks.SecondOrStepTimer( every_secs=save_secs, every_steps=save_steps) self._listeners = listeners or [] self._steps_per_run = 1 self._summary_writer = None self._global_step_tensor = None
Example #9
Source File: async_checkpoint.py From training_results_v0.5 with Apache License 2.0 | 5 votes |
def begin(self): self._summary_writer = SummaryWriterCache.get(self._checkpoint_dir) self._global_step_tensor = training_util._get_or_create_global_step_read() # pylint: disable=protected-access if self._global_step_tensor is None: raise RuntimeError( "Global step should be created to use CheckpointSaverHook.") for l in self._listeners: l.begin()
Example #10
Source File: async_checkpoint.py From training_results_v0.5 with Apache License 2.0 | 5 votes |
def __init__(self, checkpoint_dir, save_secs=None, save_steps=None, saver=None, checkpoint_basename="model.ckpt", scaffold=None, listeners=None): """Initializes a `CheckpointSaverHook`. Args: checkpoint_dir: `str`, base directory for the checkpoint files. save_secs: `int`, save every N secs. save_steps: `int`, save every N steps. saver: `Saver` object, used for saving. checkpoint_basename: `str`, base name for the checkpoint files. scaffold: `Scaffold`, use to get saver object. listeners: List of `CheckpointSaverListener` subclass instances. Used for callbacks that run immediately before or after this hook saves the checkpoint. Raises: ValueError: One of `save_steps` or `save_secs` should be set. ValueError: At most one of `saver` or `scaffold` should be set. """ logging.info("Create AsyncCheckpointSaverHook.") if saver is not None and scaffold is not None: raise ValueError("You cannot provide both saver and scaffold.") self._saver = saver self._save_thread = None self._write_graph_thread = None self._checkpoint_dir = checkpoint_dir self._save_path = os.path.join(checkpoint_dir, checkpoint_basename) self._scaffold = scaffold self._timer = basic_session_run_hooks.SecondOrStepTimer( every_secs=save_secs, every_steps=save_steps) self._listeners = listeners or [] self._steps_per_run = 1 self._summary_writer = None self._global_step_tensor = None
Example #11
Source File: async_checkpoint.py From training_results_v0.5 with Apache License 2.0 | 5 votes |
def begin(self): self._summary_writer = SummaryWriterCache.get(self._checkpoint_dir) self._global_step_tensor = training_util._get_or_create_global_step_read() # pylint: disable=protected-access if self._global_step_tensor is None: raise RuntimeError( "Global step should be created to use CheckpointSaverHook.") for l in self._listeners: l.begin()
Example #12
Source File: async_checkpoint.py From training_results_v0.5 with Apache License 2.0 | 5 votes |
def __init__(self, checkpoint_dir, save_secs=None, save_steps=None, saver=None, checkpoint_basename="model.ckpt", scaffold=None, listeners=None): """Initializes a `CheckpointSaverHook`. Args: checkpoint_dir: `str`, base directory for the checkpoint files. save_secs: `int`, save every N secs. save_steps: `int`, save every N steps. saver: `Saver` object, used for saving. checkpoint_basename: `str`, base name for the checkpoint files. scaffold: `Scaffold`, use to get saver object. listeners: List of `CheckpointSaverListener` subclass instances. Used for callbacks that run immediately before or after this hook saves the checkpoint. Raises: ValueError: One of `save_steps` or `save_secs` should be set. ValueError: At most one of `saver` or `scaffold` should be set. """ logging.info("Create AsyncCheckpointSaverHook.") if saver is not None and scaffold is not None: raise ValueError("You cannot provide both saver and scaffold.") self._saver = saver self._save_thread = None self._write_graph_thread = None self._checkpoint_dir = checkpoint_dir self._save_path = os.path.join(checkpoint_dir, checkpoint_basename) self._scaffold = scaffold self._timer = basic_session_run_hooks.SecondOrStepTimer( every_secs=save_secs, every_steps=save_steps) self._listeners = listeners or [] self._steps_per_run = 1 self._summary_writer = None self._global_step_tensor = None
Example #13
Source File: async_checkpoint.py From training_results_v0.5 with Apache License 2.0 | 5 votes |
def __init__(self, checkpoint_dir, save_secs=None, save_steps=None, saver=None, checkpoint_basename="model.ckpt", scaffold=None, listeners=None): """Initializes a `CheckpointSaverHook`. Args: checkpoint_dir: `str`, base directory for the checkpoint files. save_secs: `int`, save every N secs. save_steps: `int`, save every N steps. saver: `Saver` object, used for saving. checkpoint_basename: `str`, base name for the checkpoint files. scaffold: `Scaffold`, use to get saver object. listeners: List of `CheckpointSaverListener` subclass instances. Used for callbacks that run immediately before or after this hook saves the checkpoint. Raises: ValueError: One of `save_steps` or `save_secs` should be set. ValueError: At most one of `saver` or `scaffold` should be set. """ logging.info("Create AsyncCheckpointSaverHook.") if saver is not None and scaffold is not None: raise ValueError("You cannot provide both saver and scaffold.") self._saver = saver self._save_thread = None self._write_graph_thread = None self._checkpoint_dir = checkpoint_dir self._save_path = os.path.join(checkpoint_dir, checkpoint_basename) self._scaffold = scaffold self._timer = basic_session_run_hooks.SecondOrStepTimer( every_secs=save_secs, every_steps=save_steps) self._listeners = listeners or [] self._steps_per_run = 1 self._summary_writer = None self._global_step_tensor = None
Example #14
Source File: async_checkpoint.py From training_results_v0.5 with Apache License 2.0 | 5 votes |
def begin(self): self._summary_writer = SummaryWriterCache.get(self._checkpoint_dir) self._global_step_tensor = training_util._get_or_create_global_step_read() # pylint: disable=protected-access if self._global_step_tensor is None: raise RuntimeError( "Global step should be created to use CheckpointSaverHook.") for l in self._listeners: l.begin()
Example #15
Source File: async_checkpoint.py From training_results_v0.5 with Apache License 2.0 | 5 votes |
def __init__(self, checkpoint_dir, save_secs=None, save_steps=None, saver=None, checkpoint_basename="model.ckpt", scaffold=None, listeners=None): """Initializes a `CheckpointSaverHook`. Args: checkpoint_dir: `str`, base directory for the checkpoint files. save_secs: `int`, save every N secs. save_steps: `int`, save every N steps. saver: `Saver` object, used for saving. checkpoint_basename: `str`, base name for the checkpoint files. scaffold: `Scaffold`, use to get saver object. listeners: List of `CheckpointSaverListener` subclass instances. Used for callbacks that run immediately before or after this hook saves the checkpoint. Raises: ValueError: One of `save_steps` or `save_secs` should be set. ValueError: At most one of `saver` or `scaffold` should be set. """ logging.info("Create AsyncCheckpointSaverHook.") if saver is not None and scaffold is not None: raise ValueError("You cannot provide both saver and scaffold.") self._saver = saver self._save_thread = None self._write_graph_thread = None self._checkpoint_dir = checkpoint_dir self._save_path = os.path.join(checkpoint_dir, checkpoint_basename) self._scaffold = scaffold self._timer = basic_session_run_hooks.SecondOrStepTimer( every_secs=save_secs, every_steps=save_steps) self._listeners = listeners or [] self._steps_per_run = 1 self._summary_writer = None self._global_step_tensor = None
Example #16
Source File: async_checkpoint.py From training_results_v0.5 with Apache License 2.0 | 5 votes |
def begin(self): self._summary_writer = SummaryWriterCache.get(self._checkpoint_dir) self._global_step_tensor = training_util._get_or_create_global_step_read() # pylint: disable=protected-access if self._global_step_tensor is None: raise RuntimeError( "Global step should be created to use CheckpointSaverHook.") for l in self._listeners: l.begin()
Example #17
Source File: estimator.py From auto-alt-text-lambda-api with MIT License | 4 votes |
def _train_model(self, input_fn, hooks): all_hooks = [] self._graph = ops.Graph() with self._graph.as_default() as g, g.device(self._device_fn): random_seed.set_random_seed(self._config.tf_random_seed) global_step = contrib_framework.create_global_step(g) features, labels = input_fn() self._check_inputs(features, labels) model_fn_ops = self._call_legacy_get_train_ops(features, labels) ops.add_to_collection(ops.GraphKeys.LOSSES, model_fn_ops.loss) all_hooks.extend([ basic_session_run_hooks.NanTensorHook(model_fn_ops.loss), basic_session_run_hooks.LoggingTensorHook( { 'loss': model_fn_ops.loss, 'step': global_step }, every_n_iter=100) ]) all_hooks.extend(hooks) scaffold = model_fn_ops.training_scaffold or monitored_session.Scaffold() if not (scaffold.saver or ops.get_collection(ops.GraphKeys.SAVERS)): ops.add_to_collection( ops.GraphKeys.SAVERS, saver.Saver( sharded=True, max_to_keep=self._config.keep_checkpoint_max, defer_build=True)) chief_hooks = [] if (self._config.save_checkpoints_secs or self._config.save_checkpoints_steps): saver_hook_exists = any([ isinstance(h, basic_session_run_hooks.CheckpointSaverHook) for h in (all_hooks + model_fn_ops.training_hooks + chief_hooks + model_fn_ops.training_chief_hooks) ]) if not saver_hook_exists: chief_hooks = [ basic_session_run_hooks.CheckpointSaverHook( self._model_dir, save_secs=self._config.save_checkpoints_secs, save_steps=self._config.save_checkpoints_steps, scaffold=scaffold) ] with monitored_session.MonitoredTrainingSession( master=self._config.master, is_chief=self._config.is_chief, checkpoint_dir=self._model_dir, scaffold=scaffold, hooks=all_hooks + model_fn_ops.training_hooks, chief_only_hooks=chief_hooks + model_fn_ops.training_chief_hooks, save_checkpoint_secs=0, # Saving is handled by a hook. save_summaries_steps=self._config.save_summary_steps, config=self.config.tf_config) as mon_sess: loss = None while not mon_sess.should_stop(): _, loss = mon_sess.run([model_fn_ops.train_op, model_fn_ops.loss]) summary_io.SummaryWriterCache.clear() return loss
Example #18
Source File: monitored_session.py From deep_image_model with Apache License 2.0 | 4 votes |
def MonitoredTrainingSession(master='', # pylint: disable=invalid-name is_chief=True, checkpoint_dir=None, hooks=None, scaffold=None, config=None): """Creates a `MonitoredSession` for training. For a chief, this utility sets proper session initializer/restorer. It also creates hooks related to checkpoint and summary saving. For workers, this utility sets proper session creator which waits for the chief to inialize/restore. Args: master: `String` the TensorFlow master to use. is_chief: If `True`, it will take care of initialization and recovery the underlying TensorFlow session. If `False`, it will wait on a chief to initialize or recover the TensorFlow session. checkpoint_dir: A string. Optional path to a directory where to restore variables. hooks: Optional list of `SessionRunHook` objects. scaffold: A `Scaffold` used for gathering or building supportive ops. If not specified, a default one is created. It's used to finalize the graph. config: `ConfigProto` proto used to configure the session. Returns: A `MonitoredSession` object. """ hooks = hooks or [] scaffold = scaffold or Scaffold() if not is_chief: session_creator = WorkerSessionCreator( scaffold=scaffold, master=master, config=config) else: session_creator = ChiefSessionCreator( scaffold=scaffold, checkpoint_dir=checkpoint_dir, master=master, config=config) hooks.extend([ basic_session_run_hooks.StepCounterHook(output_dir=checkpoint_dir), basic_session_run_hooks.SummarySaverHook( scaffold=scaffold, save_steps=100, output_dir=checkpoint_dir), basic_session_run_hooks.CheckpointSaverHook( checkpoint_dir, save_secs=600, scaffold=scaffold), ]) return MonitoredSession(session_creator=session_creator, hooks=hooks)
Example #19
Source File: estimator.py From keras-lambda with MIT License | 4 votes |
def _train_model(self, input_fn, hooks): all_hooks = [] self._graph = ops.Graph() with self._graph.as_default() as g, g.device(self._device_fn): random_seed.set_random_seed(self._config.tf_random_seed) global_step = contrib_framework.create_global_step(g) features, labels = input_fn() self._check_inputs(features, labels) model_fn_ops = self._call_legacy_get_train_ops(features, labels) ops.add_to_collection(ops.GraphKeys.LOSSES, model_fn_ops.loss) all_hooks.extend([ basic_session_run_hooks.NanTensorHook(model_fn_ops.loss), basic_session_run_hooks.LoggingTensorHook( { 'loss': model_fn_ops.loss, 'step': global_step }, every_n_iter=100) ]) all_hooks.extend(hooks) scaffold = model_fn_ops.training_scaffold or monitored_session.Scaffold() if not (scaffold.saver or ops.get_collection(ops.GraphKeys.SAVERS)): ops.add_to_collection( ops.GraphKeys.SAVERS, saver.Saver( sharded=True, max_to_keep=self._config.keep_checkpoint_max, defer_build=True)) chief_hooks = [] if (self._config.save_checkpoints_secs or self._config.save_checkpoints_steps): saver_hook_exists = any([ isinstance(h, basic_session_run_hooks.CheckpointSaverHook) for h in (all_hooks + model_fn_ops.training_hooks + chief_hooks + model_fn_ops.training_chief_hooks) ]) if not saver_hook_exists: chief_hooks = [ basic_session_run_hooks.CheckpointSaverHook( self._model_dir, save_secs=self._config.save_checkpoints_secs, save_steps=self._config.save_checkpoints_steps, scaffold=scaffold) ] with monitored_session.MonitoredTrainingSession( master=self._config.master, is_chief=self._config.is_chief, checkpoint_dir=self._model_dir, scaffold=scaffold, hooks=all_hooks + model_fn_ops.training_hooks, chief_only_hooks=chief_hooks + model_fn_ops.training_chief_hooks, save_checkpoint_secs=0, # Saving is handled by a hook. save_summaries_steps=self._config.save_summary_steps, config=self.config.tf_config) as mon_sess: loss = None while not mon_sess.should_stop(): _, loss = mon_sess.run([model_fn_ops.train_op, model_fn_ops.loss]) summary_io.SummaryWriterCache.clear() return loss