Python tensorflow.matching_files() Examples

The following are 16 code examples of tensorflow.matching_files(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module tensorflow , or try the search function .
Example #1
Source File: input_pipeline_dask.py    From professional-services with Apache License 2.0 6 votes vote down vote up
def input_fn(self, name, csv_path=None):
        """Creates a dataset object for the model to consume. Input function for estimator

        Arguments:
                name : string, Name of the data [Train or Eval]
                csv_path : The path of the csv on any storage system

        Returns:
                features : tf.data.TextLineDataset object, Dataset containing batch of features
                labels : tf.data.TextLineDataset object, Dataset containing batch of labels
        """
        pattern = self._get_pattern(name, csv_path)
        tf.logging.info('The Pattern of files is : %s', pattern)
        filenames = tf.matching_files(pattern=pattern)
        dataset = tf.data.TextLineDataset(filenames).skip(1).map(
            self.parse_csv, num_parallel_calls=cpu_count())
        dataset = dataset.shuffle(buffer_size=self.batch_size * 100)
        dataset = dataset.apply(tf.contrib.data.ignore_errors())
        dataset = dataset.repeat(self.num_epochs)
        dataset = dataset.batch(self.batch_size)  # determine the ideal number
        dataset = dataset.prefetch(self.buffer_size)
        iterator = dataset.make_one_shot_iterator()
        feats, labs = iterator.get_next()
        return feats, labs 
Example #2
Source File: dataset.py    From keras_imagenet with MIT License 6 votes vote down vote up
def get_dataset(tfrecords_dir, subset, batch_size):
    """Read TFRecords files and turn them into a TFRecordDataset."""
    files = tf.matching_files(os.path.join(tfrecords_dir, '%s-*' % subset))
    shards = tf.data.Dataset.from_tensor_slices(files)
    shards = shards.shuffle(tf.cast(tf.shape(files)[0], tf.int64))
    shards = shards.repeat()
    dataset = shards.interleave(tf.data.TFRecordDataset, cycle_length=4)
    dataset = dataset.shuffle(buffer_size=8192)
    parser = partial(
        _parse_fn, is_training=True if subset == 'train' else False)
    dataset = dataset.apply(
        tf.data.experimental.map_and_batch(
            map_func=parser,
            batch_size=batch_size,
            num_parallel_calls=config.NUM_DATA_WORKERS))
    dataset = dataset.prefetch(batch_size)
    return dataset 
Example #3
Source File: dataset_util.py    From ros_people_object_detection_tensorflow with Apache License 2.0 5 votes vote down vote up
def read_dataset(file_read_func, decode_func, input_files, config):
  """Reads a dataset, and handles repetition and shuffling.

  Args:
    file_read_func: Function to use in tf.data.Dataset.interleave, to read
      every individual file into a tf.data.Dataset.
    decode_func: Function to apply to all records.
    input_files: A list of file paths to read.
    config: A input_reader_builder.InputReader object.

  Returns:
    A tf.data.Dataset based on config.
  """
  # Shard, shuffle, and read files.
  filenames = tf.concat([tf.matching_files(pattern) for pattern in input_files],
                        0)
  filename_dataset = tf.data.Dataset.from_tensor_slices(filenames)
  if config.shuffle:
    filename_dataset = filename_dataset.shuffle(
        config.filenames_shuffle_buffer_size)
  filename_dataset = filename_dataset.repeat(config.num_epochs or None)

  records_dataset = filename_dataset.apply(
      tf.contrib.data.parallel_interleave(
          file_read_func, cycle_length=config.num_readers, sloppy=True))
  if config.shuffle:
    records_dataset.shuffle(config.shuffle_buffer_size)
  tensor_dataset = records_dataset.map(
      decode_func, num_parallel_calls=config.num_parallel_map_calls)
  return tensor_dataset.prefetch(config.prefetch_size) 
Example #4
Source File: dataset_util.py    From Gun-Detector with Apache License 2.0 5 votes vote down vote up
def read_dataset(file_read_func, decode_func, input_files, config):
  """Reads a dataset, and handles repetition and shuffling.

  Args:
    file_read_func: Function to use in tf.data.Dataset.interleave, to read
      every individual file into a tf.data.Dataset.
    decode_func: Function to apply to all records.
    input_files: A list of file paths to read.
    config: A input_reader_builder.InputReader object.

  Returns:
    A tf.data.Dataset based on config.
  """
  # Shard, shuffle, and read files.
  filenames = tf.concat([tf.matching_files(pattern) for pattern in input_files],
                        0)
  filename_dataset = tf.data.Dataset.from_tensor_slices(filenames)
  if config.shuffle:
    filename_dataset = filename_dataset.shuffle(
        config.filenames_shuffle_buffer_size)
  elif config.num_readers > 1:
    tf.logging.warning('`shuffle` is false, but the input data stream is '
                       'still slightly shuffled since `num_readers` > 1.')

  filename_dataset = filename_dataset.repeat(config.num_epochs or None)

  records_dataset = filename_dataset.apply(
      tf.contrib.data.parallel_interleave(
          file_read_func, cycle_length=config.num_readers,
          block_length=config.read_block_length, sloppy=True))
  if config.shuffle:
    records_dataset.shuffle(config.shuffle_buffer_size)
  tensor_dataset = records_dataset.map(
      decode_func, num_parallel_calls=config.num_parallel_map_calls)
  return tensor_dataset.prefetch(config.prefetch_size) 
Example #5
Source File: io_ops_test.py    From deep_image_model with Apache License 2.0 5 votes vote down vote up
def testMatchingFiles(self):
    cases = ['ABcDEF.GH', 'ABzDEF.GH', 'ABasdfjklDEF.GH', 'AB3DEF.GH',
             'AB4DEF.GH', 'ABDEF.GH', 'XYZ']
    files = [tempfile.NamedTemporaryFile(
        prefix=c, dir=self.get_temp_dir()) for c in cases]

    with self.test_session():
      # Test exact match without wildcards.
      for f in files:
        self.assertEqual(tf.matching_files(f.name).eval(),
                         tf.compat.as_bytes(f.name))

      # We will look for files matching "ABxDEF.GH*" where "x" is some wildcard.
      pos = files[0].name.find(cases[0])
      pattern = files[0].name[:pos] + 'AB%sDEF.GH*'

      self.assertEqual(set(tf.matching_files(pattern % 'z').eval()),
                       self._subset(files, [1]))
      self.assertEqual(set(tf.matching_files(pattern % '?').eval()),
                       self._subset(files, [0, 1, 3, 4]))
      self.assertEqual(set(tf.matching_files(pattern % '*').eval()),
                       self._subset(files, [0, 1, 2, 3, 4, 5]))
      self.assertEqual(set(tf.matching_files(pattern % '[cxz]').eval()),
                       self._subset(files, [0, 1]))
      self.assertEqual(set(tf.matching_files(pattern % '[0-9]').eval()),
                       self._subset(files, [3, 4])) 
Example #6
Source File: dataset_util.py    From ros_tensorflow with Apache License 2.0 5 votes vote down vote up
def read_dataset(file_read_func, decode_func, input_files, config):
  """Reads a dataset, and handles repetition and shuffling.

  Args:
    file_read_func: Function to use in tf.data.Dataset.interleave, to read
      every individual file into a tf.data.Dataset.
    decode_func: Function to apply to all records.
    input_files: A list of file paths to read.
    config: A input_reader_builder.InputReader object.

  Returns:
    A tf.data.Dataset based on config.
  """
  # Shard, shuffle, and read files.
  filenames = tf.concat([tf.matching_files(pattern) for pattern in input_files],
                        0)
  filename_dataset = tf.data.Dataset.from_tensor_slices(filenames)
  if config.shuffle:
    filename_dataset = filename_dataset.shuffle(
        config.filenames_shuffle_buffer_size)
  elif config.num_readers > 1:
    tf.logging.warning('`shuffle` is false, but the input data stream is '
                       'still slightly shuffled since `num_readers` > 1.')

  filename_dataset = filename_dataset.repeat(config.num_epochs or None)

  records_dataset = filename_dataset.apply(
      tf.contrib.data.parallel_interleave(
          file_read_func, cycle_length=config.num_readers,
          block_length=config.read_block_length, sloppy=True))
  if config.shuffle:
    records_dataset.shuffle(config.shuffle_buffer_size)
  tensor_dataset = records_dataset.map(
      decode_func, num_parallel_calls=config.num_parallel_map_calls)
  return tensor_dataset.prefetch(config.prefetch_size) 
Example #7
Source File: input_util.py    From professional-services with Apache License 2.0 5 votes vote down vote up
def input_fn(input_dir, mode, batch_size, num_epochs, label_name=None,
             shuffle_buffer_size=10000, feature_spec=None):
    """Reads TFRecords and returns the features and labels."""
    if feature_spec is None:
        tf_transform_output = tft.TFTransformOutput(
            os.path.join(input_dir, 'transformed_metadata'))
        feature_spec = tf_transform_output.transformed_feature_spec()
    prefix = str(mode).lower()
    suffix = '.tfrecord'
    num_cpus = multiprocessing.cpu_count()

    file_pattern = os.path.join(input_dir, 'data', prefix, prefix+'*'+suffix)
    filenames = tf.matching_files(file_pattern)
    dataset = tf.data.TFRecordDataset(filenames=filenames, buffer_size=None,
                                      num_parallel_reads=num_cpus)

    if mode == tf.estimator.ModeKeys.TRAIN:
        dataset = dataset.shuffle(shuffle_buffer_size)

    dataset = dataset.repeat(num_epochs)
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(
        lambda examples: tf.parse_example(examples, feature_spec))
    iterator = dataset.make_one_shot_iterator()
    features = iterator.get_next()
    if mode == tf.estimator.ModeKeys.PREDICT:
        return features

    label = features.pop(label_name)
    return features, label 
Example #8
Source File: data_utils.py    From ID-CNN-CWS with GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, in_pattern, batch_size, num_buckets=0, num_epochs=None):
        self._batch_size = batch_size
        self.num_buckets = num_buckets
        self._epoch = 0
        self._step = 1.
        self.num_epochs = num_epochs
        file_pattern = in_pattern + '/examples.proto' if os.path.isdir(in_pattern) else in_pattern
        filenames = tf.matching_files(file_pattern)
        # filenames = tf.Print(filenames, [filenames], message='filenames: ')
        self.next_batch_op = self.input_pipeline(filenames, self._batch_size, self.num_buckets, self.num_epochs) 
Example #9
Source File: data_utils.py    From bran with Apache License 2.0 5 votes vote down vote up
def input_pipeline(self, file_pattern, batch_size, num_epochs=None, num_threads=10):
        filenames = tf.matching_files(file_pattern)
        filename_queue = tf.train.string_input_producer(filenames, num_epochs=num_epochs, shuffle=True)
        parsed_batch = self.example_parser(filename_queue)
        min_after_dequeue = 10000
        capacity = min_after_dequeue + 12 * batch_size
        next_batch = tf.train.batch(
                parsed_batch, batch_size=batch_size, capacity=capacity,
                num_threads=num_threads, dynamic_pad=True, allow_smaller_final_batch=True)
        return next_batch 
Example #10
Source File: dataset_util.py    From Elphas with Apache License 2.0 5 votes vote down vote up
def read_dataset(file_read_func, decode_func, input_files, config):
  """Reads a dataset, and handles repetition and shuffling.

  Args:
    file_read_func: Function to use in tf.data.Dataset.interleave, to read
      every individual file into a tf.data.Dataset.
    decode_func: Function to apply to all records.
    input_files: A list of file paths to read.
    config: A input_reader_builder.InputReader object.

  Returns:
    A tf.data.Dataset based on config.
  """
  # Shard, shuffle, and read files.
  filenames = tf.concat([tf.matching_files(pattern) for pattern in input_files],
                        0)
  filename_dataset = tf.data.Dataset.from_tensor_slices(filenames)
  if config.shuffle:
    filename_dataset = filename_dataset.shuffle(
        config.filenames_shuffle_buffer_size)
  filename_dataset = filename_dataset.repeat(config.num_epochs or None)

  records_dataset = filename_dataset.apply(
      tf.contrib.data.parallel_interleave(
          file_read_func, cycle_length=config.num_readers, sloppy=True))
  if config.shuffle:
    records_dataset.shuffle(config.shuffle_buffer_size)
  tensor_dataset = records_dataset.map(
      decode_func, num_parallel_calls=config.num_parallel_map_calls)
  return tensor_dataset.prefetch(config.prefetch_size) 
Example #11
Source File: dataset_util.py    From AniSeg with Apache License 2.0 5 votes vote down vote up
def read_dataset(file_read_func, decode_func, input_files, config):
  """Reads a dataset, and handles repetition and shuffling.

  Args:
    file_read_func: Function to use in tf.data.Dataset.interleave, to read
      every individual file into a tf.data.Dataset.
    decode_func: Function to apply to all records.
    input_files: A list of file paths to read.
    config: A input_reader_builder.InputReader object.

  Returns:
    A tf.data.Dataset based on config.
  """
  # Shard, shuffle, and read files.
  filenames = tf.concat([tf.matching_files(pattern) for pattern in input_files],
                        0)
  filename_dataset = tf.data.Dataset.from_tensor_slices(filenames)
  if config.shuffle:
    filename_dataset = filename_dataset.shuffle(
        config.filenames_shuffle_buffer_size)
  filename_dataset = filename_dataset.repeat(config.num_epochs or None)

  records_dataset = filename_dataset.apply(
      tf.contrib.data.parallel_interleave(
          file_read_func, cycle_length=config.num_readers, sloppy=True))
  if config.shuffle:
    records_dataset.shuffle(config.shuffle_buffer_size)
  tensor_dataset = records_dataset.map(
      decode_func, num_parallel_calls=config.num_parallel_map_calls)
  return tensor_dataset.prefetch(config.prefetch_size) 
Example #12
Source File: dataset_util.py    From Traffic-Rule-Violation-Detection-System with MIT License 4 votes vote down vote up
def read_dataset(
    file_read_func, decode_func, input_files, config, num_workers=1,
    worker_index=0):
  """Reads a dataset, and handles repetition and shuffling.

  Args:
    file_read_func: Function to use in tf.data.Dataset.interleave, to read
      every individual file into a tf.data.Dataset.
    decode_func: Function to apply to all records.
    input_files: A list of file paths to read.
    config: A input_reader_builder.InputReader object.
    num_workers: Number of workers / shards.
    worker_index: Id for the current worker.

  Returns:
    A tf.data.Dataset based on config.
  """
  # Shard, shuffle, and read files.
  filenames = tf.concat([tf.matching_files(pattern) for pattern in input_files],
                        0)
  dataset = tf.data.Dataset.from_tensor_slices(filenames)
  dataset = dataset.shard(num_workers, worker_index)
  dataset = dataset.repeat(config.num_epochs or None)
  if config.shuffle:
    dataset = dataset.shuffle(config.filenames_shuffle_buffer_size,
                              reshuffle_each_iteration=True)

  # Read file records and shuffle them.
  # If cycle_length is larger than the number of files, more than one reader
  # will be assigned to the same file, leading to repetition.
  cycle_length = tf.cast(
      tf.minimum(config.num_readers, tf.size(filenames)), tf.int64)
  # TODO: find the optimal block_length.
  dataset = dataset.interleave(
      file_read_func, cycle_length=cycle_length, block_length=1)

  if config.shuffle:
    dataset = dataset.shuffle(config.shuffle_buffer_size,
                              reshuffle_each_iteration=True)

  dataset = dataset.map(decode_func, num_parallel_calls=config.num_readers)
  return dataset.prefetch(config.prefetch_buffer_size) 
Example #13
Source File: dataset_util.py    From deepglobe_land_cover_classification_with_deeplabv3plus with MIT License 4 votes vote down vote up
def read_dataset(
        file_read_func, decode_func, input_files, config, num_workers=1,
        worker_index=0):
    """Reads a dataset, and handles repetition and shuffling.

    Args:
      file_read_func: Function to use in tf.data.Dataset.interleave, to read
        every individual file into a tf.data.Dataset.
      decode_func: Function to apply to all records.
      input_files: A list of file paths to read.
      config: A input_reader_builder.InputReader object.
      num_workers: Number of workers / shards.
      worker_index: Id for the current worker.

    Returns:
      A tf.data.Dataset based on config.
    """
    # Shard, shuffle, and read files.
    filenames = tf.concat([tf.matching_files(pattern) for pattern in input_files],
                          0)
    dataset = tf.data.Dataset.from_tensor_slices(filenames)
    dataset = dataset.shard(num_workers, worker_index)
    dataset = dataset.repeat(config.num_epochs or None)
    if config.shuffle:
        dataset = dataset.shuffle(config.filenames_shuffle_buffer_size,
                                  reshuffle_each_iteration=True)

    # Read file records and shuffle them.
    # If cycle_length is larger than the number of files, more than one reader
    # will be assigned to the same file, leading to repetition.
    cycle_length = tf.cast(
        tf.minimum(config.num_readers, tf.size(filenames)), tf.int64)
    # TODO: find the optimal block_length.
    dataset = dataset.interleave(
        file_read_func, cycle_length=cycle_length, block_length=1)

    if config.shuffle:
        dataset = dataset.shuffle(config.shuffle_buffer_size,
                                  reshuffle_each_iteration=True)

    dataset = dataset.map(decode_func, num_parallel_calls=config.num_readers)
    return dataset.prefetch(config.prefetch_buffer_size) 
Example #14
Source File: dataset_util.py    From tensorflow-deeplab-v3 with MIT License 4 votes vote down vote up
def read_dataset(
    file_read_func, decode_func, input_files, config, num_workers=1,
    worker_index=0):
  """Reads a dataset, and handles repetition and shuffling.

  Args:
    file_read_func: Function to use in tf.data.Dataset.interleave, to read
      every individual file into a tf.data.Dataset.
    decode_func: Function to apply to all records.
    input_files: A list of file paths to read.
    config: A input_reader_builder.InputReader object.
    num_workers: Number of workers / shards.
    worker_index: Id for the current worker.

  Returns:
    A tf.data.Dataset based on config.
  """
  # Shard, shuffle, and read files.
  filenames = tf.concat([tf.matching_files(pattern) for pattern in input_files],
                        0)
  dataset = tf.data.Dataset.from_tensor_slices(filenames)
  dataset = dataset.shard(num_workers, worker_index)
  dataset = dataset.repeat(config.num_epochs or None)
  if config.shuffle:
    dataset = dataset.shuffle(config.filenames_shuffle_buffer_size,
                              reshuffle_each_iteration=True)

  # Read file records and shuffle them.
  # If cycle_length is larger than the number of files, more than one reader
  # will be assigned to the same file, leading to repetition.
  cycle_length = tf.cast(
      tf.minimum(config.num_readers, tf.size(filenames)), tf.int64)
  # TODO: find the optimal block_length.
  dataset = dataset.interleave(
      file_read_func, cycle_length=cycle_length, block_length=1)

  if config.shuffle:
    dataset = dataset.shuffle(config.shuffle_buffer_size,
                              reshuffle_each_iteration=True)

  dataset = dataset.map(decode_func, num_parallel_calls=config.num_readers)
  return dataset.prefetch(config.prefetch_buffer_size) 
Example #15
Source File: dataset_util.py    From tensorflow-deeplab-v3-plus with MIT License 4 votes vote down vote up
def read_dataset(
    file_read_func, decode_func, input_files, config, num_workers=1,
    worker_index=0):
  """Reads a dataset, and handles repetition and shuffling.

  Args:
    file_read_func: Function to use in tf.data.Dataset.interleave, to read
      every individual file into a tf.data.Dataset.
    decode_func: Function to apply to all records.
    input_files: A list of file paths to read.
    config: A input_reader_builder.InputReader object.
    num_workers: Number of workers / shards.
    worker_index: Id for the current worker.

  Returns:
    A tf.data.Dataset based on config.
  """
  # Shard, shuffle, and read files.
  filenames = tf.concat([tf.matching_files(pattern) for pattern in input_files],
                        0)
  dataset = tf.data.Dataset.from_tensor_slices(filenames)
  dataset = dataset.shard(num_workers, worker_index)
  dataset = dataset.repeat(config.num_epochs or None)
  if config.shuffle:
    dataset = dataset.shuffle(config.filenames_shuffle_buffer_size,
                              reshuffle_each_iteration=True)

  # Read file records and shuffle them.
  # If cycle_length is larger than the number of files, more than one reader
  # will be assigned to the same file, leading to repetition.
  cycle_length = tf.cast(
      tf.minimum(config.num_readers, tf.size(filenames)), tf.int64)
  # TODO: find the optimal block_length.
  dataset = dataset.interleave(
      file_read_func, cycle_length=cycle_length, block_length=1)

  if config.shuffle:
    dataset = dataset.shuffle(config.shuffle_buffer_size,
                              reshuffle_each_iteration=True)

  dataset = dataset.map(decode_func, num_parallel_calls=config.num_readers)
  return dataset.prefetch(config.prefetch_buffer_size) 
Example #16
Source File: dataset_util.py    From LaneSegmentationNetwork with GNU Lesser General Public License v3.0 4 votes vote down vote up
def read_dataset(
        file_read_func, decode_func, input_files, config, num_workers=1,
        worker_index=0):
    """Reads a dataset, and handles repetition and shuffling.

    Args:
      file_read_func: Function to use in tf.data.Dataset.interleave, to read
        every individual file into a tf.data.Dataset.
      decode_func: Function to apply to all records.
      input_files: A list of file paths to read.
      config: A input_reader_builder.InputReader object.
      num_workers: Number of workers / shards.
      worker_index: Id for the current worker.

    Returns:
      A tf.data.Dataset based on config.
    """
    # Shard, shuffle, and read files.
    filenames = tf.concat([tf.matching_files(pattern) for pattern in input_files],
                          0)
    dataset = tf.data.Dataset.from_tensor_slices(filenames)
    dataset = dataset.shard(num_workers, worker_index)
    dataset = dataset.repeat(config.num_epochs or None)
    if config.shuffle:
        dataset = dataset.shuffle(config.filenames_shuffle_buffer_size,
                                  reshuffle_each_iteration=True)

    # Read file records and shuffle them.
    # If cycle_length is larger than the number of files, more than one reader
    # will be assigned to the same file, leading to repetition.
    cycle_length = tf.cast(
        tf.minimum(config.num_readers, tf.size(filenames)), tf.int64)
    # TODO: find the optimal block_length.
    dataset = dataset.interleave(
        file_read_func, cycle_length=cycle_length, block_length=1)

    if config.shuffle:
        dataset = dataset.shuffle(config.shuffle_buffer_size,
                                  reshuffle_each_iteration=True)

    dataset = dataset.map(decode_func, num_parallel_calls=config.num_readers)
    return dataset.prefetch(config.prefetch_buffer_size)