Python Examples of tensorflow.matching

Source File: input_pipeline_dask.py From professional-services with Apache License 2.0

6 votes

def input_fn(self, name, csv_path=None):
        """Creates a dataset object for the model to consume. Input function for estimator

        Arguments:
                name : string, Name of the data [Train or Eval]
                csv_path : The path of the csv on any storage system

        Returns:
                features : tf.data.TextLineDataset object, Dataset containing batch of features
                labels : tf.data.TextLineDataset object, Dataset containing batch of labels
        """
        pattern = self._get_pattern(name, csv_path)
        tf.logging.info('The Pattern of files is : %s', pattern)
        filenames = tf.matching_files(pattern=pattern)
        dataset = tf.data.TextLineDataset(filenames).skip(1).map(
            self.parse_csv, num_parallel_calls=cpu_count())
        dataset = dataset.shuffle(buffer_size=self.batch_size * 100)
        dataset = dataset.apply(tf.contrib.data.ignore_errors())
        dataset = dataset.repeat(self.num_epochs)
        dataset = dataset.batch(self.batch_size)  # determine the ideal number
        dataset = dataset.prefetch(self.buffer_size)
        iterator = dataset.make_one_shot_iterator()
        feats, labs = iterator.get_next()
        return feats, labs

Source File: dataset.py From keras_imagenet with MIT License

6 votes

def get_dataset(tfrecords_dir, subset, batch_size):
    """Read TFRecords files and turn them into a TFRecordDataset."""
    files = tf.matching_files(os.path.join(tfrecords_dir, '%s-*' % subset))
    shards = tf.data.Dataset.from_tensor_slices(files)
    shards = shards.shuffle(tf.cast(tf.shape(files)[0], tf.int64))
    shards = shards.repeat()
    dataset = shards.interleave(tf.data.TFRecordDataset, cycle_length=4)
    dataset = dataset.shuffle(buffer_size=8192)
    parser = partial(
        _parse_fn, is_training=True if subset == 'train' else False)
    dataset = dataset.apply(
        tf.data.experimental.map_and_batch(
            map_func=parser,
            batch_size=batch_size,
            num_parallel_calls=config.NUM_DATA_WORKERS))
    dataset = dataset.prefetch(batch_size)
    return dataset

Source File: dataset_util.py From ros_people_object_detection_tensorflow with Apache License 2.0

5 votes

def read_dataset(file_read_func, decode_func, input_files, config):
  """Reads a dataset, and handles repetition and shuffling.

  Args:
    file_read_func: Function to use in tf.data.Dataset.interleave, to read
      every individual file into a tf.data.Dataset.
    decode_func: Function to apply to all records.
    input_files: A list of file paths to read.
    config: A input_reader_builder.InputReader object.

  Returns:
    A tf.data.Dataset based on config.
  """
  # Shard, shuffle, and read files.
  filenames = tf.concat([tf.matching_files(pattern) for pattern in input_files],
                        0)
  filename_dataset = tf.data.Dataset.from_tensor_slices(filenames)
  if config.shuffle:
    filename_dataset = filename_dataset.shuffle(
        config.filenames_shuffle_buffer_size)
  filename_dataset = filename_dataset.repeat(config.num_epochs or None)

  records_dataset = filename_dataset.apply(
      tf.contrib.data.parallel_interleave(
          file_read_func, cycle_length=config.num_readers, sloppy=True))
  if config.shuffle:
    records_dataset.shuffle(config.shuffle_buffer_size)
  tensor_dataset = records_dataset.map(
      decode_func, num_parallel_calls=config.num_parallel_map_calls)
  return tensor_dataset.prefetch(config.prefetch_size)

Source File: dataset_util.py From Gun-Detector with Apache License 2.0

5 votes

def read_dataset(file_read_func, decode_func, input_files, config):
  """Reads a dataset, and handles repetition and shuffling.

  Args:
    file_read_func: Function to use in tf.data.Dataset.interleave, to read
      every individual file into a tf.data.Dataset.
    decode_func: Function to apply to all records.
    input_files: A list of file paths to read.
    config: A input_reader_builder.InputReader object.

  Returns:
    A tf.data.Dataset based on config.
  """
  # Shard, shuffle, and read files.
  filenames = tf.concat([tf.matching_files(pattern) for pattern in input_files],
                        0)
  filename_dataset = tf.data.Dataset.from_tensor_slices(filenames)
  if config.shuffle:
    filename_dataset = filename_dataset.shuffle(
        config.filenames_shuffle_buffer_size)
  elif config.num_readers > 1:
    tf.logging.warning('`shuffle` is false, but the input data stream is '
                       'still slightly shuffled since `num_readers` > 1.')

  filename_dataset = filename_dataset.repeat(config.num_epochs or None)

  records_dataset = filename_dataset.apply(
      tf.contrib.data.parallel_interleave(
          file_read_func, cycle_length=config.num_readers,
          block_length=config.read_block_length, sloppy=True))
  if config.shuffle:
    records_dataset.shuffle(config.shuffle_buffer_size)
  tensor_dataset = records_dataset.map(
      decode_func, num_parallel_calls=config.num_parallel_map_calls)
  return tensor_dataset.prefetch(config.prefetch_size)

Source File: io_ops_test.py From deep_image_model with Apache License 2.0

5 votes

def testMatchingFiles(self):
    cases = ['ABcDEF.GH', 'ABzDEF.GH', 'ABasdfjklDEF.GH', 'AB3DEF.GH',
             'AB4DEF.GH', 'ABDEF.GH', 'XYZ']
    files = [tempfile.NamedTemporaryFile(
        prefix=c, dir=self.get_temp_dir()) for c in cases]

    with self.test_session():
      # Test exact match without wildcards.
      for f in files:
        self.assertEqual(tf.matching_files(f.name).eval(),
                         tf.compat.as_bytes(f.name))

      # We will look for files matching "ABxDEF.GH*" where "x" is some wildcard.
      pos = files[0].name.find(cases[0])
      pattern = files[0].name[:pos] + 'AB%sDEF.GH*'

      self.assertEqual(set(tf.matching_files(pattern % 'z').eval()),
                       self._subset(files, [1]))
      self.assertEqual(set(tf.matching_files(pattern % '?').eval()),
                       self._subset(files, [0, 1, 3, 4]))
      self.assertEqual(set(tf.matching_files(pattern % '*').eval()),
                       self._subset(files, [0, 1, 2, 3, 4, 5]))
      self.assertEqual(set(tf.matching_files(pattern % '[cxz]').eval()),
                       self._subset(files, [0, 1]))
      self.assertEqual(set(tf.matching_files(pattern % '[0-9]').eval()),
                       self._subset(files, [3, 4]))

Source File: dataset_util.py From ros_tensorflow with Apache License 2.0

5 votes

def read_dataset(file_read_func, decode_func, input_files, config):
  """Reads a dataset, and handles repetition and shuffling.

  Args:
    file_read_func: Function to use in tf.data.Dataset.interleave, to read
      every individual file into a tf.data.Dataset.
    decode_func: Function to apply to all records.
    input_files: A list of file paths to read.
    config: A input_reader_builder.InputReader object.

  Returns:
    A tf.data.Dataset based on config.
  """
  # Shard, shuffle, and read files.
  filenames = tf.concat([tf.matching_files(pattern) for pattern in input_files],
                        0)
  filename_dataset = tf.data.Dataset.from_tensor_slices(filenames)
  if config.shuffle:
    filename_dataset = filename_dataset.shuffle(
        config.filenames_shuffle_buffer_size)
  elif config.num_readers > 1:
    tf.logging.warning('`shuffle` is false, but the input data stream is '
                       'still slightly shuffled since `num_readers` > 1.')

  filename_dataset = filename_dataset.repeat(config.num_epochs or None)

  records_dataset = filename_dataset.apply(
      tf.contrib.data.parallel_interleave(
          file_read_func, cycle_length=config.num_readers,
          block_length=config.read_block_length, sloppy=True))
  if config.shuffle:
    records_dataset.shuffle(config.shuffle_buffer_size)
  tensor_dataset = records_dataset.map(
      decode_func, num_parallel_calls=config.num_parallel_map_calls)
  return tensor_dataset.prefetch(config.prefetch_size)

Source File: input_util.py From professional-services with Apache License 2.0

5 votes

def input_fn(input_dir, mode, batch_size, num_epochs, label_name=None,
             shuffle_buffer_size=10000, feature_spec=None):
    """Reads TFRecords and returns the features and labels."""
    if feature_spec is None:
        tf_transform_output = tft.TFTransformOutput(
            os.path.join(input_dir, 'transformed_metadata'))
        feature_spec = tf_transform_output.transformed_feature_spec()
    prefix = str(mode).lower()
    suffix = '.tfrecord'
    num_cpus = multiprocessing.cpu_count()

    file_pattern = os.path.join(input_dir, 'data', prefix, prefix+'*'+suffix)
    filenames = tf.matching_files(file_pattern)
    dataset = tf.data.TFRecordDataset(filenames=filenames, buffer_size=None,
                                      num_parallel_reads=num_cpus)

    if mode == tf.estimator.ModeKeys.TRAIN:
        dataset = dataset.shuffle(shuffle_buffer_size)

    dataset = dataset.repeat(num_epochs)
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(
        lambda examples: tf.parse_example(examples, feature_spec))
    iterator = dataset.make_one_shot_iterator()
    features = iterator.get_next()
    if mode == tf.estimator.ModeKeys.PREDICT:
        return features

    label = features.pop(label_name)
    return features, label

Source File: data_utils.py From ID-CNN-CWS with GNU General Public License v3.0

5 votes

def __init__(self, in_pattern, batch_size, num_buckets=0, num_epochs=None):
        self._batch_size = batch_size
        self.num_buckets = num_buckets
        self._epoch = 0
        self._step = 1.
        self.num_epochs = num_epochs
        file_pattern = in_pattern + '/examples.proto' if os.path.isdir(in_pattern) else in_pattern
        filenames = tf.matching_files(file_pattern)
        # filenames = tf.Print(filenames, [filenames], message='filenames: ')
        self.next_batch_op = self.input_pipeline(filenames, self._batch_size, self.num_buckets, self.num_epochs)

Source File: data_utils.py From bran with Apache License 2.0

5 votes

def input_pipeline(self, file_pattern, batch_size, num_epochs=None, num_threads=10):
        filenames = tf.matching_files(file_pattern)
        filename_queue = tf.train.string_input_producer(filenames, num_epochs=num_epochs, shuffle=True)
        parsed_batch = self.example_parser(filename_queue)
        min_after_dequeue = 10000
        capacity = min_after_dequeue + 12 * batch_size
        next_batch = tf.train.batch(
                parsed_batch, batch_size=batch_size, capacity=capacity,
                num_threads=num_threads, dynamic_pad=True, allow_smaller_final_batch=True)
        return next_batch

Source File: dataset_util.py From Elphas with Apache License 2.0

5 votes

def read_dataset(file_read_func, decode_func, input_files, config):
  """Reads a dataset, and handles repetition and shuffling.

  Args:
    file_read_func: Function to use in tf.data.Dataset.interleave, to read
      every individual file into a tf.data.Dataset.
    decode_func: Function to apply to all records.
    input_files: A list of file paths to read.
    config: A input_reader_builder.InputReader object.

  Returns:
    A tf.data.Dataset based on config.
  """
  # Shard, shuffle, and read files.
  filenames = tf.concat([tf.matching_files(pattern) for pattern in input_files],
                        0)
  filename_dataset = tf.data.Dataset.from_tensor_slices(filenames)
  if config.shuffle:
    filename_dataset = filename_dataset.shuffle(
        config.filenames_shuffle_buffer_size)
  filename_dataset = filename_dataset.repeat(config.num_epochs or None)

  records_dataset = filename_dataset.apply(
      tf.contrib.data.parallel_interleave(
          file_read_func, cycle_length=config.num_readers, sloppy=True))
  if config.shuffle:
    records_dataset.shuffle(config.shuffle_buffer_size)
  tensor_dataset = records_dataset.map(
      decode_func, num_parallel_calls=config.num_parallel_map_calls)
  return tensor_dataset.prefetch(config.prefetch_size)

Source File: dataset_util.py From AniSeg with Apache License 2.0

5 votes

def read_dataset(file_read_func, decode_func, input_files, config):
  """Reads a dataset, and handles repetition and shuffling.

  Args:
    file_read_func: Function to use in tf.data.Dataset.interleave, to read
      every individual file into a tf.data.Dataset.
    decode_func: Function to apply to all records.
    input_files: A list of file paths to read.
    config: A input_reader_builder.InputReader object.

  Returns:
    A tf.data.Dataset based on config.
  """
  # Shard, shuffle, and read files.
  filenames = tf.concat([tf.matching_files(pattern) for pattern in input_files],
                        0)
  filename_dataset = tf.data.Dataset.from_tensor_slices(filenames)
  if config.shuffle:
    filename_dataset = filename_dataset.shuffle(
        config.filenames_shuffle_buffer_size)
  filename_dataset = filename_dataset.repeat(config.num_epochs or None)

  records_dataset = filename_dataset.apply(
      tf.contrib.data.parallel_interleave(
          file_read_func, cycle_length=config.num_readers, sloppy=True))
  if config.shuffle:
    records_dataset.shuffle(config.shuffle_buffer_size)
  tensor_dataset = records_dataset.map(
      decode_func, num_parallel_calls=config.num_parallel_map_calls)
  return tensor_dataset.prefetch(config.prefetch_size)

Source File: dataset_util.py From Traffic-Rule-Violation-Detection-System with MIT License

4 votes

def read_dataset(
    file_read_func, decode_func, input_files, config, num_workers=1,
    worker_index=0):
  """Reads a dataset, and handles repetition and shuffling.

  Args:
    file_read_func: Function to use in tf.data.Dataset.interleave, to read
      every individual file into a tf.data.Dataset.
    decode_func: Function to apply to all records.
    input_files: A list of file paths to read.
    config: A input_reader_builder.InputReader object.
    num_workers: Number of workers / shards.
    worker_index: Id for the current worker.

  Returns:
    A tf.data.Dataset based on config.
  """
  # Shard, shuffle, and read files.
  filenames = tf.concat([tf.matching_files(pattern) for pattern in input_files],
                        0)
  dataset = tf.data.Dataset.from_tensor_slices(filenames)
  dataset = dataset.shard(num_workers, worker_index)
  dataset = dataset.repeat(config.num_epochs or None)
  if config.shuffle:
    dataset = dataset.shuffle(config.filenames_shuffle_buffer_size,
                              reshuffle_each_iteration=True)

  # Read file records and shuffle them.
  # If cycle_length is larger than the number of files, more than one reader
  # will be assigned to the same file, leading to repetition.
  cycle_length = tf.cast(
      tf.minimum(config.num_readers, tf.size(filenames)), tf.int64)
  # TODO: find the optimal block_length.
  dataset = dataset.interleave(
      file_read_func, cycle_length=cycle_length, block_length=1)

  if config.shuffle:
    dataset = dataset.shuffle(config.shuffle_buffer_size,
                              reshuffle_each_iteration=True)

  dataset = dataset.map(decode_func, num_parallel_calls=config.num_readers)
  return dataset.prefetch(config.prefetch_buffer_size)

Source File: dataset_util.py From deepglobe_land_cover_classification_with_deeplabv3plus with MIT License

4 votes

def read_dataset(
        file_read_func, decode_func, input_files, config, num_workers=1,
        worker_index=0):
    """Reads a dataset, and handles repetition and shuffling.

    Args:
      file_read_func: Function to use in tf.data.Dataset.interleave, to read
        every individual file into a tf.data.Dataset.
      decode_func: Function to apply to all records.
      input_files: A list of file paths to read.
      config: A input_reader_builder.InputReader object.
      num_workers: Number of workers / shards.
      worker_index: Id for the current worker.

    Returns:
      A tf.data.Dataset based on config.
    """
    # Shard, shuffle, and read files.
    filenames = tf.concat([tf.matching_files(pattern) for pattern in input_files],
                          0)
    dataset = tf.data.Dataset.from_tensor_slices(filenames)
    dataset = dataset.shard(num_workers, worker_index)
    dataset = dataset.repeat(config.num_epochs or None)
    if config.shuffle:
        dataset = dataset.shuffle(config.filenames_shuffle_buffer_size,
                                  reshuffle_each_iteration=True)

    # Read file records and shuffle them.
    # If cycle_length is larger than the number of files, more than one reader
    # will be assigned to the same file, leading to repetition.
    cycle_length = tf.cast(
        tf.minimum(config.num_readers, tf.size(filenames)), tf.int64)
    # TODO: find the optimal block_length.
    dataset = dataset.interleave(
        file_read_func, cycle_length=cycle_length, block_length=1)

    if config.shuffle:
        dataset = dataset.shuffle(config.shuffle_buffer_size,
                                  reshuffle_each_iteration=True)

    dataset = dataset.map(decode_func, num_parallel_calls=config.num_readers)
    return dataset.prefetch(config.prefetch_buffer_size)

Source File: dataset_util.py From tensorflow-deeplab-v3 with MIT License

4 votes

def read_dataset(
    file_read_func, decode_func, input_files, config, num_workers=1,
    worker_index=0):
  """Reads a dataset, and handles repetition and shuffling.

  Args:
    file_read_func: Function to use in tf.data.Dataset.interleave, to read
      every individual file into a tf.data.Dataset.
    decode_func: Function to apply to all records.
    input_files: A list of file paths to read.
    config: A input_reader_builder.InputReader object.
    num_workers: Number of workers / shards.
    worker_index: Id for the current worker.

  Returns:
    A tf.data.Dataset based on config.
  """
  # Shard, shuffle, and read files.
  filenames = tf.concat([tf.matching_files(pattern) for pattern in input_files],
                        0)
  dataset = tf.data.Dataset.from_tensor_slices(filenames)
  dataset = dataset.shard(num_workers, worker_index)
  dataset = dataset.repeat(config.num_epochs or None)
  if config.shuffle:
    dataset = dataset.shuffle(config.filenames_shuffle_buffer_size,
                              reshuffle_each_iteration=True)

  # Read file records and shuffle them.
  # If cycle_length is larger than the number of files, more than one reader
  # will be assigned to the same file, leading to repetition.
  cycle_length = tf.cast(
      tf.minimum(config.num_readers, tf.size(filenames)), tf.int64)
  # TODO: find the optimal block_length.
  dataset = dataset.interleave(
      file_read_func, cycle_length=cycle_length, block_length=1)

  if config.shuffle:
    dataset = dataset.shuffle(config.shuffle_buffer_size,
                              reshuffle_each_iteration=True)

  dataset = dataset.map(decode_func, num_parallel_calls=config.num_readers)
  return dataset.prefetch(config.prefetch_buffer_size)

Source File: dataset_util.py From tensorflow-deeplab-v3-plus with MIT License

4 votes

def read_dataset(
    file_read_func, decode_func, input_files, config, num_workers=1,
    worker_index=0):
  """Reads a dataset, and handles repetition and shuffling.

  Args:
    file_read_func: Function to use in tf.data.Dataset.interleave, to read
      every individual file into a tf.data.Dataset.
    decode_func: Function to apply to all records.
    input_files: A list of file paths to read.
    config: A input_reader_builder.InputReader object.
    num_workers: Number of workers / shards.
    worker_index: Id for the current worker.

  Returns:
    A tf.data.Dataset based on config.
  """
  # Shard, shuffle, and read files.
  filenames = tf.concat([tf.matching_files(pattern) for pattern in input_files],
                        0)
  dataset = tf.data.Dataset.from_tensor_slices(filenames)
  dataset = dataset.shard(num_workers, worker_index)
  dataset = dataset.repeat(config.num_epochs or None)
  if config.shuffle:
    dataset = dataset.shuffle(config.filenames_shuffle_buffer_size,
                              reshuffle_each_iteration=True)

  # Read file records and shuffle them.
  # If cycle_length is larger than the number of files, more than one reader
  # will be assigned to the same file, leading to repetition.
  cycle_length = tf.cast(
      tf.minimum(config.num_readers, tf.size(filenames)), tf.int64)
  # TODO: find the optimal block_length.
  dataset = dataset.interleave(
      file_read_func, cycle_length=cycle_length, block_length=1)

  if config.shuffle:
    dataset = dataset.shuffle(config.shuffle_buffer_size,
                              reshuffle_each_iteration=True)

  dataset = dataset.map(decode_func, num_parallel_calls=config.num_readers)
  return dataset.prefetch(config.prefetch_buffer_size)

Source File: dataset_util.py From LaneSegmentationNetwork with GNU Lesser General Public License v3.0

4 votes

def read_dataset(
        file_read_func, decode_func, input_files, config, num_workers=1,
        worker_index=0):
    """Reads a dataset, and handles repetition and shuffling.

    Args:
      file_read_func: Function to use in tf.data.Dataset.interleave, to read
        every individual file into a tf.data.Dataset.
      decode_func: Function to apply to all records.
      input_files: A list of file paths to read.
      config: A input_reader_builder.InputReader object.
      num_workers: Number of workers / shards.
      worker_index: Id for the current worker.

    Returns:
      A tf.data.Dataset based on config.
    """
    # Shard, shuffle, and read files.
    filenames = tf.concat([tf.matching_files(pattern) for pattern in input_files],
                          0)
    dataset = tf.data.Dataset.from_tensor_slices(filenames)
    dataset = dataset.shard(num_workers, worker_index)
    dataset = dataset.repeat(config.num_epochs or None)
    if config.shuffle:
        dataset = dataset.shuffle(config.filenames_shuffle_buffer_size,
                                  reshuffle_each_iteration=True)

    # Read file records and shuffle them.
    # If cycle_length is larger than the number of files, more than one reader
    # will be assigned to the same file, leading to repetition.
    cycle_length = tf.cast(
        tf.minimum(config.num_readers, tf.size(filenames)), tf.int64)
    # TODO: find the optimal block_length.
    dataset = dataset.interleave(
        file_read_func, cycle_length=cycle_length, block_length=1)

    if config.shuffle:
        dataset = dataset.shuffle(config.shuffle_buffer_size,
                                  reshuffle_each_iteration=True)

    dataset = dataset.map(decode_func, num_parallel_calls=config.num_readers)
    return dataset.prefetch(config.prefetch_buffer_size)

Python tensorflow.matching_files() Examples