Python Examples of tensorflow.records

Source File: musdb_to_tfrecord.py From vimss with GNU General Public License v3.0

6 votes

def main(argv):  # pylint: disable=unused-argument
    tf.logging.set_verbosity(tf.logging.INFO)

    if FLAGS.project is None:
        raise ValueError('GCS Project must be provided.')

    if FLAGS.gcs_output_path is None:
        raise ValueError('GCS output path must be provided.')
    elif not FLAGS.gcs_output_path.startswith('gs://'):
        raise ValueError('GCS output path must start with gs://')

    if FLAGS.local_scratch_dir is None:
        raise ValueError('Scratch directory path must be provided.')

    # Download the dataset if it is not present locally
    raw_data_dir = FLAGS.raw_data_dir

    # Convert the raw data into tf-records
    training_records, test_records = convert_to_tf_records(raw_data_dir)

    # Upload to GCS
    upload_to_gcs(training_records, test_records)

Source File: urmp_to_tfrecords.py From vimss with GNU General Public License v3.0

6 votes

def main(argv):  # pylint: disable=unused-argument
    tf.logging.set_verbosity(tf.logging.INFO)

    if FLAGS.project is None:
        raise ValueError('GCS Project must be provided.')

    if FLAGS.gcs_output_path is None:
        raise ValueError('GCS output path must be provided.')
    elif not FLAGS.gcs_output_path.startswith('gs://'):
        raise ValueError('GCS output path must start with gs://')

    if FLAGS.local_scratch_dir is None:
        raise ValueError('Scratch directory path must be provided.')

    # Download the dataset if it is not present locally
    raw_data_dir = FLAGS.raw_data_dir

    # Convert the raw data into tf-records
    training_records, test_records = convert_to_tf_records(raw_data_dir)

    # Upload to GCS
    # upload_to_gcs(training_records, test_records)

Source File: preprocessing.py From training_results_v0.5 with Apache License 2.0

6 votes

def write_tf_examples(filename, tf_examples, serialize=True):
    '''
    Args:
        filename: Where to write tf.records
        tf_examples: An iterable of tf.Example
        serialize: whether to serialize the examples.
    '''
    with tf.python_io.TFRecordWriter(
            filename, options=TF_RECORD_CONFIG) as writer:
        for ex in tf_examples:
            if serialize:
                writer.write(ex.SerializeToString())
            else:
                writer.write(ex)

# Read tf.Example from files

Source File: preprocessing.py From training_results_v0.5 with Apache License 2.0

6 votes

def write_tf_examples(filename, tf_examples, serialize=True):
    '''
    Args:
        filename: Where to write tf.records
        tf_examples: An iterable of tf.Example
        serialize: whether to serialize the examples.
    '''
    with tf.python_io.TFRecordWriter(
            filename, options=TF_RECORD_CONFIG) as writer:
        for ex in tf_examples:
            if serialize:
                writer.write(ex.SerializeToString())
            else:
                writer.write(ex)

# Read tf.Example from files

Source File: DeeProtein.py From AiGEM_TeamHeidelberg2017 with MIT License

5 votes

def check_data(self, tfrecords_filename):
        """Checks a specified tf.Records file for coreect dataformat.
        Check if the data format in the example files is correct. Prints the shape of the data
        stored in a tf.Records file.

        Args
          tfrecords_filename: `str`, the path to the `tf.records` file to check.
        """
        record_iterator = tf.python_io.tf_record_iterator(path=tfrecords_filename)

        for string_record in record_iterator:
            # Parse the next example
            example = tf.train.Example()
            example.ParseFromString(string_record)

            # Get the features you stored (change to match your tfrecord writing code)
            seq = (example.features.feature['seq_raw']
                   .bytes_list
                   .value[0])

            label = (example.features.feature['label_raw']
                     .bytes_list
                     .value[0])

            # Convert to a numpy array (change dtype to the datatype you stored)
            seq_array = np.fromstring(seq, dtype=np.float64)
            label_array = np.fromstring(label, dtype=np.float64)

            # Print the image shape; does it match your expectations?
            print(seq_array.shape)
            print(label_array.shape)

Source File: preprocessing.py From training with Apache License 2.0

5 votes

def write_tf_examples(filename, tf_examples, serialize=True):
    """
    Args:
        filename: Where to write tf.records
        tf_examples: An iterable of tf.Example
        serialize: whether to serialize the examples.
    """
    with tf.python_io.TFRecordWriter(
            filename, options=TF_RECORD_CONFIG) as writer:
        for ex in tf_examples:
            if serialize:
                writer.write(ex.SerializeToString())
            else:
                writer.write(ex)

Source File: musdb_to_tfrecord.py From vimss with GNU General Public License v3.0

4 votes

def _process_audio_files_batch(chunk_data):
    """Processes and saves list of audio files as TFRecords.
    Args:
        chunk_data: tuple of chunk_files and output_file
        chunk_files: list of strings; each string is a path to an image file
        output_file: string, unique identifier specifying the data set
    """

    chunk_files, output_file = chunk_data[0], chunk_data[1]
    # Get training files from the directory name

    writer = tf.python_io.TFRecordWriter(output_file)

    chunk_data_cache = list()
    for filename in chunk_files:
        # load all wave files into memory and create a buffer
        file_data_cache = list()
        for source in CHANNEL_NAMES:
            data, sr = librosa.core.load(filename+source, sr=SAMPLE_RATE, mono=True)
            file_data_cache.append([filename, len(data), data])

            # Option 1: use only tf to read and resample audio
            # audio_binary = tf.read_file(filename+source)
            # wav_decoder = contrib_audio.decode_wav(
            #     audio_binary,
            #     desired_channels=CHANNELS)
            # Option 2: use Soundfile and read binary files
            # SoundFile should be much more faster but it doesn't matter because we store everything in tf.records
            # with sf.SoundFile(filename+source, "r") as f:
            #     print(filename+source, f.samplerate, f.channels, len(f), f.read().tobytes())

        for segment in _get_segments_from_audio_cache(file_data_cache):
            chunk_data_cache.append(segment)

    # shuffle all segments
    shuffle_idx = make_shuffle_idx(len(chunk_data_cache))
    chunk_data_cache = [chunk_data_cache[i] for i in shuffle_idx]

    for chunk in chunk_data_cache:
        example = _convert_to_example(filename=chunk[0], sample_idx=chunk[1], data_buffer=chunk[2])
        writer.write(example.SerializeToString())

    writer.close()
    tf.logging.info('Finished writing file: %s' % output_file)

Source File: urmp_to_tfrecords.py From vimss with GNU General Public License v3.0

4 votes

def _process_audio_files_batch(chunk_data):
    """Processes and saves list of audio files as TFRecords.
    Args:
        chunk_data: tuple of chunk_files and output_file
        chunk_files: list of strings; each string is a path to an wav file
        output_file: string, unique identifier specifying the data set
    """

    chunk_files, output_file = chunk_data[0], chunk_data[1]
    # Get training files from the directory name

    writer = tf.python_io.TFRecordWriter(output_file)

    chunk_data_cache = list()
    for track in chunk_files:
        # load all wave files into memory and create a buffer
        file_data_cache = list()
        for source in track:
            data, sr = librosa.core.load(source, sr=SAMPLE_RATE, mono=True)
            file_data_cache.append([track, len(data), data])

            # Option 1: use only tf to read and resample audio
            # audio_binary = tf.read_file(filename+source)
            # wav_decoder = contrib_audio.decode_wav(
            #     audio_binary,
            #     desired_channels=CHANNELS)
            # Option 2: use Soundfile and read binary files
            # SoundFile should be much more faster but it doesn't matter because we store everything in tf.records
            # with sf.SoundFile(filename+source, "r") as f:
            #     print(filename+source, f.samplerate, f.channels, len(f), f.read().tobytes())

        for segment in _get_segments_from_audio_cache(file_data_cache):
            chunk_data_cache.append(segment)

    # shuffle all segments
    shuffle_idx = make_shuffle_idx(len(chunk_data_cache))
    chunk_data_cache = [chunk_data_cache[i] for i in shuffle_idx]

    for chunk in chunk_data_cache:
        labels = get_labels_from_filename(chunk[0])
        example = _convert_to_example(filename=chunk[0], sample_idx=chunk[1],
                                      data_buffer=chunk[2], num_sources=chunk[3],
                                      labels=labels)
        writer.write(example.SerializeToString())

    writer.close()
    tf.logging.info('Finished writing file: %s' % output_file)

Source File: preprocessing.py From training_results_v0.5 with Apache License 2.0

4 votes

def read_tf_records(batch_size, tf_records, num_repeats=None,
                    shuffle_records=True, shuffle_examples=True,
                    shuffle_buffer_size=None,
                    filter_amount=1.0):
    '''
    Args:
        batch_size: batch size to return
        tf_records: a list of tf_record filenames
        num_repeats: how many times the data should be read (default: infinite)
        shuffle_records: whether to shuffle the order of files read
        shuffle_examples: whether to shuffle the tf.Examples
        shuffle_buffer_size: how big of a buffer to fill before shuffling.
        filter_amount: what fraction of records to keep
    Returns:
        a tf dataset of batched tensors
    '''

    if shuffle_buffer_size is None:
        shuffle_buffer_size = SHUFFLE_BUFFER_SIZE
    if shuffle_records:
        random.shuffle(tf_records)
    record_list = tf.data.Dataset.from_tensor_slices(tf_records)

    # compression_type here must agree with write_tf_examples
    # cycle_length = how many tfrecord files are read in parallel
    # block_length = how many tf.Examples are read from each file before
    #   moving to the next file
    # The idea is to shuffle both the order of the files being read,
    # and the examples being read from the files.
    dataset = record_list.interleave(lambda x:
                                     tf.data.TFRecordDataset(
                                         x, compression_type='ZLIB'),
                                     cycle_length=64, block_length=16)
    # The sampling dataset replaces filter dataset with lambda function below.
    # Its a faster implemenation of the filter dataset with this specific lambda
    # function.
    #dataset = dataset.sampling(filter_amount)
    dataset = dataset.filter(lambda x: tf.less(
        tf.random_uniform([1]), filter_amount)[0])
    # TODO(amj): apply py_func for transforms here.
    if num_repeats is not None:
        dataset = dataset.repeat(num_repeats)
    else:
        dataset = dataset.repeat()
    if shuffle_examples:
        dataset = dataset.shuffle(buffer_size=shuffle_buffer_size)
    dataset = dataset.batch(batch_size)
    return dataset

Source File: preprocessing.py From training_results_v0.5 with Apache License 2.0

4 votes

def read_tf_records(batch_size, tf_records, num_repeats=None,
                    shuffle_records=True, shuffle_examples=True,
                    shuffle_buffer_size=None,
                    filter_amount=1.0):
    '''
    Args:
        batch_size: batch size to return
        tf_records: a list of tf_record filenames
        num_repeats: how many times the data should be read (default: infinite)
        shuffle_records: whether to shuffle the order of files read
        shuffle_examples: whether to shuffle the tf.Examples
        shuffle_buffer_size: how big of a buffer to fill before shuffling.
        filter_amount: what fraction of records to keep
    Returns:
        a tf dataset of batched tensors
    '''

    if shuffle_buffer_size is None:
        shuffle_buffer_size = SHUFFLE_BUFFER_SIZE
    if shuffle_records:
        random.shuffle(tf_records)
    record_list = tf.data.Dataset.from_tensor_slices(tf_records)

    # compression_type here must agree with write_tf_examples
    # cycle_length = how many tfrecord files are read in parallel
    # block_length = how many tf.Examples are read from each file before
    #   moving to the next file
    # The idea is to shuffle both the order of the files being read,
    # and the examples being read from the files.
    dataset = record_list.interleave(lambda x:
                                     tf.data.TFRecordDataset(
                                         x, compression_type='ZLIB'),
                                     cycle_length=64, block_length=16)
    # The sampling dataset replaces filter dataset with lambda function below.
    # Its a faster implemenation of the filter dataset with this specific lambda
    # function.
    dataset = dataset.sampling(filter_amount)
    #dataset = dataset.filter(lambda x: tf.less(
    #    tf.random_uniform([1]), filter_amount)[0])
    # TODO(amj): apply py_func for transforms here.
    if num_repeats is not None:
        dataset = dataset.repeat(num_repeats)
    else:
        dataset = dataset.repeat()
    if shuffle_examples:
        dataset = dataset.shuffle(buffer_size=shuffle_buffer_size)
    dataset = dataset.batch(batch_size)
    return dataset

Source File: preprocessing.py From training with Apache License 2.0

4 votes

def read_tf_records(batch_size, tf_records, num_repeats=1,
                    shuffle_records=True, shuffle_examples=True,
                    shuffle_buffer_size=None, interleave=True,
                    filter_amount=1.0):
    """
    Args:
        batch_size: batch size to return
        tf_records: a list of tf_record filenames
        num_repeats: how many times the data should be read (default: One)
        shuffle_records: whether to shuffle the order of files read
        shuffle_examples: whether to shuffle the tf.Examples
        shuffle_buffer_size: how big of a buffer to fill before shuffling.
        interleave: iwhether to interleave examples from multiple tf_records
        filter_amount: what fraction of records to keep
    Returns:
        a tf dataset of batched tensors
    """
    if shuffle_examples and not shuffle_buffer_size:
        raise ValueError("Must set shuffle buffer size if shuffling examples")

    tf_records = list(tf_records)
    if shuffle_records:
        random.shuffle(tf_records)
    record_list = tf.data.Dataset.from_tensor_slices(tf_records)

    # compression_type here must agree with write_tf_examples
    map_func = functools.partial(
        tf.data.TFRecordDataset,
        buffer_size=8 * 1024 * 1024,
        compression_type='ZLIB')

    if interleave:
        # cycle_length = how many tfrecord files are read in parallel
        # The idea is to shuffle both the order of the files being read,
        # and the examples being read from the files.
        dataset = record_list.apply(tf.data.experimental.parallel_interleave(
            map_func, cycle_length=64, sloppy=True))
    else:
        dataset = record_list.flat_map(map_func)

    if filter_amount < 1.0:
        dataset = dataset.filter(
            lambda _: tf.random_uniform([]) < filter_amount)

    dataset = dataset.repeat(num_repeats)
    if shuffle_examples:
        dataset = dataset.shuffle(buffer_size=shuffle_buffer_size)

    dataset = dataset.batch(batch_size)
    return dataset

Python tensorflow.records() Examples