Python tensorflow.records() Examples

The following are 11 code examples of tensorflow.records(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module tensorflow , or try the search function .
Example #1
Source File: musdb_to_tfrecord.py    From vimss with GNU General Public License v3.0 6 votes vote down vote up
def main(argv):  # pylint: disable=unused-argument
    tf.logging.set_verbosity(tf.logging.INFO)

    if FLAGS.project is None:
        raise ValueError('GCS Project must be provided.')

    if FLAGS.gcs_output_path is None:
        raise ValueError('GCS output path must be provided.')
    elif not FLAGS.gcs_output_path.startswith('gs://'):
        raise ValueError('GCS output path must start with gs://')

    if FLAGS.local_scratch_dir is None:
        raise ValueError('Scratch directory path must be provided.')

    # Download the dataset if it is not present locally
    raw_data_dir = FLAGS.raw_data_dir

    # Convert the raw data into tf-records
    training_records, test_records = convert_to_tf_records(raw_data_dir)

    # Upload to GCS
    upload_to_gcs(training_records, test_records) 
Example #2
Source File: urmp_to_tfrecords.py    From vimss with GNU General Public License v3.0 6 votes vote down vote up
def main(argv):  # pylint: disable=unused-argument
    tf.logging.set_verbosity(tf.logging.INFO)

    if FLAGS.project is None:
        raise ValueError('GCS Project must be provided.')

    if FLAGS.gcs_output_path is None:
        raise ValueError('GCS output path must be provided.')
    elif not FLAGS.gcs_output_path.startswith('gs://'):
        raise ValueError('GCS output path must start with gs://')

    if FLAGS.local_scratch_dir is None:
        raise ValueError('Scratch directory path must be provided.')

    # Download the dataset if it is not present locally
    raw_data_dir = FLAGS.raw_data_dir

    # Convert the raw data into tf-records
    training_records, test_records = convert_to_tf_records(raw_data_dir)

    # Upload to GCS
    # upload_to_gcs(training_records, test_records) 
Example #3
Source File: preprocessing.py    From training_results_v0.5 with Apache License 2.0 6 votes vote down vote up
def write_tf_examples(filename, tf_examples, serialize=True):
    '''
    Args:
        filename: Where to write tf.records
        tf_examples: An iterable of tf.Example
        serialize: whether to serialize the examples.
    '''
    with tf.python_io.TFRecordWriter(
            filename, options=TF_RECORD_CONFIG) as writer:
        for ex in tf_examples:
            if serialize:
                writer.write(ex.SerializeToString())
            else:
                writer.write(ex)

# Read tf.Example from files 
Example #4
Source File: preprocessing.py    From training_results_v0.5 with Apache License 2.0 6 votes vote down vote up
def write_tf_examples(filename, tf_examples, serialize=True):
    '''
    Args:
        filename: Where to write tf.records
        tf_examples: An iterable of tf.Example
        serialize: whether to serialize the examples.
    '''
    with tf.python_io.TFRecordWriter(
            filename, options=TF_RECORD_CONFIG) as writer:
        for ex in tf_examples:
            if serialize:
                writer.write(ex.SerializeToString())
            else:
                writer.write(ex)

# Read tf.Example from files 
Example #5
Source File: DeeProtein.py    From AiGEM_TeamHeidelberg2017 with MIT License 5 votes vote down vote up
def check_data(self, tfrecords_filename):
        """Checks a specified tf.Records file for coreect dataformat.
        Check if the data format in the example files is correct. Prints the shape of the data
        stored in a tf.Records file.

        Args
          tfrecords_filename: `str`, the path to the `tf.records` file to check.
        """
        record_iterator = tf.python_io.tf_record_iterator(path=tfrecords_filename)

        for string_record in record_iterator:
            # Parse the next example
            example = tf.train.Example()
            example.ParseFromString(string_record)

            # Get the features you stored (change to match your tfrecord writing code)
            seq = (example.features.feature['seq_raw']
                   .bytes_list
                   .value[0])

            label = (example.features.feature['label_raw']
                     .bytes_list
                     .value[0])

            # Convert to a numpy array (change dtype to the datatype you stored)
            seq_array = np.fromstring(seq, dtype=np.float64)
            label_array = np.fromstring(label, dtype=np.float64)

            # Print the image shape; does it match your expectations?
            print(seq_array.shape)
            print(label_array.shape) 
Example #6
Source File: preprocessing.py    From training with Apache License 2.0 5 votes vote down vote up
def write_tf_examples(filename, tf_examples, serialize=True):
    """
    Args:
        filename: Where to write tf.records
        tf_examples: An iterable of tf.Example
        serialize: whether to serialize the examples.
    """
    with tf.python_io.TFRecordWriter(
            filename, options=TF_RECORD_CONFIG) as writer:
        for ex in tf_examples:
            if serialize:
                writer.write(ex.SerializeToString())
            else:
                writer.write(ex) 
Example #7
Source File: musdb_to_tfrecord.py    From vimss with GNU General Public License v3.0 4 votes vote down vote up
def _process_audio_files_batch(chunk_data):
    """Processes and saves list of audio files as TFRecords.
    Args:
        chunk_data: tuple of chunk_files and output_file
        chunk_files: list of strings; each string is a path to an image file
        output_file: string, unique identifier specifying the data set
    """

    chunk_files, output_file = chunk_data[0], chunk_data[1]
    # Get training files from the directory name

    writer = tf.python_io.TFRecordWriter(output_file)

    chunk_data_cache = list()
    for filename in chunk_files:
        # load all wave files into memory and create a buffer
        file_data_cache = list()
        for source in CHANNEL_NAMES:
            data, sr = librosa.core.load(filename+source, sr=SAMPLE_RATE, mono=True)
            file_data_cache.append([filename, len(data), data])

            # Option 1: use only tf to read and resample audio
            # audio_binary = tf.read_file(filename+source)
            # wav_decoder = contrib_audio.decode_wav(
            #     audio_binary,
            #     desired_channels=CHANNELS)
            # Option 2: use Soundfile and read binary files
            # SoundFile should be much more faster but it doesn't matter because we store everything in tf.records
            # with sf.SoundFile(filename+source, "r") as f:
            #     print(filename+source, f.samplerate, f.channels, len(f), f.read().tobytes())

        for segment in _get_segments_from_audio_cache(file_data_cache):
            chunk_data_cache.append(segment)

    # shuffle all segments
    shuffle_idx = make_shuffle_idx(len(chunk_data_cache))
    chunk_data_cache = [chunk_data_cache[i] for i in shuffle_idx]

    for chunk in chunk_data_cache:
        example = _convert_to_example(filename=chunk[0], sample_idx=chunk[1], data_buffer=chunk[2])
        writer.write(example.SerializeToString())

    writer.close()
    tf.logging.info('Finished writing file: %s' % output_file) 
Example #8
Source File: urmp_to_tfrecords.py    From vimss with GNU General Public License v3.0 4 votes vote down vote up
def _process_audio_files_batch(chunk_data):
    """Processes and saves list of audio files as TFRecords.
    Args:
        chunk_data: tuple of chunk_files and output_file
        chunk_files: list of strings; each string is a path to an wav file
        output_file: string, unique identifier specifying the data set
    """

    chunk_files, output_file = chunk_data[0], chunk_data[1]
    # Get training files from the directory name

    writer = tf.python_io.TFRecordWriter(output_file)

    chunk_data_cache = list()
    for track in chunk_files:
        # load all wave files into memory and create a buffer
        file_data_cache = list()
        for source in track:
            data, sr = librosa.core.load(source, sr=SAMPLE_RATE, mono=True)
            file_data_cache.append([track, len(data), data])

            # Option 1: use only tf to read and resample audio
            # audio_binary = tf.read_file(filename+source)
            # wav_decoder = contrib_audio.decode_wav(
            #     audio_binary,
            #     desired_channels=CHANNELS)
            # Option 2: use Soundfile and read binary files
            # SoundFile should be much more faster but it doesn't matter because we store everything in tf.records
            # with sf.SoundFile(filename+source, "r") as f:
            #     print(filename+source, f.samplerate, f.channels, len(f), f.read().tobytes())

        for segment in _get_segments_from_audio_cache(file_data_cache):
            chunk_data_cache.append(segment)

    # shuffle all segments
    shuffle_idx = make_shuffle_idx(len(chunk_data_cache))
    chunk_data_cache = [chunk_data_cache[i] for i in shuffle_idx]

    for chunk in chunk_data_cache:
        labels = get_labels_from_filename(chunk[0])
        example = _convert_to_example(filename=chunk[0], sample_idx=chunk[1],
                                      data_buffer=chunk[2], num_sources=chunk[3],
                                      labels=labels)
        writer.write(example.SerializeToString())

    writer.close()
    tf.logging.info('Finished writing file: %s' % output_file) 
Example #9
Source File: preprocessing.py    From training_results_v0.5 with Apache License 2.0 4 votes vote down vote up
def read_tf_records(batch_size, tf_records, num_repeats=None,
                    shuffle_records=True, shuffle_examples=True,
                    shuffle_buffer_size=None,
                    filter_amount=1.0):
    '''
    Args:
        batch_size: batch size to return
        tf_records: a list of tf_record filenames
        num_repeats: how many times the data should be read (default: infinite)
        shuffle_records: whether to shuffle the order of files read
        shuffle_examples: whether to shuffle the tf.Examples
        shuffle_buffer_size: how big of a buffer to fill before shuffling.
        filter_amount: what fraction of records to keep
    Returns:
        a tf dataset of batched tensors
    '''

    if shuffle_buffer_size is None:
        shuffle_buffer_size = SHUFFLE_BUFFER_SIZE
    if shuffle_records:
        random.shuffle(tf_records)
    record_list = tf.data.Dataset.from_tensor_slices(tf_records)

    # compression_type here must agree with write_tf_examples
    # cycle_length = how many tfrecord files are read in parallel
    # block_length = how many tf.Examples are read from each file before
    #   moving to the next file
    # The idea is to shuffle both the order of the files being read,
    # and the examples being read from the files.
    dataset = record_list.interleave(lambda x:
                                     tf.data.TFRecordDataset(
                                         x, compression_type='ZLIB'),
                                     cycle_length=64, block_length=16)
    # The sampling dataset replaces filter dataset with lambda function below.
    # Its a faster implemenation of the filter dataset with this specific lambda
    # function.
    #dataset = dataset.sampling(filter_amount)
    dataset = dataset.filter(lambda x: tf.less(
        tf.random_uniform([1]), filter_amount)[0])
    # TODO(amj): apply py_func for transforms here.
    if num_repeats is not None:
        dataset = dataset.repeat(num_repeats)
    else:
        dataset = dataset.repeat()
    if shuffle_examples:
        dataset = dataset.shuffle(buffer_size=shuffle_buffer_size)
    dataset = dataset.batch(batch_size)
    return dataset 
Example #10
Source File: preprocessing.py    From training_results_v0.5 with Apache License 2.0 4 votes vote down vote up
def read_tf_records(batch_size, tf_records, num_repeats=None,
                    shuffle_records=True, shuffle_examples=True,
                    shuffle_buffer_size=None,
                    filter_amount=1.0):
    '''
    Args:
        batch_size: batch size to return
        tf_records: a list of tf_record filenames
        num_repeats: how many times the data should be read (default: infinite)
        shuffle_records: whether to shuffle the order of files read
        shuffle_examples: whether to shuffle the tf.Examples
        shuffle_buffer_size: how big of a buffer to fill before shuffling.
        filter_amount: what fraction of records to keep
    Returns:
        a tf dataset of batched tensors
    '''

    if shuffle_buffer_size is None:
        shuffle_buffer_size = SHUFFLE_BUFFER_SIZE
    if shuffle_records:
        random.shuffle(tf_records)
    record_list = tf.data.Dataset.from_tensor_slices(tf_records)

    # compression_type here must agree with write_tf_examples
    # cycle_length = how many tfrecord files are read in parallel
    # block_length = how many tf.Examples are read from each file before
    #   moving to the next file
    # The idea is to shuffle both the order of the files being read,
    # and the examples being read from the files.
    dataset = record_list.interleave(lambda x:
                                     tf.data.TFRecordDataset(
                                         x, compression_type='ZLIB'),
                                     cycle_length=64, block_length=16)
    # The sampling dataset replaces filter dataset with lambda function below.
    # Its a faster implemenation of the filter dataset with this specific lambda
    # function.
    dataset = dataset.sampling(filter_amount)
    #dataset = dataset.filter(lambda x: tf.less(
    #    tf.random_uniform([1]), filter_amount)[0])
    # TODO(amj): apply py_func for transforms here.
    if num_repeats is not None:
        dataset = dataset.repeat(num_repeats)
    else:
        dataset = dataset.repeat()
    if shuffle_examples:
        dataset = dataset.shuffle(buffer_size=shuffle_buffer_size)
    dataset = dataset.batch(batch_size)
    return dataset 
Example #11
Source File: preprocessing.py    From training with Apache License 2.0 4 votes vote down vote up
def read_tf_records(batch_size, tf_records, num_repeats=1,
                    shuffle_records=True, shuffle_examples=True,
                    shuffle_buffer_size=None, interleave=True,
                    filter_amount=1.0):
    """
    Args:
        batch_size: batch size to return
        tf_records: a list of tf_record filenames
        num_repeats: how many times the data should be read (default: One)
        shuffle_records: whether to shuffle the order of files read
        shuffle_examples: whether to shuffle the tf.Examples
        shuffle_buffer_size: how big of a buffer to fill before shuffling.
        interleave: iwhether to interleave examples from multiple tf_records
        filter_amount: what fraction of records to keep
    Returns:
        a tf dataset of batched tensors
    """
    if shuffle_examples and not shuffle_buffer_size:
        raise ValueError("Must set shuffle buffer size if shuffling examples")

    tf_records = list(tf_records)
    if shuffle_records:
        random.shuffle(tf_records)
    record_list = tf.data.Dataset.from_tensor_slices(tf_records)

    # compression_type here must agree with write_tf_examples
    map_func = functools.partial(
        tf.data.TFRecordDataset,
        buffer_size=8 * 1024 * 1024,
        compression_type='ZLIB')

    if interleave:
        # cycle_length = how many tfrecord files are read in parallel
        # The idea is to shuffle both the order of the files being read,
        # and the examples being read from the files.
        dataset = record_list.apply(tf.data.experimental.parallel_interleave(
            map_func, cycle_length=64, sloppy=True))
    else:
        dataset = record_list.flat_map(map_func)

    if filter_amount < 1.0:
        dataset = dataset.filter(
            lambda _: tf.random_uniform([]) < filter_amount)

    dataset = dataset.repeat(num_repeats)
    if shuffle_examples:
        dataset = dataset.shuffle(buffer_size=shuffle_buffer_size)

    dataset = dataset.batch(batch_size)
    return dataset