Python tensorflow.records() Examples
The following are 11
code examples of tensorflow.records().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
tensorflow
, or try the search function
.
Example #1
Source File: musdb_to_tfrecord.py From vimss with GNU General Public License v3.0 | 6 votes |
def main(argv): # pylint: disable=unused-argument tf.logging.set_verbosity(tf.logging.INFO) if FLAGS.project is None: raise ValueError('GCS Project must be provided.') if FLAGS.gcs_output_path is None: raise ValueError('GCS output path must be provided.') elif not FLAGS.gcs_output_path.startswith('gs://'): raise ValueError('GCS output path must start with gs://') if FLAGS.local_scratch_dir is None: raise ValueError('Scratch directory path must be provided.') # Download the dataset if it is not present locally raw_data_dir = FLAGS.raw_data_dir # Convert the raw data into tf-records training_records, test_records = convert_to_tf_records(raw_data_dir) # Upload to GCS upload_to_gcs(training_records, test_records)
Example #2
Source File: urmp_to_tfrecords.py From vimss with GNU General Public License v3.0 | 6 votes |
def main(argv): # pylint: disable=unused-argument tf.logging.set_verbosity(tf.logging.INFO) if FLAGS.project is None: raise ValueError('GCS Project must be provided.') if FLAGS.gcs_output_path is None: raise ValueError('GCS output path must be provided.') elif not FLAGS.gcs_output_path.startswith('gs://'): raise ValueError('GCS output path must start with gs://') if FLAGS.local_scratch_dir is None: raise ValueError('Scratch directory path must be provided.') # Download the dataset if it is not present locally raw_data_dir = FLAGS.raw_data_dir # Convert the raw data into tf-records training_records, test_records = convert_to_tf_records(raw_data_dir) # Upload to GCS # upload_to_gcs(training_records, test_records)
Example #3
Source File: preprocessing.py From training_results_v0.5 with Apache License 2.0 | 6 votes |
def write_tf_examples(filename, tf_examples, serialize=True): ''' Args: filename: Where to write tf.records tf_examples: An iterable of tf.Example serialize: whether to serialize the examples. ''' with tf.python_io.TFRecordWriter( filename, options=TF_RECORD_CONFIG) as writer: for ex in tf_examples: if serialize: writer.write(ex.SerializeToString()) else: writer.write(ex) # Read tf.Example from files
Example #4
Source File: preprocessing.py From training_results_v0.5 with Apache License 2.0 | 6 votes |
def write_tf_examples(filename, tf_examples, serialize=True): ''' Args: filename: Where to write tf.records tf_examples: An iterable of tf.Example serialize: whether to serialize the examples. ''' with tf.python_io.TFRecordWriter( filename, options=TF_RECORD_CONFIG) as writer: for ex in tf_examples: if serialize: writer.write(ex.SerializeToString()) else: writer.write(ex) # Read tf.Example from files
Example #5
Source File: DeeProtein.py From AiGEM_TeamHeidelberg2017 with MIT License | 5 votes |
def check_data(self, tfrecords_filename): """Checks a specified tf.Records file for coreect dataformat. Check if the data format in the example files is correct. Prints the shape of the data stored in a tf.Records file. Args tfrecords_filename: `str`, the path to the `tf.records` file to check. """ record_iterator = tf.python_io.tf_record_iterator(path=tfrecords_filename) for string_record in record_iterator: # Parse the next example example = tf.train.Example() example.ParseFromString(string_record) # Get the features you stored (change to match your tfrecord writing code) seq = (example.features.feature['seq_raw'] .bytes_list .value[0]) label = (example.features.feature['label_raw'] .bytes_list .value[0]) # Convert to a numpy array (change dtype to the datatype you stored) seq_array = np.fromstring(seq, dtype=np.float64) label_array = np.fromstring(label, dtype=np.float64) # Print the image shape; does it match your expectations? print(seq_array.shape) print(label_array.shape)
Example #6
Source File: preprocessing.py From training with Apache License 2.0 | 5 votes |
def write_tf_examples(filename, tf_examples, serialize=True): """ Args: filename: Where to write tf.records tf_examples: An iterable of tf.Example serialize: whether to serialize the examples. """ with tf.python_io.TFRecordWriter( filename, options=TF_RECORD_CONFIG) as writer: for ex in tf_examples: if serialize: writer.write(ex.SerializeToString()) else: writer.write(ex)
Example #7
Source File: musdb_to_tfrecord.py From vimss with GNU General Public License v3.0 | 4 votes |
def _process_audio_files_batch(chunk_data): """Processes and saves list of audio files as TFRecords. Args: chunk_data: tuple of chunk_files and output_file chunk_files: list of strings; each string is a path to an image file output_file: string, unique identifier specifying the data set """ chunk_files, output_file = chunk_data[0], chunk_data[1] # Get training files from the directory name writer = tf.python_io.TFRecordWriter(output_file) chunk_data_cache = list() for filename in chunk_files: # load all wave files into memory and create a buffer file_data_cache = list() for source in CHANNEL_NAMES: data, sr = librosa.core.load(filename+source, sr=SAMPLE_RATE, mono=True) file_data_cache.append([filename, len(data), data]) # Option 1: use only tf to read and resample audio # audio_binary = tf.read_file(filename+source) # wav_decoder = contrib_audio.decode_wav( # audio_binary, # desired_channels=CHANNELS) # Option 2: use Soundfile and read binary files # SoundFile should be much more faster but it doesn't matter because we store everything in tf.records # with sf.SoundFile(filename+source, "r") as f: # print(filename+source, f.samplerate, f.channels, len(f), f.read().tobytes()) for segment in _get_segments_from_audio_cache(file_data_cache): chunk_data_cache.append(segment) # shuffle all segments shuffle_idx = make_shuffle_idx(len(chunk_data_cache)) chunk_data_cache = [chunk_data_cache[i] for i in shuffle_idx] for chunk in chunk_data_cache: example = _convert_to_example(filename=chunk[0], sample_idx=chunk[1], data_buffer=chunk[2]) writer.write(example.SerializeToString()) writer.close() tf.logging.info('Finished writing file: %s' % output_file)
Example #8
Source File: urmp_to_tfrecords.py From vimss with GNU General Public License v3.0 | 4 votes |
def _process_audio_files_batch(chunk_data): """Processes and saves list of audio files as TFRecords. Args: chunk_data: tuple of chunk_files and output_file chunk_files: list of strings; each string is a path to an wav file output_file: string, unique identifier specifying the data set """ chunk_files, output_file = chunk_data[0], chunk_data[1] # Get training files from the directory name writer = tf.python_io.TFRecordWriter(output_file) chunk_data_cache = list() for track in chunk_files: # load all wave files into memory and create a buffer file_data_cache = list() for source in track: data, sr = librosa.core.load(source, sr=SAMPLE_RATE, mono=True) file_data_cache.append([track, len(data), data]) # Option 1: use only tf to read and resample audio # audio_binary = tf.read_file(filename+source) # wav_decoder = contrib_audio.decode_wav( # audio_binary, # desired_channels=CHANNELS) # Option 2: use Soundfile and read binary files # SoundFile should be much more faster but it doesn't matter because we store everything in tf.records # with sf.SoundFile(filename+source, "r") as f: # print(filename+source, f.samplerate, f.channels, len(f), f.read().tobytes()) for segment in _get_segments_from_audio_cache(file_data_cache): chunk_data_cache.append(segment) # shuffle all segments shuffle_idx = make_shuffle_idx(len(chunk_data_cache)) chunk_data_cache = [chunk_data_cache[i] for i in shuffle_idx] for chunk in chunk_data_cache: labels = get_labels_from_filename(chunk[0]) example = _convert_to_example(filename=chunk[0], sample_idx=chunk[1], data_buffer=chunk[2], num_sources=chunk[3], labels=labels) writer.write(example.SerializeToString()) writer.close() tf.logging.info('Finished writing file: %s' % output_file)
Example #9
Source File: preprocessing.py From training_results_v0.5 with Apache License 2.0 | 4 votes |
def read_tf_records(batch_size, tf_records, num_repeats=None, shuffle_records=True, shuffle_examples=True, shuffle_buffer_size=None, filter_amount=1.0): ''' Args: batch_size: batch size to return tf_records: a list of tf_record filenames num_repeats: how many times the data should be read (default: infinite) shuffle_records: whether to shuffle the order of files read shuffle_examples: whether to shuffle the tf.Examples shuffle_buffer_size: how big of a buffer to fill before shuffling. filter_amount: what fraction of records to keep Returns: a tf dataset of batched tensors ''' if shuffle_buffer_size is None: shuffle_buffer_size = SHUFFLE_BUFFER_SIZE if shuffle_records: random.shuffle(tf_records) record_list = tf.data.Dataset.from_tensor_slices(tf_records) # compression_type here must agree with write_tf_examples # cycle_length = how many tfrecord files are read in parallel # block_length = how many tf.Examples are read from each file before # moving to the next file # The idea is to shuffle both the order of the files being read, # and the examples being read from the files. dataset = record_list.interleave(lambda x: tf.data.TFRecordDataset( x, compression_type='ZLIB'), cycle_length=64, block_length=16) # The sampling dataset replaces filter dataset with lambda function below. # Its a faster implemenation of the filter dataset with this specific lambda # function. #dataset = dataset.sampling(filter_amount) dataset = dataset.filter(lambda x: tf.less( tf.random_uniform([1]), filter_amount)[0]) # TODO(amj): apply py_func for transforms here. if num_repeats is not None: dataset = dataset.repeat(num_repeats) else: dataset = dataset.repeat() if shuffle_examples: dataset = dataset.shuffle(buffer_size=shuffle_buffer_size) dataset = dataset.batch(batch_size) return dataset
Example #10
Source File: preprocessing.py From training_results_v0.5 with Apache License 2.0 | 4 votes |
def read_tf_records(batch_size, tf_records, num_repeats=None, shuffle_records=True, shuffle_examples=True, shuffle_buffer_size=None, filter_amount=1.0): ''' Args: batch_size: batch size to return tf_records: a list of tf_record filenames num_repeats: how many times the data should be read (default: infinite) shuffle_records: whether to shuffle the order of files read shuffle_examples: whether to shuffle the tf.Examples shuffle_buffer_size: how big of a buffer to fill before shuffling. filter_amount: what fraction of records to keep Returns: a tf dataset of batched tensors ''' if shuffle_buffer_size is None: shuffle_buffer_size = SHUFFLE_BUFFER_SIZE if shuffle_records: random.shuffle(tf_records) record_list = tf.data.Dataset.from_tensor_slices(tf_records) # compression_type here must agree with write_tf_examples # cycle_length = how many tfrecord files are read in parallel # block_length = how many tf.Examples are read from each file before # moving to the next file # The idea is to shuffle both the order of the files being read, # and the examples being read from the files. dataset = record_list.interleave(lambda x: tf.data.TFRecordDataset( x, compression_type='ZLIB'), cycle_length=64, block_length=16) # The sampling dataset replaces filter dataset with lambda function below. # Its a faster implemenation of the filter dataset with this specific lambda # function. dataset = dataset.sampling(filter_amount) #dataset = dataset.filter(lambda x: tf.less( # tf.random_uniform([1]), filter_amount)[0]) # TODO(amj): apply py_func for transforms here. if num_repeats is not None: dataset = dataset.repeat(num_repeats) else: dataset = dataset.repeat() if shuffle_examples: dataset = dataset.shuffle(buffer_size=shuffle_buffer_size) dataset = dataset.batch(batch_size) return dataset
Example #11
Source File: preprocessing.py From training with Apache License 2.0 | 4 votes |
def read_tf_records(batch_size, tf_records, num_repeats=1, shuffle_records=True, shuffle_examples=True, shuffle_buffer_size=None, interleave=True, filter_amount=1.0): """ Args: batch_size: batch size to return tf_records: a list of tf_record filenames num_repeats: how many times the data should be read (default: One) shuffle_records: whether to shuffle the order of files read shuffle_examples: whether to shuffle the tf.Examples shuffle_buffer_size: how big of a buffer to fill before shuffling. interleave: iwhether to interleave examples from multiple tf_records filter_amount: what fraction of records to keep Returns: a tf dataset of batched tensors """ if shuffle_examples and not shuffle_buffer_size: raise ValueError("Must set shuffle buffer size if shuffling examples") tf_records = list(tf_records) if shuffle_records: random.shuffle(tf_records) record_list = tf.data.Dataset.from_tensor_slices(tf_records) # compression_type here must agree with write_tf_examples map_func = functools.partial( tf.data.TFRecordDataset, buffer_size=8 * 1024 * 1024, compression_type='ZLIB') if interleave: # cycle_length = how many tfrecord files are read in parallel # The idea is to shuffle both the order of the files being read, # and the examples being read from the files. dataset = record_list.apply(tf.data.experimental.parallel_interleave( map_func, cycle_length=64, sloppy=True)) else: dataset = record_list.flat_map(map_func) if filter_amount < 1.0: dataset = dataset.filter( lambda _: tf.random_uniform([]) < filter_amount) dataset = dataset.repeat(num_repeats) if shuffle_examples: dataset = dataset.shuffle(buffer_size=shuffle_buffer_size) dataset = dataset.batch(batch_size) return dataset