Python vggish_params.EMBEDDING_SIZE Examples
The following are 29
code examples of vggish_params.EMBEDDING_SIZE().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
vggish_params
, or try the search function
.
Example #1
Source File: vggish_postprocess.py From Tensorflow-Audio-Classification with Apache License 2.0 | 6 votes |
def __init__(self, pca_params_npz_path): """Constructs a postprocessor. Args: pca_params_npz_path: Path to a NumPy-format .npz file that contains the PCA parameters used in postprocessing. """ params = np.load(pca_params_npz_path) self._pca_matrix = params[vggish_params.PCA_EIGEN_VECTORS_NAME] # Load means into a column vector for easier broadcasting later. self._pca_means = params[vggish_params.PCA_MEANS_NAME].reshape(-1, 1) assert self._pca_matrix.shape == ( vggish_params.EMBEDDING_SIZE, vggish_params.EMBEDDING_SIZE), ( 'Bad PCA matrix shape: %r' % (self._pca_matrix.shape,)) assert self._pca_means.shape == (vggish_params.EMBEDDING_SIZE, 1), ( 'Bad PCA means shape: %r' % (self._pca_means.shape,))
Example #2
Source File: vggish_postprocess.py From multilabel-image-classification-tensorflow with MIT License | 6 votes |
def __init__(self, pca_params_npz_path): """Constructs a postprocessor. Args: pca_params_npz_path: Path to a NumPy-format .npz file that contains the PCA parameters used in postprocessing. """ params = np.load(pca_params_npz_path) self._pca_matrix = params[vggish_params.PCA_EIGEN_VECTORS_NAME] # Load means into a column vector for easier broadcasting later. self._pca_means = params[vggish_params.PCA_MEANS_NAME].reshape(-1, 1) assert self._pca_matrix.shape == ( vggish_params.EMBEDDING_SIZE, vggish_params.EMBEDDING_SIZE), ( 'Bad PCA matrix shape: %r' % (self._pca_matrix.shape,)) assert self._pca_means.shape == (vggish_params.EMBEDDING_SIZE, 1), ( 'Bad PCA means shape: %r' % (self._pca_means.shape,))
Example #3
Source File: vggish_postprocess.py From yolo_v2 with Apache License 2.0 | 6 votes |
def __init__(self, pca_params_npz_path): """Constructs a postprocessor. Args: pca_params_npz_path: Path to a NumPy-format .npz file that contains the PCA parameters used in postprocessing. """ params = np.load(pca_params_npz_path) self._pca_matrix = params[vggish_params.PCA_EIGEN_VECTORS_NAME] # Load means into a column vector for easier broadcasting later. self._pca_means = params[vggish_params.PCA_MEANS_NAME].reshape(-1, 1) assert self._pca_matrix.shape == ( vggish_params.EMBEDDING_SIZE, vggish_params.EMBEDDING_SIZE), ( 'Bad PCA matrix shape: %r' % (self._pca_matrix.shape,)) assert self._pca_means.shape == (vggish_params.EMBEDDING_SIZE, 1), ( 'Bad PCA means shape: %r' % (self._pca_means.shape,))
Example #4
Source File: vggish_postprocess.py From models with Apache License 2.0 | 6 votes |
def __init__(self, pca_params_npz_path): """Constructs a postprocessor. Args: pca_params_npz_path: Path to a NumPy-format .npz file that contains the PCA parameters used in postprocessing. """ params = np.load(pca_params_npz_path) self._pca_matrix = params[vggish_params.PCA_EIGEN_VECTORS_NAME] # Load means into a column vector for easier broadcasting later. self._pca_means = params[vggish_params.PCA_MEANS_NAME].reshape(-1, 1) assert self._pca_matrix.shape == ( vggish_params.EMBEDDING_SIZE, vggish_params.EMBEDDING_SIZE), ( 'Bad PCA matrix shape: %r' % (self._pca_matrix.shape,)) assert self._pca_means.shape == (vggish_params.EMBEDDING_SIZE, 1), ( 'Bad PCA means shape: %r' % (self._pca_means.shape,))
Example #5
Source File: vggish_postprocess.py From Gun-Detector with Apache License 2.0 | 6 votes |
def __init__(self, pca_params_npz_path): """Constructs a postprocessor. Args: pca_params_npz_path: Path to a NumPy-format .npz file that contains the PCA parameters used in postprocessing. """ params = np.load(pca_params_npz_path) self._pca_matrix = params[vggish_params.PCA_EIGEN_VECTORS_NAME] # Load means into a column vector for easier broadcasting later. self._pca_means = params[vggish_params.PCA_MEANS_NAME].reshape(-1, 1) assert self._pca_matrix.shape == ( vggish_params.EMBEDDING_SIZE, vggish_params.EMBEDDING_SIZE), ( 'Bad PCA matrix shape: %r' % (self._pca_matrix.shape,)) assert self._pca_means.shape == (vggish_params.EMBEDDING_SIZE, 1), ( 'Bad PCA means shape: %r' % (self._pca_means.shape,))
Example #6
Source File: vggish_postprocess.py From g-tensorflow-models with Apache License 2.0 | 6 votes |
def __init__(self, pca_params_npz_path): """Constructs a postprocessor. Args: pca_params_npz_path: Path to a NumPy-format .npz file that contains the PCA parameters used in postprocessing. """ params = np.load(pca_params_npz_path) self._pca_matrix = params[vggish_params.PCA_EIGEN_VECTORS_NAME] # Load means into a column vector for easier broadcasting later. self._pca_means = params[vggish_params.PCA_MEANS_NAME].reshape(-1, 1) assert self._pca_matrix.shape == ( vggish_params.EMBEDDING_SIZE, vggish_params.EMBEDDING_SIZE), ( 'Bad PCA matrix shape: %r' % (self._pca_matrix.shape,)) assert self._pca_means.shape == (vggish_params.EMBEDDING_SIZE, 1), ( 'Bad PCA means shape: %r' % (self._pca_means.shape,))
Example #7
Source File: vggish_postprocess.py From object_detection_kitti with Apache License 2.0 | 6 votes |
def __init__(self, pca_params_npz_path): """Constructs a postprocessor. Args: pca_params_npz_path: Path to a NumPy-format .npz file that contains the PCA parameters used in postprocessing. """ params = np.load(pca_params_npz_path) self._pca_matrix = params[vggish_params.PCA_EIGEN_VECTORS_NAME] # Load means into a column vector for easier broadcasting later. self._pca_means = params[vggish_params.PCA_MEANS_NAME].reshape(-1, 1) assert self._pca_matrix.shape == ( vggish_params.EMBEDDING_SIZE, vggish_params.EMBEDDING_SIZE), ( 'Bad PCA matrix shape: %r' % (self._pca_matrix.shape,)) assert self._pca_means.shape == (vggish_params.EMBEDDING_SIZE, 1), ( 'Bad PCA means shape: %r' % (self._pca_means.shape,))
Example #8
Source File: vggish_postprocess.py From audioset_classification with MIT License | 6 votes |
def __init__(self, pca_params_npz_path): """Constructs a postprocessor. Args: pca_params_npz_path: Path to a NumPy-format .npz file that contains the PCA parameters used in postprocessing. """ params = np.load(pca_params_npz_path) self._pca_matrix = params[vggish_params.PCA_EIGEN_VECTORS_NAME] # Load means into a column vector for easier broadcasting later. self._pca_means = params[vggish_params.PCA_MEANS_NAME].reshape(-1, 1) assert self._pca_matrix.shape == ( vggish_params.EMBEDDING_SIZE, vggish_params.EMBEDDING_SIZE), ( 'Bad PCA matrix shape: %r' % (self._pca_matrix.shape,)) assert self._pca_means.shape == (vggish_params.EMBEDDING_SIZE, 1), ( 'Bad PCA means shape: %r' % (self._pca_means.shape,))
Example #9
Source File: vggish_postprocess.py From object_detection_with_tensorflow with MIT License | 6 votes |
def __init__(self, pca_params_npz_path): """Constructs a postprocessor. Args: pca_params_npz_path: Path to a NumPy-format .npz file that contains the PCA parameters used in postprocessing. """ params = np.load(pca_params_npz_path) self._pca_matrix = params[vggish_params.PCA_EIGEN_VECTORS_NAME] # Load means into a column vector for easier broadcasting later. self._pca_means = params[vggish_params.PCA_MEANS_NAME].reshape(-1, 1) assert self._pca_matrix.shape == ( vggish_params.EMBEDDING_SIZE, vggish_params.EMBEDDING_SIZE), ( 'Bad PCA matrix shape: %r' % (self._pca_matrix.shape,)) assert self._pca_means.shape == (vggish_params.EMBEDDING_SIZE, 1), ( 'Bad PCA means shape: %r' % (self._pca_means.shape,))
Example #10
Source File: vggish_postprocess.py From object_detection_with_tensorflow with MIT License | 5 votes |
def postprocess(self, embeddings_batch): """Applies postprocessing to a batch of embeddings. Args: embeddings_batch: An nparray of shape [batch_size, embedding_size] containing output from the embedding layer of VGGish. Returns: An nparray of the same shape as the input but of type uint8, containing the PCA-transformed and quantized version of the input. """ assert len(embeddings_batch.shape) == 2, ( 'Expected 2-d batch, got %r' % (embeddings_batch.shape,)) assert embeddings_batch.shape[1] == vggish_params.EMBEDDING_SIZE, ( 'Bad batch shape: %r' % (embeddings_batch.shape,)) # Apply PCA. # - Embeddings come in as [batch_size, embedding_size]. # - Transpose to [embedding_size, batch_size]. # - Subtract pca_means column vector from each column. # - Premultiply by PCA matrix of shape [output_dims, input_dims] # where both are are equal to embedding_size in our case. # - Transpose result back to [batch_size, embedding_size]. pca_applied = np.dot(self._pca_matrix, (embeddings_batch.T - self._pca_means)).T # Quantize by: # - clipping to [min, max] range clipped_embeddings = np.clip( pca_applied, vggish_params.QUANTIZE_MIN_VAL, vggish_params.QUANTIZE_MAX_VAL) # - convert to 8-bit in range [0.0, 255.0] quantized_embeddings = ( (clipped_embeddings - vggish_params.QUANTIZE_MIN_VAL) * (255.0 / (vggish_params.QUANTIZE_MAX_VAL - vggish_params.QUANTIZE_MIN_VAL))) # - cast 8-bit float to uint8 quantized_embeddings = quantized_embeddings.astype(np.uint8) return quantized_embeddings
Example #11
Source File: vggish_postprocess.py From multilabel-image-classification-tensorflow with MIT License | 5 votes |
def postprocess(self, embeddings_batch): """Applies postprocessing to a batch of embeddings. Args: embeddings_batch: An nparray of shape [batch_size, embedding_size] containing output from the embedding layer of VGGish. Returns: An nparray of the same shape as the input but of type uint8, containing the PCA-transformed and quantized version of the input. """ assert len(embeddings_batch.shape) == 2, ( 'Expected 2-d batch, got %r' % (embeddings_batch.shape,)) assert embeddings_batch.shape[1] == vggish_params.EMBEDDING_SIZE, ( 'Bad batch shape: %r' % (embeddings_batch.shape,)) # Apply PCA. # - Embeddings come in as [batch_size, embedding_size]. # - Transpose to [embedding_size, batch_size]. # - Subtract pca_means column vector from each column. # - Premultiply by PCA matrix of shape [output_dims, input_dims] # where both are are equal to embedding_size in our case. # - Transpose result back to [batch_size, embedding_size]. pca_applied = np.dot(self._pca_matrix, (embeddings_batch.T - self._pca_means)).T # Quantize by: # - clipping to [min, max] range clipped_embeddings = np.clip( pca_applied, vggish_params.QUANTIZE_MIN_VAL, vggish_params.QUANTIZE_MAX_VAL) # - convert to 8-bit in range [0.0, 255.0] quantized_embeddings = ( (clipped_embeddings - vggish_params.QUANTIZE_MIN_VAL) * (255.0 / (vggish_params.QUANTIZE_MAX_VAL - vggish_params.QUANTIZE_MIN_VAL))) # - cast 8-bit float to uint8 quantized_embeddings = quantized_embeddings.astype(np.uint8) return quantized_embeddings
Example #12
Source File: vggish_postprocess.py From models with Apache License 2.0 | 5 votes |
def postprocess(self, embeddings_batch): """Applies postprocessing to a batch of embeddings. Args: embeddings_batch: An nparray of shape [batch_size, embedding_size] containing output from the embedding layer of VGGish. Returns: An nparray of the same shape as the input but of type uint8, containing the PCA-transformed and quantized version of the input. """ assert len(embeddings_batch.shape) == 2, ( 'Expected 2-d batch, got %r' % (embeddings_batch.shape,)) assert embeddings_batch.shape[1] == vggish_params.EMBEDDING_SIZE, ( 'Bad batch shape: %r' % (embeddings_batch.shape,)) # Apply PCA. # - Embeddings come in as [batch_size, embedding_size]. # - Transpose to [embedding_size, batch_size]. # - Subtract pca_means column vector from each column. # - Premultiply by PCA matrix of shape [output_dims, input_dims] # where both are are equal to embedding_size in our case. # - Transpose result back to [batch_size, embedding_size]. pca_applied = np.dot(self._pca_matrix, (embeddings_batch.T - self._pca_means)).T # Quantize by: # - clipping to [min, max] range clipped_embeddings = np.clip( pca_applied, vggish_params.QUANTIZE_MIN_VAL, vggish_params.QUANTIZE_MAX_VAL) # - convert to 8-bit in range [0.0, 255.0] quantized_embeddings = ( (clipped_embeddings - vggish_params.QUANTIZE_MIN_VAL) * (255.0 / (vggish_params.QUANTIZE_MAX_VAL - vggish_params.QUANTIZE_MIN_VAL))) # - cast 8-bit float to uint8 quantized_embeddings = quantized_embeddings.astype(np.uint8) return quantized_embeddings
Example #13
Source File: vggish_postprocess.py From g-tensorflow-models with Apache License 2.0 | 5 votes |
def postprocess(self, embeddings_batch): """Applies postprocessing to a batch of embeddings. Args: embeddings_batch: An nparray of shape [batch_size, embedding_size] containing output from the embedding layer of VGGish. Returns: An nparray of the same shape as the input but of type uint8, containing the PCA-transformed and quantized version of the input. """ assert len(embeddings_batch.shape) == 2, ( 'Expected 2-d batch, got %r' % (embeddings_batch.shape,)) assert embeddings_batch.shape[1] == vggish_params.EMBEDDING_SIZE, ( 'Bad batch shape: %r' % (embeddings_batch.shape,)) # Apply PCA. # - Embeddings come in as [batch_size, embedding_size]. # - Transpose to [embedding_size, batch_size]. # - Subtract pca_means column vector from each column. # - Premultiply by PCA matrix of shape [output_dims, input_dims] # where both are are equal to embedding_size in our case. # - Transpose result back to [batch_size, embedding_size]. pca_applied = np.dot(self._pca_matrix, (embeddings_batch.T - self._pca_means)).T # Quantize by: # - clipping to [min, max] range clipped_embeddings = np.clip( pca_applied, vggish_params.QUANTIZE_MIN_VAL, vggish_params.QUANTIZE_MAX_VAL) # - convert to 8-bit in range [0.0, 255.0] quantized_embeddings = ( (clipped_embeddings - vggish_params.QUANTIZE_MIN_VAL) * (255.0 / (vggish_params.QUANTIZE_MAX_VAL - vggish_params.QUANTIZE_MIN_VAL))) # - cast 8-bit float to uint8 quantized_embeddings = quantized_embeddings.astype(np.uint8) return quantized_embeddings
Example #14
Source File: vggish_postprocess.py From audioset_classification with MIT License | 5 votes |
def postprocess(self, embeddings_batch): """Applies postprocessing to a batch of embeddings. Args: embeddings_batch: An nparray of shape [batch_size, embedding_size] containing output from the embedding layer of VGGish. Returns: An nparray of the same shape as the input but of type uint8, containing the PCA-transformed and quantized version of the input. """ assert len(embeddings_batch.shape) == 2, ( 'Expected 2-d batch, got %r' % (embeddings_batch.shape,)) assert embeddings_batch.shape[1] == vggish_params.EMBEDDING_SIZE, ( 'Bad batch shape: %r' % (embeddings_batch.shape,)) # Apply PCA. # - Embeddings come in as [batch_size, embedding_size]. # - Transpose to [embedding_size, batch_size]. # - Subtract pca_means column vector from each column. # - Premultiply by PCA matrix of shape [output_dims, input_dims] # where both are are equal to embedding_size in our case. # - Transpose result back to [batch_size, embedding_size]. pca_applied = np.dot(self._pca_matrix, (embeddings_batch.T - self._pca_means)).T # Quantize by: # - clipping to [min, max] range clipped_embeddings = np.clip( pca_applied, vggish_params.QUANTIZE_MIN_VAL, vggish_params.QUANTIZE_MAX_VAL) # - convert to 8-bit in range [0.0, 255.0] quantized_embeddings = ( (clipped_embeddings - vggish_params.QUANTIZE_MIN_VAL) * (255.0 / (vggish_params.QUANTIZE_MAX_VAL - vggish_params.QUANTIZE_MIN_VAL))) # - cast 8-bit float to uint8 quantized_embeddings = quantized_embeddings.astype(np.uint8) return quantized_embeddings
Example #15
Source File: vggish_postprocess.py From object_detection_kitti with Apache License 2.0 | 5 votes |
def postprocess(self, embeddings_batch): """Applies postprocessing to a batch of embeddings. Args: embeddings_batch: An nparray of shape [batch_size, embedding_size] containing output from the embedding layer of VGGish. Returns: An nparray of the same shape as the input but of type uint8, containing the PCA-transformed and quantized version of the input. """ assert len(embeddings_batch.shape) == 2, ( 'Expected 2-d batch, got %r' % (embeddings_batch.shape,)) assert embeddings_batch.shape[1] == vggish_params.EMBEDDING_SIZE, ( 'Bad batch shape: %r' % (embeddings_batch.shape,)) # Apply PCA. # - Embeddings come in as [batch_size, embedding_size]. # - Transpose to [embedding_size, batch_size]. # - Subtract pca_means column vector from each column. # - Premultiply by PCA matrix of shape [output_dims, input_dims] # where both are are equal to embedding_size in our case. # - Transpose result back to [batch_size, embedding_size]. pca_applied = np.dot(self._pca_matrix, (embeddings_batch.T - self._pca_means)).T # Quantize by: # - clipping to [min, max] range clipped_embeddings = np.clip( pca_applied, vggish_params.QUANTIZE_MIN_VAL, vggish_params.QUANTIZE_MAX_VAL) # - convert to 8-bit in range [0.0, 255.0] quantized_embeddings = ( (clipped_embeddings - vggish_params.QUANTIZE_MIN_VAL) * (255.0 / (vggish_params.QUANTIZE_MAX_VAL - vggish_params.QUANTIZE_MIN_VAL))) # - cast 8-bit float to uint8 quantized_embeddings = quantized_embeddings.astype(np.uint8) return quantized_embeddings
Example #16
Source File: vggish_postprocess.py From Gun-Detector with Apache License 2.0 | 5 votes |
def postprocess(self, embeddings_batch): """Applies postprocessing to a batch of embeddings. Args: embeddings_batch: An nparray of shape [batch_size, embedding_size] containing output from the embedding layer of VGGish. Returns: An nparray of the same shape as the input but of type uint8, containing the PCA-transformed and quantized version of the input. """ assert len(embeddings_batch.shape) == 2, ( 'Expected 2-d batch, got %r' % (embeddings_batch.shape,)) assert embeddings_batch.shape[1] == vggish_params.EMBEDDING_SIZE, ( 'Bad batch shape: %r' % (embeddings_batch.shape,)) # Apply PCA. # - Embeddings come in as [batch_size, embedding_size]. # - Transpose to [embedding_size, batch_size]. # - Subtract pca_means column vector from each column. # - Premultiply by PCA matrix of shape [output_dims, input_dims] # where both are are equal to embedding_size in our case. # - Transpose result back to [batch_size, embedding_size]. pca_applied = np.dot(self._pca_matrix, (embeddings_batch.T - self._pca_means)).T # Quantize by: # - clipping to [min, max] range clipped_embeddings = np.clip( pca_applied, vggish_params.QUANTIZE_MIN_VAL, vggish_params.QUANTIZE_MAX_VAL) # - convert to 8-bit in range [0.0, 255.0] quantized_embeddings = ( (clipped_embeddings - vggish_params.QUANTIZE_MIN_VAL) * (255.0 / (vggish_params.QUANTIZE_MAX_VAL - vggish_params.QUANTIZE_MIN_VAL))) # - cast 8-bit float to uint8 quantized_embeddings = quantized_embeddings.astype(np.uint8) return quantized_embeddings
Example #17
Source File: vggish_postprocess.py From yolo_v2 with Apache License 2.0 | 5 votes |
def postprocess(self, embeddings_batch): """Applies postprocessing to a batch of embeddings. Args: embeddings_batch: An nparray of shape [batch_size, embedding_size] containing output from the embedding layer of VGGish. Returns: An nparray of the same shape as the input but of type uint8, containing the PCA-transformed and quantized version of the input. """ assert len(embeddings_batch.shape) == 2, ( 'Expected 2-d batch, got %r' % (embeddings_batch.shape,)) assert embeddings_batch.shape[1] == vggish_params.EMBEDDING_SIZE, ( 'Bad batch shape: %r' % (embeddings_batch.shape,)) # Apply PCA. # - Embeddings come in as [batch_size, embedding_size]. # - Transpose to [embedding_size, batch_size]. # - Subtract pca_means column vector from each column. # - Premultiply by PCA matrix of shape [output_dims, input_dims] # where both are are equal to embedding_size in our case. # - Transpose result back to [batch_size, embedding_size]. pca_applied = np.dot(self._pca_matrix, (embeddings_batch.T - self._pca_means)).T # Quantize by: # - clipping to [min, max] range clipped_embeddings = np.clip( pca_applied, vggish_params.QUANTIZE_MIN_VAL, vggish_params.QUANTIZE_MAX_VAL) # - convert to 8-bit in range [0.0, 255.0] quantized_embeddings = ( (clipped_embeddings - vggish_params.QUANTIZE_MIN_VAL) * (255.0 / (vggish_params.QUANTIZE_MAX_VAL - vggish_params.QUANTIZE_MIN_VAL))) # - cast 8-bit float to uint8 quantized_embeddings = quantized_embeddings.astype(np.uint8) return quantized_embeddings
Example #18
Source File: vggish_postprocess.py From Tensorflow-Audio-Classification with Apache License 2.0 | 5 votes |
def postprocess(self, embeddings_batch): """Applies postprocessing to a batch of embeddings. Args: embeddings_batch: An nparray of shape [batch_size, embedding_size] containing output from the embedding layer of VGGish. Returns: An nparray of the same shape as the input but of type uint8, containing the PCA-transformed and quantized version of the input. """ assert len(embeddings_batch.shape) == 2, ( 'Expected 2-d batch, got %r' % (embeddings_batch.shape,)) assert embeddings_batch.shape[1] == vggish_params.EMBEDDING_SIZE, ( 'Bad batch shape: %r' % (embeddings_batch.shape,)) # Apply PCA. # - Embeddings come in as [batch_size, embedding_size]. # - Transpose to [embedding_size, batch_size]. # - Subtract pca_means column vector from each column. # - Premultiply by PCA matrix of shape [output_dims, input_dims] # where both are are equal to embedding_size in our case. # - Transpose result back to [batch_size, embedding_size]. pca_applied = np.dot(self._pca_matrix, (embeddings_batch.T - self._pca_means)).T # Quantize by: # - clipping to [min, max] range clipped_embeddings = np.clip( pca_applied, vggish_params.QUANTIZE_MIN_VAL, vggish_params.QUANTIZE_MAX_VAL) # - convert to 8-bit in range [0.0, 255.0] quantized_embeddings = ( (clipped_embeddings - vggish_params.QUANTIZE_MIN_VAL) * (255.0 / (vggish_params.QUANTIZE_MAX_VAL - vggish_params.QUANTIZE_MIN_VAL))) # - cast 8-bit float to uint8 quantized_embeddings = quantized_embeddings.astype(np.uint8) return quantized_embeddings
Example #19
Source File: vggish_slim.py From sklearn-audio-transfer-learning with ISC License | 4 votes |
def define_vggish_slim(training=False): """Defines the VGGish TensorFlow model. All ops are created in the current default graph, under the scope 'vggish/'. The input is a placeholder named 'vggish/input_features' of type float32 and shape [batch_size, num_frames, num_bands] where batch_size is variable and num_frames and num_bands are constants, and [num_frames, num_bands] represents a log-mel-scale spectrogram patch covering num_bands frequency bands and num_frames time frames (where each frame step is usually 10ms). This is produced by computing the stabilized log(mel-spectrogram + params.LOG_OFFSET). The output is an op named 'vggish/embedding' which produces the activations of a 128-D embedding layer, which is usually the penultimate layer when used as part of a full model with a final classifier layer. Args: training: If true, all parameters are marked trainable. Returns: The op 'vggish/embeddings'. """ # Defaults: # - All weights are initialized to N(0, INIT_STDDEV). # - All biases are initialized to 0. # - All activations are ReLU. # - All convolutions are 3x3 with stride 1 and SAME padding. # - All max-pools are 2x2 with stride 2 and SAME padding. with slim.arg_scope([slim.conv2d, slim.fully_connected], weights_initializer=tf.truncated_normal_initializer( stddev=params.INIT_STDDEV), biases_initializer=tf.zeros_initializer(), activation_fn=tf.nn.relu, trainable=training), \ slim.arg_scope([slim.conv2d], kernel_size=[3, 3], stride=1, padding='SAME'), \ slim.arg_scope([slim.max_pool2d], kernel_size=[2, 2], stride=2, padding='SAME'), \ tf.variable_scope('vggish'): # Input: a batch of 2-D log-mel-spectrogram patches. features = tf.placeholder( tf.float32, shape=(None, params.NUM_FRAMES, params.NUM_BANDS), name='input_features') # Reshape to 4-D so that we can convolve a batch with conv2d(). net = tf.reshape(features, [-1, params.NUM_FRAMES, params.NUM_BANDS, 1]) # The VGG stack of alternating convolutions and max-pools. net = slim.conv2d(net, 64, scope='conv1') net = slim.max_pool2d(net, scope='pool1') net = slim.conv2d(net, 128, scope='conv2') net = slim.max_pool2d(net, scope='pool2') net = slim.repeat(net, 2, slim.conv2d, 256, scope='conv3') net = slim.max_pool2d(net, scope='pool3') net = slim.repeat(net, 2, slim.conv2d, 512, scope='conv4') net = slim.max_pool2d(net, scope='pool4') # Flatten before entering fully-connected layers net = slim.flatten(net) net = slim.repeat(net, 2, slim.fully_connected, 4096, scope='fc1') # The embedding layer. net = slim.fully_connected(net, params.EMBEDDING_SIZE, scope='fc2') return tf.identity(net, name='embedding')
Example #20
Source File: vggish_slim.py From object_detection_with_tensorflow with MIT License | 4 votes |
def define_vggish_slim(training=False): """Defines the VGGish TensorFlow model. All ops are created in the current default graph, under the scope 'vggish/'. The input is a placeholder named 'vggish/input_features' of type float32 and shape [batch_size, num_frames, num_bands] where batch_size is variable and num_frames and num_bands are constants, and [num_frames, num_bands] represents a log-mel-scale spectrogram patch covering num_bands frequency bands and num_frames time frames (where each frame step is usually 10ms). This is produced by computing the stabilized log(mel-spectrogram + params.LOG_OFFSET). The output is an op named 'vggish/embedding' which produces the activations of a 128-D embedding layer, which is usually the penultimate layer when used as part of a full model with a final classifier layer. Args: training: If true, all parameters are marked trainable. Returns: The op 'vggish/embeddings'. """ # Defaults: # - All weights are initialized to N(0, INIT_STDDEV). # - All biases are initialized to 0. # - All activations are ReLU. # - All convolutions are 3x3 with stride 1 and SAME padding. # - All max-pools are 2x2 with stride 2 and SAME padding. with slim.arg_scope([slim.conv2d, slim.fully_connected], weights_initializer=tf.truncated_normal_initializer( stddev=params.INIT_STDDEV), biases_initializer=tf.zeros_initializer(), activation_fn=tf.nn.relu, trainable=training), \ slim.arg_scope([slim.conv2d], kernel_size=[3, 3], stride=1, padding='SAME'), \ slim.arg_scope([slim.max_pool2d], kernel_size=[2, 2], stride=2, padding='SAME'), \ tf.variable_scope('vggish'): # Input: a batch of 2-D log-mel-spectrogram patches. features = tf.placeholder( tf.float32, shape=(None, params.NUM_FRAMES, params.NUM_BANDS), name='input_features') # Reshape to 4-D so that we can convolve a batch with conv2d(). net = tf.reshape(features, [-1, params.NUM_FRAMES, params.NUM_BANDS, 1]) # The VGG stack of alternating convolutions and max-pools. net = slim.conv2d(net, 64, scope='conv1') net = slim.max_pool2d(net, scope='pool1') net = slim.conv2d(net, 128, scope='conv2') net = slim.max_pool2d(net, scope='pool2') net = slim.repeat(net, 2, slim.conv2d, 256, scope='conv3') net = slim.max_pool2d(net, scope='pool3') net = slim.repeat(net, 2, slim.conv2d, 512, scope='conv4') net = slim.max_pool2d(net, scope='pool4') # Flatten before entering fully-connected layers net = slim.flatten(net) net = slim.repeat(net, 2, slim.fully_connected, 4096, scope='fc1') # The embedding layer. net = slim.fully_connected(net, params.EMBEDDING_SIZE, scope='fc2') return tf.identity(net, name='embedding')
Example #21
Source File: extract_audioset_embedding.py From audioset_classification with MIT License | 4 votes |
def extract_audioset_embedding(): """Extract log mel spectrogram features. """ # Arguments & parameters mel_bins = vggish_params.NUM_BANDS sample_rate = vggish_params.SAMPLE_RATE input_len = vggish_params.NUM_FRAMES embedding_size = vggish_params.EMBEDDING_SIZE '''You may modify the EXAMPLE_HOP_SECONDS in vggish_params.py to change the hop size. ''' # Paths audio_path = 'appendixes/01.wav' checkpoint_path = os.path.join('vggish_model.ckpt') pcm_params_path = os.path.join('vggish_pca_params.npz') if not os.path.isfile(checkpoint_path): raise Exception('Please download vggish_model.ckpt from ' 'https://storage.googleapis.com/audioset/vggish_model.ckpt ' 'and put it in the root of this codebase. ') if not os.path.isfile(pcm_params_path): raise Exception('Please download pcm_params_path from ' 'https://storage.googleapis.com/audioset/vggish_pca_params.npz ' 'and put it in the root of this codebase. ') # Load model sess = tf.Session() vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint_path) features_tensor = sess.graph.get_tensor_by_name(vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name(vggish_params.OUTPUT_TENSOR_NAME) pproc = vggish_postprocess.Postprocessor(pcm_params_path) # Read audio (audio, _) = read_audio(audio_path, target_fs=sample_rate) # Extract log mel feature logmel = vggish_input.waveform_to_examples(audio, sample_rate) # Extract embedding feature [embedding_batch] = sess.run([embedding_tensor], feed_dict={features_tensor: logmel}) # PCA postprocessed_batch = pproc.postprocess(embedding_batch) print('Audio length: {}'.format(len(audio))) print('Log mel shape: {}'.format(logmel.shape)) print('Embedding feature shape: {}'.format(postprocessed_batch.shape))
Example #22
Source File: vggish_slim.py From object_detection_kitti with Apache License 2.0 | 4 votes |
def define_vggish_slim(training=False): """Defines the VGGish TensorFlow model. All ops are created in the current default graph, under the scope 'vggish/'. The input is a placeholder named 'vggish/input_features' of type float32 and shape [batch_size, num_frames, num_bands] where batch_size is variable and num_frames and num_bands are constants, and [num_frames, num_bands] represents a log-mel-scale spectrogram patch covering num_bands frequency bands and num_frames time frames (where each frame step is usually 10ms). This is produced by computing the stabilized log(mel-spectrogram + params.LOG_OFFSET). The output is an op named 'vggish/embedding' which produces the activations of a 128-D embedding layer, which is usually the penultimate layer when used as part of a full model with a final classifier layer. Args: training: If true, all parameters are marked trainable. Returns: The op 'vggish/embeddings'. """ # Defaults: # - All weights are initialized to N(0, INIT_STDDEV). # - All biases are initialized to 0. # - All activations are ReLU. # - All convolutions are 3x3 with stride 1 and SAME padding. # - All max-pools are 2x2 with stride 2 and SAME padding. with slim.arg_scope([slim.conv2d, slim.fully_connected], weights_initializer=tf.truncated_normal_initializer( stddev=params.INIT_STDDEV), biases_initializer=tf.zeros_initializer(), activation_fn=tf.nn.relu, trainable=training), \ slim.arg_scope([slim.conv2d], kernel_size=[3, 3], stride=1, padding='SAME'), \ slim.arg_scope([slim.max_pool2d], kernel_size=[2, 2], stride=2, padding='SAME'), \ tf.variable_scope('vggish'): # Input: a batch of 2-D log-mel-spectrogram patches. features = tf.placeholder( tf.float32, shape=(None, params.NUM_FRAMES, params.NUM_BANDS), name='input_features') # Reshape to 4-D so that we can convolve a batch with conv2d(). net = tf.reshape(features, [-1, params.NUM_FRAMES, params.NUM_BANDS, 1]) # The VGG stack of alternating convolutions and max-pools. net = slim.conv2d(net, 64, scope='conv1') net = slim.max_pool2d(net, scope='pool1') net = slim.conv2d(net, 128, scope='conv2') net = slim.max_pool2d(net, scope='pool2') net = slim.repeat(net, 2, slim.conv2d, 256, scope='conv3') net = slim.max_pool2d(net, scope='pool3') net = slim.repeat(net, 2, slim.conv2d, 512, scope='conv4') net = slim.max_pool2d(net, scope='pool4') # Flatten before entering fully-connected layers net = slim.flatten(net) net = slim.repeat(net, 2, slim.fully_connected, 4096, scope='fc1') # The embedding layer. net = slim.fully_connected(net, params.EMBEDDING_SIZE, scope='fc2') return tf.identity(net, name='embedding')
Example #23
Source File: vggish_slim.py From audioset_classification with MIT License | 4 votes |
def define_vggish_slim(training=False): """Defines the VGGish TensorFlow model. All ops are created in the current default graph, under the scope 'vggish/'. The input is a placeholder named 'vggish/input_features' of type float32 and shape [batch_size, num_frames, num_bands] where batch_size is variable and num_frames and num_bands are constants, and [num_frames, num_bands] represents a log-mel-scale spectrogram patch covering num_bands frequency bands and num_frames time frames (where each frame step is usually 10ms). This is produced by computing the stabilized log(mel-spectrogram + params.LOG_OFFSET). The output is an op named 'vggish/embedding' which produces the activations of a 128-D embedding layer, which is usually the penultimate layer when used as part of a full model with a final classifier layer. Args: training: If true, all parameters are marked trainable. Returns: The op 'vggish/embeddings'. """ # Defaults: # - All weights are initialized to N(0, INIT_STDDEV). # - All biases are initialized to 0. # - All activations are ReLU. # - All convolutions are 3x3 with stride 1 and SAME padding. # - All max-pools are 2x2 with stride 2 and SAME padding. with slim.arg_scope([slim.conv2d, slim.fully_connected], weights_initializer=tf.truncated_normal_initializer( stddev=params.INIT_STDDEV), biases_initializer=tf.zeros_initializer(), activation_fn=tf.nn.relu, trainable=training), \ slim.arg_scope([slim.conv2d], kernel_size=[3, 3], stride=1, padding='SAME'), \ slim.arg_scope([slim.max_pool2d], kernel_size=[2, 2], stride=2, padding='SAME'), \ tf.variable_scope('vggish'): # Input: a batch of 2-D log-mel-spectrogram patches. features = tf.placeholder( tf.float32, shape=(None, params.NUM_FRAMES, params.NUM_BANDS), name='input_features') # Reshape to 4-D so that we can convolve a batch with conv2d(). net = tf.reshape(features, [-1, params.NUM_FRAMES, params.NUM_BANDS, 1]) # The VGG stack of alternating convolutions and max-pools. net = slim.conv2d(net, 64, scope='conv1') net = slim.max_pool2d(net, scope='pool1') net = slim.conv2d(net, 128, scope='conv2') net = slim.max_pool2d(net, scope='pool2') net = slim.repeat(net, 2, slim.conv2d, 256, scope='conv3') net = slim.max_pool2d(net, scope='pool3') net = slim.repeat(net, 2, slim.conv2d, 512, scope='conv4') net = slim.max_pool2d(net, scope='pool4') # Flatten before entering fully-connected layers net = slim.flatten(net) net = slim.repeat(net, 2, slim.fully_connected, 4096, scope='fc1') # The embedding layer. net = slim.fully_connected(net, params.EMBEDDING_SIZE, scope='fc2') return tf.identity(net, name='embedding')
Example #24
Source File: vggish_slim.py From Gun-Detector with Apache License 2.0 | 4 votes |
def define_vggish_slim(training=False): """Defines the VGGish TensorFlow model. All ops are created in the current default graph, under the scope 'vggish/'. The input is a placeholder named 'vggish/input_features' of type float32 and shape [batch_size, num_frames, num_bands] where batch_size is variable and num_frames and num_bands are constants, and [num_frames, num_bands] represents a log-mel-scale spectrogram patch covering num_bands frequency bands and num_frames time frames (where each frame step is usually 10ms). This is produced by computing the stabilized log(mel-spectrogram + params.LOG_OFFSET). The output is an op named 'vggish/embedding' which produces the activations of a 128-D embedding layer, which is usually the penultimate layer when used as part of a full model with a final classifier layer. Args: training: If true, all parameters are marked trainable. Returns: The op 'vggish/embeddings'. """ # Defaults: # - All weights are initialized to N(0, INIT_STDDEV). # - All biases are initialized to 0. # - All activations are ReLU. # - All convolutions are 3x3 with stride 1 and SAME padding. # - All max-pools are 2x2 with stride 2 and SAME padding. with slim.arg_scope([slim.conv2d, slim.fully_connected], weights_initializer=tf.truncated_normal_initializer( stddev=params.INIT_STDDEV), biases_initializer=tf.zeros_initializer(), activation_fn=tf.nn.relu, trainable=training), \ slim.arg_scope([slim.conv2d], kernel_size=[3, 3], stride=1, padding='SAME'), \ slim.arg_scope([slim.max_pool2d], kernel_size=[2, 2], stride=2, padding='SAME'), \ tf.variable_scope('vggish'): # Input: a batch of 2-D log-mel-spectrogram patches. features = tf.placeholder( tf.float32, shape=(None, params.NUM_FRAMES, params.NUM_BANDS), name='input_features') # Reshape to 4-D so that we can convolve a batch with conv2d(). net = tf.reshape(features, [-1, params.NUM_FRAMES, params.NUM_BANDS, 1]) # The VGG stack of alternating convolutions and max-pools. net = slim.conv2d(net, 64, scope='conv1') net = slim.max_pool2d(net, scope='pool1') net = slim.conv2d(net, 128, scope='conv2') net = slim.max_pool2d(net, scope='pool2') net = slim.repeat(net, 2, slim.conv2d, 256, scope='conv3') net = slim.max_pool2d(net, scope='pool3') net = slim.repeat(net, 2, slim.conv2d, 512, scope='conv4') net = slim.max_pool2d(net, scope='pool4') # Flatten before entering fully-connected layers net = slim.flatten(net) net = slim.repeat(net, 2, slim.fully_connected, 4096, scope='fc1') # The embedding layer. net = slim.fully_connected(net, params.EMBEDDING_SIZE, scope='fc2') return tf.identity(net, name='embedding')
Example #25
Source File: vggish_slim.py From g-tensorflow-models with Apache License 2.0 | 4 votes |
def define_vggish_slim(training=False): """Defines the VGGish TensorFlow model. All ops are created in the current default graph, under the scope 'vggish/'. The input is a placeholder named 'vggish/input_features' of type float32 and shape [batch_size, num_frames, num_bands] where batch_size is variable and num_frames and num_bands are constants, and [num_frames, num_bands] represents a log-mel-scale spectrogram patch covering num_bands frequency bands and num_frames time frames (where each frame step is usually 10ms). This is produced by computing the stabilized log(mel-spectrogram + params.LOG_OFFSET). The output is an op named 'vggish/embedding' which produces the activations of a 128-D embedding layer, which is usually the penultimate layer when used as part of a full model with a final classifier layer. Args: training: If true, all parameters are marked trainable. Returns: The op 'vggish/embeddings'. """ # Defaults: # - All weights are initialized to N(0, INIT_STDDEV). # - All biases are initialized to 0. # - All activations are ReLU. # - All convolutions are 3x3 with stride 1 and SAME padding. # - All max-pools are 2x2 with stride 2 and SAME padding. with slim.arg_scope([slim.conv2d, slim.fully_connected], weights_initializer=tf.truncated_normal_initializer( stddev=params.INIT_STDDEV), biases_initializer=tf.zeros_initializer(), activation_fn=tf.nn.relu, trainable=training), \ slim.arg_scope([slim.conv2d], kernel_size=[3, 3], stride=1, padding='SAME'), \ slim.arg_scope([slim.max_pool2d], kernel_size=[2, 2], stride=2, padding='SAME'), \ tf.variable_scope('vggish'): # Input: a batch of 2-D log-mel-spectrogram patches. features = tf.placeholder( tf.float32, shape=(None, params.NUM_FRAMES, params.NUM_BANDS), name='input_features') # Reshape to 4-D so that we can convolve a batch with conv2d(). net = tf.reshape(features, [-1, params.NUM_FRAMES, params.NUM_BANDS, 1]) # The VGG stack of alternating convolutions and max-pools. net = slim.conv2d(net, 64, scope='conv1') net = slim.max_pool2d(net, scope='pool1') net = slim.conv2d(net, 128, scope='conv2') net = slim.max_pool2d(net, scope='pool2') net = slim.repeat(net, 2, slim.conv2d, 256, scope='conv3') net = slim.max_pool2d(net, scope='pool3') net = slim.repeat(net, 2, slim.conv2d, 512, scope='conv4') net = slim.max_pool2d(net, scope='pool4') # Flatten before entering fully-connected layers net = slim.flatten(net) net = slim.repeat(net, 2, slim.fully_connected, 4096, scope='fc1') # The embedding layer. net = slim.fully_connected(net, params.EMBEDDING_SIZE, scope='fc2') return tf.identity(net, name='embedding')
Example #26
Source File: vggish_slim.py From yolo_v2 with Apache License 2.0 | 4 votes |
def define_vggish_slim(training=False): """Defines the VGGish TensorFlow model. All ops are created in the current default graph, under the scope 'vggish/'. The input is a placeholder named 'vggish/input_features' of type float32 and shape [batch_size, num_frames, num_bands] where batch_size is variable and num_frames and num_bands are constants, and [num_frames, num_bands] represents a log-mel-scale spectrogram patch covering num_bands frequency bands and num_frames time frames (where each frame step is usually 10ms). This is produced by computing the stabilized log(mel-spectrogram + params.LOG_OFFSET). The output is an op named 'vggish/embedding' which produces the activations of a 128-D embedding layer, which is usually the penultimate layer when used as part of a full model with a final classifier layer. Args: training: If true, all parameters are marked trainable. Returns: The op 'vggish/embeddings'. """ # Defaults: # - All weights are initialized to N(0, INIT_STDDEV). # - All biases are initialized to 0. # - All activations are ReLU. # - All convolutions are 3x3 with stride 1 and SAME padding. # - All max-pools are 2x2 with stride 2 and SAME padding. with slim.arg_scope([slim.conv2d, slim.fully_connected], weights_initializer=tf.truncated_normal_initializer( stddev=params.INIT_STDDEV), biases_initializer=tf.zeros_initializer(), activation_fn=tf.nn.relu, trainable=training), \ slim.arg_scope([slim.conv2d], kernel_size=[3, 3], stride=1, padding='SAME'), \ slim.arg_scope([slim.max_pool2d], kernel_size=[2, 2], stride=2, padding='SAME'), \ tf.variable_scope('vggish'): # Input: a batch of 2-D log-mel-spectrogram patches. features = tf.placeholder( tf.float32, shape=(None, params.NUM_FRAMES, params.NUM_BANDS), name='input_features') # Reshape to 4-D so that we can convolve a batch with conv2d(). net = tf.reshape(features, [-1, params.NUM_FRAMES, params.NUM_BANDS, 1]) # The VGG stack of alternating convolutions and max-pools. net = slim.conv2d(net, 64, scope='conv1') net = slim.max_pool2d(net, scope='pool1') net = slim.conv2d(net, 128, scope='conv2') net = slim.max_pool2d(net, scope='pool2') net = slim.repeat(net, 2, slim.conv2d, 256, scope='conv3') net = slim.max_pool2d(net, scope='pool3') net = slim.repeat(net, 2, slim.conv2d, 512, scope='conv4') net = slim.max_pool2d(net, scope='pool4') # Flatten before entering fully-connected layers net = slim.flatten(net) net = slim.repeat(net, 2, slim.fully_connected, 4096, scope='fc1') # The embedding layer. net = slim.fully_connected(net, params.EMBEDDING_SIZE, scope='fc2') return tf.identity(net, name='embedding')
Example #27
Source File: vggish_slim.py From models with Apache License 2.0 | 4 votes |
def define_vggish_slim(training=False): """Defines the VGGish TensorFlow model. All ops are created in the current default graph, under the scope 'vggish/'. The input is a placeholder named 'vggish/input_features' of type float32 and shape [batch_size, num_frames, num_bands] where batch_size is variable and num_frames and num_bands are constants, and [num_frames, num_bands] represents a log-mel-scale spectrogram patch covering num_bands frequency bands and num_frames time frames (where each frame step is usually 10ms). This is produced by computing the stabilized log(mel-spectrogram + params.LOG_OFFSET). The output is an op named 'vggish/embedding' which produces the activations of a 128-D embedding layer, which is usually the penultimate layer when used as part of a full model with a final classifier layer. Args: training: If true, all parameters are marked trainable. Returns: The op 'vggish/embeddings'. """ # Defaults: # - All weights are initialized to N(0, INIT_STDDEV). # - All biases are initialized to 0. # - All activations are ReLU. # - All convolutions are 3x3 with stride 1 and SAME padding. # - All max-pools are 2x2 with stride 2 and SAME padding. with slim.arg_scope([slim.conv2d, slim.fully_connected], weights_initializer=tf.truncated_normal_initializer( stddev=params.INIT_STDDEV), biases_initializer=tf.zeros_initializer(), activation_fn=tf.nn.relu, trainable=training), \ slim.arg_scope([slim.conv2d], kernel_size=[3, 3], stride=1, padding='SAME'), \ slim.arg_scope([slim.max_pool2d], kernel_size=[2, 2], stride=2, padding='SAME'), \ tf.variable_scope('vggish'): # Input: a batch of 2-D log-mel-spectrogram patches. features = tf.placeholder( tf.float32, shape=(None, params.NUM_FRAMES, params.NUM_BANDS), name='input_features') # Reshape to 4-D so that we can convolve a batch with conv2d(). net = tf.reshape(features, [-1, params.NUM_FRAMES, params.NUM_BANDS, 1]) # The VGG stack of alternating convolutions and max-pools. net = slim.conv2d(net, 64, scope='conv1') net = slim.max_pool2d(net, scope='pool1') net = slim.conv2d(net, 128, scope='conv2') net = slim.max_pool2d(net, scope='pool2') net = slim.repeat(net, 2, slim.conv2d, 256, scope='conv3') net = slim.max_pool2d(net, scope='pool3') net = slim.repeat(net, 2, slim.conv2d, 512, scope='conv4') net = slim.max_pool2d(net, scope='pool4') # Flatten before entering fully-connected layers net = slim.flatten(net) net = slim.repeat(net, 2, slim.fully_connected, 4096, scope='fc1') # The embedding layer. net = slim.fully_connected(net, params.EMBEDDING_SIZE, scope='fc2') return tf.identity(net, name='embedding')
Example #28
Source File: vggish_slim.py From Tensorflow-Audio-Classification with Apache License 2.0 | 4 votes |
def define_vggish_slim(training=False): """Defines the VGGish TensorFlow model. All ops are created in the current default graph, under the scope 'vggish/'. The input is a placeholder named 'vggish/input_features' of type float32 and shape [batch_size, num_frames, num_bands] where batch_size is variable and num_frames and num_bands are constants, and [num_frames, num_bands] represents a log-mel-scale spectrogram patch covering num_bands frequency bands and num_frames time frames (where each frame step is usually 10ms). This is produced by computing the stabilized log(mel-spectrogram + params.LOG_OFFSET). The output is an op named 'vggish/embedding' which produces the activations of a 128-D embedding layer, which is usually the penultimate layer when used as part of a full model with a final classifier layer. Args: training: If true, all parameters are marked trainable. Returns: The op 'vggish/embeddings'. """ # Defaults: # - All weights are initialized to N(0, INIT_STDDEV). # - All biases are initialized to 0. # - All activations are ReLU. # - All convolutions are 3x3 with stride 1 and SAME padding. # - All max-pools are 2x2 with stride 2 and SAME padding. with slim.arg_scope([slim.conv2d, slim.fully_connected], weights_initializer=tf.truncated_normal_initializer( stddev=params.INIT_STDDEV), biases_initializer=tf.zeros_initializer(), activation_fn=tf.nn.relu, trainable=training), \ slim.arg_scope([slim.conv2d], kernel_size=[3, 3], stride=1, padding='SAME'), \ slim.arg_scope([slim.max_pool2d], kernel_size=[2, 2], stride=2, padding='SAME'), \ tf.variable_scope('vggish'): # Input: a batch of 2-D log-mel-spectrogram patches. features = tf.placeholder( tf.float32, shape=(None, params.NUM_FRAMES, params.NUM_BANDS), name='input_features') # Reshape to 4-D so that we can convolve a batch with conv2d(). net = tf.reshape(features, [-1, params.NUM_FRAMES, params.NUM_BANDS, 1]) # The VGG stack of alternating convolutions and max-pools. net = slim.conv2d(net, 64, scope='conv1') net = slim.max_pool2d(net, scope='pool1') net = slim.conv2d(net, 128, scope='conv2') net = slim.max_pool2d(net, scope='pool2') net = slim.repeat(net, 2, slim.conv2d, 256, scope='conv3') net = slim.max_pool2d(net, scope='pool3') net = slim.repeat(net, 2, slim.conv2d, 512, scope='conv4') net = slim.max_pool2d(net, scope='pool4') # Flatten before entering fully-connected layers net = slim.flatten(net) net = slim.repeat(net, 2, slim.fully_connected, 4096, scope='fc1') # The embedding layer. net = slim.fully_connected(net, params.EMBEDDING_SIZE, scope='fc2') return tf.identity(net, name='embedding')
Example #29
Source File: vggish_slim.py From multilabel-image-classification-tensorflow with MIT License | 4 votes |
def define_vggish_slim(training=False): """Defines the VGGish TensorFlow model. All ops are created in the current default graph, under the scope 'vggish/'. The input is a placeholder named 'vggish/input_features' of type float32 and shape [batch_size, num_frames, num_bands] where batch_size is variable and num_frames and num_bands are constants, and [num_frames, num_bands] represents a log-mel-scale spectrogram patch covering num_bands frequency bands and num_frames time frames (where each frame step is usually 10ms). This is produced by computing the stabilized log(mel-spectrogram + params.LOG_OFFSET). The output is an op named 'vggish/embedding' which produces the activations of a 128-D embedding layer, which is usually the penultimate layer when used as part of a full model with a final classifier layer. Args: training: If true, all parameters are marked trainable. Returns: The op 'vggish/embeddings'. """ # Defaults: # - All weights are initialized to N(0, INIT_STDDEV). # - All biases are initialized to 0. # - All activations are ReLU. # - All convolutions are 3x3 with stride 1 and SAME padding. # - All max-pools are 2x2 with stride 2 and SAME padding. with slim.arg_scope([slim.conv2d, slim.fully_connected], weights_initializer=tf.truncated_normal_initializer( stddev=params.INIT_STDDEV), biases_initializer=tf.zeros_initializer(), activation_fn=tf.nn.relu, trainable=training), \ slim.arg_scope([slim.conv2d], kernel_size=[3, 3], stride=1, padding='SAME'), \ slim.arg_scope([slim.max_pool2d], kernel_size=[2, 2], stride=2, padding='SAME'), \ tf.variable_scope('vggish'): # Input: a batch of 2-D log-mel-spectrogram patches. features = tf.placeholder( tf.float32, shape=(None, params.NUM_FRAMES, params.NUM_BANDS), name='input_features') # Reshape to 4-D so that we can convolve a batch with conv2d(). net = tf.reshape(features, [-1, params.NUM_FRAMES, params.NUM_BANDS, 1]) # The VGG stack of alternating convolutions and max-pools. net = slim.conv2d(net, 64, scope='conv1') net = slim.max_pool2d(net, scope='pool1') net = slim.conv2d(net, 128, scope='conv2') net = slim.max_pool2d(net, scope='pool2') net = slim.repeat(net, 2, slim.conv2d, 256, scope='conv3') net = slim.max_pool2d(net, scope='pool3') net = slim.repeat(net, 2, slim.conv2d, 512, scope='conv4') net = slim.max_pool2d(net, scope='pool4') # Flatten before entering fully-connected layers net = slim.flatten(net) net = slim.repeat(net, 2, slim.fully_connected, 4096, scope='fc1') # The embedding layer. net = slim.fully_connected(net, params.EMBEDDING_SIZE, scope='fc2') return tf.identity(net, name='embedding')