Python Examples of vggish_params.EMBEDDING

Source File: vggish_postprocess.py From Tensorflow-Audio-Classification with Apache License 2.0

6 votes

def __init__(self, pca_params_npz_path):
    """Constructs a postprocessor.

    Args:
      pca_params_npz_path: Path to a NumPy-format .npz file that
        contains the PCA parameters used in postprocessing.
    """
    params = np.load(pca_params_npz_path)
    self._pca_matrix = params[vggish_params.PCA_EIGEN_VECTORS_NAME]
    # Load means into a column vector for easier broadcasting later.
    self._pca_means = params[vggish_params.PCA_MEANS_NAME].reshape(-1, 1)
    assert self._pca_matrix.shape == (
        vggish_params.EMBEDDING_SIZE, vggish_params.EMBEDDING_SIZE), (
            'Bad PCA matrix shape: %r' % (self._pca_matrix.shape,))
    assert self._pca_means.shape == (vggish_params.EMBEDDING_SIZE, 1), (
        'Bad PCA means shape: %r' % (self._pca_means.shape,))

Source File: vggish_postprocess.py From multilabel-image-classification-tensorflow with MIT License

6 votes

def __init__(self, pca_params_npz_path):
    """Constructs a postprocessor.

    Args:
      pca_params_npz_path: Path to a NumPy-format .npz file that
        contains the PCA parameters used in postprocessing.
    """
    params = np.load(pca_params_npz_path)
    self._pca_matrix = params[vggish_params.PCA_EIGEN_VECTORS_NAME]
    # Load means into a column vector for easier broadcasting later.
    self._pca_means = params[vggish_params.PCA_MEANS_NAME].reshape(-1, 1)
    assert self._pca_matrix.shape == (
        vggish_params.EMBEDDING_SIZE, vggish_params.EMBEDDING_SIZE), (
            'Bad PCA matrix shape: %r' % (self._pca_matrix.shape,))
    assert self._pca_means.shape == (vggish_params.EMBEDDING_SIZE, 1), (
        'Bad PCA means shape: %r' % (self._pca_means.shape,))

Source File: vggish_postprocess.py From yolo_v2 with Apache License 2.0

6 votes

def __init__(self, pca_params_npz_path):
    """Constructs a postprocessor.

    Args:
      pca_params_npz_path: Path to a NumPy-format .npz file that
        contains the PCA parameters used in postprocessing.
    """
    params = np.load(pca_params_npz_path)
    self._pca_matrix = params[vggish_params.PCA_EIGEN_VECTORS_NAME]
    # Load means into a column vector for easier broadcasting later.
    self._pca_means = params[vggish_params.PCA_MEANS_NAME].reshape(-1, 1)
    assert self._pca_matrix.shape == (
        vggish_params.EMBEDDING_SIZE, vggish_params.EMBEDDING_SIZE), (
            'Bad PCA matrix shape: %r' % (self._pca_matrix.shape,))
    assert self._pca_means.shape == (vggish_params.EMBEDDING_SIZE, 1), (
        'Bad PCA means shape: %r' % (self._pca_means.shape,))

Source File: vggish_postprocess.py From models with Apache License 2.0

6 votes

def __init__(self, pca_params_npz_path):
    """Constructs a postprocessor.

    Args:
      pca_params_npz_path: Path to a NumPy-format .npz file that
        contains the PCA parameters used in postprocessing.
    """
    params = np.load(pca_params_npz_path)
    self._pca_matrix = params[vggish_params.PCA_EIGEN_VECTORS_NAME]
    # Load means into a column vector for easier broadcasting later.
    self._pca_means = params[vggish_params.PCA_MEANS_NAME].reshape(-1, 1)
    assert self._pca_matrix.shape == (
        vggish_params.EMBEDDING_SIZE, vggish_params.EMBEDDING_SIZE), (
            'Bad PCA matrix shape: %r' % (self._pca_matrix.shape,))
    assert self._pca_means.shape == (vggish_params.EMBEDDING_SIZE, 1), (
        'Bad PCA means shape: %r' % (self._pca_means.shape,))

Source File: vggish_postprocess.py From Gun-Detector with Apache License 2.0

6 votes

def __init__(self, pca_params_npz_path):
    """Constructs a postprocessor.

    Args:
      pca_params_npz_path: Path to a NumPy-format .npz file that
        contains the PCA parameters used in postprocessing.
    """
    params = np.load(pca_params_npz_path)
    self._pca_matrix = params[vggish_params.PCA_EIGEN_VECTORS_NAME]
    # Load means into a column vector for easier broadcasting later.
    self._pca_means = params[vggish_params.PCA_MEANS_NAME].reshape(-1, 1)
    assert self._pca_matrix.shape == (
        vggish_params.EMBEDDING_SIZE, vggish_params.EMBEDDING_SIZE), (
            'Bad PCA matrix shape: %r' % (self._pca_matrix.shape,))
    assert self._pca_means.shape == (vggish_params.EMBEDDING_SIZE, 1), (
        'Bad PCA means shape: %r' % (self._pca_means.shape,))

Source File: vggish_postprocess.py From g-tensorflow-models with Apache License 2.0

6 votes

def __init__(self, pca_params_npz_path):
    """Constructs a postprocessor.

    Args:
      pca_params_npz_path: Path to a NumPy-format .npz file that
        contains the PCA parameters used in postprocessing.
    """
    params = np.load(pca_params_npz_path)
    self._pca_matrix = params[vggish_params.PCA_EIGEN_VECTORS_NAME]
    # Load means into a column vector for easier broadcasting later.
    self._pca_means = params[vggish_params.PCA_MEANS_NAME].reshape(-1, 1)
    assert self._pca_matrix.shape == (
        vggish_params.EMBEDDING_SIZE, vggish_params.EMBEDDING_SIZE), (
            'Bad PCA matrix shape: %r' % (self._pca_matrix.shape,))
    assert self._pca_means.shape == (vggish_params.EMBEDDING_SIZE, 1), (
        'Bad PCA means shape: %r' % (self._pca_means.shape,))

Source File: vggish_postprocess.py From object_detection_kitti with Apache License 2.0

6 votes

def __init__(self, pca_params_npz_path):
    """Constructs a postprocessor.

    Args:
      pca_params_npz_path: Path to a NumPy-format .npz file that
        contains the PCA parameters used in postprocessing.
    """
    params = np.load(pca_params_npz_path)
    self._pca_matrix = params[vggish_params.PCA_EIGEN_VECTORS_NAME]
    # Load means into a column vector for easier broadcasting later.
    self._pca_means = params[vggish_params.PCA_MEANS_NAME].reshape(-1, 1)
    assert self._pca_matrix.shape == (
        vggish_params.EMBEDDING_SIZE, vggish_params.EMBEDDING_SIZE), (
            'Bad PCA matrix shape: %r' % (self._pca_matrix.shape,))
    assert self._pca_means.shape == (vggish_params.EMBEDDING_SIZE, 1), (
        'Bad PCA means shape: %r' % (self._pca_means.shape,))

Source File: vggish_postprocess.py From audioset_classification with MIT License

6 votes

def __init__(self, pca_params_npz_path):
    """Constructs a postprocessor.

    Args:
      pca_params_npz_path: Path to a NumPy-format .npz file that
        contains the PCA parameters used in postprocessing.
    """
    params = np.load(pca_params_npz_path)
    self._pca_matrix = params[vggish_params.PCA_EIGEN_VECTORS_NAME]
    # Load means into a column vector for easier broadcasting later.
    self._pca_means = params[vggish_params.PCA_MEANS_NAME].reshape(-1, 1)
    assert self._pca_matrix.shape == (
        vggish_params.EMBEDDING_SIZE, vggish_params.EMBEDDING_SIZE), (
            'Bad PCA matrix shape: %r' % (self._pca_matrix.shape,))
    assert self._pca_means.shape == (vggish_params.EMBEDDING_SIZE, 1), (
        'Bad PCA means shape: %r' % (self._pca_means.shape,))

Source File: vggish_postprocess.py From object_detection_with_tensorflow with MIT License

6 votes

def __init__(self, pca_params_npz_path):
    """Constructs a postprocessor.

    Args:
      pca_params_npz_path: Path to a NumPy-format .npz file that
        contains the PCA parameters used in postprocessing.
    """
    params = np.load(pca_params_npz_path)
    self._pca_matrix = params[vggish_params.PCA_EIGEN_VECTORS_NAME]
    # Load means into a column vector for easier broadcasting later.
    self._pca_means = params[vggish_params.PCA_MEANS_NAME].reshape(-1, 1)
    assert self._pca_matrix.shape == (
        vggish_params.EMBEDDING_SIZE, vggish_params.EMBEDDING_SIZE), (
            'Bad PCA matrix shape: %r' % (self._pca_matrix.shape,))
    assert self._pca_means.shape == (vggish_params.EMBEDDING_SIZE, 1), (
        'Bad PCA means shape: %r' % (self._pca_means.shape,))

Source File: vggish_postprocess.py From object_detection_with_tensorflow with MIT License

5 votes

def postprocess(self, embeddings_batch):
    """Applies postprocessing to a batch of embeddings.

    Args:
      embeddings_batch: An nparray of shape [batch_size, embedding_size]
        containing output from the embedding layer of VGGish.

    Returns:
      An nparray of the same shape as the input but of type uint8,
      containing the PCA-transformed and quantized version of the input.
    """
    assert len(embeddings_batch.shape) == 2, (
        'Expected 2-d batch, got %r' % (embeddings_batch.shape,))
    assert embeddings_batch.shape[1] == vggish_params.EMBEDDING_SIZE, (
        'Bad batch shape: %r' % (embeddings_batch.shape,))

    # Apply PCA.
    # - Embeddings come in as [batch_size, embedding_size].
    # - Transpose to [embedding_size, batch_size].
    # - Subtract pca_means column vector from each column.
    # - Premultiply by PCA matrix of shape [output_dims, input_dims]
    #   where both are are equal to embedding_size in our case.
    # - Transpose result back to [batch_size, embedding_size].
    pca_applied = np.dot(self._pca_matrix,
                         (embeddings_batch.T - self._pca_means)).T

    # Quantize by:
    # - clipping to [min, max] range
    clipped_embeddings = np.clip(
        pca_applied, vggish_params.QUANTIZE_MIN_VAL,
        vggish_params.QUANTIZE_MAX_VAL)
    # - convert to 8-bit in range [0.0, 255.0]
    quantized_embeddings = (
        (clipped_embeddings - vggish_params.QUANTIZE_MIN_VAL) *
        (255.0 /
         (vggish_params.QUANTIZE_MAX_VAL - vggish_params.QUANTIZE_MIN_VAL)))
    # - cast 8-bit float to uint8
    quantized_embeddings = quantized_embeddings.astype(np.uint8)

    return quantized_embeddings

Source File: vggish_postprocess.py From multilabel-image-classification-tensorflow with MIT License

5 votes

def postprocess(self, embeddings_batch):
    """Applies postprocessing to a batch of embeddings.

    Args:
      embeddings_batch: An nparray of shape [batch_size, embedding_size]
        containing output from the embedding layer of VGGish.

    Returns:
      An nparray of the same shape as the input but of type uint8,
      containing the PCA-transformed and quantized version of the input.
    """
    assert len(embeddings_batch.shape) == 2, (
        'Expected 2-d batch, got %r' % (embeddings_batch.shape,))
    assert embeddings_batch.shape[1] == vggish_params.EMBEDDING_SIZE, (
        'Bad batch shape: %r' % (embeddings_batch.shape,))

    # Apply PCA.
    # - Embeddings come in as [batch_size, embedding_size].
    # - Transpose to [embedding_size, batch_size].
    # - Subtract pca_means column vector from each column.
    # - Premultiply by PCA matrix of shape [output_dims, input_dims]
    #   where both are are equal to embedding_size in our case.
    # - Transpose result back to [batch_size, embedding_size].
    pca_applied = np.dot(self._pca_matrix,
                         (embeddings_batch.T - self._pca_means)).T

    # Quantize by:
    # - clipping to [min, max] range
    clipped_embeddings = np.clip(
        pca_applied, vggish_params.QUANTIZE_MIN_VAL,
        vggish_params.QUANTIZE_MAX_VAL)
    # - convert to 8-bit in range [0.0, 255.0]
    quantized_embeddings = (
        (clipped_embeddings - vggish_params.QUANTIZE_MIN_VAL) *
        (255.0 /
         (vggish_params.QUANTIZE_MAX_VAL - vggish_params.QUANTIZE_MIN_VAL)))
    # - cast 8-bit float to uint8
    quantized_embeddings = quantized_embeddings.astype(np.uint8)

    return quantized_embeddings

Source File: vggish_postprocess.py From models with Apache License 2.0

5 votes

def postprocess(self, embeddings_batch):
    """Applies postprocessing to a batch of embeddings.

    Args:
      embeddings_batch: An nparray of shape [batch_size, embedding_size]
        containing output from the embedding layer of VGGish.

    Returns:
      An nparray of the same shape as the input but of type uint8,
      containing the PCA-transformed and quantized version of the input.
    """
    assert len(embeddings_batch.shape) == 2, (
        'Expected 2-d batch, got %r' % (embeddings_batch.shape,))
    assert embeddings_batch.shape[1] == vggish_params.EMBEDDING_SIZE, (
        'Bad batch shape: %r' % (embeddings_batch.shape,))

    # Apply PCA.
    # - Embeddings come in as [batch_size, embedding_size].
    # - Transpose to [embedding_size, batch_size].
    # - Subtract pca_means column vector from each column.
    # - Premultiply by PCA matrix of shape [output_dims, input_dims]
    #   where both are are equal to embedding_size in our case.
    # - Transpose result back to [batch_size, embedding_size].
    pca_applied = np.dot(self._pca_matrix,
                         (embeddings_batch.T - self._pca_means)).T

    # Quantize by:
    # - clipping to [min, max] range
    clipped_embeddings = np.clip(
        pca_applied, vggish_params.QUANTIZE_MIN_VAL,
        vggish_params.QUANTIZE_MAX_VAL)
    # - convert to 8-bit in range [0.0, 255.0]
    quantized_embeddings = (
        (clipped_embeddings - vggish_params.QUANTIZE_MIN_VAL) *
        (255.0 /
         (vggish_params.QUANTIZE_MAX_VAL - vggish_params.QUANTIZE_MIN_VAL)))
    # - cast 8-bit float to uint8
    quantized_embeddings = quantized_embeddings.astype(np.uint8)

    return quantized_embeddings

Source File: vggish_postprocess.py From g-tensorflow-models with Apache License 2.0

5 votes

def postprocess(self, embeddings_batch):
    """Applies postprocessing to a batch of embeddings.

    Args:
      embeddings_batch: An nparray of shape [batch_size, embedding_size]
        containing output from the embedding layer of VGGish.

    Returns:
      An nparray of the same shape as the input but of type uint8,
      containing the PCA-transformed and quantized version of the input.
    """
    assert len(embeddings_batch.shape) == 2, (
        'Expected 2-d batch, got %r' % (embeddings_batch.shape,))
    assert embeddings_batch.shape[1] == vggish_params.EMBEDDING_SIZE, (
        'Bad batch shape: %r' % (embeddings_batch.shape,))

    # Apply PCA.
    # - Embeddings come in as [batch_size, embedding_size].
    # - Transpose to [embedding_size, batch_size].
    # - Subtract pca_means column vector from each column.
    # - Premultiply by PCA matrix of shape [output_dims, input_dims]
    #   where both are are equal to embedding_size in our case.
    # - Transpose result back to [batch_size, embedding_size].
    pca_applied = np.dot(self._pca_matrix,
                         (embeddings_batch.T - self._pca_means)).T

    # Quantize by:
    # - clipping to [min, max] range
    clipped_embeddings = np.clip(
        pca_applied, vggish_params.QUANTIZE_MIN_VAL,
        vggish_params.QUANTIZE_MAX_VAL)
    # - convert to 8-bit in range [0.0, 255.0]
    quantized_embeddings = (
        (clipped_embeddings - vggish_params.QUANTIZE_MIN_VAL) *
        (255.0 /
         (vggish_params.QUANTIZE_MAX_VAL - vggish_params.QUANTIZE_MIN_VAL)))
    # - cast 8-bit float to uint8
    quantized_embeddings = quantized_embeddings.astype(np.uint8)

    return quantized_embeddings

Source File: vggish_postprocess.py From audioset_classification with MIT License

5 votes

def postprocess(self, embeddings_batch):
    """Applies postprocessing to a batch of embeddings.

    Args:
      embeddings_batch: An nparray of shape [batch_size, embedding_size]
        containing output from the embedding layer of VGGish.

    Returns:
      An nparray of the same shape as the input but of type uint8,
      containing the PCA-transformed and quantized version of the input.
    """
    assert len(embeddings_batch.shape) == 2, (
        'Expected 2-d batch, got %r' % (embeddings_batch.shape,))
    assert embeddings_batch.shape[1] == vggish_params.EMBEDDING_SIZE, (
        'Bad batch shape: %r' % (embeddings_batch.shape,))

    # Apply PCA.
    # - Embeddings come in as [batch_size, embedding_size].
    # - Transpose to [embedding_size, batch_size].
    # - Subtract pca_means column vector from each column.
    # - Premultiply by PCA matrix of shape [output_dims, input_dims]
    #   where both are are equal to embedding_size in our case.
    # - Transpose result back to [batch_size, embedding_size].
    pca_applied = np.dot(self._pca_matrix,
                         (embeddings_batch.T - self._pca_means)).T

    # Quantize by:
    # - clipping to [min, max] range
    clipped_embeddings = np.clip(
        pca_applied, vggish_params.QUANTIZE_MIN_VAL,
        vggish_params.QUANTIZE_MAX_VAL)
    # - convert to 8-bit in range [0.0, 255.0]
    quantized_embeddings = (
        (clipped_embeddings - vggish_params.QUANTIZE_MIN_VAL) *
        (255.0 /
         (vggish_params.QUANTIZE_MAX_VAL - vggish_params.QUANTIZE_MIN_VAL)))
    # - cast 8-bit float to uint8
    quantized_embeddings = quantized_embeddings.astype(np.uint8)

    return quantized_embeddings

Source File: vggish_postprocess.py From object_detection_kitti with Apache License 2.0

5 votes

def postprocess(self, embeddings_batch):
    """Applies postprocessing to a batch of embeddings.

    Args:
      embeddings_batch: An nparray of shape [batch_size, embedding_size]
        containing output from the embedding layer of VGGish.

    Returns:
      An nparray of the same shape as the input but of type uint8,
      containing the PCA-transformed and quantized version of the input.
    """
    assert len(embeddings_batch.shape) == 2, (
        'Expected 2-d batch, got %r' % (embeddings_batch.shape,))
    assert embeddings_batch.shape[1] == vggish_params.EMBEDDING_SIZE, (
        'Bad batch shape: %r' % (embeddings_batch.shape,))

    # Apply PCA.
    # - Embeddings come in as [batch_size, embedding_size].
    # - Transpose to [embedding_size, batch_size].
    # - Subtract pca_means column vector from each column.
    # - Premultiply by PCA matrix of shape [output_dims, input_dims]
    #   where both are are equal to embedding_size in our case.
    # - Transpose result back to [batch_size, embedding_size].
    pca_applied = np.dot(self._pca_matrix,
                         (embeddings_batch.T - self._pca_means)).T

    # Quantize by:
    # - clipping to [min, max] range
    clipped_embeddings = np.clip(
        pca_applied, vggish_params.QUANTIZE_MIN_VAL,
        vggish_params.QUANTIZE_MAX_VAL)
    # - convert to 8-bit in range [0.0, 255.0]
    quantized_embeddings = (
        (clipped_embeddings - vggish_params.QUANTIZE_MIN_VAL) *
        (255.0 /
         (vggish_params.QUANTIZE_MAX_VAL - vggish_params.QUANTIZE_MIN_VAL)))
    # - cast 8-bit float to uint8
    quantized_embeddings = quantized_embeddings.astype(np.uint8)

    return quantized_embeddings

Source File: vggish_postprocess.py From Gun-Detector with Apache License 2.0

5 votes

def postprocess(self, embeddings_batch):
    """Applies postprocessing to a batch of embeddings.

    Args:
      embeddings_batch: An nparray of shape [batch_size, embedding_size]
        containing output from the embedding layer of VGGish.

    Returns:
      An nparray of the same shape as the input but of type uint8,
      containing the PCA-transformed and quantized version of the input.
    """
    assert len(embeddings_batch.shape) == 2, (
        'Expected 2-d batch, got %r' % (embeddings_batch.shape,))
    assert embeddings_batch.shape[1] == vggish_params.EMBEDDING_SIZE, (
        'Bad batch shape: %r' % (embeddings_batch.shape,))

    # Apply PCA.
    # - Embeddings come in as [batch_size, embedding_size].
    # - Transpose to [embedding_size, batch_size].
    # - Subtract pca_means column vector from each column.
    # - Premultiply by PCA matrix of shape [output_dims, input_dims]
    #   where both are are equal to embedding_size in our case.
    # - Transpose result back to [batch_size, embedding_size].
    pca_applied = np.dot(self._pca_matrix,
                         (embeddings_batch.T - self._pca_means)).T

    # Quantize by:
    # - clipping to [min, max] range
    clipped_embeddings = np.clip(
        pca_applied, vggish_params.QUANTIZE_MIN_VAL,
        vggish_params.QUANTIZE_MAX_VAL)
    # - convert to 8-bit in range [0.0, 255.0]
    quantized_embeddings = (
        (clipped_embeddings - vggish_params.QUANTIZE_MIN_VAL) *
        (255.0 /
         (vggish_params.QUANTIZE_MAX_VAL - vggish_params.QUANTIZE_MIN_VAL)))
    # - cast 8-bit float to uint8
    quantized_embeddings = quantized_embeddings.astype(np.uint8)

    return quantized_embeddings

Source File: vggish_postprocess.py From yolo_v2 with Apache License 2.0

5 votes

def postprocess(self, embeddings_batch):
    """Applies postprocessing to a batch of embeddings.

    Args:
      embeddings_batch: An nparray of shape [batch_size, embedding_size]
        containing output from the embedding layer of VGGish.

    Returns:
      An nparray of the same shape as the input but of type uint8,
      containing the PCA-transformed and quantized version of the input.
    """
    assert len(embeddings_batch.shape) == 2, (
        'Expected 2-d batch, got %r' % (embeddings_batch.shape,))
    assert embeddings_batch.shape[1] == vggish_params.EMBEDDING_SIZE, (
        'Bad batch shape: %r' % (embeddings_batch.shape,))

    # Apply PCA.
    # - Embeddings come in as [batch_size, embedding_size].
    # - Transpose to [embedding_size, batch_size].
    # - Subtract pca_means column vector from each column.
    # - Premultiply by PCA matrix of shape [output_dims, input_dims]
    #   where both are are equal to embedding_size in our case.
    # - Transpose result back to [batch_size, embedding_size].
    pca_applied = np.dot(self._pca_matrix,
                         (embeddings_batch.T - self._pca_means)).T

    # Quantize by:
    # - clipping to [min, max] range
    clipped_embeddings = np.clip(
        pca_applied, vggish_params.QUANTIZE_MIN_VAL,
        vggish_params.QUANTIZE_MAX_VAL)
    # - convert to 8-bit in range [0.0, 255.0]
    quantized_embeddings = (
        (clipped_embeddings - vggish_params.QUANTIZE_MIN_VAL) *
        (255.0 /
         (vggish_params.QUANTIZE_MAX_VAL - vggish_params.QUANTIZE_MIN_VAL)))
    # - cast 8-bit float to uint8
    quantized_embeddings = quantized_embeddings.astype(np.uint8)

    return quantized_embeddings

Source File: vggish_postprocess.py From Tensorflow-Audio-Classification with Apache License 2.0

5 votes

def postprocess(self, embeddings_batch):
    """Applies postprocessing to a batch of embeddings.

    Args:
      embeddings_batch: An nparray of shape [batch_size, embedding_size]
        containing output from the embedding layer of VGGish.

    Returns:
      An nparray of the same shape as the input but of type uint8,
      containing the PCA-transformed and quantized version of the input.
    """
    assert len(embeddings_batch.shape) == 2, (
        'Expected 2-d batch, got %r' % (embeddings_batch.shape,))
    assert embeddings_batch.shape[1] == vggish_params.EMBEDDING_SIZE, (
        'Bad batch shape: %r' % (embeddings_batch.shape,))

    # Apply PCA.
    # - Embeddings come in as [batch_size, embedding_size].
    # - Transpose to [embedding_size, batch_size].
    # - Subtract pca_means column vector from each column.
    # - Premultiply by PCA matrix of shape [output_dims, input_dims]
    #   where both are are equal to embedding_size in our case.
    # - Transpose result back to [batch_size, embedding_size].
    pca_applied = np.dot(self._pca_matrix,
                         (embeddings_batch.T - self._pca_means)).T

    # Quantize by:
    # - clipping to [min, max] range
    clipped_embeddings = np.clip(
        pca_applied, vggish_params.QUANTIZE_MIN_VAL,
        vggish_params.QUANTIZE_MAX_VAL)
    # - convert to 8-bit in range [0.0, 255.0]
    quantized_embeddings = (
        (clipped_embeddings - vggish_params.QUANTIZE_MIN_VAL) *
        (255.0 /
         (vggish_params.QUANTIZE_MAX_VAL - vggish_params.QUANTIZE_MIN_VAL)))
    # - cast 8-bit float to uint8
    quantized_embeddings = quantized_embeddings.astype(np.uint8)

    return quantized_embeddings

Source File: vggish_slim.py From sklearn-audio-transfer-learning with ISC License

4 votes

def define_vggish_slim(training=False):
  """Defines the VGGish TensorFlow model.

  All ops are created in the current default graph, under the scope 'vggish/'.

  The input is a placeholder named 'vggish/input_features' of type float32 and
  shape [batch_size, num_frames, num_bands] where batch_size is variable and
  num_frames and num_bands are constants, and [num_frames, num_bands] represents
  a log-mel-scale spectrogram patch covering num_bands frequency bands and
  num_frames time frames (where each frame step is usually 10ms). This is
  produced by computing the stabilized log(mel-spectrogram + params.LOG_OFFSET).
  The output is an op named 'vggish/embedding' which produces the activations of
  a 128-D embedding layer, which is usually the penultimate layer when used as
  part of a full model with a final classifier layer.

  Args:
    training: If true, all parameters are marked trainable.

  Returns:
    The op 'vggish/embeddings'.
  """
  # Defaults:
  # - All weights are initialized to N(0, INIT_STDDEV).
  # - All biases are initialized to 0.
  # - All activations are ReLU.
  # - All convolutions are 3x3 with stride 1 and SAME padding.
  # - All max-pools are 2x2 with stride 2 and SAME padding.
  with slim.arg_scope([slim.conv2d, slim.fully_connected],
                      weights_initializer=tf.truncated_normal_initializer(
                          stddev=params.INIT_STDDEV),
                      biases_initializer=tf.zeros_initializer(),
                      activation_fn=tf.nn.relu,
                      trainable=training), \
       slim.arg_scope([slim.conv2d],
                      kernel_size=[3, 3], stride=1, padding='SAME'), \
       slim.arg_scope([slim.max_pool2d],
                      kernel_size=[2, 2], stride=2, padding='SAME'), \
       tf.variable_scope('vggish'):
    # Input: a batch of 2-D log-mel-spectrogram patches.
    features = tf.placeholder(
        tf.float32, shape=(None, params.NUM_FRAMES, params.NUM_BANDS),
        name='input_features')
    # Reshape to 4-D so that we can convolve a batch with conv2d().
    net = tf.reshape(features, [-1, params.NUM_FRAMES, params.NUM_BANDS, 1])

    # The VGG stack of alternating convolutions and max-pools.
    net = slim.conv2d(net, 64, scope='conv1')
    net = slim.max_pool2d(net, scope='pool1')
    net = slim.conv2d(net, 128, scope='conv2')
    net = slim.max_pool2d(net, scope='pool2')
    net = slim.repeat(net, 2, slim.conv2d, 256, scope='conv3')
    net = slim.max_pool2d(net, scope='pool3')
    net = slim.repeat(net, 2, slim.conv2d, 512, scope='conv4')
    net = slim.max_pool2d(net, scope='pool4')

    # Flatten before entering fully-connected layers
    net = slim.flatten(net)
    net = slim.repeat(net, 2, slim.fully_connected, 4096, scope='fc1')
    # The embedding layer.
    net = slim.fully_connected(net, params.EMBEDDING_SIZE, scope='fc2')
    return tf.identity(net, name='embedding')

Source File: vggish_slim.py From object_detection_with_tensorflow with MIT License

4 votes

def define_vggish_slim(training=False):
  """Defines the VGGish TensorFlow model.

  All ops are created in the current default graph, under the scope 'vggish/'.

  The input is a placeholder named 'vggish/input_features' of type float32 and
  shape [batch_size, num_frames, num_bands] where batch_size is variable and
  num_frames and num_bands are constants, and [num_frames, num_bands] represents
  a log-mel-scale spectrogram patch covering num_bands frequency bands and
  num_frames time frames (where each frame step is usually 10ms). This is
  produced by computing the stabilized log(mel-spectrogram + params.LOG_OFFSET).
  The output is an op named 'vggish/embedding' which produces the activations of
  a 128-D embedding layer, which is usually the penultimate layer when used as
  part of a full model with a final classifier layer.

  Args:
    training: If true, all parameters are marked trainable.

  Returns:
    The op 'vggish/embeddings'.
  """
  # Defaults:
  # - All weights are initialized to N(0, INIT_STDDEV).
  # - All biases are initialized to 0.
  # - All activations are ReLU.
  # - All convolutions are 3x3 with stride 1 and SAME padding.
  # - All max-pools are 2x2 with stride 2 and SAME padding.
  with slim.arg_scope([slim.conv2d, slim.fully_connected],
                      weights_initializer=tf.truncated_normal_initializer(
                          stddev=params.INIT_STDDEV),
                      biases_initializer=tf.zeros_initializer(),
                      activation_fn=tf.nn.relu,
                      trainable=training), \
       slim.arg_scope([slim.conv2d],
                      kernel_size=[3, 3], stride=1, padding='SAME'), \
       slim.arg_scope([slim.max_pool2d],
                      kernel_size=[2, 2], stride=2, padding='SAME'), \
       tf.variable_scope('vggish'):
    # Input: a batch of 2-D log-mel-spectrogram patches.
    features = tf.placeholder(
        tf.float32, shape=(None, params.NUM_FRAMES, params.NUM_BANDS),
        name='input_features')
    # Reshape to 4-D so that we can convolve a batch with conv2d().
    net = tf.reshape(features, [-1, params.NUM_FRAMES, params.NUM_BANDS, 1])

    # The VGG stack of alternating convolutions and max-pools.
    net = slim.conv2d(net, 64, scope='conv1')
    net = slim.max_pool2d(net, scope='pool1')
    net = slim.conv2d(net, 128, scope='conv2')
    net = slim.max_pool2d(net, scope='pool2')
    net = slim.repeat(net, 2, slim.conv2d, 256, scope='conv3')
    net = slim.max_pool2d(net, scope='pool3')
    net = slim.repeat(net, 2, slim.conv2d, 512, scope='conv4')
    net = slim.max_pool2d(net, scope='pool4')

    # Flatten before entering fully-connected layers
    net = slim.flatten(net)
    net = slim.repeat(net, 2, slim.fully_connected, 4096, scope='fc1')
    # The embedding layer.
    net = slim.fully_connected(net, params.EMBEDDING_SIZE, scope='fc2')
    return tf.identity(net, name='embedding')

Source File: extract_audioset_embedding.py From audioset_classification with MIT License

4 votes

def extract_audioset_embedding():
    """Extract log mel spectrogram features. 
    """
    
    # Arguments & parameters
    mel_bins = vggish_params.NUM_BANDS
    sample_rate = vggish_params.SAMPLE_RATE
    input_len = vggish_params.NUM_FRAMES
    embedding_size = vggish_params.EMBEDDING_SIZE
    
    '''You may modify the EXAMPLE_HOP_SECONDS in vggish_params.py to change the 
    hop size. '''

    # Paths
    audio_path = 'appendixes/01.wav'
    checkpoint_path = os.path.join('vggish_model.ckpt')
    pcm_params_path = os.path.join('vggish_pca_params.npz')
    
    if not os.path.isfile(checkpoint_path):
        raise Exception('Please download vggish_model.ckpt from '
            'https://storage.googleapis.com/audioset/vggish_model.ckpt '
            'and put it in the root of this codebase. ')
        
    if not os.path.isfile(pcm_params_path):
        raise Exception('Please download pcm_params_path from '
        'https://storage.googleapis.com/audioset/vggish_pca_params.npz '
        'and put it in the root of this codebase. ')
    
    # Load model
    sess = tf.Session()
    
    vggish_slim.define_vggish_slim(training=False)
    vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint_path)
    features_tensor = sess.graph.get_tensor_by_name(vggish_params.INPUT_TENSOR_NAME)
    embedding_tensor = sess.graph.get_tensor_by_name(vggish_params.OUTPUT_TENSOR_NAME)
    
    pproc = vggish_postprocess.Postprocessor(pcm_params_path)

    # Read audio
    (audio, _) = read_audio(audio_path, target_fs=sample_rate)
    
    # Extract log mel feature
    logmel = vggish_input.waveform_to_examples(audio, sample_rate)

    # Extract embedding feature
    [embedding_batch] = sess.run([embedding_tensor], feed_dict={features_tensor: logmel})
    
    # PCA
    postprocessed_batch = pproc.postprocess(embedding_batch)
    
    print('Audio length: {}'.format(len(audio)))
    print('Log mel shape: {}'.format(logmel.shape))
    print('Embedding feature shape: {}'.format(postprocessed_batch.shape))

Source File: vggish_slim.py From object_detection_kitti with Apache License 2.0

4 votes

def define_vggish_slim(training=False):
  """Defines the VGGish TensorFlow model.

  All ops are created in the current default graph, under the scope 'vggish/'.

  The input is a placeholder named 'vggish/input_features' of type float32 and
  shape [batch_size, num_frames, num_bands] where batch_size is variable and
  num_frames and num_bands are constants, and [num_frames, num_bands] represents
  a log-mel-scale spectrogram patch covering num_bands frequency bands and
  num_frames time frames (where each frame step is usually 10ms). This is
  produced by computing the stabilized log(mel-spectrogram + params.LOG_OFFSET).
  The output is an op named 'vggish/embedding' which produces the activations of
  a 128-D embedding layer, which is usually the penultimate layer when used as
  part of a full model with a final classifier layer.

  Args:
    training: If true, all parameters are marked trainable.

  Returns:
    The op 'vggish/embeddings'.
  """
  # Defaults:
  # - All weights are initialized to N(0, INIT_STDDEV).
  # - All biases are initialized to 0.
  # - All activations are ReLU.
  # - All convolutions are 3x3 with stride 1 and SAME padding.
  # - All max-pools are 2x2 with stride 2 and SAME padding.
  with slim.arg_scope([slim.conv2d, slim.fully_connected],
                      weights_initializer=tf.truncated_normal_initializer(
                          stddev=params.INIT_STDDEV),
                      biases_initializer=tf.zeros_initializer(),
                      activation_fn=tf.nn.relu,
                      trainable=training), \
       slim.arg_scope([slim.conv2d],
                      kernel_size=[3, 3], stride=1, padding='SAME'), \
       slim.arg_scope([slim.max_pool2d],
                      kernel_size=[2, 2], stride=2, padding='SAME'), \
       tf.variable_scope('vggish'):
    # Input: a batch of 2-D log-mel-spectrogram patches.
    features = tf.placeholder(
        tf.float32, shape=(None, params.NUM_FRAMES, params.NUM_BANDS),
        name='input_features')
    # Reshape to 4-D so that we can convolve a batch with conv2d().
    net = tf.reshape(features, [-1, params.NUM_FRAMES, params.NUM_BANDS, 1])

    # The VGG stack of alternating convolutions and max-pools.
    net = slim.conv2d(net, 64, scope='conv1')
    net = slim.max_pool2d(net, scope='pool1')
    net = slim.conv2d(net, 128, scope='conv2')
    net = slim.max_pool2d(net, scope='pool2')
    net = slim.repeat(net, 2, slim.conv2d, 256, scope='conv3')
    net = slim.max_pool2d(net, scope='pool3')
    net = slim.repeat(net, 2, slim.conv2d, 512, scope='conv4')
    net = slim.max_pool2d(net, scope='pool4')

    # Flatten before entering fully-connected layers
    net = slim.flatten(net)
    net = slim.repeat(net, 2, slim.fully_connected, 4096, scope='fc1')
    # The embedding layer.
    net = slim.fully_connected(net, params.EMBEDDING_SIZE, scope='fc2')
    return tf.identity(net, name='embedding')

Source File: vggish_slim.py From audioset_classification with MIT License

4 votes

def define_vggish_slim(training=False):
  """Defines the VGGish TensorFlow model.

  All ops are created in the current default graph, under the scope 'vggish/'.

  The input is a placeholder named 'vggish/input_features' of type float32 and
  shape [batch_size, num_frames, num_bands] where batch_size is variable and
  num_frames and num_bands are constants, and [num_frames, num_bands] represents
  a log-mel-scale spectrogram patch covering num_bands frequency bands and
  num_frames time frames (where each frame step is usually 10ms). This is
  produced by computing the stabilized log(mel-spectrogram + params.LOG_OFFSET).
  The output is an op named 'vggish/embedding' which produces the activations of
  a 128-D embedding layer, which is usually the penultimate layer when used as
  part of a full model with a final classifier layer.

  Args:
    training: If true, all parameters are marked trainable.

  Returns:
    The op 'vggish/embeddings'.
  """
  # Defaults:
  # - All weights are initialized to N(0, INIT_STDDEV).
  # - All biases are initialized to 0.
  # - All activations are ReLU.
  # - All convolutions are 3x3 with stride 1 and SAME padding.
  # - All max-pools are 2x2 with stride 2 and SAME padding.
  with slim.arg_scope([slim.conv2d, slim.fully_connected],
                      weights_initializer=tf.truncated_normal_initializer(
                          stddev=params.INIT_STDDEV),
                      biases_initializer=tf.zeros_initializer(),
                      activation_fn=tf.nn.relu,
                      trainable=training), \
       slim.arg_scope([slim.conv2d],
                      kernel_size=[3, 3], stride=1, padding='SAME'), \
       slim.arg_scope([slim.max_pool2d],
                      kernel_size=[2, 2], stride=2, padding='SAME'), \
       tf.variable_scope('vggish'):
    # Input: a batch of 2-D log-mel-spectrogram patches.
    features = tf.placeholder(
        tf.float32, shape=(None, params.NUM_FRAMES, params.NUM_BANDS),
        name='input_features')
    # Reshape to 4-D so that we can convolve a batch with conv2d().
    net = tf.reshape(features, [-1, params.NUM_FRAMES, params.NUM_BANDS, 1])

    # The VGG stack of alternating convolutions and max-pools.
    net = slim.conv2d(net, 64, scope='conv1')
    net = slim.max_pool2d(net, scope='pool1')
    net = slim.conv2d(net, 128, scope='conv2')
    net = slim.max_pool2d(net, scope='pool2')
    net = slim.repeat(net, 2, slim.conv2d, 256, scope='conv3')
    net = slim.max_pool2d(net, scope='pool3')
    net = slim.repeat(net, 2, slim.conv2d, 512, scope='conv4')
    net = slim.max_pool2d(net, scope='pool4')

    # Flatten before entering fully-connected layers
    net = slim.flatten(net)
    net = slim.repeat(net, 2, slim.fully_connected, 4096, scope='fc1')
    # The embedding layer.
    net = slim.fully_connected(net, params.EMBEDDING_SIZE, scope='fc2')
    return tf.identity(net, name='embedding')

Source File: vggish_slim.py From Gun-Detector with Apache License 2.0

4 votes

def define_vggish_slim(training=False):
  """Defines the VGGish TensorFlow model.

  All ops are created in the current default graph, under the scope 'vggish/'.

  The input is a placeholder named 'vggish/input_features' of type float32 and
  shape [batch_size, num_frames, num_bands] where batch_size is variable and
  num_frames and num_bands are constants, and [num_frames, num_bands] represents
  a log-mel-scale spectrogram patch covering num_bands frequency bands and
  num_frames time frames (where each frame step is usually 10ms). This is
  produced by computing the stabilized log(mel-spectrogram + params.LOG_OFFSET).
  The output is an op named 'vggish/embedding' which produces the activations of
  a 128-D embedding layer, which is usually the penultimate layer when used as
  part of a full model with a final classifier layer.

  Args:
    training: If true, all parameters are marked trainable.

  Returns:
    The op 'vggish/embeddings'.
  """
  # Defaults:
  # - All weights are initialized to N(0, INIT_STDDEV).
  # - All biases are initialized to 0.
  # - All activations are ReLU.
  # - All convolutions are 3x3 with stride 1 and SAME padding.
  # - All max-pools are 2x2 with stride 2 and SAME padding.
  with slim.arg_scope([slim.conv2d, slim.fully_connected],
                      weights_initializer=tf.truncated_normal_initializer(
                          stddev=params.INIT_STDDEV),
                      biases_initializer=tf.zeros_initializer(),
                      activation_fn=tf.nn.relu,
                      trainable=training), \
       slim.arg_scope([slim.conv2d],
                      kernel_size=[3, 3], stride=1, padding='SAME'), \
       slim.arg_scope([slim.max_pool2d],
                      kernel_size=[2, 2], stride=2, padding='SAME'), \
       tf.variable_scope('vggish'):
    # Input: a batch of 2-D log-mel-spectrogram patches.
    features = tf.placeholder(
        tf.float32, shape=(None, params.NUM_FRAMES, params.NUM_BANDS),
        name='input_features')
    # Reshape to 4-D so that we can convolve a batch with conv2d().
    net = tf.reshape(features, [-1, params.NUM_FRAMES, params.NUM_BANDS, 1])

    # The VGG stack of alternating convolutions and max-pools.
    net = slim.conv2d(net, 64, scope='conv1')
    net = slim.max_pool2d(net, scope='pool1')
    net = slim.conv2d(net, 128, scope='conv2')
    net = slim.max_pool2d(net, scope='pool2')
    net = slim.repeat(net, 2, slim.conv2d, 256, scope='conv3')
    net = slim.max_pool2d(net, scope='pool3')
    net = slim.repeat(net, 2, slim.conv2d, 512, scope='conv4')
    net = slim.max_pool2d(net, scope='pool4')

    # Flatten before entering fully-connected layers
    net = slim.flatten(net)
    net = slim.repeat(net, 2, slim.fully_connected, 4096, scope='fc1')
    # The embedding layer.
    net = slim.fully_connected(net, params.EMBEDDING_SIZE, scope='fc2')
    return tf.identity(net, name='embedding')

Source File: vggish_slim.py From g-tensorflow-models with Apache License 2.0

4 votes

def define_vggish_slim(training=False):
  """Defines the VGGish TensorFlow model.

  All ops are created in the current default graph, under the scope 'vggish/'.

  The input is a placeholder named 'vggish/input_features' of type float32 and
  shape [batch_size, num_frames, num_bands] where batch_size is variable and
  num_frames and num_bands are constants, and [num_frames, num_bands] represents
  a log-mel-scale spectrogram patch covering num_bands frequency bands and
  num_frames time frames (where each frame step is usually 10ms). This is
  produced by computing the stabilized log(mel-spectrogram + params.LOG_OFFSET).
  The output is an op named 'vggish/embedding' which produces the activations of
  a 128-D embedding layer, which is usually the penultimate layer when used as
  part of a full model with a final classifier layer.

  Args:
    training: If true, all parameters are marked trainable.

  Returns:
    The op 'vggish/embeddings'.
  """
  # Defaults:
  # - All weights are initialized to N(0, INIT_STDDEV).
  # - All biases are initialized to 0.
  # - All activations are ReLU.
  # - All convolutions are 3x3 with stride 1 and SAME padding.
  # - All max-pools are 2x2 with stride 2 and SAME padding.
  with slim.arg_scope([slim.conv2d, slim.fully_connected],
                      weights_initializer=tf.truncated_normal_initializer(
                          stddev=params.INIT_STDDEV),
                      biases_initializer=tf.zeros_initializer(),
                      activation_fn=tf.nn.relu,
                      trainable=training), \
       slim.arg_scope([slim.conv2d],
                      kernel_size=[3, 3], stride=1, padding='SAME'), \
       slim.arg_scope([slim.max_pool2d],
                      kernel_size=[2, 2], stride=2, padding='SAME'), \
       tf.variable_scope('vggish'):
    # Input: a batch of 2-D log-mel-spectrogram patches.
    features = tf.placeholder(
        tf.float32, shape=(None, params.NUM_FRAMES, params.NUM_BANDS),
        name='input_features')
    # Reshape to 4-D so that we can convolve a batch with conv2d().
    net = tf.reshape(features, [-1, params.NUM_FRAMES, params.NUM_BANDS, 1])

    # The VGG stack of alternating convolutions and max-pools.
    net = slim.conv2d(net, 64, scope='conv1')
    net = slim.max_pool2d(net, scope='pool1')
    net = slim.conv2d(net, 128, scope='conv2')
    net = slim.max_pool2d(net, scope='pool2')
    net = slim.repeat(net, 2, slim.conv2d, 256, scope='conv3')
    net = slim.max_pool2d(net, scope='pool3')
    net = slim.repeat(net, 2, slim.conv2d, 512, scope='conv4')
    net = slim.max_pool2d(net, scope='pool4')

    # Flatten before entering fully-connected layers
    net = slim.flatten(net)
    net = slim.repeat(net, 2, slim.fully_connected, 4096, scope='fc1')
    # The embedding layer.
    net = slim.fully_connected(net, params.EMBEDDING_SIZE, scope='fc2')
    return tf.identity(net, name='embedding')

Source File: vggish_slim.py From yolo_v2 with Apache License 2.0

4 votes

def define_vggish_slim(training=False):
  """Defines the VGGish TensorFlow model.

  All ops are created in the current default graph, under the scope 'vggish/'.

  The input is a placeholder named 'vggish/input_features' of type float32 and
  shape [batch_size, num_frames, num_bands] where batch_size is variable and
  num_frames and num_bands are constants, and [num_frames, num_bands] represents
  a log-mel-scale spectrogram patch covering num_bands frequency bands and
  num_frames time frames (where each frame step is usually 10ms). This is
  produced by computing the stabilized log(mel-spectrogram + params.LOG_OFFSET).
  The output is an op named 'vggish/embedding' which produces the activations of
  a 128-D embedding layer, which is usually the penultimate layer when used as
  part of a full model with a final classifier layer.

  Args:
    training: If true, all parameters are marked trainable.

  Returns:
    The op 'vggish/embeddings'.
  """
  # Defaults:
  # - All weights are initialized to N(0, INIT_STDDEV).
  # - All biases are initialized to 0.
  # - All activations are ReLU.
  # - All convolutions are 3x3 with stride 1 and SAME padding.
  # - All max-pools are 2x2 with stride 2 and SAME padding.
  with slim.arg_scope([slim.conv2d, slim.fully_connected],
                      weights_initializer=tf.truncated_normal_initializer(
                          stddev=params.INIT_STDDEV),
                      biases_initializer=tf.zeros_initializer(),
                      activation_fn=tf.nn.relu,
                      trainable=training), \
       slim.arg_scope([slim.conv2d],
                      kernel_size=[3, 3], stride=1, padding='SAME'), \
       slim.arg_scope([slim.max_pool2d],
                      kernel_size=[2, 2], stride=2, padding='SAME'), \
       tf.variable_scope('vggish'):
    # Input: a batch of 2-D log-mel-spectrogram patches.
    features = tf.placeholder(
        tf.float32, shape=(None, params.NUM_FRAMES, params.NUM_BANDS),
        name='input_features')
    # Reshape to 4-D so that we can convolve a batch with conv2d().
    net = tf.reshape(features, [-1, params.NUM_FRAMES, params.NUM_BANDS, 1])

    # The VGG stack of alternating convolutions and max-pools.
    net = slim.conv2d(net, 64, scope='conv1')
    net = slim.max_pool2d(net, scope='pool1')
    net = slim.conv2d(net, 128, scope='conv2')
    net = slim.max_pool2d(net, scope='pool2')
    net = slim.repeat(net, 2, slim.conv2d, 256, scope='conv3')
    net = slim.max_pool2d(net, scope='pool3')
    net = slim.repeat(net, 2, slim.conv2d, 512, scope='conv4')
    net = slim.max_pool2d(net, scope='pool4')

    # Flatten before entering fully-connected layers
    net = slim.flatten(net)
    net = slim.repeat(net, 2, slim.fully_connected, 4096, scope='fc1')
    # The embedding layer.
    net = slim.fully_connected(net, params.EMBEDDING_SIZE, scope='fc2')
    return tf.identity(net, name='embedding')

Source File: vggish_slim.py From models with Apache License 2.0

4 votes

def define_vggish_slim(training=False):
  """Defines the VGGish TensorFlow model.

  All ops are created in the current default graph, under the scope 'vggish/'.

  The input is a placeholder named 'vggish/input_features' of type float32 and
  shape [batch_size, num_frames, num_bands] where batch_size is variable and
  num_frames and num_bands are constants, and [num_frames, num_bands] represents
  a log-mel-scale spectrogram patch covering num_bands frequency bands and
  num_frames time frames (where each frame step is usually 10ms). This is
  produced by computing the stabilized log(mel-spectrogram + params.LOG_OFFSET).
  The output is an op named 'vggish/embedding' which produces the activations of
  a 128-D embedding layer, which is usually the penultimate layer when used as
  part of a full model with a final classifier layer.

  Args:
    training: If true, all parameters are marked trainable.

  Returns:
    The op 'vggish/embeddings'.
  """
  # Defaults:
  # - All weights are initialized to N(0, INIT_STDDEV).
  # - All biases are initialized to 0.
  # - All activations are ReLU.
  # - All convolutions are 3x3 with stride 1 and SAME padding.
  # - All max-pools are 2x2 with stride 2 and SAME padding.
  with slim.arg_scope([slim.conv2d, slim.fully_connected],
                      weights_initializer=tf.truncated_normal_initializer(
                          stddev=params.INIT_STDDEV),
                      biases_initializer=tf.zeros_initializer(),
                      activation_fn=tf.nn.relu,
                      trainable=training), \
       slim.arg_scope([slim.conv2d],
                      kernel_size=[3, 3], stride=1, padding='SAME'), \
       slim.arg_scope([slim.max_pool2d],
                      kernel_size=[2, 2], stride=2, padding='SAME'), \
       tf.variable_scope('vggish'):
    # Input: a batch of 2-D log-mel-spectrogram patches.
    features = tf.placeholder(
        tf.float32, shape=(None, params.NUM_FRAMES, params.NUM_BANDS),
        name='input_features')
    # Reshape to 4-D so that we can convolve a batch with conv2d().
    net = tf.reshape(features, [-1, params.NUM_FRAMES, params.NUM_BANDS, 1])

    # The VGG stack of alternating convolutions and max-pools.
    net = slim.conv2d(net, 64, scope='conv1')
    net = slim.max_pool2d(net, scope='pool1')
    net = slim.conv2d(net, 128, scope='conv2')
    net = slim.max_pool2d(net, scope='pool2')
    net = slim.repeat(net, 2, slim.conv2d, 256, scope='conv3')
    net = slim.max_pool2d(net, scope='pool3')
    net = slim.repeat(net, 2, slim.conv2d, 512, scope='conv4')
    net = slim.max_pool2d(net, scope='pool4')

    # Flatten before entering fully-connected layers
    net = slim.flatten(net)
    net = slim.repeat(net, 2, slim.fully_connected, 4096, scope='fc1')
    # The embedding layer.
    net = slim.fully_connected(net, params.EMBEDDING_SIZE, scope='fc2')
    return tf.identity(net, name='embedding')

Source File: vggish_slim.py From Tensorflow-Audio-Classification with Apache License 2.0

4 votes

def define_vggish_slim(training=False):
  """Defines the VGGish TensorFlow model.

  All ops are created in the current default graph, under the scope 'vggish/'.

  The input is a placeholder named 'vggish/input_features' of type float32 and
  shape [batch_size, num_frames, num_bands] where batch_size is variable and
  num_frames and num_bands are constants, and [num_frames, num_bands] represents
  a log-mel-scale spectrogram patch covering num_bands frequency bands and
  num_frames time frames (where each frame step is usually 10ms). This is
  produced by computing the stabilized log(mel-spectrogram + params.LOG_OFFSET).
  The output is an op named 'vggish/embedding' which produces the activations of
  a 128-D embedding layer, which is usually the penultimate layer when used as
  part of a full model with a final classifier layer.

  Args:
    training: If true, all parameters are marked trainable.

  Returns:
    The op 'vggish/embeddings'.
  """
  # Defaults:
  # - All weights are initialized to N(0, INIT_STDDEV).
  # - All biases are initialized to 0.
  # - All activations are ReLU.
  # - All convolutions are 3x3 with stride 1 and SAME padding.
  # - All max-pools are 2x2 with stride 2 and SAME padding.
  with slim.arg_scope([slim.conv2d, slim.fully_connected],
                      weights_initializer=tf.truncated_normal_initializer(
                          stddev=params.INIT_STDDEV),
                      biases_initializer=tf.zeros_initializer(),
                      activation_fn=tf.nn.relu,
                      trainable=training), \
       slim.arg_scope([slim.conv2d],
                      kernel_size=[3, 3], stride=1, padding='SAME'), \
       slim.arg_scope([slim.max_pool2d],
                      kernel_size=[2, 2], stride=2, padding='SAME'), \
       tf.variable_scope('vggish'):
    # Input: a batch of 2-D log-mel-spectrogram patches.
    features = tf.placeholder(
        tf.float32, shape=(None, params.NUM_FRAMES, params.NUM_BANDS),
        name='input_features')
    # Reshape to 4-D so that we can convolve a batch with conv2d().
    net = tf.reshape(features, [-1, params.NUM_FRAMES, params.NUM_BANDS, 1])

    # The VGG stack of alternating convolutions and max-pools.
    net = slim.conv2d(net, 64, scope='conv1')
    net = slim.max_pool2d(net, scope='pool1')
    net = slim.conv2d(net, 128, scope='conv2')
    net = slim.max_pool2d(net, scope='pool2')
    net = slim.repeat(net, 2, slim.conv2d, 256, scope='conv3')
    net = slim.max_pool2d(net, scope='pool3')
    net = slim.repeat(net, 2, slim.conv2d, 512, scope='conv4')
    net = slim.max_pool2d(net, scope='pool4')

    # Flatten before entering fully-connected layers
    net = slim.flatten(net)
    net = slim.repeat(net, 2, slim.fully_connected, 4096, scope='fc1')
    # The embedding layer.
    net = slim.fully_connected(net, params.EMBEDDING_SIZE, scope='fc2')
    return tf.identity(net, name='embedding')

Source File: vggish_slim.py From multilabel-image-classification-tensorflow with MIT License

4 votes

def define_vggish_slim(training=False):
  """Defines the VGGish TensorFlow model.

  All ops are created in the current default graph, under the scope 'vggish/'.

  The input is a placeholder named 'vggish/input_features' of type float32 and
  shape [batch_size, num_frames, num_bands] where batch_size is variable and
  num_frames and num_bands are constants, and [num_frames, num_bands] represents
  a log-mel-scale spectrogram patch covering num_bands frequency bands and
  num_frames time frames (where each frame step is usually 10ms). This is
  produced by computing the stabilized log(mel-spectrogram + params.LOG_OFFSET).
  The output is an op named 'vggish/embedding' which produces the activations of
  a 128-D embedding layer, which is usually the penultimate layer when used as
  part of a full model with a final classifier layer.

  Args:
    training: If true, all parameters are marked trainable.

  Returns:
    The op 'vggish/embeddings'.
  """
  # Defaults:
  # - All weights are initialized to N(0, INIT_STDDEV).
  # - All biases are initialized to 0.
  # - All activations are ReLU.
  # - All convolutions are 3x3 with stride 1 and SAME padding.
  # - All max-pools are 2x2 with stride 2 and SAME padding.
  with slim.arg_scope([slim.conv2d, slim.fully_connected],
                      weights_initializer=tf.truncated_normal_initializer(
                          stddev=params.INIT_STDDEV),
                      biases_initializer=tf.zeros_initializer(),
                      activation_fn=tf.nn.relu,
                      trainable=training), \
       slim.arg_scope([slim.conv2d],
                      kernel_size=[3, 3], stride=1, padding='SAME'), \
       slim.arg_scope([slim.max_pool2d],
                      kernel_size=[2, 2], stride=2, padding='SAME'), \
       tf.variable_scope('vggish'):
    # Input: a batch of 2-D log-mel-spectrogram patches.
    features = tf.placeholder(
        tf.float32, shape=(None, params.NUM_FRAMES, params.NUM_BANDS),
        name='input_features')
    # Reshape to 4-D so that we can convolve a batch with conv2d().
    net = tf.reshape(features, [-1, params.NUM_FRAMES, params.NUM_BANDS, 1])

    # The VGG stack of alternating convolutions and max-pools.
    net = slim.conv2d(net, 64, scope='conv1')
    net = slim.max_pool2d(net, scope='pool1')
    net = slim.conv2d(net, 128, scope='conv2')
    net = slim.max_pool2d(net, scope='pool2')
    net = slim.repeat(net, 2, slim.conv2d, 256, scope='conv3')
    net = slim.max_pool2d(net, scope='pool3')
    net = slim.repeat(net, 2, slim.conv2d, 512, scope='conv4')
    net = slim.max_pool2d(net, scope='pool4')

    # Flatten before entering fully-connected layers
    net = slim.flatten(net)
    net = slim.repeat(net, 2, slim.fully_connected, 4096, scope='fc1')
    # The embedding layer.
    net = slim.fully_connected(net, params.EMBEDDING_SIZE, scope='fc2')
    return tf.identity(net, name='embedding')

Python vggish_params.EMBEDDING_SIZE Examples