Python Examples of scipy.spatial.distance.hamming

Source File: test_classification.py From Mastering-Elasticsearch-7.0 with MIT License

7 votes

def test_multilabel_hamming_loss():
    # Dense label indicator matrix format
    y1 = np.array([[0, 1, 1], [1, 0, 1]])
    y2 = np.array([[0, 0, 1], [1, 0, 1]])
    w = np.array([1, 3])

    assert_equal(hamming_loss(y1, y2), 1 / 6)
    assert_equal(hamming_loss(y1, y1), 0)
    assert_equal(hamming_loss(y2, y2), 0)
    assert_equal(hamming_loss(y2, 1 - y2), 1)
    assert_equal(hamming_loss(y1, 1 - y1), 1)
    assert_equal(hamming_loss(y1, np.zeros(y1.shape)), 4 / 6)
    assert_equal(hamming_loss(y2, np.zeros(y1.shape)), 0.5)
    assert_equal(hamming_loss(y1, y2, sample_weight=w), 1. / 12)
    assert_equal(hamming_loss(y1, 1-y2, sample_weight=w), 11. / 12)
    assert_equal(hamming_loss(y1, np.zeros_like(y1), sample_weight=w), 2. / 3)
    # sp_hamming only works with 1-D arrays
    assert_equal(hamming_loss(y1[0], y2[0]), sp_hamming(y1[0], y2[0]))
    assert_warns_message(DeprecationWarning,
                         "The labels parameter is unused. It was"
                         " deprecated in version 0.21 and"
                         " will be removed in version 0.23",
                         hamming_loss, y1, y2, labels=[0, 1])

Source File: cost_sensitive_reference_pair_encoding.py From libact with BSD 2-Clause "Simplified" License

6 votes

def make_query(self):
        dataset = self.dataset

        unlabeled_entry_ids, X_pool = dataset.get_unlabeled_entries()
        X_pool = np.asarray(X_pool)

        self.csrpe_.train(dataset)
        self.model_.train(dataset)

        predY = self.model_.predict(X_pool)
        Z = self.csrpe_.predicted_code(X_pool)
        predZ = self.csrpe_.encode(predY)

        dist = paired_distances(Z, predZ, metric=hamming) # z1 z2
        dist2 = self.csrpe_.predict_dist(X_pool) # z1 zt
        #dist3 = self.csrpe.distance(predZ) # z2 zt

        dist = dist + dist2
        #dist = dist + dist3

        ask_id = self.random_state_.choice(
            np.where(np.isclose(dist, np.max(dist)))[0])

        return unlabeled_entry_ids[ask_id]

Source File: classification.py From brainiak with Apache License 2.0

6 votes

def example_of_aggregating_sim_matrix(raw_data, labels, num_subjects, num_epochs_per_subj):
    # aggregate the kernel matrix to save memory
    svm_clf = svm.SVC(kernel='precomputed', shrinking=False, C=1, gamma='auto')
    clf = Classifier(svm_clf, num_processed_voxels=1000, epochs_per_subj=num_epochs_per_subj)
    rearranged_data = raw_data[num_epochs_per_subj:] + raw_data[0:num_epochs_per_subj]
    rearranged_labels = labels[num_epochs_per_subj:] + labels[0:num_epochs_per_subj]
    clf.fit(list(zip(rearranged_data, rearranged_data)), rearranged_labels,
            num_training_samples=num_epochs_per_subj*(num_subjects-1))
    predict = clf.predict()
    print(predict)
    print(clf.decision_function())
    test_labels = labels[0:num_epochs_per_subj]
    incorrect_predict = hamming(predict, np.asanyarray(test_labels)) * num_epochs_per_subj
    logger.info(
        'when aggregating the similarity matrix to save memory, '
        'the accuracy is %d / %d = %.2f' %
        (num_epochs_per_subj-incorrect_predict, num_epochs_per_subj,
         (num_epochs_per_subj-incorrect_predict) * 1.0 / num_epochs_per_subj)
    )
    # when the kernel matrix is computed in portion, the test data is already in
    print(clf.score(None, test_labels))

Source File: classification.py From brainiak with Apache License 2.0

6 votes

def example_of_correlating_two_components(raw_data, raw_data2, labels, num_subjects, num_epochs_per_subj):
    # aggregate the kernel matrix to save memory
    svm_clf = svm.SVC(kernel='precomputed', shrinking=False, C=1, gamma='auto')
    clf = Classifier(svm_clf, epochs_per_subj=num_epochs_per_subj)
    num_training_samples=num_epochs_per_subj*(num_subjects-1)
    clf.fit(list(zip(raw_data[0:num_training_samples], raw_data2[0:num_training_samples])),
            labels[0:num_training_samples])
    X = list(zip(raw_data[num_training_samples:], raw_data2[num_training_samples:]))
    predict = clf.predict(X)
    print(predict)
    print(clf.decision_function(X))
    test_labels = labels[num_training_samples:]
    incorrect_predict = hamming(predict, np.asanyarray(test_labels)) * num_epochs_per_subj
    logger.info(
        'when aggregating the similarity matrix to save memory, '
        'the accuracy is %d / %d = %.2f' %
        (num_epochs_per_subj-incorrect_predict, num_epochs_per_subj,
         (num_epochs_per_subj-incorrect_predict) * 1.0 / num_epochs_per_subj)
    )
    # when the kernel matrix is computed in portion, the test data is already in
    print(clf.score(X, test_labels))

Source File: classification.py From brainiak with Apache License 2.0

6 votes

def example_of_correlating_two_components_aggregating_sim_matrix(raw_data, raw_data2, labels,
                                                                 num_subjects, num_epochs_per_subj):
    # aggregate the kernel matrix to save memory
    svm_clf = svm.SVC(kernel='precomputed', shrinking=False, C=1, gamma='auto')
    clf = Classifier(svm_clf, num_processed_voxels=1000, epochs_per_subj=num_epochs_per_subj)
    num_training_samples=num_epochs_per_subj*(num_subjects-1)
    clf.fit(list(zip(raw_data, raw_data2)), labels,
            num_training_samples=num_training_samples)
    predict = clf.predict()
    print(predict)
    print(clf.decision_function())
    test_labels = labels[num_training_samples:]
    incorrect_predict = hamming(predict, np.asanyarray(test_labels)) * num_epochs_per_subj
    logger.info(
        'when aggregating the similarity matrix to save memory, '
        'the accuracy is %d / %d = %.2f' %
        (num_epochs_per_subj-incorrect_predict, num_epochs_per_subj,
         (num_epochs_per_subj-incorrect_predict) * 1.0 / num_epochs_per_subj)
    )
    # when the kernel matrix is computed in portion, the test data is already in
    print(clf.score(None, test_labels))

# python3 classification.py face_scene bet.nii.gz face_scene/prefrontal_top_mask.nii.gz face_scene/fs_epoch_labels.npy

Source File: test_libdist.py From enspara with GNU General Public License v3.0

6 votes

def test_hamming_distance():

    dtypes = ['|S1']
    for elem_size in ['8', '16', '32', '64']:
        for int_type in ['int', 'uint']:
            dtypes.append(int_type + elem_size)

    for dtype in dtypes:
        X = np.array([[1, 3, 8],
                      [3, 1, 8],
                      [1, 1, 7]]).astype(dtype)
        y = np.array([1, 2, 3]).astype(dtype)

        d_expected = np.zeros((len(X)))
        for i in range(len(X)):
            d_expected[i] = scipy_hamming(X[i], y)

        d_enspara = libdist.hamming(X, y)

        assert_array_equal(d_expected, d_enspara)

Source File: test_classification.py From twitter-stock-recommendation with MIT License

6 votes

def test_multilabel_hamming_loss():
    # Dense label indicator matrix format
    y1 = np.array([[0, 1, 1], [1, 0, 1]])
    y2 = np.array([[0, 0, 1], [1, 0, 1]])
    w = np.array([1, 3])

    assert_equal(hamming_loss(y1, y2), 1 / 6)
    assert_equal(hamming_loss(y1, y1), 0)
    assert_equal(hamming_loss(y2, y2), 0)
    assert_equal(hamming_loss(y2, 1 - y2), 1)
    assert_equal(hamming_loss(y1, 1 - y1), 1)
    assert_equal(hamming_loss(y1, np.zeros(y1.shape)), 4 / 6)
    assert_equal(hamming_loss(y2, np.zeros(y1.shape)), 0.5)
    assert_equal(hamming_loss(y1, y2, sample_weight=w), 1. / 12)
    assert_equal(hamming_loss(y1, 1-y2, sample_weight=w), 11. / 12)
    assert_equal(hamming_loss(y1, np.zeros_like(y1), sample_weight=w), 2. / 3)
    # sp_hamming only works with 1-D arrays
    assert_equal(hamming_loss(y1[0], y2[0]), sp_hamming(y1[0], y2[0]))
    assert_warns(DeprecationWarning, hamming_loss, y1, y2, classes=[0, 1])

Source File: plotting.py From kvae with MIT License

5 votes

def plot_trajectory_uncertainty(true, gen, filter, smooth, filename):
    sequences, timesteps, h, w = true.shape

    errors = dict(Generated=list(), Filtered=list(), Smoothed=list())
    for label, var in zip(('Generated', 'Filtered', 'Smoothed'), (gen, filter, smooth)):
        for step in range(timesteps):
            errors[label].append(hamming(true[:, step].ravel() > 0.5, var[:, step].ravel() > 0.5))

        plt.plot(np.linspace(1, timesteps, num=timesteps).astype(int), errors[label], linewidth=3, ms=20, label=label)
    plt.xlabel('Steps', fontsize=20)
    plt.ylabel('Hamming distance', fontsize=20)
    plt.legend(fontsize=20)
    plt.savefig(filename)
    plt.close()

Source File: classification.py From brainiak with Apache License 2.0

5 votes

def example_of_cross_validation_with_detailed_info(raw_data, labels, num_subjects, num_epochs_per_subj):
    # no shrinking, set C=1
    svm_clf = svm.SVC(kernel='precomputed', shrinking=False, C=1, gamma='auto')
    #logit_clf = LogisticRegression()
    clf = Classifier(svm_clf, epochs_per_subj=num_epochs_per_subj)
    # doing leave-one-subject-out cross validation
    for i in range(num_subjects):
        leave_start = i * num_epochs_per_subj
        leave_end = (i+1) * num_epochs_per_subj
        training_data = raw_data[0:leave_start] + raw_data[leave_end:]
        test_data = raw_data[leave_start:leave_end]
        training_labels = labels[0:leave_start] + labels[leave_end:]
        test_labels = labels[leave_start:leave_end]
        clf.fit(list(zip(training_data, training_data)), training_labels)
        # joblib can be used for saving and loading models
        #joblib.dump(clf, 'model/logistic.pkl')
        #clf = joblib.load('model/svm.pkl')
        predict = clf.predict(list(zip(test_data, test_data)))
        print(predict)
        print(clf.decision_function(list(zip(test_data, test_data))))
        incorrect_predict = hamming(predict, np.asanyarray(test_labels)) * num_epochs_per_subj
        logger.info(
            'when leaving subject %d out for testing, the accuracy is %d / %d = %.2f' %
            (i, num_epochs_per_subj-incorrect_predict, num_epochs_per_subj,
             (num_epochs_per_subj-incorrect_predict) * 1.0 / num_epochs_per_subj)
        )
        print(clf.score(list(zip(test_data, test_data)), test_labels))

Source File: utils.py From teneto with GNU General Public License v3.0

5 votes

def get_distance_function(requested_metric):
    """
    This function returns a specified distance function.

    Paramters
    ---------

    requested_metric: str
        Distance function. Can be any function in: https://docs.scipy.org/doc/scipy/reference/spatial.distance.html.

    Returns
    -------

    requested_metric : distance function

    """
    distance_options = {
        'braycurtis': distance.braycurtis,
        'canberra': distance.canberra,
        'chebyshev': distance.chebyshev,
        'cityblock': distance.cityblock,
        'correlation': distance.correlation,
        'cosine': distance.cosine,
        'euclidean': distance.euclidean,
        'sqeuclidean': distance.sqeuclidean,
        'dice': distance.dice,
        'hamming': distance.hamming,
        'jaccard': distance.jaccard,
        'kulsinski': distance.kulsinski,
        'matching': distance.matching,
        'rogerstanimoto': distance.rogerstanimoto,
        'russellrao': distance.russellrao,
        'sokalmichener': distance.sokalmichener,
        'sokalsneath': distance.sokalsneath,
        'yule': distance.yule,
    }
    if requested_metric in distance_options:
        return distance_options[requested_metric]
    else:
        raise ValueError('Distance function cannot be found.')

Source File: utils.py From teneto with GNU General Public License v3.0

5 votes

def check_distance_funciton_input(distance_func_name, netinfo):
    """
    Function checks distance_func_name, if it is specified as 'default'.
    Then given the type of the network selects a default distance function.

    Parameters
    ----------

    distance_func_name : str
        distance function name.

    netinfo : dict
        the output of utils.process_input

    Returns
    -------

    distance_func_name : str
        distance function name.
    """
    if distance_func_name == 'default' and netinfo['nettype'][0] == 'b':
        print('Default distance funciton specified. As network is binary, using Hamming')
        distance_func_name = 'hamming'
    elif distance_func_name == 'default' and netinfo['nettype'][0] == 'w':
        distance_func_name = 'euclidean'
        print(
            'Default distance funciton specified. '
            'As network is weighted, using Euclidean')

    return distance_func_name

Source File: taar_similarity.py From telemetry-airflow with Mozilla Public License 2.0

5 votes

def similarity_function(x, y):
    """ Similarity function for comparing user features.

    This actually really should be implemented in taar.similarity_recommender
    and then imported here for consistency.
    """

    def safe_get(field, row, default_value):
        # Safely get a value from the Row. If the value is None, get the
        # default value.
        return row[field] if row[field] is not None else default_value

    # Extract the values for the categorical and continuous features for both
    # the x and y samples. Use an empty string as the default value for missing
    # categorical fields and 0 for the continuous ones.
    x_categorical_features = [safe_get(k, x, "") for k in CATEGORICAL_FEATURES]
    y_categorical_features = [safe_get(k, y, "") for k in CATEGORICAL_FEATURES]
    x_continuous_features = [
        float(safe_get(k, x, 0)) for k in CONTINUOUS_FEATURES
    ]
    y_continuous_features = [
        float(safe_get(k, y, 0)) for k in CONTINUOUS_FEATURES
    ]

    # Here a larger distance indicates a poorer match between categorical variables.
    j_d = distance.hamming(x_categorical_features, y_categorical_features)
    j_c = distance.canberra(x_continuous_features, y_continuous_features)

    # Take the product of similarities to attain a univariate similarity score.
    # Add a minimal constant to prevent zero values from categorical features.
    # Note: since both the distance function return a Numpy type, we need to
    # call the |item| function to get the underlying Python type. If we don't
    # do that this job will fail when performing KDE due to SPARK-20803 on
    # Spark 2.2.0.
    return abs((j_c + 0.001) * j_d).item()

Source File: taar_similarity.py From python_mozetl with MIT License

5 votes

def similarity_function(x, y):
    """ Similarity function for comparing user features.

    This actually really should be implemented in taar.similarity_recommender
    and then imported here for consistency.
    """

    def safe_get(field, row, default_value):
        # Safely get a value from the Row. If the value is None, get the
        # default value.
        return row[field] if row[field] is not None else default_value

    # Extract the values for the categorical and continuous features for both
    # the x and y samples. Use an empty string as the default value for missing
    # categorical fields and 0 for the continuous ones.
    x_categorical_features = [safe_get(k, x, "") for k in CATEGORICAL_FEATURES]
    x_continuous_features = [safe_get(k, x, 0) for k in CONTINUOUS_FEATURES]
    y_categorical_features = [safe_get(k, y, "") for k in CATEGORICAL_FEATURES]
    y_continuous_features = [safe_get(k, y, 0) for k in CONTINUOUS_FEATURES]

    # Here a larger distance indicates a poorer match between categorical variables.
    j_d = distance.hamming(x_categorical_features, y_categorical_features)
    j_c = distance.canberra(x_continuous_features, y_continuous_features)

    # Take the product of similarities to attain a univariate similarity score.
    # Add a minimal constant to prevent zero values from categorical features.
    # Note: since both the distance function return a Numpy type, we need to
    # call the |item| function to get the underlying Python type. If we don't
    # do that this job will fail when performing KDE due to SPARK-20803 on
    # Spark 2.2.0.
    return abs((j_c + 0.001) * j_d).item()

Source File: ripple_carry_adder.py From forest-benchmarking with Apache License 2.0

5 votes

def get_error_hamming_distributions_from_results(results: Sequence[Sequence[Sequence[int]]]) \
        -> Sequence[Sequence[float]]:
    """
    Get the distribution of the hamming weight of the error vector (number of bits flipped
    between output and expected answer) for each possible pair of two n_bit summands using
    results output by get_n_bit_adder_results

    :param results: a list of results output from a call to get_n_bit_adder_results
    :return: the relative frequency of observing each hamming weight, 0 to n_bits+1, for the error
        that occurred when adding each pair of two n_bit summands
    """
    num_shots = len(results[0])
    n_bits = len(results[0][0]) - 1

    hamming_wt_distrs = []
    # loop over all binary strings of length n_bits
    for result, bits in zip(results, all_bitstrings(2 * n_bits)):
        # Input nums are written from (MSB .... LSB) = (a_n, ..., a_1, a_0)
        num_a = bit_array_to_int(bits[:n_bits])
        num_b = bit_array_to_int(bits[n_bits:])

        # add the numbers
        ans = num_a + num_b
        ans_bits = int_to_bit_array(ans, n_bits + 1)

        # record the fraction of shots that resulted in an error of the given weight
        hamming_wt_distr = [0. for _ in range(len(ans_bits) + 1)]
        for shot in result:
            # multiply relative hamming distance by the length of the output for the weight
            wt = len(ans_bits) * hamming(ans_bits, shot)
            hamming_wt_distr[int(wt)] += 1. / num_shots

        hamming_wt_distrs.append(hamming_wt_distr)

    return hamming_wt_distrs

Python scipy.spatial.distance.hamming() Examples