Python Examples of sklearn.neighbors.LSHForest

Source File: kneighbour_l2r_classifier.py From Quadflor with BSD 3-Clause "New" or "Revised" License

6 votes

def __init__(self, use_lsh_forest=False, n_neighbors=20, max_iterations = 300, count_concepts = False, number_of_concepts = 0,
                count_terms = False, training_validation_split = 0.8, algorithm_id = '7', l2r_metric = "ERR@k", n_jobs = 1, translation_probability = False, **kwargs ):
        
        self.n_neighbors = n_neighbors
        nn = LSHForest(n_neighbors=n_neighbors, **kwargs) if use_lsh_forest else NearestNeighbors(
            n_neighbors=n_neighbors, **kwargs)
        self.knn = BatchKNeighbors(nn)
        self.y = None
        self.max_iterations = max_iterations
        self.count_concepts = count_concepts
        self.count_terms = count_terms
        self.number_of_concepts = number_of_concepts
        self.training_validation_split = training_validation_split
        self.algorithm_id = algorithm_id
        self.l2r_metric = l2r_metric
        self.n_jobs = n_jobs
        self.translation_probability = translation_probability

Source File: test_approximate.py From twitter-stock-recommendation with MIT License

6 votes

def test_graphs():
    # Smoke tests for graph methods.
    n_samples_sizes = [5, 10, 20]
    n_features = 3
    rng = np.random.RandomState(42)

    for n_samples in n_samples_sizes:
        X = rng.rand(n_samples, n_features)
        lshf = ignore_warnings(LSHForest, category=DeprecationWarning)(
            min_hash_match=0)
        ignore_warnings(lshf.fit)(X)

        kneighbors_graph = lshf.kneighbors_graph(X)
        radius_neighbors_graph = lshf.radius_neighbors_graph(X)

        assert_equal(kneighbors_graph.shape[0], n_samples)
        assert_equal(kneighbors_graph.shape[1], n_samples)
        assert_equal(radius_neighbors_graph.shape[0], n_samples)
        assert_equal(radius_neighbors_graph.shape[1], n_samples)

Source File: recommender.py From atap with Apache License 2.0

5 votes

def __init__(self, k=3, **kwargs):
        """
        Note: tried LSHForest, still too slow
        :param k:
        :param kwargs:
        """
        self.model = NearestNeighbors(n_neighbors=k, **kwargs)

Source File: nearest_neighbor.py From Quadflor with BSD 3-Clause "New" or "Revised" License

5 votes

def __init__(self, use_lsh_forest=False, metric='cosine', algorithm='brute'):
        self.lsh = use_lsh_forest
        self.y = None
        nn = LSHForest(n_neighbors=1, n_candidates=400, n_estimators=35) if use_lsh_forest else NearestNeighbors(
                n_neighbors=1, metric=metric, algorithm=algorithm)
        self.knn = BatchKNeighbors(nn)

Source File: br_kneighbor_classifier.py From Quadflor with BSD 3-Clause "New" or "Revised" License

5 votes

def __init__(self, threshold=0.2, use_lsh_forest=False, mode='b',
                 n_neighbors=50, scoring='f1_samples', auto_optimize_k=False,
                 n_neighbor_candidates=(3, 5, 8, 13, 21, 34, 55, 84, 139, 223, 362),
                 algorithm='brute', metric='cosine'):
        self.auto_optimize_k = auto_optimize_k
        self.scoring = scoring
        self.n_neighbor_candidates = n_neighbor_candidates
        self.mode = mode
        self.n_neighbors = n_neighbors
        self.threshold = threshold
        nn = LSHForest(n_neighbors=n_neighbors, n_candidates=400,
                       n_estimators=35) if use_lsh_forest else NearestNeighbors(
                n_neighbors=n_neighbors, algorithm=algorithm, metric=metric)
        self.knn = BatchKNeighbors(nn)
        self.y = None

Source File: util_funcs.py From PointCNN with MIT License

5 votes

def knn_indices_func_approx(rep_pts : FloatTensor,  # (N, pts, dim)
                            pts : FloatTensor,      # (N, x, dim)
                            K : int, D : int
                           ) -> LongTensor:         # (N, pts, K)
    """
    Approximate CPU-based Indexing function based on K-Nearest Neighbors search.
    :param rep_pts: Representative points.
    :param pts: Point cloud to get indices from.
    :param K: Number of nearest neighbors to collect.
    :param D: "Spread" of neighboring points.
    :return: Array of indices, P_idx, into pts such that pts[n][P_idx[n],:]
    is the set k-nearest neighbors for the representative points in pts[n].
    """
    if rep_pts.is_cuda:
        rep_pts = rep_pts.cpu()
    if pts.is_cuda:
        pts = pts.cpu()
    rep_pts = rep_pts.data.numpy()
    pts = pts.data.numpy()

    region_idx = []

    for n, p in enumerate(rep_pts):
        P_particular = pts[n]
        lshf = LSHForest(n_estimators = 20, n_candidates = 100, n_neighbors = D*K + 1)
        lshf.fit(P_particular)
        indices = lshf.kneighbors(p, return_distance = False)
        region_idx.append(indices[:,1::D])

Source File: test_approximate.py From twitter-stock-recommendation with MIT License

5 votes

def test_lsh_forest_deprecation():
    assert_warns_message(DeprecationWarning,
                         "LSHForest has poor performance and has been "
                         "deprecated in 0.19. It will be removed "
                         "in version 0.21.", LSHForest)

Source File: test_approximate.py From twitter-stock-recommendation with MIT License

5 votes

def test_neighbors_accuracy_with_n_candidates():
    # Checks whether accuracy increases as `n_candidates` increases.
    n_candidates_values = np.array([.1, 50, 500])
    n_samples = 100
    n_features = 10
    n_iter = 10
    n_points = 5
    rng = np.random.RandomState(42)
    accuracies = np.zeros(n_candidates_values.shape[0], dtype=float)
    X = rng.rand(n_samples, n_features)

    for i, n_candidates in enumerate(n_candidates_values):
        lshf = ignore_warnings(LSHForest, category=DeprecationWarning)(
            n_candidates=n_candidates)
        ignore_warnings(lshf.fit)(X)
        for j in range(n_iter):
            query = X[rng.randint(0, n_samples)].reshape(1, -1)

            neighbors = lshf.kneighbors(query, n_neighbors=n_points,
                                        return_distance=False)
            distances = pairwise_distances(query, X, metric='cosine')
            ranks = np.argsort(distances)[0, :n_points]

            intersection = np.intersect1d(ranks, neighbors).shape[0]
            ratio = intersection / float(n_points)
            accuracies[i] = accuracies[i] + ratio

        accuracies[i] = accuracies[i] / float(n_iter)
    # Sorted accuracies should be equal to original accuracies
    print('accuracies:', accuracies)
    assert_true(np.all(np.diff(accuracies) >= 0),
                msg="Accuracies are not non-decreasing.")
    # Highest accuracy should be strictly greater than the lowest
    assert_true(np.ptp(accuracies) > 0,
                msg="Highest accuracy is not strictly greater than lowest.")

Source File: test_approximate.py From twitter-stock-recommendation with MIT License

5 votes

def test_neighbors_accuracy_with_n_estimators():
    # Checks whether accuracy increases as `n_estimators` increases.
    n_estimators = np.array([1, 10, 100])
    n_samples = 100
    n_features = 10
    n_iter = 10
    n_points = 5
    rng = np.random.RandomState(42)
    accuracies = np.zeros(n_estimators.shape[0], dtype=float)
    X = rng.rand(n_samples, n_features)

    for i, t in enumerate(n_estimators):
        lshf = ignore_warnings(LSHForest, category=DeprecationWarning)(
            n_candidates=500, n_estimators=t)
        ignore_warnings(lshf.fit)(X)
        for j in range(n_iter):
            query = X[rng.randint(0, n_samples)].reshape(1, -1)
            neighbors = lshf.kneighbors(query, n_neighbors=n_points,
                                        return_distance=False)
            distances = pairwise_distances(query, X, metric='cosine')
            ranks = np.argsort(distances)[0, :n_points]

            intersection = np.intersect1d(ranks, neighbors).shape[0]
            ratio = intersection / float(n_points)
            accuracies[i] = accuracies[i] + ratio

        accuracies[i] = accuracies[i] / float(n_iter)
    # Sorted accuracies should be equal to original accuracies
    assert_true(np.all(np.diff(accuracies) >= 0),
                msg="Accuracies are not non-decreasing.")
    # Highest accuracy should be strictly greater than the lowest
    assert_true(np.ptp(accuracies) > 0,
                msg="Highest accuracy is not strictly greater than lowest.")

Source File: test_approximate.py From twitter-stock-recommendation with MIT License

5 votes

def test_fit():
    # Checks whether `fit` method sets all attribute values correctly.
    n_samples = 12
    n_features = 2
    n_estimators = 5
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = ignore_warnings(LSHForest, category=DeprecationWarning)(
        n_estimators=n_estimators)
    ignore_warnings(lshf.fit)(X)

    # _input_array = X
    assert_array_equal(X, lshf._fit_X)
    # A hash function g(p) for each tree
    assert_equal(n_estimators, len(lshf.hash_functions_))
    # Hash length = 32
    assert_equal(32, lshf.hash_functions_[0].components_.shape[0])
    # Number of trees_ in the forest
    assert_equal(n_estimators, len(lshf.trees_))
    # Each tree has entries for every data point
    assert_equal(n_samples, len(lshf.trees_[0]))
    # Original indices after sorting the hashes
    assert_equal(n_estimators, len(lshf.original_indices_))
    # Each set of original indices in a tree has entries for every data point
    assert_equal(n_samples, len(lshf.original_indices_[0]))

Source File: test_approximate.py From twitter-stock-recommendation with MIT License

5 votes

def test_partial_fit():
    # Checks whether inserting array is consistent with fitted data.
    # `partial_fit` method should set all attribute values correctly.
    n_samples = 12
    n_samples_partial_fit = 3
    n_features = 2
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)
    X_partial_fit = rng.rand(n_samples_partial_fit, n_features)

    lshf = ignore_warnings(LSHForest, category=DeprecationWarning)()

    # Test unfitted estimator
    ignore_warnings(lshf.partial_fit)(X)
    assert_array_equal(X, lshf._fit_X)

    ignore_warnings(lshf.fit)(X)

    # Insert wrong dimension
    assert_raises(ValueError, lshf.partial_fit,
                  np.random.randn(n_samples_partial_fit, n_features - 1))

    ignore_warnings(lshf.partial_fit)(X_partial_fit)

    # size of _input_array = samples + 1 after insertion
    assert_equal(lshf._fit_X.shape[0],
                 n_samples + n_samples_partial_fit)
    # size of original_indices_[1] = samples + 1
    assert_equal(len(lshf.original_indices_[0]),
                 n_samples + n_samples_partial_fit)
    # size of trees_[1] = samples + 1
    assert_equal(len(lshf.trees_[1]),
                 n_samples + n_samples_partial_fit)

Source File: test_approximate.py From twitter-stock-recommendation with MIT License

5 votes

def test_hash_functions():
    # Checks randomness of hash functions.
    # Variance and mean of each hash function (projection vector)
    # should be different from flattened array of hash functions.
    # If hash functions are not randomly built (seeded with
    # same value), variances and means of all functions are equal.
    n_samples = 12
    n_features = 2
    n_estimators = 5
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = ignore_warnings(LSHForest, category=DeprecationWarning)(
        n_estimators=n_estimators,
        random_state=rng.randint(0, np.iinfo(np.int32).max))
    ignore_warnings(lshf.fit)(X)

    hash_functions = []
    for i in range(n_estimators):
        hash_functions.append(lshf.hash_functions_[i].components_)

    for i in range(n_estimators):
        assert_not_equal(np.var(hash_functions),
                         np.var(lshf.hash_functions_[i].components_))

    for i in range(n_estimators):
        assert_not_equal(np.mean(hash_functions),
                         np.mean(lshf.hash_functions_[i].components_))

Source File: test_approximate.py From twitter-stock-recommendation with MIT License

5 votes

def test_candidates():
    # Checks whether candidates are sufficient.
    # This should handle the cases when number of candidates is 0.
    # User should be warned when number of candidates is less than
    # requested number of neighbors.
    X_train = np.array([[5, 5, 2], [21, 5, 5], [1, 1, 1], [8, 9, 1],
                        [6, 10, 2]], dtype=np.float32)
    X_test = np.array([7, 10, 3], dtype=np.float32).reshape(1, -1)

    # For zero candidates
    lshf = ignore_warnings(LSHForest, category=DeprecationWarning)(
        min_hash_match=32)
    ignore_warnings(lshf.fit)(X_train)

    message = ("Number of candidates is not sufficient to retrieve"
               " %i neighbors with"
               " min_hash_match = %i. Candidates are filled up"
               " uniformly from unselected"
               " indices." % (3, 32))
    assert_warns_message(UserWarning, message, lshf.kneighbors,
                         X_test, n_neighbors=3)
    distances, neighbors = lshf.kneighbors(X_test, n_neighbors=3)
    assert_equal(distances.shape[1], 3)

    # For candidates less than n_neighbors
    lshf = ignore_warnings(LSHForest, category=DeprecationWarning)(
        min_hash_match=31)
    ignore_warnings(lshf.fit)(X_train)

    message = ("Number of candidates is not sufficient to retrieve"
               " %i neighbors with"
               " min_hash_match = %i. Candidates are filled up"
               " uniformly from unselected"
               " indices." % (5, 31))
    assert_warns_message(UserWarning, message, lshf.kneighbors,
                         X_test, n_neighbors=5)
    distances, neighbors = lshf.kneighbors(X_test, n_neighbors=5)
    assert_equal(distances.shape[1], 5)

Source File: test_approximate.py From twitter-stock-recommendation with MIT License

4 votes

def test_kneighbors():
    # Checks whether desired number of neighbors are returned.
    # It is guaranteed to return the requested number of neighbors
    # if `min_hash_match` is set to 0. Returned distances should be
    # in ascending order.
    n_samples = 12
    n_features = 2
    n_iter = 10
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = ignore_warnings(LSHForest, category=DeprecationWarning)(
        min_hash_match=0)
    # Test unfitted estimator
    assert_raises(ValueError, lshf.kneighbors, X[0])

    ignore_warnings(lshf.fit)(X)

    for i in range(n_iter):
        n_neighbors = rng.randint(0, n_samples)
        query = X[rng.randint(0, n_samples)].reshape(1, -1)
        neighbors = lshf.kneighbors(query, n_neighbors=n_neighbors,
                                    return_distance=False)
        # Desired number of neighbors should be returned.
        assert_equal(neighbors.shape[1], n_neighbors)

    # Multiple points
    n_queries = 5
    queries = X[rng.randint(0, n_samples, n_queries)]
    distances, neighbors = lshf.kneighbors(queries,
                                           n_neighbors=1,
                                           return_distance=True)
    assert_equal(neighbors.shape[0], n_queries)
    assert_equal(distances.shape[0], n_queries)
    # Test only neighbors
    neighbors = lshf.kneighbors(queries, n_neighbors=1,
                                return_distance=False)
    assert_equal(neighbors.shape[0], n_queries)
    # Test random point(not in the data set)
    query = rng.randn(n_features).reshape(1, -1)
    lshf.kneighbors(query, n_neighbors=1,
                    return_distance=False)
    # Test n_neighbors at initialization
    neighbors = lshf.kneighbors(query, return_distance=False)
    assert_equal(neighbors.shape[1], 5)
    # Test `neighbors` has an integer dtype
    assert_true(neighbors.dtype.kind == 'i',
                msg="neighbors are not in integer dtype.")

Source File: test_approximate.py From twitter-stock-recommendation with MIT License

4 votes

def test_radius_neighbors_boundary_handling():
    X = [[0.999, 0.001], [0.5, 0.5], [0, 1.], [-1., 0.001]]
    n_points = len(X)

    # Build an exact nearest neighbors model as reference model to ensure
    # consistency between exact and approximate methods
    nnbrs = NearestNeighbors(algorithm='brute', metric='cosine').fit(X)

    # Build a LSHForest model with hyperparameter values that always guarantee
    # exact results on this toy dataset.
    lsfh = ignore_warnings(LSHForest, category=DeprecationWarning)(
        min_hash_match=0, n_candidates=n_points, random_state=42).fit(X)

    # define a query aligned with the first axis
    query = [[1., 0.]]

    # Compute the exact cosine distances of the query to the four points of
    # the dataset
    dists = pairwise_distances(query, X, metric='cosine').ravel()

    # The first point is almost aligned with the query (very small angle),
    # the cosine distance should therefore be almost null:
    assert_almost_equal(dists[0], 0, decimal=5)

    # The second point form an angle of 45 degrees to the query vector
    assert_almost_equal(dists[1], 1 - np.cos(np.pi / 4))

    # The third point is orthogonal from the query vector hence at a distance
    # exactly one:
    assert_almost_equal(dists[2], 1)

    # The last point is almost colinear but with opposite sign to the query
    # therefore it has a cosine 'distance' very close to the maximum possible
    # value of 2.
    assert_almost_equal(dists[3], 2, decimal=5)

    # If we query with a radius of one, all the samples except the last sample
    # should be included in the results. This means that the third sample
    # is lying on the boundary of the radius query:
    exact_dists, exact_idx = nnbrs.radius_neighbors(query, radius=1)
    approx_dists, approx_idx = lsfh.radius_neighbors(query, radius=1)

    assert_array_equal(np.sort(exact_idx[0]), [0, 1, 2])
    assert_array_equal(np.sort(approx_idx[0]), [0, 1, 2])
    assert_array_almost_equal(np.sort(exact_dists[0]), dists[:-1])
    assert_array_almost_equal(np.sort(approx_dists[0]), dists[:-1])

    # If we perform the same query with a slightly lower radius, the third
    # point of the dataset that lay on the boundary of the previous query
    # is now rejected:
    eps = np.finfo(np.float64).eps
    exact_dists, exact_idx = nnbrs.radius_neighbors(query, radius=1 - eps)
    approx_dists, approx_idx = lsfh.radius_neighbors(query, radius=1 - eps)

    assert_array_equal(np.sort(exact_idx[0]), [0, 1])
    assert_array_equal(np.sort(approx_idx[0]), [0, 1])
    assert_array_almost_equal(np.sort(exact_dists[0]), dists[:-2])
    assert_array_almost_equal(np.sort(approx_dists[0]), dists[:-2])

Python sklearn.neighbors.LSHForest() Examples