Python Examples of sklearn.neighbors

Source File: cub_utils.py From omgh with MIT License

6 votes

def _pre_calculate(self, force=False):
        if self.final_storage.check_exists(self.final_storage.instance_path) and not force:
            self.NNS = self.final_storage.load_instance(self.final_storage.instance_path)
        else:
            self.ssfeature_loader.setup()
            self.Xtrain = self.ssfeature_loader.load_train()
            self.Xtest = self.ssfeature_loader.load_test()
            if self.normalize:
                self.Xtrain = utils.l2_feat_norm(self.Xtrain)
                self.Xtest = utils.l2_feat_norm(self.Xtest)

            self.nn_model = sklearn.neighbors.NearestNeighbors(n_neighbors=self.n_neighbors, algorithm='ball_tree', metric='minkowski', p=2)
            self.nn_model.fit(self.Xtrain)
            self.NNS = self.nn_model.kneighbors(self.Xtest, self.n_neighbors, return_distance=False)
            self.final_storage.save_instance(self.final_storage.instance_path, self.NNS)

        # this needs change for larges n_neighbors
        if self.n_neighbors == 1:
            self.NNS = self.NNS.T[0]
        else:
            pass

Source File: embeddings.py From stanza-old with Apache License 2.0

6 votes

def k_nearest_approx(self, vec, k):
        """Get the k nearest neighbors of a vector (in terms of cosine similarity).

        :param (np.array) vec: query vector
        :param (int) k: number of top neighbors to return

        :return (list[tuple[str, float]]): a list of (word, cosine similarity) pairs, in descending order
        """
        if not hasattr(self, 'lshf'):
            self.lshf = self._init_lsh_forest()

        # TODO(kelvin): make this inner product score, to be consistent with k_nearest
        distances, neighbors = self.lshf.kneighbors([vec], n_neighbors=k, return_distance=True)
        scores = np.subtract(1, distances)
        nbr_score_pairs = self._word_to_score(np.squeeze(neighbors), np.squeeze(scores))

        return sorted(nbr_score_pairs.items(), key=lambda x: x[1], reverse=True)

Source File: models.py From jh-kaggle-util with Apache License 2.0

5 votes

def run_sklearn():
  n_trees = 100
  n_folds = 3

  # https://www.analyticsvidhya.com/blog/2015/06/tuning-random-forest-model/
  alg_list = [
      ['rforest',RandomForestClassifier(n_estimators=1000, n_jobs=-1, verbose=1, max_depth=3)],
      ['extree',ExtraTreesClassifier(n_estimators = 1000,max_depth=3,n_jobs=-1)],
      ['adaboost',AdaBoostClassifier(base_estimator=None, n_estimators=600, learning_rate=1.0)],
      ['knn', sklearn.neighbors.KNeighborsClassifier(n_neighbors=5,n_jobs=-1)]
  ]

  start_time = time.time()
  for name,alg in alg_list:
      train = jhkaggle.train_sklearn.TrainSKLearn("1",name,alg,False)
      train.run()
      train = None

Source File: baselines.py From rmnist with MIT License

5 votes

def baselines(n):
    td, vd, ts = data_loader.load_data(n)
    classifiers = [
        sklearn.svm.SVC(C=1000),
        sklearn.svm.SVC(kernel="linear", C=0.1),
        sklearn.neighbors.KNeighborsClassifier(1),
        sklearn.tree.DecisionTreeClassifier(),
        sklearn.ensemble.RandomForestClassifier(max_depth=10, n_estimators=500, max_features=1),
        sklearn.neural_network.MLPClassifier(alpha=1, hidden_layer_sizes=(500, 100))
    ]
    for clf in classifiers:
        clf.fit(td[0], td[1])
        print "\n{}: {}".format(type(clf).__name__, round(clf.score(vd[0], vd[1])*100, 2))

Source File: transfer.py From rmnist with MIT License

5 votes

def transfer(n):
    td, vd, ts = data_loader.load_data(n, abstract=True, expanded=expanded)
    classifiers = [
        #sklearn.svm.SVC(),
        #sklearn.svm.SVC(kernel="linear", C=0.1),
        #sklearn.neighbors.KNeighborsClassifier(1),
        #sklearn.tree.DecisionTreeClassifier(),
        #sklearn.ensemble.RandomForestClassifier(max_depth=10, n_estimators=500, max_features=1),
        sklearn.neural_network.MLPClassifier(alpha=1.0, hidden_layer_sizes=(300,), max_iter=500)
    ]
    for clf in classifiers:
        clf.fit(td[0], td[1])
        print "\n{}: {}".format(type(clf).__name__, round(clf.score(vd[0], vd[1])*100, 2))

Source File: embeddings.py From stanza-old with Apache License 2.0

5 votes

def _init_lsh_forest(self):
        """Construct an LSH forest for nearest neighbor search."""
        import sklearn.neighbors
        lshf = sklearn.neighbors.LSHForest()
        lshf.fit(self.array)
        return lshf

Source File: embeddings.py From stanza-old with Apache License 2.0

5 votes

def k_nearest(self, vec, k):
        """Get the k nearest neighbors of a vector (in terms of highest inner products).

        :param (np.array) vec: query vector
        :param (int) k: number of top neighbors to return

        :return (list[tuple[str, float]]): a list of (word, score) pairs, in descending order
        """
        nbr_score_pairs = self.inner_products(vec)
        return sorted(nbr_score_pairs.items(), key=lambda x: x[1], reverse=True)[:k]

Source File: test_cli.py From mlflow with Apache License 2.0

5 votes

def sk_model(iris_data):
    x, y = iris_data
    knn_model = sklearn.neighbors.KNeighborsClassifier()
    knn_model.fit(x, y)
    return knn_model

Source File: k_neighbors_classifier.py From lale with Apache License 2.0

5 votes

def __init__(self, **hyperparams):
        self._hyperparams = hyperparams
        self._wrapped_model = sklearn.neighbors.KNeighborsClassifier(**self._hyperparams)

Source File: utils.py From NeuroKit with MIT License

5 votes

def _get_embedded(signal, delay=1, dimension=2, r="default", distance="chebyshev", approximate=True, fuzzy=False):
    """Examples
    ----------
    >>> import neurokit2 as nk
    >>>
    >>> signal = nk.signal_simulate(duration=2, frequency=5)
    >>> delay = nk.complexity_delay(signal)
    >>>
    >>> embbeded, count = _get_embedded(signal, delay, r=0.2 * np.std(signal, ddof=1), dimension=2,
    ...                                 distance='chebyshev', approximate=False)
    """
    # Sanity checks
    if distance not in sklearn.neighbors.KDTree.valid_metrics:
        raise ValueError(
            "NeuroKit error: _get_embedded(): The given metric (%s) is not valid."
            "The valid metric names are: %s" % (distance, sklearn.neighbors.KDTree.valid_metrics)
        )

    # Get embedded
    embedded = complexity_embedding(signal, delay=delay, dimension=dimension)
    if approximate is False:
        embedded = embedded[:-1]  # Removes the last line

    if fuzzy is False:
        # Get neighbors count
        count = _get_count(embedded, r=r, distance=distance)
    else:
        # FuzzyEn: Remove the local baselines of vectors
        embedded -= np.mean(embedded, axis=1, keepdims=True)
        count = _get_count_fuzzy(embedded, r=r, distance=distance, n=1)

    return embedded, count


# =============================================================================
# Get Count
# =============================================================================

Source File: graph.py From TextCategorization with MIT License

5 votes

def distance_lshforest(z, k=4, metric='cosine'):
    """Return an approximation of the k-nearest cosine distances."""
    assert metric is 'cosine'
    lshf = sklearn.neighbors.LSHForest()
    lshf.fit(z)
    dist, idx = lshf.kneighbors(z, n_neighbors=k + 1)
    assert dist.min() < 1e-10
    dist[dist < 0] = 0
    return dist, idx

# TODO: other ANNs s.a. NMSLIB, EFANNA, FLANN, Annoy, sklearn neighbors, PANN

Source File: field_based_ml_field_detection.py From lexpredict-contraxsuite with GNU Affero General Public License v3.0

5 votes

def init_classifier_impl(field_code: str, init_script: str):
    if init_script is not None:
        init_script = init_script.strip()

    if not init_script:
        from sklearn import tree as sklearn_tree
        return sklearn_tree.DecisionTreeClassifier()

    from sklearn import tree as sklearn_tree
    from sklearn import neural_network as sklearn_neural_network
    from sklearn import neighbors as sklearn_neighbors
    from sklearn import svm as sklearn_svm
    from sklearn import gaussian_process as sklearn_gaussian_process
    from sklearn.gaussian_process import kernels as sklearn_gaussian_process_kernels
    from sklearn import ensemble as sklearn_ensemble
    from sklearn import naive_bayes as sklearn_naive_bayes
    from sklearn import discriminant_analysis as sklearn_discriminant_analysis
    from sklearn import linear_model as sklearn_linear_model

    eval_locals = {
        'sklearn_linear_model': sklearn_linear_model,
        'sklearn_tree': sklearn_tree,
        'sklearn_neural_network': sklearn_neural_network,
        'sklearn_neighbors': sklearn_neighbors,
        'sklearn_svm': sklearn_svm,
        'sklearn_gaussian_process': sklearn_gaussian_process,
        'sklearn_gaussian_process_kernels': sklearn_gaussian_process_kernels,
        'sklearn_ensemble': sklearn_ensemble,
        'sklearn_naive_bayes': sklearn_naive_bayes,
        'sklearn_discriminant_analysis': sklearn_discriminant_analysis
    }
    return eval_script('classifier init script of field {0}'.format(field_code), init_script, eval_locals)

Source File: graph.py From gconvRNN with MIT License

5 votes

def distance_lshforest(z, k=4, metric='cosine'):
    """Return an approximation of the k-nearest cosine distances."""
    assert metric is 'cosine'
    lshf = sklearn.neighbors.LSHForest()
    lshf.fit(z)
    dist, idx = lshf.kneighbors(z, n_neighbors=k+1)
    assert dist.min() < 1e-10
    dist[dist < 0] = 0
    return dist, idx

# TODO: other ANNs s.a. NMSLIB, EFANNA, FLANN, Annoy, sklearn neighbors, PANN

Source File: similarity_encoder.py From dirty_cat with BSD 3-Clause "New" or "Revised" License

5 votes

def get_kmeans_prototypes(X, n_prototypes, hashing_dim=128,
                          ngram_range=(3, 3), sparse=False, sample_weight=None,
                          random_state=None):
    """
    Computes prototypes based on:
      - dimensionality reduction (via hashing n-grams)
      - k-means clustering
      - nearest neighbor
    """
    vectorizer = HashingVectorizer(analyzer='char', norm=None,
                                   alternate_sign=False,
                                   ngram_range=ngram_range,
                                   n_features=hashing_dim)
    projected = vectorizer.transform(X)
    if not sparse:
        projected = projected.toarray()
    kmeans = KMeans(n_clusters=n_prototypes, random_state=random_state)
    kmeans.fit(projected, sample_weight=sample_weight)
    centers = kmeans.cluster_centers_
    neighbors = NearestNeighbors()
    neighbors.fit(projected)
    indexes_prototypes = np.unique(neighbors.kneighbors(centers, 1)[-1])
    if indexes_prototypes.shape[0] < n_prototypes:
        warnings.warn('Final number of unique prototypes is lower than ' +
                      'n_prototypes (expected)')
    return np.sort(X[indexes_prototypes])

Source File: knn_matting.py From knn-matting with MIT License

5 votes

def knn_matte(img, trimap, mylambda=100):
    [m, n, c] = img.shape
    img, trimap = img/255.0, trimap/255.0
    foreground = (trimap > 0.99).astype(int)
    background = (trimap < 0.01).astype(int)
    all_constraints = foreground + background

    print('Finding nearest neighbors')
    a, b = np.unravel_index(np.arange(m*n), (m, n))
    feature_vec = np.append(np.transpose(img.reshape(m*n,c)), [ a, b]/np.sqrt(m*m + n*n), axis=0).T
    nbrs = sklearn.neighbors.NearestNeighbors(n_neighbors=10, n_jobs=4).fit(feature_vec)
    knns = nbrs.kneighbors(feature_vec)[1]

    # Compute Sparse A
    print('Computing sparse A')
    row_inds = np.repeat(np.arange(m*n), 10)
    col_inds = knns.reshape(m*n*10)
    vals = 1 - np.linalg.norm(feature_vec[row_inds] - feature_vec[col_inds], axis=1)/(c+2)
    A = scipy.sparse.coo_matrix((vals, (row_inds, col_inds)),shape=(m*n, m*n))

    D_script = scipy.sparse.diags(np.ravel(A.sum(axis=1)))
    L = D_script-A
    D = scipy.sparse.diags(np.ravel(all_constraints[:,:, 0]))
    v = np.ravel(foreground[:,:,0])
    c = 2*mylambda*np.transpose(v)
    H = 2*(L + mylambda*D)

    print('Solving linear system for alpha')
    warnings.filterwarnings('error')
    alpha = []
    try:
        alpha = np.minimum(np.maximum(scipy.sparse.linalg.spsolve(H, c), 0), 1).reshape(m, n)
    except Warning:
        x = scipy.sparse.linalg.lsqr(H, c)
        alpha = np.minimum(np.maximum(x[0], 0), 1).reshape(m, n)
    return alpha

Source File: advanced_supvervised_model_trainer.py From healthcareai-py with MIT License

5 votes

def knn(self,
            scoring_metric='roc_auc',
            hyperparameter_grid=None,
            randomized_search=True,
            number_iteration_samples=10):
        """
        A light wrapper for Sklearn's knn classifier that performs randomized 
        search over an overridable default
        hyperparameter grid.
        
        Args:
            scoring_metric (str): Any sklearn scoring metric appropriate for classification
            hyperparameter_grid (dict): hyperparameters by name
            randomized_search (bool): True for randomized search (default)

            number_iteration_samples (int): Number of models to train during the 
            randomized search for exploring the
            hyperparameter space. More may lead to a better model, but will take longer.

        Returns:
            TrainedSupervisedModel: 
        """
        self.validate_classification('KNN')
        if hyperparameter_grid is None:
            neighbors = list(range(5, 26))
            hyperparameter_grid = {'n_neighbors': neighbors, 'weights': ['uniform', 'distance']}
            number_iteration_samples = 10

            print('KNN Grid: {}'.format(hyperparameter_grid))
        algorithm = get_algorithm(KNeighborsClassifier,
                                  scoring_metric,
                                  hyperparameter_grid,
                                  randomized_search,
                                  number_iteration_samples=number_iteration_samples)

        trained_supervised_model = self._create_trained_supervised_model(algorithm)

        return trained_supervised_model

Source File: mutual_information.py From NeuroKit with MIT License

5 votes

def _entropy(X, k=1):
    """Returns the entropy of X. From https://gist.github.com/GaelVaroquaux/ead9898bd3c973c40429.

    Parameters
    -----------
    X : array-like or shape (n_samples, n_features)
        The data the entropy of which is computed
    k : int (optional)
        number of nearest neighbors for density estimation

    Returns
    -------
    float
        entropy of X.

    Notes
    ---------
    - Kozachenko, L. F. & Leonenko, N. N. 1987 Sample estimate of entropy of a random vector. Probl. Inf. Transm.
    23, 95-101.
    - Evans, D. 2008 A computationally efficient estimator for mutual information, Proc. R. Soc. A 464 (2093),
    1203-1215.
    - Kraskov A, Stogbauer H, Grassberger P. (2004). Estimating mutual information. Phys Rev E 69(6 Pt 2):066138.

    """

    # Distance to kth nearest neighbor
    r = _nearest_distances(X, k)  # squared distances
    n, d = X.shape
    volume_unit_ball = (np.pi ** (0.5 * d)) / scipy.special.gamma(0.5 * d + 1)

    # Perez-Cruz et al. (2008). Estimation of Information Theoretic Measures for
    # Continuous Random Variables, suggets returning:
    # return d*mean(log(r))+log(volume_unit_ball)+log(n-1)-log(k)

    return (
        d * np.mean(np.log(r + np.finfo(X.dtype).eps))
        + np.log(volume_unit_ball)
        + scipy.special.psi(n)
        - scipy.special.psi(k)
    )

Source File: mutual_information.py From NeuroKit with MIT License

5 votes

def _nearest_distances(X, k=1):
    """From https://gist.github.com/GaelVaroquaux/ead9898bd3c973c40429
    X = array(N,M)
    N = number of points
    M = number of dimensions
    returns the distance to the kth nearest neighbor for every point in X
    """
    knn = sklearn.neighbors.NearestNeighbors(n_neighbors=k + 1)
    knn.fit(X)
    d, _ = knn.kneighbors(X)  # the first nearest neighbor is itself
    return d[:, -1]  # returns the distance to the kth nearest neighbor

Source File: utils.py From NeuroKit with MIT License

5 votes

def _get_count_fuzzy(embedded, r, distance="chebyshev", n=1):
    dist = sklearn.neighbors.DistanceMetric.get_metric(distance)
    dist = dist.pairwise(embedded)

    if n > 1:
        sim = np.exp(-(dist ** n) / r)
    else:
        sim = np.exp(-dist / r)
    # Return the count
    return np.sum(sim, axis=0)


# =============================================================================
# Get R
# =============================================================================

Source File: utils.py From NeuroKit with MIT License

5 votes

def _get_count(embedded, r, distance="chebyshev"):
    kdtree = sklearn.neighbors.KDTree(embedded, metric=distance)
    # Return the count
    return kdtree.query_radius(embedded, r, count_only=True).astype(np.float64)

Source File: runDBSCAN.py From simsearch with MIT License

4 votes

def findEps(ssearch):
    """
    Find a good epsilon value to use.
    """
    ###########################################################################
    # Calculate nearest neighbors
    ###########################################################################
    
    # Create a nearest neighbors model--we need 2 nearest neighbors since the 
    # nearest neighbor to a point is going to be itself.
    nbrs_model = NearestNeighbors(n_neighbors=2, algorithm='brute', metric='cosine').fit(ssearch.index.index)
    
    t0 = time.time()
    
    # Find nearest neighbors.
    distances, indices = nbrs_model.kneighbors(ssearch.index.index)
    
    elapsed = time.time() - t0
    
    print 'Took %.2f seconds' % elapsed
    
    distances = [d[1] for d in distances]
    indeces = [ind[1] for ind in indices]
    
    ###########################################################################
    # Histogram the nearest neighbor distances.
    ###########################################################################
    
    import matplotlib.pyplot as plt
    
    counts, bins, patches = plt.hist(distances, bins=16)
    plt.title("Nearest neighbor distances")
    plt.xlabel("Distance")
    plt.ylabel("Frequency")
    
    print '\n%d bins:' % len(counts)
    
    countAcc = 0
    num_points = len(ssearch.index.index)
    
    for i in range(0, len(counts)):
        countAcc += counts[i]
        
        # Calculate the percentage of values which fall below the upper limit 
        # of this bin.
        prcnt = float(countAcc) / float(num_points) * 100.0    
        
        print '  %.2f%% < %.2f' % (prcnt, bins[i + 1])

Source File: runDBSCAN.py From simsearch with MIT License

4 votes

def findMinPts(ssearch, eps):
    """
    Find a good value for MinPts.
    """
    
    ###########################################################################
    # Count neighbors within threshold
    ###########################################################################
    
    print 'Calculating pair-wise distances...'
    # Calculate pair-wise cosine distance for all documents.
    t0 = time.time()
    
    DD = sklearn.metrics.pairwise.cosine_distances(ssearch.index.index)
    
    elapsed = time.time() - t0
    
    print '    Took %.2f seconds' % elapsed
    
    print 'Counting number of neighbors...'
    
    t0 = time.time()
    
    # Create a list to hold the number of neighbors for each point.
    numNeighbors = [0]*len(DD)
    
    for i in range(0, len(DD)):
        dists = DD[i]
        
        count = 0
        for j in range(0, len(DD)):
            if (dists[j] < eps):
                count += 1
    
        numNeighbors[i] = count            
    
    elapsed = time.time() - t0
    
    print '    Took %.2f seconds' % elapsed
    
    ###############################################################################
    # Histogram the nearest neighbor distances.
    ###############################################################################
    
    import matplotlib.pyplot as plt
    
    counts, bins, patches = plt.hist(numNeighbors, bins=60)
    plt.title("Number of neighbors")
    plt.xlabel("Number of neighbors")
    plt.ylabel("Frequency")
    
    print '\n%d bins:' % (len(bins) - 1)
    binsStr = ''
    for b in bins:
        binsStr += '  %0.2f' % b
    
    print binsStr

Python sklearn.neighbors() Examples