Python Examples of faiss.IndexFlatL2

Source File: faissext.py From metric-learning-divide-and-conquer with GNU Lesser General Public License v3.0

7 votes

def test_knn_search(size=10000, gpu_id=None):
    x = np.random.rand(size, 512)
    x = x.reshape(x.shape[0], -1).astype('float32')
    d = x.shape[1]

    tic = time.time()
    if gpu_id is None:
        index = faiss.IndexFlatL2(d)
    else:
        cfg = faiss.GpuIndexFlatConfig()
        cfg.useFloat16 = False
        cfg.device = gpu_id

        flat_config = [cfg]
        resources = [faiss.StandardGpuResources()]
        index = faiss.GpuIndexFlatL2(resources[0], d, flat_config[0])
    index.add(x)
    print('Index built in {} sec'.format(time.time() - tic))
    distances, I = index.search(x, 21)
    print('Searched in {} sec'.format(time.time() - tic))
    print(distances.shape)
    print(I.shape)
    print(distances[:5])
    print(I[:5])

Source File: faiss.py From gntp with MIT License

7 votes

def _build_approximate_index(self,
                                     data: np.ndarray):
            dimensionality = data.shape[1]
            nlist = 100 if data.shape[0] > 100 else 2

            if self.kernel_name in {'rbf'}:
                quantizer = faiss.IndexFlatL2(dimensionality)
                cpu_index_flat = faiss.IndexIVFFlat(quantizer, dimensionality, nlist, faiss.METRIC_L2)
            else:
                quantizer = faiss.IndexFlatIP(dimensionality)
                cpu_index_flat = faiss.IndexIVFFlat(quantizer, dimensionality, nlist)

            gpu_index_ivf = faiss.index_cpu_to_gpu(self.resource, 0, cpu_index_flat)
            gpu_index_ivf.train(data)
            gpu_index_ivf.add(data)
            self.index = gpu_index_ivf

Source File: __init__.py From bbknn with MIT License

6 votes

def create_tree(data,approx,metric,use_faiss,n_trees):
	'''
	Create a faiss/cKDTree/KDTree/annoy index for nearest neighbour lookup. All undescribed input
	as in ``bbknn.bbknn()``. Returns the resulting index.

	Input
	-----
	data : ``numppy.array``
		PCA coordinates of a batch's cells to index.
	'''
	if approx:
		ckd = AnnoyIndex(data.shape[1],metric=metric)
		for i in np.arange(data.shape[0]):
			ckd.add_item(i,data[i,:])
		ckd.build(n_trees)
	elif metric == 'euclidean':
		if 'faiss' in sys.modules and use_faiss:
			ckd = faiss.IndexFlatL2(data.shape[1])
			ckd.add(data)
		else:
			ckd = cKDTree(data)
	else:
		ckd = KDTree(data,metric=metric)
	return ckd

Source File: stat_utils.py From pytorch-metric-learning with MIT License

6 votes

def run_kmeans(x, nmb_clusters):
    """
    Args:
        x: data
        nmb_clusters (int): number of clusters
    Returns:
        list: ids of data in each cluster
    """
    n_data, d = x.shape
    logging.info("running k-means clustering with k=%d"%nmb_clusters)
    logging.info("embedding dimensionality is %d"%d)

    # faiss implementation of k-means
    clus = faiss.Clustering(d, nmb_clusters)
    clus.niter = 20
    clus.max_points_per_centroid = 10000000
    index = faiss.IndexFlatL2(d)
    if faiss.get_num_gpus() > 0:
        index = faiss.index_cpu_to_all_gpus(index)
    # perform the training
    clus.train(x, index)
    _, idxs = index.search(x, 1)

    return [int(n[0]) for n in idxs]


# modified from https://github.com/facebookresearch/faiss/wiki/Faiss-building-blocks:-clustering,-PCA,-quantization

Source File: losses.py From SO-Net with MIT License

6 votes

def build_nn_index(self, database):
        '''
        :param database: numpy array of Nx3
        :return: Faiss index, in CPU
        '''
        # index = faiss.GpuIndexFlatL2(self.res, self.dimension, self.flat_config)  # dimension is 3
        index_cpu = faiss.IndexFlatL2(self.dimension)
        index = faiss.index_cpu_to_gpu(self.res, self.opt.gpu_id, index_cpu)
        index.add(database)
        return index

Source File: data.py From dynamicslearn with MIT License

6 votes

def cluster(vectorized, ncentroids):
    import faiss
    x = vectorized
    niter = 50
    verbose = True
    d = x.shape[1]
    kmeans = faiss.Kmeans(d, ncentroids, niter=niter, verbose=verbose)
    kmeans.train(x)

    # for i, v in enumerate(kmeans.centroids):
    #     print(i)

    index = faiss.IndexFlatL2(d)
    index.add(x)
    D, I = index.search(kmeans.centroids, 1)
    x_reduced = x[I, :].squeeze()
    return x_reduced

Source File: RecognitionThread.py From TUT-live-age-estimator with MIT License

6 votes

def initialize_celeb(self):
        print("Initializing celebrity network...")

        with CustomObjectScope({'relu6': keras.layers.ReLU(6.),
                                'DepthwiseConv2D': keras.layers.DepthwiseConv2D,
                                'lifted_struct_loss': lifted_struct_loss,
                                'triplet_loss': triplet_loss}):
            self.siameseNet = keras.models.load_model(os.path.join(self.siamesepath, "feature_model.h5"))

        self.siameseNet._make_predict_function()

        ##### Read celebrity features
        celebrity_features = self.siamesepath + os.sep + "features_" + self.celeb_dataset + ".h5"
        print("Reading celebrity data from {}...".format(celebrity_features))

        with h5py.File(celebrity_features, "r") as h5:
            celeb_features = np.array(h5["features"]).astype(np.float32)
            self.path_ends = list(h5["path_ends"])
            self.celeb_files = [os.path.join(self.visualization_path, s.decode("utf-8")) for s in self.path_ends]

        print("Building index...")
        self.celeb_index = faiss.IndexFlatL2(celeb_features.shape[1])
        self.celeb_index.add(celeb_features)

Source File: run_index.py From denspi with Apache License 2.0

6 votes

def train_coarse_quantizer(data, quantizer_path, num_clusters, hnsw=False, niter=10, cuda=False):
    d = data.shape[1]

    index_flat = faiss.IndexFlatL2(d)
    # make it into a gpu index
    if cuda:
        res = faiss.StandardGpuResources()
        index_flat = faiss.index_cpu_to_gpu(res, 0, index_flat)
    clus = faiss.Clustering(d, num_clusters)
    clus.verbose = True
    clus.niter = niter
    clus.train(data, index_flat)
    centroids = faiss.vector_float_to_array(clus.centroids)
    centroids = centroids.reshape(num_clusters, d)

    if hnsw:
        quantizer = faiss.IndexHNSWFlat(d, 32)
        quantizer.hnsw.efSearch = 128
        quantizer.train(centroids)
        quantizer.add(centroids)
    else:
        quantizer = faiss.IndexFlatL2(d)
        quantizer.add(centroids)

    faiss.write_index(quantizer, quantizer_path)

Source File: reranking.py From Landmark2019-1st-and-3rd-Place-Solution with Apache License 2.0

5 votes

def __init__(self, database, method):
        super().__init__(database, method)
        self.index = {'cosine': faiss.IndexFlatIP,
                      'euclidean': faiss.IndexFlatL2}[method](self.D)
        if os.environ.get('CUDA_VISIBLE_DEVICES'):
            print('CUDA', os.environ.get('CUDA_VISIBLE_DEVICES'))
            self.index = faiss.index_cpu_to_all_gpus(self.index)
        self.add()

Source File: stat_utils.py From pytorch-metric-learning with MIT License

5 votes

def get_knn(
    reference_embeddings, test_embeddings, k, embeddings_come_from_same_source=False
):
    """
    Finds the k elements in reference_embeddings that are closest to each
    element of test_embeddings.
    Args:
        reference_embeddings: numpy array of size (num_samples, dimensionality).
        test_embeddings: numpy array of size (num_samples2, dimensionality).
        k: int, number of nearest neighbors to find
        embeddings_come_from_same_source: if True, then the nearest neighbor of
                                         each element (which is actually itself)
                                         will be ignored.
    Returns:
        numpy array: indices of nearest k neighbors
        numpy array: corresponding distances
    """
    d = reference_embeddings.shape[1]
    logging.info("running k-nn with k=%d"%k)
    logging.info("embedding dimensionality is %d"%d)
    index = faiss.IndexFlatL2(d)
    if faiss.get_num_gpus() > 0:
        index = faiss.index_cpu_to_all_gpus(index)
    index.add(reference_embeddings)
    distances, indices = index.search(test_embeddings, k + 1)
    if embeddings_come_from_same_source:
        return indices[:, 1:], distances[:, 1:]
    return indices[:, :k], distances[:, :k]


# modified from https://raw.githubusercontent.com/facebookresearch/deepcluster/

Source File: knn.py From diffusion with MIT License

5 votes

def __init__(self, database, method, M=128, nbits=8, nlist=316, nprobe=64):
        super().__init__(database, method)
        self.quantizer = {'cosine': faiss.IndexFlatIP,
                          'euclidean': faiss.IndexFlatL2}[method](self.D)
        self.index = faiss.IndexIVFPQ(self.quantizer, self.D, nlist, M, nbits)
        samples = database[np.random.permutation(np.arange(self.N))[:self.N // 5]]
        print("[ANN] train")
        self.index.train(samples)
        self.add()
        self.index.nprobe = nprobe

Source File: knn.py From diffusion with MIT License

5 votes

def __init__(self, database, method):
        super().__init__(database, method)
        self.index = {'cosine': faiss.IndexFlatIP,
                      'euclidean': faiss.IndexFlatL2}[method](self.D)
        if os.environ.get('CUDA_VISIBLE_DEVICES'):
            self.index = faiss.index_cpu_to_all_gpus(self.index)
        self.add()

Source File: streamlit_demo.py From RecNN with Apache License 2.0

5 votes

def get_index():
    import faiss
    from sklearn.preprocessing import normalize
    # test indexes
    indexL2 = faiss.IndexFlatL2(128)
    indexIP = faiss.IndexFlatIP(128)
    indexCOS = faiss.IndexFlatIP(128)

    mov_mat, _, _ = get_embeddings()
    mov_mat = mov_mat.numpy().astype('float32')
    indexL2.add(mov_mat)
    indexIP.add(mov_mat)
    indexCOS.add(normalize(mov_mat, axis=1, norm='l2'))
    return {'L2': indexL2, 'IP': indexIP, 'COS': indexCOS}

Source File: faiss.py From ann-benchmarks with MIT License

5 votes

def fit(self, X):
        if self._metric == 'angular':
            X = sklearn.preprocessing.normalize(X, axis=1, norm='l2')

        if X.dtype != numpy.float32:
            X = X.astype(numpy.float32)

        self.quantizer = faiss.IndexFlatL2(X.shape[1])
        index = faiss.IndexIVFFlat(
            self.quantizer, X.shape[1], self._n_list, faiss.METRIC_L2)
        index.train(X)
        index.add(X)
        self.index = index

Source File: lossess.py From RL-GAN-Net with MIT License

5 votes

def build_nn_index(self, database):
        '''
        :param database: numpy array of Nx3
        :return: Faiss index, in CPU
        '''
        # index = faiss.GpuIndexFlatL2(self.res, self.dimension, self.flat_config)  # dimension is 3
        index_cpu = faiss.IndexFlatL2(self.dimension)
        index = faiss.index_cpu_to_gpu(self.res, self.opt.gpu_id, index_cpu)
        index.add(database)
        return index

Source File: lossess.py From RL-GAN-Net with MIT License

5 votes

def build_nn_index(self, database):
        '''
        :param database: numpy array of Nx3
        :return: Faiss index, in CPU
        '''
        # index = faiss.GpuIndexFlatL2(self.res, self.dimension, self.flat_config)  # dimension is 3
        index_cpu = faiss.IndexFlatL2(self.dimension)
        index = faiss.index_cpu_to_gpu(self.res, self.opt.gpu_id, index_cpu)
        index.add(database)
        return index

Source File: faiss.py From gntp with MIT License

5 votes

def _build_exact_index(self,
                               data: np.ndarray):
            dimensionality = data.shape[1]

            if self.kernel_name in {'rbf'}:
                self.cpu_index_flat = faiss.IndexFlatL2(dimensionality)
            else:
                self.cpu_index_flat = faiss.IndexFlatIP(dimensionality)

            if not self.cpu:
                self.index = faiss.index_cpu_to_gpu(self.resource, 0, self.cpu_index_flat)
            else:
                self.index = self.cpu_index_flat
            self.index.add(data)

Source File: auxiliaries.py From Deep-Metric-Learning-Baselines with Apache License 2.0

5 votes

def recover_closest_one_dataset(feature_matrix_all, image_paths, save_path, n_image_samples=10, n_closest=3):
    """
    Provide sample recoveries.

    Args:
        feature_matrix_all: np.ndarray [n_samples x embed_dim], full data embedding of test samples.
        image_paths:        list [n_samples], list of datapaths corresponding to <feature_matrix_all>
        save_path:          str, where to store sample image.
        n_image_samples:    Number of sample recoveries.
        n_closest:          Number of closest recoveries to show.
    Returns:
        Nothing!
    """
    image_paths = np.array([x[0] for x in image_paths])
    sample_idxs = np.random.choice(np.arange(len(feature_matrix_all)), n_image_samples)

    faiss_search_index = faiss.IndexFlatL2(feature_matrix_all.shape[-1])
    faiss_search_index.add(feature_matrix_all)
    _, closest_feature_idxs = faiss_search_index.search(feature_matrix_all, n_closest+1)

    sample_paths = image_paths[closest_feature_idxs][sample_idxs]

    f,axes = plt.subplots(n_image_samples, n_closest+1)
    for i,(ax,plot_path) in enumerate(zip(axes.reshape(-1), sample_paths.reshape(-1))):
        ax.imshow(np.array(Image.open(plot_path)))
        ax.set_xticks([])
        ax.set_yticks([])
        if i%(n_closest+1):
            ax.axvline(x=0, color='g', linewidth=13)
        else:
            ax.axvline(x=0, color='r', linewidth=13)
    f.set_size_inches(10,20)
    f.tight_layout()
    f.savefig(save_path)
    plt.close()


####### RECOVER CLOSEST EXAMPLE IMAGES #######

Source File: matching_localfeatures.py From Landmark2019-1st-and-3rd-Place-Solution with Apache License 2.0

5 votes

def euclidean_search_test(fn_npy):
    ds = load_train_ensemble_dataset()

    cpu_index = faiss.IndexFlatL2(ds[f'feats_train'].shape[1])
    cpu_index.add(ds[f'feats_train'])

    _, all_ranks = cpu_index.search(x=ds[f'feats_test'], k=100)
    Path(fn_npy).parent.mkdir(parents=True, exist_ok=True)
    np.save(fn_npy, all_ranks)

Source File: submit_recognition.py From Landmark2019-1st-and-3rd-Place-Solution with Apache License 2.0

5 votes

def get_df_and_dists(topk=100):
    test_dirs = [
        ROOT + 'exp/v19c/feats_test19_ms_L2_ep4_scaleup_ep3_freqthresh-2_loss-cosface_pooling-G,G,G,G_verifythresh-30/',
        ROOT + 'exp/v20c/feats_test19_ms_L2_ep5_augmentation-middle_epochs-7_freqthresh-3_loss-arcface_verifythresh-30/',
        ROOT + 'exp/v21c/feats_test19_ms_L2_ep6_scaleup_ep5_augmentation-middle_epochs-7_freqthresh-3_loss-arcface_verifythresh-30/',
        ROOT + 'exp/v22c/feats_test19_ms_L2_ep4_scaleup_ep3_base_margin-0.4_freqthresh-2_verifythresh-30/',
        ROOT + 'exp/v23c/feats_test19_ms_L2_ep6_scaleup_ep5_augmentation-middle_epochs-7_freqthresh-3_verifythresh-30/',
        ROOT + 'exp/v24c/feats_test19_ms_L2_ep5_augmentation-middle_epochs-7_freqthresh-3_loss-cosface_verifythresh-30/',
    ]
    train_dirs = [
        ROOT + 'exp/v19c/feats_train_ms_L2_ep4_scaleup_ep3_freqthresh-2_loss-cosface_pooling-G,G,G,G_verifythresh-30/',
        ROOT + 'exp/v20c/feats_train_ms_L2_ep5_augmentation-middle_epochs-7_freqthresh-3_loss-arcface_verifythresh-30/',
        ROOT + 'exp/v21c/feats_train_ms_L2_ep6_scaleup_ep5_augmentation-middle_epochs-7_freqthresh-3_loss-arcface_verifythresh-30/',
        ROOT + 'exp/v22c/feats_train_ms_L2_ep4_scaleup_ep3_base_margin-0.4_freqthresh-2_verifythresh-30/',
        ROOT + 'exp/v23c/feats_train_ms_L2_ep6_scaleup_ep5_augmentation-middle_epochs-7_freqthresh-3_verifythresh-30/',
        ROOT + 'exp/v24c/feats_train_ms_L2_ep5_augmentation-middle_epochs-7_freqthresh-3_loss-cosface_verifythresh-30/',
    ]
    weights = [
        0.5,
        1.0,
        1.0,
        0.5,
        1.0,
        1.0,
    ]

    logger.info('load ids and features.')
    ids_test, feats_test = utils.prepare_ids_and_feats(test_dirs, weights, normalize=True)
    ids_train, feats_train = utils.prepare_ids_and_feats(train_dirs, weights, normalize=True)
    logger.info('done.')

    logger.info('build index...')
    cpu_index = faiss.IndexFlatL2(feats_train.shape[1])
    cpu_index.add(feats_train)
    dists, topk_idx = cpu_index.search(x=feats_test, k=topk)
    logger.info('query search done.')

    df = pd.DataFrame(ids_test, columns=['id'])
    df['images'] = np.apply_along_axis(' '.join, axis=1, arr=ids_train[topk_idx])

    return df, dists

Source File: submit_retrieval.py From Landmark2019-1st-and-3rd-Place-Solution with Apache License 2.0

5 votes

def main():
    index_dirs = [
        '../exp/v2clean/feats_index19_ms_L2_ep4_freqthresh-3_loss-arcface_verifythresh-30/']
    test_dirs = [
        '../exp/v2clean/feats_test19_ms_L2_ep4_freqthresh-3_loss-arcface_verifythresh-30/']
    train_dirs = [
        '../exp/v2clean/feats_train_ms_L2_ep4_freqthresh-3_loss-arcface_verifythresh-30/']

    ids_index, feats_index = utils.prepare_ids_and_feats(index_dirs, normalize=True)
    ids_test, feats_test = utils.prepare_ids_and_feats(test_dirs, normalize=True)
    ids_train, feats_train = utils.prepare_ids_and_feats(train_dirs, normalize=True)

    print('build index...')
    cpu_index = faiss.IndexFlatL2(feats_index.shape[1])
    gpu_index = faiss.index_cpu_to_all_gpus(cpu_index)
    gpu_index.add(feats_index)
    dists, topk_idx = gpu_index.search(x=feats_test, k=100)
    print('query search done.')

    subm = pd.DataFrame(ids_test, columns=['id'])
    subm['images'] = np.apply_along_axis(' '.join, axis=1, arr=ids_index[topk_idx])

    subm = reranking_submission(ids_index, feats_index,
                                ids_test, feats_test,
                                ids_train, feats_train,
                                subm, topk=100)

    output_name = ROOT + f'output/submit_retrieval.csv.gz'
    subm[['id', 'images']].to_csv(output_name, compression='gzip', index=False)
    print('saved to ' + output_name)

    cmd = f'kaggle c submit -c landmark-retrieval-2019 -f {output_name} -m "" '
    print(cmd)
    subprocess.run(cmd, shell=True)

Source File: modelnet_shrec_loader.py From SO-Net with MIT License

5 votes

def build_nn_index(self, database):
        '''
        :param database: numpy array of Nx3
        :return: Faiss index, in CPU
        '''
        index = faiss.IndexFlatL2(self.dimension)  # dimension is 3
        index.add(database)
        return index

Source File: faissext.py From metric-learning-divide-and-conquer with GNU Lesser General Public License v3.0

5 votes

def compute_cluster_assignment(centroids, x):
    assert centroids is not None, "should train before assigning"
    d = centroids.shape[1]
    index = faiss.IndexFlatL2(d)
    index.add(centroids)
    distances, labels = index.search(x, 1)
    return labels.ravel()

Source File: auxiliaries.py From Deep-Metric-Learning-Baselines with Apache License 2.0

5 votes

def recover_closest_inshop(query_feature_matrix_all, gallery_feature_matrix_all, query_image_paths, gallery_image_paths, save_path, n_image_samples=10, n_closest=3):
    """
    Provide sample recoveries.

    Args:
        query_feature_matrix_all:   np.ndarray [n_query_samples x embed_dim], full data embedding of query samples.
        gallery_feature_matrix_all: np.ndarray [n_gallery_samples x embed_dim], full data embedding of gallery samples.
        query_image_paths:          list [n_samples], list of datapaths corresponding to <query_feature_matrix_all>
        gallery_image_paths:        list [n_samples], list of datapaths corresponding to <gallery_feature_matrix_all>
        save_path:          str, where to store sample image.
        n_image_samples:    Number of sample recoveries.
        n_closest:          Number of closest recoveries to show.
    Returns:
        Nothing!
    """
    query_image_paths, gallery_image_paths   = np.array(query_image_paths), np.array(gallery_image_paths)
    sample_idxs = np.random.choice(np.arange(len(query_feature_matrix_all)), n_image_samples)

    faiss_search_index = faiss.IndexFlatL2(gallery_feature_matrix_all.shape[-1])
    faiss_search_index.add(gallery_feature_matrix_all)
    _, closest_feature_idxs = faiss_search_index.search(query_feature_matrix_all, n_closest)

    image_paths  = gallery_image_paths[closest_feature_idxs]
    image_paths  = np.concatenate([query_image_paths.reshape(-1,1), image_paths],axis=-1)

    sample_paths = image_paths[closest_feature_idxs][sample_idxs]

    f,axes = plt.subplots(n_image_samples, n_closest+1)
    for i,(ax,plot_path) in enumerate(zip(axes.reshape(-1), sample_paths.reshape(-1))):
        ax.imshow(np.array(Image.open(plot_path)))
        ax.set_xticks([])
        ax.set_yticks([])
        if i%(n_closest+1):
            ax.axvline(x=0, color='g', linewidth=13)
        else:
            ax.axvline(x=0, color='r', linewidth=13)
    f.set_size_inches(10,20)
    f.tight_layout()
    f.savefig(save_path)
    plt.close()

Source File: matching_localfeatures.py From Landmark2019-1st-and-3rd-Place-Solution with Apache License 2.0

4 votes

def faiss_search_results_train_train(block_id=1, n_splits=1):
    dataset = load_train_dataset()
    print('Loading train19 landmark dict')
    landmark_dict = load_train19_landmark_dict()

    size_train = dataset.feats_train.shape[0]
    part_size = int(size_train / n_splits)
    idx_train_start = (block_id - 1) * part_size
    idx_train_end = (block_id) * part_size
    if block_id == n_splits:
        idx_train_end = size_train

    cpu_index = faiss.IndexFlatL2(dataset.feats_train.shape[1])
    gpu_index = faiss.index_cpu_to_all_gpus(cpu_index)
    gpu_index.add(dataset.feats_train)
    dists, topk_idx = gpu_index.search(
        x=dataset.feats_train[idx_train_start:idx_train_end],
        k=1000)

    df = pd.DataFrame(
        dataset.ids_train[idx_train_start:idx_train_end],
        columns=['id'])
    df['images'] = np.apply_along_axis(
        ' '.join, axis=1, arr=dataset.ids_train[topk_idx])

    print('generate sub')
    rows = []
    for imidx, (_, r) in tqdm.tqdm(enumerate(df.iterrows()),
                                   total=len(df)):
        landmark_id = landmark_dict[r['id']]
        same_landmark_images = []
        for rank, imid in enumerate(r.images.split(' ')):
            if landmark_id == landmark_dict[imid]:
                same_landmark_images.append(
                    f'{rank}:{dists[imidx, rank]:.8f}:{imid}')
                if len(same_landmark_images) >= 100:
                    break

        rows.append({
            'id': r['id'],
            'landmark_id': landmark_id,
            'matched': ' '.join(same_landmark_images),
        })

    fn = (ROOT + 'input/' +
          f'train19_train19_faiss_search_same_landmarks_blk{block_id}.csv.gz')
    Path(fn).parent.mkdir(parents=True, exist_ok=True)

    print('to_csv')
    df = pd.DataFrame(rows).to_csv(fn, index=False,
                                   compression='gzip')

Source File: utils.py From DeMa-BWE with BSD 3-Clause "New" or "Revised" License

4 votes

def get_nn_avg_dist_mog(emb, query, knn):
    """
    Compute the average distance of the `knn` nearest neighbors
    for a given set of embeddings and queries.
    Use Faiss if available.

    emb has divided sqrt(2) * var
    """
    if FAISS_AVAILABLE:
        emb = emb.cpu().numpy()
        query = query.cpu().numpy()
        if hasattr(faiss, 'StandardGpuResources'):
            # gpu mode
            res = faiss.StandardGpuResources()
            config = faiss.GpuIndexFlatConfig()
            config.device = 0
            index = faiss.GpuIndexFlatL2(res, emb.shape[1], config)
        else:
            # cpu mode
            index = faiss.IndexFlatL2(emb.shape[1])
        index.add(emb)
        # Ad-hoc implementation
        topK = 1000
        temp = 2.
        topK = 10
        distances, idxes = index.search(query, topK)
        return distances.mean(1)
        #query_idx = np.tile(np.arange(query.shape[0]) + 1, (topK, 1)).transpose()
        #rank_diff = abs(np.log(idxes + 1) - np.log(query_idx)) / temp
        #mog_distances_sorted = np.sort(distances + rank_diff)[:, :knn]
        # return: qN, knn
        #return mog_distances_sorted.mean(1)
    else:
        bs = 1024
        all_distances = []
        emb = emb.transpose(0, 1).contiguous()
        for i in range(0, query.shape[0], bs):
            distances = query[i:i + bs].mm(emb)
            best_distances, _ = distances.topk(knn, dim=1, largest=True, sorted=True)
            all_distances.append(best_distances.mean(1).cpu())
        all_distances = torch.cat(all_distances)
        return all_distances.numpy()

Source File: faissext.py From metric-learning-divide-and-conquer with GNU Lesser General Public License v3.0

4 votes

def find_nearest_neighbors(x, queries=None, k=5, gpu_id=None):
    """
    Find k nearest neighbors for each of the n examples.
    Distances are computed using Squared Euclidean distance metric.

    Arguments:
    ----------
    queries
    x (ndarray): N examples to search within. [N x d].
    gpu_id (int): use CPU if None else use GPU with the specified id.
    queries (ndarray): find nearest neigbor for each query example. [M x d] matrix
        If None than find k nearest neighbors for each row of x
        (excluding self exampels).
    k (int): number of nearest neighbors to find.

    Return
    I (ndarray): Indices of the nearest neighnpors. [M x k]
    distances (ndarray): Distances to the nearest neighbors. [M x k]

    """
    if gpu_id is not None and not isinstance(gpu_id, int):
        raise ValueError('gpu_id must be None or int')
    x = np.asarray(x.reshape(x.shape[0], -1), dtype=np.float32)
    remove_self = False # will we have queries in the search results?
    if queries is None:
        remove_self = True
        queries = x
        k += 1

    d = x.shape[1]

    tic = time.time()
    if gpu_id is None:
        logging.debug('FAISS: cpu::find {} nearest neighbors'\
                     .format(k - int(remove_self)))
        index = faiss.IndexFlatL2(d)
    else:
        logging.debug('FAISS: gpu[{}]::find {} nearest neighbors'\
                     .format(gpu_id, k - int(remove_self)))
        cfg = faiss.GpuIndexFlatConfig()
        cfg.useFloat16 = False
        cfg.device = gpu_id

        flat_config = [cfg]
        resources = [faiss.StandardGpuResources()]
        index = faiss.GpuIndexFlatL2(resources[0], d, flat_config[0])
    index.add(x)
    distances, nns = index.search(queries, k)
    if remove_self:
        for i in range(len(nns)):
            indices = np.nonzero(nns[i, :] != i)[0]
            indices.sort()
            if len(indices) > k - 1:
                indices = indices[:-1]
            nns[i, :-1] = nns[i, indices]
            distances[i, :-1] = distances[i, indices]
        nns = nns[:, :-1]
        distances = distances[:, :-1]
    logging.debug('FAISS: Neighbors search total elapsed time: {:.2f} sec'.format(time.time() - tic))
    return nns, distances

Source File: faissext.py From metric-learning-divide-and-conquer with GNU Lesser General Public License v3.0

4 votes

def train_kmeans(x, num_clusters=1000, gpu_ids=None, niter=100, nredo=1, verbose=0):
    """
    Runs k-means clustering on one or several GPUs
    """
    assert np.all(~np.isnan(x)), 'x contains NaN'
    assert np.all(np.isfinite(x)), 'x contains Inf'
    if isinstance(gpu_ids, int):
        gpu_ids = [gpu_ids]
    assert gpu_ids is None or len(gpu_ids)

    d = x.shape[1]
    kmeans = faiss.Clustering(d, num_clusters)
    kmeans.verbose = bool(verbose)
    kmeans.niter = niter
    kmeans.nredo = nredo

    # otherwise the kmeans implementation sub-samples the training set
    kmeans.max_points_per_centroid = 10000000

    if gpu_ids is not None:
        res = [faiss.StandardGpuResources() for i in gpu_ids]

        flat_config = []
        for i in gpu_ids:
            cfg = faiss.GpuIndexFlatConfig()
            cfg.useFloat16 = False
            cfg.device = i
            flat_config.append(cfg)

        if len(gpu_ids) == 1:
            index = faiss.GpuIndexFlatL2(res[0], d, flat_config[0])
        else:
            indexes = [faiss.GpuIndexFlatL2(res[i], d, flat_config[i])
                       for i in range(len(gpu_ids))]
            index = faiss.IndexProxy()
            for sub_index in indexes:
                index.addIndex(sub_index)
    else:
        index = faiss.IndexFlatL2(d)

    # perform the training
    kmeans.train(x, index)
    centroids = faiss.vector_float_to_array(kmeans.centroids)

    objective = faiss.vector_float_to_array(kmeans.obj)
    #logging.debug("Final objective: %.4g" % objective[-1])

    return centroids.reshape(num_clusters, d)

Python faiss.IndexFlatL2() Examples