Python faiss.index_factory() Examples
The following are 10
code examples of faiss.index_factory().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
faiss
, or try the search function
.
Example #1
Source File: train.py From Mosaicer with MIT License | 8 votes |
def faiss_train(fn_feature, root_path, index_path='train.index', id_path='data.json'): folder_names = os.listdir(root_path) logging.info('directory %s ', folder_names) ids = None vals = None id_json = {} print(folder_names) for idx, folder_name in enumerate(folder_names): id_json[str(idx)] = folder_name now_path = os.path.join(root_path, folder_name) feature_val = fn_feature(now_path) vals = np.concatenate((feature_val, vals), axis=0) if vals is not None else feature_val id_np = np.asarray([idx] * feature_val.shape[0]) ids = np.concatenate((id_np, ids), axis=0) if ids is not None else id_np N, dim = vals.shape x = int(2 * math.sqrt(N)) index_description = "IVF{x},Flat".format(x=x) index = faiss.index_factory(7 * 7 * 512, index_description, faiss.METRIC_INNER_PRODUCT) index.train(vals) index.add_with_ids(vals, ids) faiss.write_index(index, index_path) with open(id_path, 'w', encoding='utf-8') as f: json.dump(id_json, f, ensure_ascii=False, indent=4) print(id_json) return index, id_json
Example #2
Source File: _faiss.py From mars with Apache License 2.0 | 6 votes |
def execute(cls, ctx, op): (data,), device_id, _ = as_same_device( [ctx[op.input.key]], device=op.device, ret_extra=True) with device(device_id): index = faiss.index_factory(data.shape[1], op.faiss_index, op.faiss_metric_type) if device_id >= 0: # pragma: no cover # GPU index = _index_to_gpu(index, device_id) index.train_c(data.shape[0], _swig_ptr_from_cupy_float32_array(data)) else: index.train(data) ctx[op.outputs[0].key] = _store_index( ctx, op, index, device_id)
Example #3
Source File: _faiss.py From mars with Apache License 2.0 | 5 votes |
def _execute_one_chunk(cls, ctx, op): (inp,), device_id, xp = as_same_device( [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True) with device(device_id): # create index index = faiss.index_factory(inp.shape[1], op.faiss_index, op.faiss_metric_type) # GPU if device_id >= 0: # pragma: no cover index = _index_to_gpu(index, device_id) # train index if not index.is_trained: assert op.n_sample is not None sample_indices = xp.random.choice(inp.shape[0], size=op.n_sample, replace=False) sampled = inp[sample_indices] index.train(sampled) if op.metric == 'cosine': # faiss does not support cosine distances directly, # data needs to be normalize before adding to index, # refer to: # https://github.com/facebookresearch/faiss/wiki/FAQ#how-can-i-index-vectors-for-cosine-distance faiss.normalize_L2(inp) # add vectors to index if device_id >= 0: # pragma: no cover # gpu inp = inp.astype(np.float32, copy=False) index.add_c(inp.shape[0], _swig_ptr_from_cupy_float32_array(inp)) else: index.add(inp) ctx[op.outputs[0].key] = _store_index(ctx, op, index, device_id)
Example #4
Source File: _faiss.py From mars with Apache License 2.0 | 5 votes |
def build_faiss_index(X, index_name='auto', n_sample=None, metric="euclidean", random_state=None, same_distribution=True, accuracy=False, memory_require=None, **kw): X = astensor(X) if metric not in METRIC_TO_FAISS_METRIC_TYPE: raise ValueError('unknown metric: {}'.format(metric)) if index_name != 'auto': try: faiss.index_factory(X.shape[1], index_name, METRIC_TO_FAISS_METRIC_TYPE[metric]) except RuntimeError: raise ValueError('illegal faiss index: {}'.format(index_name)) rs = check_random_state(random_state) if isinstance(rs, RandomState): rs = rs.to_numpy() seed = gen_random_seeds(1, rs)[0] if memory_require is None: memory_require = MemoryRequirementGrade.low else: memory_require = _get_memory_require(memory_require) op = FaissBuildIndex(faiss_index=index_name, metric=metric, n_sample=n_sample, gpu=X.op.gpu, seed=seed, same_distribution=same_distribution, accuracy=accuracy, memory_require=memory_require, **kw) return op(X)
Example #5
Source File: faiss_gpu.py From ann-benchmarks with MIT License | 5 votes |
def fit(self, X): X = X.astype(numpy.float32) self._index = faiss.GpuIndexIVFFlat(self._res, len(X[0]), self._n_bits, faiss.METRIC_L2) # self._index = faiss.index_factory(len(X[0]), # "IVF%d,Flat" % self._n_bits) # co = faiss.GpuClonerOptions() # co.useFloat16 = True # self._index = faiss.index_cpu_to_gpu(self._res, 0, # self._index, co) self._index.train(X) self._index.add(X) self._index.setNumProbes(self._n_probes)
Example #6
Source File: cdp.py From capture_reid with Apache License 2.0 | 5 votes |
def cluster(features, th_knn, max_size=300, labels=None): ''' 与face-train不同,这里聚类的相似度没有经过1-转换 :param features: :param th_knn: :param max_size: :return: ''' k = 80 nprobe = 8 # knn size, dim = features.shape metric = faiss.METRIC_INNER_PRODUCT nlist = min(4096, 8 * round(math.sqrt(size))) if size < 4 * 10000: fac_str = "Flat" # same elif size < 80 * 10000: fac_str = "IVF" + str(nlist) + ",Flat" # same elif size < 200 * 10000: fac_str = "IVF16384,Flat" # same else: fac_str = "IVF16384,PQ8" # same logger.info("cdp cluster fac str %s", fac_str) index = faiss.index_factory(dim, fac_str, metric) index.train(features) index.nprobe = min(nprobe, nlist) assert index.is_trained logger.info('cdp cluster nlist: {}, nprobe: {}'.format(nlist, nprobe)) index.add(features) sims, ners = index.search(features, k=k) if "Flat" not in fac_str: sims = sim_by_feature(features, features, ners) knns = np.concatenate([sims[:, np.newaxis].astype(np.float32), ners[:, np.newaxis].astype(np.float32)], axis=1) # del features return cluster_by_knns(knns, features, th_knn, max_size, labels)
Example #7
Source File: _faiss.py From mars with Apache License 2.0 | 4 votes |
def _execute_map(cls, ctx, op): (data,), device_id, _ = as_same_device( [ctx[op.inputs[0].key]], device=op.device, ret_extra=True) index = ctx[op.inputs[1].key] if len(op.inputs) == 2 else None with device(device_id): if index is not None: # fetch the trained index trained_index = _load_index(ctx, op, index, device_id) return_index_type = _get_index_type(op.return_index_type, ctx) if return_index_type == 'object': # clone a new one, # because faiss does not ensure thread-safe for operations that change index # https://github.com/facebookresearch/faiss/wiki/Threads-and-asynchronous-calls#thread-safety trained_index = faiss.clone_index(trained_index) else: trained_index = faiss.index_factory(data.shape[1], op.faiss_index, op.faiss_metric_type) if op.same_distribution: # no need to train, just create index pass else: # distribution no the same, train on each chunk trained_index.train(data) if device_id >= 0: # pragma: no cover trained_index = _index_to_gpu(trained_index, device_id) if op.metric == 'cosine': # faiss does not support cosine distances directly, # data needs to be normalize before adding to index, # refer to: # https://github.com/facebookresearch/faiss/wiki/FAQ#how-can-i-index-vectors-for-cosine-distance faiss.normalize_L2(data) # add data into index if device_id >= 0: # pragma: no cover # gpu trained_index.add_c(data.shape[0], _swig_ptr_from_cupy_float32_array(data)) else: trained_index.add(data) ctx[op.outputs[0].key] = _store_index(ctx, op, trained_index, device_id)
Example #8
Source File: test_faiss.py From mars with Apache License 2.0 | 4 votes |
def testGenIndexStringAndSampleCount(self): d = 32 # accuracy=True, could be Flat only ret = _gen_index_string_and_sample_count((10 ** 9, d), None, True, 'minimum') self.assertEqual(ret, ('Flat', None)) # no memory concern ret = _gen_index_string_and_sample_count((10 ** 5, d), None, False, 'maximum') self.assertEqual(ret, ('HNSW32', None)) index = faiss.index_factory(d, ret[0]) self.assertTrue(index.is_trained) # memory concern not much ret = _gen_index_string_and_sample_count((10 ** 5, d), None, False, 'high') self.assertEqual(ret, ('IVF1580,Flat', 47400)) index = faiss.index_factory(d, ret[0]) self.assertFalse(index.is_trained) # memory quite important ret = _gen_index_string_and_sample_count((5 * 10 ** 6, d), None, False, 'low') self.assertEqual(ret, ('PCAR16,IVF65536_HNSW32,SQ8', 32 * 65536)) index = faiss.index_factory(d, ret[0]) self.assertFalse(index.is_trained) # memory very important ret = _gen_index_string_and_sample_count((10 ** 8, d), None, False, 'minimum') self.assertEqual(ret, ('OPQ16_32,IVF1048576_HNSW32,PQ16', 64 * 65536)) index = faiss.index_factory(d, ret[0]) self.assertFalse(index.is_trained) ret = _gen_index_string_and_sample_count((10 ** 10, d), None, False, 'low') self.assertEqual(ret, ('PCAR16,IVF1048576_HNSW32,SQ8', 64 * 65536)) index = faiss.index_factory(d, ret[0]) self.assertFalse(index.is_trained) with self.assertRaises(ValueError): # M > 64 raise error _gen_index_string_and_sample_count((10 ** 5, d), None, False, 'maximum', M=128) with self.assertRaises(ValueError): # M > 64 _gen_index_string_and_sample_count((10 ** 5, d), None, False, 'minimum', M=128) with self.assertRaises(ValueError): # dim should be multiple of M _gen_index_string_and_sample_count((10 ** 5, d), None, False, 'minimum', M=16, dim=17) with self.assertRaises(ValueError): _gen_index_string_and_sample_count((10 ** 5, d), None, False, 'low', k=5)
Example #9
Source File: knn.py From learn-to-cluster with MIT License | 4 votes |
def __init__(self, feats, k, index_path='', index_key='', nprobe=128, omp_num_threads=None, rebuild_index=True, verbose=True, **kwargs): import faiss if omp_num_threads is not None: faiss.omp_set_num_threads(omp_num_threads) self.verbose = verbose with Timer('[faiss] build index', verbose): if index_path != '' and not rebuild_index and os.path.exists( index_path): print('[faiss] read index from {}'.format(index_path)) index = faiss.read_index(index_path) else: feats = feats.astype('float32') size, dim = feats.shape index = faiss.IndexFlatIP(dim) if index_key != '': assert index_key.find( 'HNSW') < 0, 'HNSW returns distances insted of sims' metric = faiss.METRIC_INNER_PRODUCT nlist = min(4096, 8 * round(math.sqrt(size))) if index_key == 'IVF': quantizer = index index = faiss.IndexIVFFlat(quantizer, dim, nlist, metric) else: index = faiss.index_factory(dim, index_key, metric) if index_key.find('Flat') < 0: assert not index.is_trained index.train(feats) index.nprobe = min(nprobe, nlist) assert index.is_trained print('nlist: {}, nprobe: {}'.format(nlist, nprobe)) index.add(feats) if index_path != '': print('[faiss] save index to {}'.format(index_path)) mkdir_if_no_exists(index_path) faiss.write_index(index, index_path) with Timer('[faiss] query topk {}'.format(k), verbose): knn_ofn = index_path + '.npz' if os.path.exists(knn_ofn): print('[faiss] read knns from {}'.format(knn_ofn)) self.knns = np.load(knn_ofn)['data'] else: sims, nbrs = index.search(feats, k=k) self.knns = [(np.array(nbr, dtype=np.int32), 1 - np.array(sim, dtype=np.float32)) for nbr, sim in zip(nbrs, sims)]
Example #10
Source File: faiss_gpu.py From learn-to-cluster with MIT License | 4 votes |
def __init__(self, target, nprobe=128, index_factory_str=None, verbose=False, mode='proxy', using_gpu=True): self._res_list = [] num_gpu = faiss.get_num_gpus() print('[faiss gpu] #GPU: {}'.format(num_gpu)) size, dim = target.shape assert size > 0, "size: {}".format(size) index_factory_str = "IVF{},PQ{}".format( min(8192, 16 * round(np.sqrt(size))), 32) if index_factory_str is None else index_factory_str cpu_index = faiss.index_factory(dim, index_factory_str) cpu_index.nprobe = nprobe if mode == 'proxy': co = faiss.GpuClonerOptions() co.useFloat16 = True co.usePrecomputed = False index = faiss.IndexProxy() for i in range(num_gpu): res = faiss.StandardGpuResources() self._res_list.append(res) sub_index = faiss.index_cpu_to_gpu( res, i, cpu_index, co) if using_gpu else cpu_index index.addIndex(sub_index) elif mode == 'shard': co = faiss.GpuMultipleClonerOptions() co.useFloat16 = True co.usePrecomputed = False co.shard = True index = faiss.index_cpu_to_all_gpus(cpu_index, co, ngpu=num_gpu) else: raise KeyError("Unknown index mode") index = faiss.IndexIDMap(index) index.verbose = verbose # get nlist to decide how many samples used for training nlist = int([ item for item in index_factory_str.split(",") if 'IVF' in item ][0].replace("IVF", "")) # training if not index.is_trained: indexes_sample_for_train = np.random.randint( 0, size, nlist * 256) index.train(target[indexes_sample_for_train]) # add with ids target_ids = np.arange(0, size) index.add_with_ids(target, target_ids) self.index = index