Python faiss.write_index() Examples
The following are 13
code examples of faiss.write_index().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
faiss
, or try the search function
.
Example #1
Source File: train.py From Mosaicer with MIT License | 8 votes |
def faiss_train(fn_feature, root_path, index_path='train.index', id_path='data.json'): folder_names = os.listdir(root_path) logging.info('directory %s ', folder_names) ids = None vals = None id_json = {} print(folder_names) for idx, folder_name in enumerate(folder_names): id_json[str(idx)] = folder_name now_path = os.path.join(root_path, folder_name) feature_val = fn_feature(now_path) vals = np.concatenate((feature_val, vals), axis=0) if vals is not None else feature_val id_np = np.asarray([idx] * feature_val.shape[0]) ids = np.concatenate((id_np, ids), axis=0) if ids is not None else id_np N, dim = vals.shape x = int(2 * math.sqrt(N)) index_description = "IVF{x},Flat".format(x=x) index = faiss.index_factory(7 * 7 * 512, index_description, faiss.METRIC_INNER_PRODUCT) index.train(vals) index.add_with_ids(vals, ids) faiss.write_index(index, index_path) with open(id_path, 'w', encoding='utf-8') as f: json.dump(id_json, f, ensure_ascii=False, indent=4) print(id_json) return index, id_json
Example #2
Source File: h5_to_faiss.py From Seq2Seq-Vis with Apache License 2.0 | 6 votes |
def main(): f = h5py.File(opt.states, "r") data = f[opt.data] seqs, slens, hid = data.shape print("Processing {} Sequences".format(seqs)) print("with {} tokens each".format(slens)) print("and {} states".format(hid)) # Initialize a new index index = faiss.IndexFlatIP(hid) # Fill it for ix in tqdm(range(0, seqs-opt.stepsize, opt.stepsize)): cdata = np.array(data[ix:ix+opt.stepsize]\ .reshape(-1, hid), dtype="float32") index.add(cdata) f.close() faiss.write_index(index, opt.output)
Example #3
Source File: run_index.py From denspi with Apache License 2.0 | 6 votes |
def train_coarse_quantizer(data, quantizer_path, num_clusters, hnsw=False, niter=10, cuda=False): d = data.shape[1] index_flat = faiss.IndexFlatL2(d) # make it into a gpu index if cuda: res = faiss.StandardGpuResources() index_flat = faiss.index_cpu_to_gpu(res, 0, index_flat) clus = faiss.Clustering(d, num_clusters) clus.verbose = True clus.niter = niter clus.train(data, index_flat) centroids = faiss.vector_float_to_array(clus.centroids) centroids = centroids.reshape(num_clusters, d) if hnsw: quantizer = faiss.IndexHNSWFlat(d, 32) quantizer.hnsw.efSearch = 128 quantizer.train(centroids) quantizer.add(centroids) else: quantizer = faiss.IndexFlatL2(d) quantizer.add(centroids) faiss.write_index(quantizer, quantizer_path)
Example #4
Source File: run_index.py From denspi with Apache License 2.0 | 6 votes |
def train_index(data, quantizer_path, trained_index_path, fine_quant='SQ8', cuda=False): quantizer = faiss.read_index(quantizer_path) if fine_quant == 'SQ8': trained_index = faiss.IndexIVFScalarQuantizer(quantizer, quantizer.d, quantizer.ntotal, faiss.METRIC_L2) elif fine_quant.startswith('PQ'): m = int(fine_quant[2:]) trained_index = faiss.IndexIVFPQ(quantizer, quantizer.d, quantizer.ntotal, m, 8) else: raise ValueError(fine_quant) if cuda: if fine_quant.startswith('PQ'): print('PQ not supported on GPU; keeping CPU.') else: res = faiss.StandardGpuResources() gpu_index = faiss.index_cpu_to_gpu(res, 0, trained_index) gpu_index.train(data) trained_index = faiss.index_gpu_to_cpu(gpu_index) else: trained_index.train(data) faiss.write_index(trained_index, trained_index_path)
Example #5
Source File: embedding_based_indexer.py From forte with Apache License 2.0 | 6 votes |
def save(self, path: str) -> None: r"""Save the index and meta data in ``path`` directory. The index will be saved as ``index.faiss`` and ``index.meta_data`` respectively inside ``path`` directory. Args: path (str): A path to the directory where the index will be saved """ if os.path.exists(path): logging.warning("%s directory already exists. Index will be " "saved into an existing directory", path) else: os.makedirs(path) cpu_index = faiss.index_gpu_to_cpu(self._index) \ if self._index.__class__.__name__.startswith("Gpu") else self._index faiss.write_index(cpu_index, f"{path}/index.faiss") with open(f"{path}/index.meta_data", "wb") as f: pickle.dump(self._meta_data, f)
Example #6
Source File: _faiss.py From mars with Apache License 2.0 | 5 votes |
def _store_index(ctx, op, index, device_id): return_index_type = _get_index_type(op.return_index_type, ctx) if return_index_type == 'object': # no need to serialize return index elif return_index_type == 'filename': # save to file, then return filename if device_id >= 0: # pragma: no cover # for gpu, convert to cpu first index = faiss.index_gpu_to_cpu(index) fn = tempfile.mkstemp('.index', prefix='faiss_')[1] faiss.write_index(index, fn) atexit.register(lambda: os.remove(fn)) return fn else: if device_id >= 0: # pragma: no cover # for gpu, convert to cpu first index = faiss.index_gpu_to_cpu(index) # distributed, save to file, then return in memory bytes fn = tempfile.mkstemp('.index', prefix='faiss_')[1] faiss.write_index(index, fn) try: with open(fn, 'rb') as f: return f.read() finally: os.remove(fn)
Example #7
Source File: indexing.py From faiss-server with MIT License | 5 votes |
def do_indexing(word2vec_model=None): if not os.path.isfile(INDEX_FILE_PATH): index = faiss.IndexFlatIP(word2vec_model.vector_size) index.add(word2vec_model.wv.syn0norm) faiss.write_index(index, INDEX_FILE_PATH) return index else: return faiss.read_index(INDEX_FILE_PATH)
Example #8
Source File: faiss_indexer.py From BLINK with MIT License | 5 votes |
def serialize(self, index_file: str): logger.info("Serializing index to %s", index_file) faiss.write_index(self.index, index_file)
Example #9
Source File: create_faiss.py From exbert with Apache License 2.0 | 5 votes |
def save_indexes(idxs, outdir, base_name=LAYER_TEMPLATE): """Save the faiss index into a file for each index in idxs""" base_dir = Path(outdir) if not base_dir.exists(): base_dir.mkdir(exist_ok=True, parents=True) out_name = str(base_dir / base_name) for i, idx in enumerate(idxs): name = out_name.format(i) print(f"Saving to {name}") faiss.write_index(idx, name)
Example #10
Source File: run_index.py From denspi with Apache License 2.0 | 5 votes |
def merge_indexes(subindex_dir, trained_index_path, target_index_path, target_idx2id_path, target_inv_path): # target_inv_path = merged_index.ivfdata names = os.listdir(subindex_dir) idx2id_paths = [os.path.join(subindex_dir, name) for name in names if name.endswith('.hdf5')] index_paths = [os.path.join(subindex_dir, name) for name in names if name.endswith('.faiss')] print('copying idx2id') with h5py.File(target_idx2id_path, 'w') as out: for idx2id_path in tqdm(idx2id_paths, desc='copying idx2id'): with h5py.File(idx2id_path, 'r') as in_: for key, g in in_.items(): offset = str(g.attrs['offset']) assert key == offset group = out.create_group(offset) group.create_dataset('doc', data=in_['doc']) group.create_dataset('para', data=in_['para']) group.create_dataset('word', data=in_['word']) print('loading invlists') ivfs = [] for index_path in tqdm(index_paths, desc='loading invlists'): # the IO_FLAG_MMAP is to avoid actually loading the data thus # the total size of the inverted lists can exceed the # available RAM index = faiss.read_index(index_path, faiss.IO_FLAG_MMAP) ivfs.append(index.invlists) # avoid that the invlists get deallocated with the index index.own_invlists = False # construct the output index index = faiss.read_index(trained_index_path) # prepare the output inverted lists. They will be written # to merged_index.ivfdata invlists = faiss.OnDiskInvertedLists( index.nlist, index.code_size, target_inv_path) # merge all the inverted lists print('merging') ivf_vector = faiss.InvertedListsPtrVector() for ivf in tqdm(ivfs): ivf_vector.push_back(ivf) print("merge %d inverted lists " % ivf_vector.size()) ntotal = invlists.merge_from(ivf_vector.data(), ivf_vector.size()) print(ntotal) # now replace the inverted lists in the output index index.ntotal = ntotal index.replace_invlists(invlists) print('writing index') faiss.write_index(index, target_index_path)
Example #11
Source File: remove_doc_id.py From denspi with Apache License 2.0 | 4 votes |
def remove_doc_ids(args): if os.path.isdir(args.subindex_dir): names = os.listdir(args.subindex_dir) index_names = [name for name in names if name.endswith('.faiss')] index_paths = [os.path.join(args.subindex_dir, name) for name in index_names] target_paths = [os.path.join(args.target_dir, name) for name in index_names] idx2id_paths = [path.replace('.faiss', '.hdf5') for path in index_paths] if not os.path.exists(args.target_dir): os.makedirs(args.target_dir) with open(args.ignore_path, 'r') as fp: ignore_counter = json.load(fp) count = sum(ignore_counter.values()) th = count * args.ratio ignores = [int(key) for key, val in ignore_counter.items() if val > th] print('thresholding at %.1f, removing following document ids:' % th) for ignore in ignores: print(ignore) for idx2id_path, index_path, target_path in zip(idx2id_paths, tqdm(index_paths), target_paths): with h5py.File(idx2id_path, 'r') as f: doc_ids = f['doc'][:] offset = f.attrs['offset'] idxs, = np.where(np.any(np.expand_dims(doc_ids, 1) == ignores, 1)) if len(idxs) > 0: idxs = idxs + offset print('found %d ids to remove' % len(idxs)) index = faiss.read_index(index_path) index.remove_ids(idxs) faiss.write_index(index, target_path) else: print('no ignore list found at %s' % index_path) else: index_path = args.subindex_dir target_path = args.target_dir idx2id_path = args.subindex_dir.replace('index.faiss', 'idx2id.hdf5') with open(args.ignore_path, 'r') as fp: ignores = np.array(list(map(int, json.load(fp)))) with h5py.File(idx2id_path, 'r') as f: for offset, group in f.items(): doc_ids = group['doc'][:] offset = int(offset) idxs, = np.where(np.any(np.expand_dims(doc_ids, 1) == ignores, 1)) if len(idxs) > 0: idxs = idxs + offset print(idxs) index = faiss.read_index(index_path) index.remove_ids(idxs) faiss.write_index(index, target_path) else: print('no ignore list found at %d' % offset)
Example #12
Source File: run_index.py From denspi with Apache License 2.0 | 4 votes |
def run_index(args): phrase_path = os.path.join(args.dump_dir, 'phrase.hdf5') if os.path.exists(phrase_path): dump_paths = [phrase_path] else: dump_names = os.listdir(os.path.join(args.dump_dir, 'phrase')) dump_paths = [os.path.join(args.dump_dir, 'phrase', name) for name in dump_names if name.endswith('.hdf5')] data = None if args.stage in ['all', 'coarse']: if args.replace or not os.path.exists(args.quantizer_path): if not os.path.exists(args.index_dir): os.makedirs(args.index_dir) data, max_norm = sample_data(dump_paths, max_norm=args.max_norm, para=args.para, doc_sample_ratio=args.doc_sample_ratio, vec_sample_ratio=args.vec_sample_ratio, max_norm_cf=args.max_norm_cf, num_dummy_zeros=args.num_dummy_zeros, norm_th=args.norm_th) with open(args.max_norm_path, 'w') as fp: json.dump(max_norm, fp) train_coarse_quantizer(data, args.quantizer_path, args.num_clusters, cuda=args.cuda) if args.stage in ['all', 'fine']: if args.replace or not os.path.exists(args.trained_index_path): with open(args.max_norm_path, 'r') as fp: max_norm = json.load(fp) if data is None: data, _ = sample_data(dump_paths, max_norm=max_norm, para=args.para, doc_sample_ratio=args.doc_sample_ratio, vec_sample_ratio=args.vec_sample_ratio, num_dummy_zeros=args.num_dummy_zeros, norm_th=args.norm_th) train_index(data, args.quantizer_path, args.trained_index_path, fine_quant=args.fine_quant, cuda=args.cuda) if args.stage in ['all', 'add']: if args.replace or not os.path.exists(args.index_path): with open(args.max_norm_path, 'r') as fp: max_norm = json.load(fp) if args.dump_paths is not None: dump_paths = args.dump_paths if not os.path.exists(args.subindex_dir): os.makedirs(args.subindex_dir) add_to_index(dump_paths, args.trained_index_path, args.index_path, args.idx2id_path, max_norm=max_norm, para=args.para, num_dummy_zeros=args.num_dummy_zeros, cuda=args.cuda, num_docs_per_add=args.num_docs_per_add, offset=args.offset, norm_th=args.norm_th, fine_quant=args.fine_quant) if args.stage == 'merge': if args.replace or not os.path.exists(args.index_path): merge_indexes(args.subindex_dir, args.trained_index_path, args.index_path, args.idx2id_path, args.inv_path) if args.stage == 'move': index = faiss.read_index(args.trained_index_path) invlists = faiss.OnDiskInvertedLists( index.nlist, index.code_size, args.inv_path) index.replace_invlists(invlists) faiss.write_index(index, args.index_path)
Example #13
Source File: knn.py From learn-to-cluster with MIT License | 4 votes |
def __init__(self, feats, k, index_path='', index_key='', nprobe=128, omp_num_threads=None, rebuild_index=True, verbose=True, **kwargs): import faiss if omp_num_threads is not None: faiss.omp_set_num_threads(omp_num_threads) self.verbose = verbose with Timer('[faiss] build index', verbose): if index_path != '' and not rebuild_index and os.path.exists( index_path): print('[faiss] read index from {}'.format(index_path)) index = faiss.read_index(index_path) else: feats = feats.astype('float32') size, dim = feats.shape index = faiss.IndexFlatIP(dim) if index_key != '': assert index_key.find( 'HNSW') < 0, 'HNSW returns distances insted of sims' metric = faiss.METRIC_INNER_PRODUCT nlist = min(4096, 8 * round(math.sqrt(size))) if index_key == 'IVF': quantizer = index index = faiss.IndexIVFFlat(quantizer, dim, nlist, metric) else: index = faiss.index_factory(dim, index_key, metric) if index_key.find('Flat') < 0: assert not index.is_trained index.train(feats) index.nprobe = min(nprobe, nlist) assert index.is_trained print('nlist: {}, nprobe: {}'.format(nlist, nprobe)) index.add(feats) if index_path != '': print('[faiss] save index to {}'.format(index_path)) mkdir_if_no_exists(index_path) faiss.write_index(index, index_path) with Timer('[faiss] query topk {}'.format(k), verbose): knn_ofn = index_path + '.npz' if os.path.exists(knn_ofn): print('[faiss] read knns from {}'.format(knn_ofn)) self.knns = np.load(knn_ofn)['data'] else: sims, nbrs = index.search(feats, k=k) self.knns = [(np.array(nbr, dtype=np.int32), 1 - np.array(sim, dtype=np.float32)) for nbr, sim in zip(nbrs, sims)]