Python faiss.read_index() Examples
The following are 14
code examples of faiss.read_index().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
faiss
, or try the search function
.
Example #1
Source File: _faiss.py From mars with Apache License 2.0 | 7 votes |
def _load_index(ctx, op, index, device_id): return_index_type = _get_index_type(op.return_index_type, ctx) if return_index_type == 'object': # local return index elif return_index_type == 'filename': # local cluster return faiss.read_index(index) else: # distributed fn = tempfile.mkstemp('.index', prefix='faiss_')[1] with open(fn, 'wb') as f: f.write(index) index = faiss.read_index(f.name) if device_id >= 0: # pragma: no cover index = _index_to_gpu(index, device_id) return index
Example #2
Source File: mips.py From denspi with Apache License 2.0 | 6 votes |
def __init__(self, phrase_dump_dir, start_index_path, idx2id_path, max_answer_length, para=False, num_dummy_zeros=0, cuda=False): if os.path.isdir(phrase_dump_dir): self.phrase_dump_paths = sorted( [os.path.join(phrase_dump_dir, name) for name in os.listdir(phrase_dump_dir) if 'hdf5' in name]) dump_names = [os.path.splitext(os.path.basename(path))[0] for path in self.phrase_dump_paths] self.dump_ranges = [list(map(int, name.split('-'))) for name in dump_names] else: self.phrase_dump_paths = [phrase_dump_dir] self.phrase_dumps = [h5py.File(path, 'r') for path in self.phrase_dump_paths] self.max_answer_length = max_answer_length self.para = para print('reading %s' % start_index_path) self.start_index = faiss.read_index(start_index_path, faiss.IO_FLAG_ONDISK_SAME_DIR) self.idx_f = self.load_idx_f(idx2id_path) self.has_offset = not 'doc' in self.idx_f # with h5py.File(idx2id_path, 'r') as f: # self.idx2doc_id = f['doc'][:] # self.idx2para_id = f['para'][:] # self.idx2word_id = f['word'][:] self.num_dummy_zeros = num_dummy_zeros self.cuda = cuda
Example #3
Source File: run_index.py From denspi with Apache License 2.0 | 6 votes |
def train_index(data, quantizer_path, trained_index_path, fine_quant='SQ8', cuda=False): quantizer = faiss.read_index(quantizer_path) if fine_quant == 'SQ8': trained_index = faiss.IndexIVFScalarQuantizer(quantizer, quantizer.d, quantizer.ntotal, faiss.METRIC_L2) elif fine_quant.startswith('PQ'): m = int(fine_quant[2:]) trained_index = faiss.IndexIVFPQ(quantizer, quantizer.d, quantizer.ntotal, m, 8) else: raise ValueError(fine_quant) if cuda: if fine_quant.startswith('PQ'): print('PQ not supported on GPU; keeping CPU.') else: res = faiss.StandardGpuResources() gpu_index = faiss.index_cpu_to_gpu(res, 0, trained_index) gpu_index.train(data) trained_index = faiss.index_gpu_to_cpu(gpu_index) else: trained_index.train(data) faiss.write_index(trained_index, trained_index_path)
Example #4
Source File: indexing.py From faiss-server with MIT License | 5 votes |
def do_indexing(word2vec_model=None): if not os.path.isfile(INDEX_FILE_PATH): index = faiss.IndexFlatIP(word2vec_model.vector_size) index.add(word2vec_model.wv.syn0norm) faiss.write_index(index, INDEX_FILE_PATH) return index else: return faiss.read_index(INDEX_FILE_PATH)
Example #5
Source File: faissVectorIndex.py From Seq2Seq-Vis with Apache License 2.0 | 5 votes |
def __init__(self, file_name, dim_vector=500, sentence_max_len=50): self.u = faiss.read_index(file_name) # type: faiss.Index self.sentence_max_length = sentence_max_len
Example #6
Source File: faiss_indexer.py From BLINK with MIT License | 5 votes |
def deserialize_from(self, index_file: str): logger.info("Loading index from %s", index_file) self.index = faiss.read_index(index_file) logger.info( "Loaded index of type %s and size %d", type(self.index), self.index.ntotal ) # DenseFlatIndexer does exact search
Example #7
Source File: index_wrapper.py From exbert with Apache License 2.0 | 5 votes |
def __init_indexes(self): for fname in self.base_dir.glob(self.pattern): print(fname) idx = fname.stem.split('_')[-1] self.indexes[int(idx)] = faiss.read_index(str(fname))
Example #8
Source File: run_index.py From denspi with Apache License 2.0 | 5 votes |
def merge_indexes(subindex_dir, trained_index_path, target_index_path, target_idx2id_path, target_inv_path): # target_inv_path = merged_index.ivfdata names = os.listdir(subindex_dir) idx2id_paths = [os.path.join(subindex_dir, name) for name in names if name.endswith('.hdf5')] index_paths = [os.path.join(subindex_dir, name) for name in names if name.endswith('.faiss')] print('copying idx2id') with h5py.File(target_idx2id_path, 'w') as out: for idx2id_path in tqdm(idx2id_paths, desc='copying idx2id'): with h5py.File(idx2id_path, 'r') as in_: for key, g in in_.items(): offset = str(g.attrs['offset']) assert key == offset group = out.create_group(offset) group.create_dataset('doc', data=in_['doc']) group.create_dataset('para', data=in_['para']) group.create_dataset('word', data=in_['word']) print('loading invlists') ivfs = [] for index_path in tqdm(index_paths, desc='loading invlists'): # the IO_FLAG_MMAP is to avoid actually loading the data thus # the total size of the inverted lists can exceed the # available RAM index = faiss.read_index(index_path, faiss.IO_FLAG_MMAP) ivfs.append(index.invlists) # avoid that the invlists get deallocated with the index index.own_invlists = False # construct the output index index = faiss.read_index(trained_index_path) # prepare the output inverted lists. They will be written # to merged_index.ivfdata invlists = faiss.OnDiskInvertedLists( index.nlist, index.code_size, target_inv_path) # merge all the inverted lists print('merging') ivf_vector = faiss.InvertedListsPtrVector() for ivf in tqdm(ivfs): ivf_vector.push_back(ivf) print("merge %d inverted lists " % ivf_vector.size()) ntotal = invlists.merge_from(ivf_vector.data(), ivf_vector.size()) print(ntotal) # now replace the inverted lists in the output index index.ntotal = ntotal index.replace_invlists(invlists) print('writing index') faiss.write_index(index, target_index_path)
Example #9
Source File: embedding_based_indexer.py From forte with Apache License 2.0 | 5 votes |
def load(self, path: str, device: Optional[str] = None) -> None: r"""Load the index and meta data from ``path`` directory. Args: path (str): A path to the directory to load the index from. device (optional str): Device to load the index into. If None, value will be picked from hyperparameters. """ if not os.path.exists(path): raise ValueError(f"Failed to load the index. {path} " f"does not exist.") cpu_index = faiss.read_index(f"{path}/index.faiss") if device is None: device = self._config.device if device.lower().startswith("gpu"): gpu_resource = faiss.StandardGpuResources() gpu_id = int(device[3:]) if faiss.get_num_gpus() < gpu_id: gpu_id = 0 logging.warning("Cannot create the index on device %s. " "Total number of GPUs on this machine is " "%s. Using the gpu0 for the index.", device, faiss.get_num_gpus()) self._index = faiss.index_cpu_to_gpu( gpu_resource, gpu_id, cpu_index) else: self._index = cpu_index with open(f"{path}/index.meta_data", "rb") as f: self._meta_data = pickle.load(f)
Example #10
Source File: label.py From Mosaicer with MIT License | 5 votes |
def calculate(self, images): predicted = [] index = faiss.read_index(self.index_path) with open(self.id_path) as f: id_json = json.load(f) logging.info('database load') imgs = self.feature.get_feature(images) D, I = index.search(imgs, k=1) for p in I: predicted.append(id_json[str(p[0])]) return predicted
Example #11
Source File: server.py From ig65m-pytorch with MIT License | 5 votes |
def main(args): index = read_index(str(args.index.with_suffix(".idx"))) index.nprobe = args.num_probes with args.index.with_suffix(".json").open() as fp: metadata = json.load(fp) def query(batch, n): feats = np.frombuffer(batch.data, dtype=np.float32) feats = rearrange(feats, "(n d) -> n d", d=args.dimension) assert len(feats.shape) == 2 assert feats.shape[1] == args.dimension assert feats.dtype == np.float32 dists, indices = index.search(feats, n) meta = [[metadata[i] for i in batch] for batch in indices] return dists.tolist(), indices.tolist(), meta with SimpleXMLRPCServer((args.host, args.port), logRequests=False) as server: server.register_function(query) try: print("⏳ Waiting for similarity calls on {}:{}".format(args.host, args.port), file=sys.stderr) server.serve_forever() except KeyboardInterrupt: print("\n⌛ Done", file=sys.stderr)
Example #12
Source File: remove_doc_id.py From denspi with Apache License 2.0 | 4 votes |
def remove_doc_ids(args): if os.path.isdir(args.subindex_dir): names = os.listdir(args.subindex_dir) index_names = [name for name in names if name.endswith('.faiss')] index_paths = [os.path.join(args.subindex_dir, name) for name in index_names] target_paths = [os.path.join(args.target_dir, name) for name in index_names] idx2id_paths = [path.replace('.faiss', '.hdf5') for path in index_paths] if not os.path.exists(args.target_dir): os.makedirs(args.target_dir) with open(args.ignore_path, 'r') as fp: ignore_counter = json.load(fp) count = sum(ignore_counter.values()) th = count * args.ratio ignores = [int(key) for key, val in ignore_counter.items() if val > th] print('thresholding at %.1f, removing following document ids:' % th) for ignore in ignores: print(ignore) for idx2id_path, index_path, target_path in zip(idx2id_paths, tqdm(index_paths), target_paths): with h5py.File(idx2id_path, 'r') as f: doc_ids = f['doc'][:] offset = f.attrs['offset'] idxs, = np.where(np.any(np.expand_dims(doc_ids, 1) == ignores, 1)) if len(idxs) > 0: idxs = idxs + offset print('found %d ids to remove' % len(idxs)) index = faiss.read_index(index_path) index.remove_ids(idxs) faiss.write_index(index, target_path) else: print('no ignore list found at %s' % index_path) else: index_path = args.subindex_dir target_path = args.target_dir idx2id_path = args.subindex_dir.replace('index.faiss', 'idx2id.hdf5') with open(args.ignore_path, 'r') as fp: ignores = np.array(list(map(int, json.load(fp)))) with h5py.File(idx2id_path, 'r') as f: for offset, group in f.items(): doc_ids = group['doc'][:] offset = int(offset) idxs, = np.where(np.any(np.expand_dims(doc_ids, 1) == ignores, 1)) if len(idxs) > 0: idxs = idxs + offset print(idxs) index = faiss.read_index(index_path) index.remove_ids(idxs) faiss.write_index(index, target_path) else: print('no ignore list found at %d' % offset)
Example #13
Source File: run_index.py From denspi with Apache License 2.0 | 4 votes |
def run_index(args): phrase_path = os.path.join(args.dump_dir, 'phrase.hdf5') if os.path.exists(phrase_path): dump_paths = [phrase_path] else: dump_names = os.listdir(os.path.join(args.dump_dir, 'phrase')) dump_paths = [os.path.join(args.dump_dir, 'phrase', name) for name in dump_names if name.endswith('.hdf5')] data = None if args.stage in ['all', 'coarse']: if args.replace or not os.path.exists(args.quantizer_path): if not os.path.exists(args.index_dir): os.makedirs(args.index_dir) data, max_norm = sample_data(dump_paths, max_norm=args.max_norm, para=args.para, doc_sample_ratio=args.doc_sample_ratio, vec_sample_ratio=args.vec_sample_ratio, max_norm_cf=args.max_norm_cf, num_dummy_zeros=args.num_dummy_zeros, norm_th=args.norm_th) with open(args.max_norm_path, 'w') as fp: json.dump(max_norm, fp) train_coarse_quantizer(data, args.quantizer_path, args.num_clusters, cuda=args.cuda) if args.stage in ['all', 'fine']: if args.replace or not os.path.exists(args.trained_index_path): with open(args.max_norm_path, 'r') as fp: max_norm = json.load(fp) if data is None: data, _ = sample_data(dump_paths, max_norm=max_norm, para=args.para, doc_sample_ratio=args.doc_sample_ratio, vec_sample_ratio=args.vec_sample_ratio, num_dummy_zeros=args.num_dummy_zeros, norm_th=args.norm_th) train_index(data, args.quantizer_path, args.trained_index_path, fine_quant=args.fine_quant, cuda=args.cuda) if args.stage in ['all', 'add']: if args.replace or not os.path.exists(args.index_path): with open(args.max_norm_path, 'r') as fp: max_norm = json.load(fp) if args.dump_paths is not None: dump_paths = args.dump_paths if not os.path.exists(args.subindex_dir): os.makedirs(args.subindex_dir) add_to_index(dump_paths, args.trained_index_path, args.index_path, args.idx2id_path, max_norm=max_norm, para=args.para, num_dummy_zeros=args.num_dummy_zeros, cuda=args.cuda, num_docs_per_add=args.num_docs_per_add, offset=args.offset, norm_th=args.norm_th, fine_quant=args.fine_quant) if args.stage == 'merge': if args.replace or not os.path.exists(args.index_path): merge_indexes(args.subindex_dir, args.trained_index_path, args.index_path, args.idx2id_path, args.inv_path) if args.stage == 'move': index = faiss.read_index(args.trained_index_path) invlists = faiss.OnDiskInvertedLists( index.nlist, index.code_size, args.inv_path) index.replace_invlists(invlists) faiss.write_index(index, args.index_path)
Example #14
Source File: knn.py From learn-to-cluster with MIT License | 4 votes |
def __init__(self, feats, k, index_path='', index_key='', nprobe=128, omp_num_threads=None, rebuild_index=True, verbose=True, **kwargs): import faiss if omp_num_threads is not None: faiss.omp_set_num_threads(omp_num_threads) self.verbose = verbose with Timer('[faiss] build index', verbose): if index_path != '' and not rebuild_index and os.path.exists( index_path): print('[faiss] read index from {}'.format(index_path)) index = faiss.read_index(index_path) else: feats = feats.astype('float32') size, dim = feats.shape index = faiss.IndexFlatIP(dim) if index_key != '': assert index_key.find( 'HNSW') < 0, 'HNSW returns distances insted of sims' metric = faiss.METRIC_INNER_PRODUCT nlist = min(4096, 8 * round(math.sqrt(size))) if index_key == 'IVF': quantizer = index index = faiss.IndexIVFFlat(quantizer, dim, nlist, metric) else: index = faiss.index_factory(dim, index_key, metric) if index_key.find('Flat') < 0: assert not index.is_trained index.train(feats) index.nprobe = min(nprobe, nlist) assert index.is_trained print('nlist: {}, nprobe: {}'.format(nlist, nprobe)) index.add(feats) if index_path != '': print('[faiss] save index to {}'.format(index_path)) mkdir_if_no_exists(index_path) faiss.write_index(index, index_path) with Timer('[faiss] query topk {}'.format(k), verbose): knn_ofn = index_path + '.npz' if os.path.exists(knn_ofn): print('[faiss] read knns from {}'.format(knn_ofn)) self.knns = np.load(knn_ofn)['data'] else: sims, nbrs = index.search(feats, k=k) self.knns = [(np.array(nbr, dtype=np.int32), 1 - np.array(sim, dtype=np.float32)) for nbr, sim in zip(nbrs, sims)]