Python Examples of faiss.write

Source File: train.py From Mosaicer with MIT License

8 votes

def faiss_train(fn_feature, root_path, index_path='train.index', id_path='data.json'):
    folder_names = os.listdir(root_path)
    logging.info('directory %s ', folder_names)
    ids = None
    vals = None
    id_json = {}
    print(folder_names)
    for idx, folder_name in enumerate(folder_names):
        id_json[str(idx)] = folder_name
        now_path = os.path.join(root_path, folder_name)
        feature_val = fn_feature(now_path)
        vals = np.concatenate((feature_val, vals), axis=0) if vals is not None else feature_val
        id_np = np.asarray([idx] * feature_val.shape[0])
        ids = np.concatenate((id_np, ids), axis=0) if ids is not None else id_np
    N, dim = vals.shape
    x = int(2 * math.sqrt(N))
    index_description = "IVF{x},Flat".format(x=x)
    index = faiss.index_factory(7 * 7 * 512, index_description, faiss.METRIC_INNER_PRODUCT)
    index.train(vals)
    index.add_with_ids(vals, ids)
    faiss.write_index(index, index_path)
    with open(id_path, 'w', encoding='utf-8') as f:
        json.dump(id_json, f, ensure_ascii=False, indent=4)
    print(id_json)
    return index, id_json

Source File: h5_to_faiss.py From Seq2Seq-Vis with Apache License 2.0

6 votes

def main():
    f = h5py.File(opt.states, "r")
    data = f[opt.data]
    seqs, slens, hid = data.shape

    print("Processing {} Sequences".format(seqs))
    print("with {} tokens each".format(slens))
    print("and {} states".format(hid))

    # Initialize a new index
    index = faiss.IndexFlatIP(hid)
    # Fill it
    for ix in tqdm(range(0, seqs-opt.stepsize, opt.stepsize)):
        cdata = np.array(data[ix:ix+opt.stepsize]\
                  .reshape(-1, hid), dtype="float32")
        index.add(cdata)
    f.close()

    faiss.write_index(index, opt.output)

Source File: run_index.py From denspi with Apache License 2.0

6 votes

def train_coarse_quantizer(data, quantizer_path, num_clusters, hnsw=False, niter=10, cuda=False):
    d = data.shape[1]

    index_flat = faiss.IndexFlatL2(d)
    # make it into a gpu index
    if cuda:
        res = faiss.StandardGpuResources()
        index_flat = faiss.index_cpu_to_gpu(res, 0, index_flat)
    clus = faiss.Clustering(d, num_clusters)
    clus.verbose = True
    clus.niter = niter
    clus.train(data, index_flat)
    centroids = faiss.vector_float_to_array(clus.centroids)
    centroids = centroids.reshape(num_clusters, d)

    if hnsw:
        quantizer = faiss.IndexHNSWFlat(d, 32)
        quantizer.hnsw.efSearch = 128
        quantizer.train(centroids)
        quantizer.add(centroids)
    else:
        quantizer = faiss.IndexFlatL2(d)
        quantizer.add(centroids)

    faiss.write_index(quantizer, quantizer_path)

Source File: run_index.py From denspi with Apache License 2.0

6 votes

def train_index(data, quantizer_path, trained_index_path, fine_quant='SQ8', cuda=False):
    quantizer = faiss.read_index(quantizer_path)
    if fine_quant == 'SQ8':
        trained_index = faiss.IndexIVFScalarQuantizer(quantizer, quantizer.d, quantizer.ntotal, faiss.METRIC_L2)
    elif fine_quant.startswith('PQ'):
        m = int(fine_quant[2:])
        trained_index = faiss.IndexIVFPQ(quantizer, quantizer.d, quantizer.ntotal, m, 8)
    else:
        raise ValueError(fine_quant)

    if cuda:
        if fine_quant.startswith('PQ'):
            print('PQ not supported on GPU; keeping CPU.')
        else:
            res = faiss.StandardGpuResources()
            gpu_index = faiss.index_cpu_to_gpu(res, 0, trained_index)
            gpu_index.train(data)
            trained_index = faiss.index_gpu_to_cpu(gpu_index)
    else:
        trained_index.train(data)
    faiss.write_index(trained_index, trained_index_path)

Source File: embedding_based_indexer.py From forte with Apache License 2.0

6 votes

def save(self, path: str) -> None:
        r"""Save the index and meta data in ``path`` directory. The index
        will be saved as ``index.faiss`` and ``index.meta_data`` respectively
        inside ``path`` directory.

        Args:
            path (str): A path to the directory where the index will be saved

        """

        if os.path.exists(path):
            logging.warning("%s directory already exists. Index will be "
                            "saved into an existing directory", path)
        else:
            os.makedirs(path)

        cpu_index = faiss.index_gpu_to_cpu(self._index) \
            if self._index.__class__.__name__.startswith("Gpu") else self._index
        faiss.write_index(cpu_index, f"{path}/index.faiss")
        with open(f"{path}/index.meta_data", "wb") as f:
            pickle.dump(self._meta_data, f)

Source File: _faiss.py From mars with Apache License 2.0

5 votes

def _store_index(ctx, op, index, device_id):
    return_index_type = _get_index_type(op.return_index_type, ctx)

    if return_index_type == 'object':
        # no need to serialize
        return index
    elif return_index_type == 'filename':
        # save to file, then return filename
        if device_id >= 0:  # pragma: no cover
            # for gpu, convert to cpu first
            index = faiss.index_gpu_to_cpu(index)
        fn = tempfile.mkstemp('.index', prefix='faiss_')[1]
        faiss.write_index(index, fn)

        atexit.register(lambda: os.remove(fn))

        return fn
    else:
        if device_id >= 0:  # pragma: no cover
            # for gpu, convert to cpu first
            index = faiss.index_gpu_to_cpu(index)
        # distributed, save to file, then return in memory bytes
        fn = tempfile.mkstemp('.index', prefix='faiss_')[1]
        faiss.write_index(index, fn)
        try:
            with open(fn, 'rb') as f:
                return f.read()
        finally:
            os.remove(fn)

Source File: indexing.py From faiss-server with MIT License

5 votes

def do_indexing(word2vec_model=None):
    if not os.path.isfile(INDEX_FILE_PATH):
        index = faiss.IndexFlatIP(word2vec_model.vector_size)
        index.add(word2vec_model.wv.syn0norm)
        faiss.write_index(index, INDEX_FILE_PATH)
        return index
    else:
        return faiss.read_index(INDEX_FILE_PATH)

Source File: faiss_indexer.py From BLINK with MIT License

5 votes

def serialize(self, index_file: str):
        logger.info("Serializing index to %s", index_file)
        faiss.write_index(self.index, index_file)

Source File: create_faiss.py From exbert with Apache License 2.0

5 votes

def save_indexes(idxs, outdir, base_name=LAYER_TEMPLATE):
    """Save the faiss index into a file for each index in idxs"""

    base_dir = Path(outdir)
    if not base_dir.exists(): base_dir.mkdir(exist_ok=True, parents=True)

    out_name = str(base_dir / base_name)
    for i, idx in enumerate(idxs):
        name = out_name.format(i)
        print(f"Saving to {name}")
        faiss.write_index(idx, name)

Source File: run_index.py From denspi with Apache License 2.0

5 votes

def merge_indexes(subindex_dir, trained_index_path, target_index_path, target_idx2id_path, target_inv_path):
    # target_inv_path = merged_index.ivfdata
    names = os.listdir(subindex_dir)
    idx2id_paths = [os.path.join(subindex_dir, name) for name in names if name.endswith('.hdf5')]
    index_paths = [os.path.join(subindex_dir, name) for name in names if name.endswith('.faiss')]

    print('copying idx2id')
    with h5py.File(target_idx2id_path, 'w') as out:
        for idx2id_path in tqdm(idx2id_paths, desc='copying idx2id'):
            with h5py.File(idx2id_path, 'r') as in_:
                for key, g in in_.items():
                    offset = str(g.attrs['offset'])
                    assert key == offset
                    group = out.create_group(offset)
                    group.create_dataset('doc', data=in_['doc'])
                    group.create_dataset('para', data=in_['para'])
                    group.create_dataset('word', data=in_['word'])

    print('loading invlists')
    ivfs = []
    for index_path in tqdm(index_paths, desc='loading invlists'):
        # the IO_FLAG_MMAP is to avoid actually loading the data thus
        # the total size of the inverted lists can exceed the
        # available RAM
        index = faiss.read_index(index_path,
                                 faiss.IO_FLAG_MMAP)
        ivfs.append(index.invlists)

        # avoid that the invlists get deallocated with the index
        index.own_invlists = False

    # construct the output index
    index = faiss.read_index(trained_index_path)

    # prepare the output inverted lists. They will be written
    # to merged_index.ivfdata
    invlists = faiss.OnDiskInvertedLists(
        index.nlist, index.code_size,
        target_inv_path)

    # merge all the inverted lists
    print('merging')
    ivf_vector = faiss.InvertedListsPtrVector()
    for ivf in tqdm(ivfs):
        ivf_vector.push_back(ivf)

    print("merge %d inverted lists " % ivf_vector.size())
    ntotal = invlists.merge_from(ivf_vector.data(), ivf_vector.size())
    print(ntotal)

    # now replace the inverted lists in the output index
    index.ntotal = ntotal
    index.replace_invlists(invlists)

    print('writing index')
    faiss.write_index(index, target_index_path)

Source File: remove_doc_id.py From denspi with Apache License 2.0

4 votes

def remove_doc_ids(args):
    if os.path.isdir(args.subindex_dir):
        names = os.listdir(args.subindex_dir)
        index_names = [name for name in names if name.endswith('.faiss')]
        index_paths = [os.path.join(args.subindex_dir, name) for name in index_names]
        target_paths = [os.path.join(args.target_dir, name) for name in index_names]
        idx2id_paths = [path.replace('.faiss', '.hdf5') for path in index_paths]
        if not os.path.exists(args.target_dir):
            os.makedirs(args.target_dir)

        with open(args.ignore_path, 'r') as fp:
            ignore_counter = json.load(fp)
        count = sum(ignore_counter.values())
        th = count * args.ratio
        ignores = [int(key) for key, val in ignore_counter.items() if val > th]
        print('thresholding at %.1f, removing following document ids:' % th)
        for ignore in ignores:
            print(ignore)

        for idx2id_path, index_path, target_path in zip(idx2id_paths, tqdm(index_paths), target_paths):
            with h5py.File(idx2id_path, 'r') as f:
                doc_ids = f['doc'][:]
                offset = f.attrs['offset']
            idxs, = np.where(np.any(np.expand_dims(doc_ids, 1) == ignores, 1))
            if len(idxs) > 0:
                idxs = idxs + offset
                print('found %d ids to remove' % len(idxs))
                index = faiss.read_index(index_path)
                index.remove_ids(idxs)
                faiss.write_index(index, target_path)
            else:
                print('no ignore list found at %s' % index_path)
    else:
        index_path = args.subindex_dir
        target_path = args.target_dir
        idx2id_path = args.subindex_dir.replace('index.faiss', 'idx2id.hdf5')
        with open(args.ignore_path, 'r') as fp:
            ignores = np.array(list(map(int, json.load(fp))))
        with h5py.File(idx2id_path, 'r') as f:
            for offset, group in f.items():
                doc_ids = group['doc'][:]
                offset = int(offset)
                idxs, = np.where(np.any(np.expand_dims(doc_ids, 1) == ignores, 1))
                if len(idxs) > 0:
                    idxs = idxs + offset
                    print(idxs)
                    index = faiss.read_index(index_path)
                    index.remove_ids(idxs)
                    faiss.write_index(index, target_path)
                else:
                    print('no ignore list found at %d' % offset)

Source File: run_index.py From denspi with Apache License 2.0

4 votes

def run_index(args):
    phrase_path = os.path.join(args.dump_dir, 'phrase.hdf5')
    if os.path.exists(phrase_path):
        dump_paths = [phrase_path]
    else:
        dump_names = os.listdir(os.path.join(args.dump_dir, 'phrase'))
        dump_paths = [os.path.join(args.dump_dir, 'phrase', name) for name in dump_names if name.endswith('.hdf5')]

    data = None

    if args.stage in ['all', 'coarse']:
        if args.replace or not os.path.exists(args.quantizer_path):
            if not os.path.exists(args.index_dir):
                os.makedirs(args.index_dir)
            data, max_norm = sample_data(dump_paths, max_norm=args.max_norm, para=args.para,
                                         doc_sample_ratio=args.doc_sample_ratio, vec_sample_ratio=args.vec_sample_ratio,
                                         max_norm_cf=args.max_norm_cf, num_dummy_zeros=args.num_dummy_zeros,
                                         norm_th=args.norm_th)
            with open(args.max_norm_path, 'w') as fp:
                json.dump(max_norm, fp)
            train_coarse_quantizer(data, args.quantizer_path, args.num_clusters, cuda=args.cuda)

    if args.stage in ['all', 'fine']:
        if args.replace or not os.path.exists(args.trained_index_path):
            with open(args.max_norm_path, 'r') as fp:
                max_norm = json.load(fp)
            if data is None:
                data, _ = sample_data(dump_paths, max_norm=max_norm, para=args.para,
                                      doc_sample_ratio=args.doc_sample_ratio, vec_sample_ratio=args.vec_sample_ratio,
                                      num_dummy_zeros=args.num_dummy_zeros, norm_th=args.norm_th)
            train_index(data, args.quantizer_path, args.trained_index_path, fine_quant=args.fine_quant, cuda=args.cuda)

    if args.stage in ['all', 'add']:
        if args.replace or not os.path.exists(args.index_path):
            with open(args.max_norm_path, 'r') as fp:
                max_norm = json.load(fp)
            if args.dump_paths is not None:
                dump_paths = args.dump_paths
                if not os.path.exists(args.subindex_dir):
                    os.makedirs(args.subindex_dir)
            add_to_index(dump_paths, args.trained_index_path, args.index_path, args.idx2id_path,
                         max_norm=max_norm, para=args.para, num_dummy_zeros=args.num_dummy_zeros, cuda=args.cuda,
                         num_docs_per_add=args.num_docs_per_add, offset=args.offset, norm_th=args.norm_th,
                         fine_quant=args.fine_quant)

    if args.stage == 'merge':
        if args.replace or not os.path.exists(args.index_path):
            merge_indexes(args.subindex_dir, args.trained_index_path, args.index_path, args.idx2id_path, args.inv_path)

    if args.stage == 'move':
        index = faiss.read_index(args.trained_index_path)
        invlists = faiss.OnDiskInvertedLists(
            index.nlist, index.code_size,
            args.inv_path)
        index.replace_invlists(invlists)
        faiss.write_index(index, args.index_path)

Source File: knn.py From learn-to-cluster with MIT License

4 votes

def __init__(self,
                 feats,
                 k,
                 index_path='',
                 index_key='',
                 nprobe=128,
                 omp_num_threads=None,
                 rebuild_index=True,
                 verbose=True,
                 **kwargs):
        import faiss
        if omp_num_threads is not None:
            faiss.omp_set_num_threads(omp_num_threads)
        self.verbose = verbose
        with Timer('[faiss] build index', verbose):
            if index_path != '' and not rebuild_index and os.path.exists(
                    index_path):
                print('[faiss] read index from {}'.format(index_path))
                index = faiss.read_index(index_path)
            else:
                feats = feats.astype('float32')
                size, dim = feats.shape
                index = faiss.IndexFlatIP(dim)
                if index_key != '':
                    assert index_key.find(
                        'HNSW') < 0, 'HNSW returns distances insted of sims'
                    metric = faiss.METRIC_INNER_PRODUCT
                    nlist = min(4096, 8 * round(math.sqrt(size)))
                    if index_key == 'IVF':
                        quantizer = index
                        index = faiss.IndexIVFFlat(quantizer, dim, nlist,
                                                   metric)
                    else:
                        index = faiss.index_factory(dim, index_key, metric)
                    if index_key.find('Flat') < 0:
                        assert not index.is_trained
                    index.train(feats)
                    index.nprobe = min(nprobe, nlist)
                    assert index.is_trained
                    print('nlist: {}, nprobe: {}'.format(nlist, nprobe))
                index.add(feats)
                if index_path != '':
                    print('[faiss] save index to {}'.format(index_path))
                    mkdir_if_no_exists(index_path)
                    faiss.write_index(index, index_path)
        with Timer('[faiss] query topk {}'.format(k), verbose):
            knn_ofn = index_path + '.npz'
            if os.path.exists(knn_ofn):
                print('[faiss] read knns from {}'.format(knn_ofn))
                self.knns = np.load(knn_ofn)['data']
            else:
                sims, nbrs = index.search(feats, k=k)
                self.knns = [(np.array(nbr, dtype=np.int32),
                              1 - np.array(sim, dtype=np.float32))
                             for nbr, sim in zip(nbrs, sims)]

Python faiss.write_index() Examples