Python Examples of annoy.AnnoyIndex

Source File: knn.py From ivis with GNU General Public License v2.0

6 votes

def run(self):
        try:
            index = AnnoyIndex(self.n_dims, metric='angular')
            index.load(self.index_filepath)
            for i in range(self.data_indices[0], self.data_indices[1]):
                neighbour_indexes = index.get_nns_by_item(
                    i, self.k, search_k=self.search_k, include_distances=False)
                neighbour_indexes = np.array(neighbour_indexes,
                                                dtype=np.uint32)
                self.results_queue.put(
                    IndexNeighbours(row_index=i,
                                    neighbour_list=neighbour_indexes))
        except Exception as e:
            self.exception = e
        finally:
            self.results_queue.close()

Source File: __init__.py From bbknn with MIT License

6 votes

def create_tree(data,approx,metric,use_faiss,n_trees):
	'''
	Create a faiss/cKDTree/KDTree/annoy index for nearest neighbour lookup. All undescribed input
	as in ``bbknn.bbknn()``. Returns the resulting index.

	Input
	-----
	data : ``numppy.array``
		PCA coordinates of a batch's cells to index.
	'''
	if approx:
		ckd = AnnoyIndex(data.shape[1],metric=metric)
		for i in np.arange(data.shape[0]):
			ckd.add_item(i,data[i,:])
		ckd.build(n_trees)
	elif metric == 'euclidean':
		if 'faiss' in sys.modules and use_faiss:
			ckd = faiss.IndexFlatL2(data.shape[1])
			ckd.add(data)
		else:
			ckd = cKDTree(data)
	else:
		ckd = KDTree(data,metric=metric)
	return ckd

Source File: test_knn.py From ivis with GNU General Public License v2.0

6 votes

def test_build_sparse_annoy_index(annoy_index_file):
    data = np.random.choice([0, 1], size=(10, 5))
    sparse_data = csr_matrix(data)

    index = build_annoy_index(sparse_data, annoy_index_file)
    assert os.path.exists(annoy_index_file)

    loaded_index = AnnoyIndex(5, metric='angular')
    loaded_index.load(annoy_index_file)

    assert index.f == loaded_index.f == 5
    assert index.get_n_items() == loaded_index.get_n_items() == 10
    assert index.get_nns_by_item(0, 5) == loaded_index.get_nns_by_item(0, 5)

    index.unload()
    loaded_index.unload()

Source File: scanorama.py From scanorama with MIT License

6 votes

def nn_approx(ds1, ds2, knn=KNN, metric='manhattan', n_trees=10):
    # Build index.
    a = AnnoyIndex(ds2.shape[1], metric=metric)
    for i in range(ds2.shape[0]):
        a.add_item(i, ds2[i, :])
    a.build(n_trees)

    # Search index.
    ind = []
    for i in range(ds1.shape[0]):
        ind.append(a.get_nns_by_vector(ds1[i, :], knn, search_k=-1))
    ind = np.array(ind)

    # Match.
    match = set()
    for a, b in zip(range(ds1.shape[0]), ind):
        for b_i in b:
            match.add((a, b_i))

    return match

# Find mutual nearest neighbors.

Source File: __init__.py From magnitude with MIT License

6 votes

def get_approx_index_chunks(self):
        """Gets decompressed chunks of the AnnoyIndex of the vectors from
        the database."""
        try:
            db = self._db(force_new=True)
            with lz4.frame.LZ4FrameDecompressor() as decompressor:
                chunks = db.execute(
                    """
                        SELECT rowid,index_file
                        FROM `magnitude_approx`
                        WHERE trees = ?
                    """, (self.approx_trees,))
                for chunk in chunks:
                    yield decompressor.decompress(chunk[1])
                    if self.closed:
                        return
        except Exception as e:
            if self.closed:
                pass
            else:
                raise e

Source File: __init__.py From magnitude with MIT License

6 votes

def get_approx_index_chunks(self):
        """Gets decompressed chunks of the AnnoyIndex of the vectors from
        the database."""
        try:
            db = self._db(force_new=True)
            with lz4.frame.LZ4FrameDecompressor() as decompressor:
                chunks = db.execute(
                    """
                        SELECT rowid,index_file
                        FROM `magnitude_approx`
                        WHERE trees = ?
                    """, (self.approx_trees,))
                for chunk in chunks:
                    yield decompressor.decompress(chunk[1])
                    if self.closed:
                        return
        except Exception as e:
            if self.closed:
                pass
            else:
                raise e

Source File: __init__.py From magnitude with MIT License

6 votes

def get_approx_index_chunks(self):
        """Gets decompressed chunks of the AnnoyIndex of the vectors from
        the database."""
        try:
            db = self._db(force_new=True)
            with lz4.frame.LZ4FrameDecompressor() as decompressor:
                chunks = db.execute(
                    """
                        SELECT rowid,index_file
                        FROM `magnitude_approx`
                        WHERE trees = ?
                    """, (self.approx_trees,))
                for chunk in chunks:
                    yield decompressor.decompress(chunk[1])
                    if self.closed:
                        return
        except Exception as e:
            if self.closed:
                pass
            else:
                raise e

Source File: sketch.py From geosketch with MIT License

6 votes

def label_approx(X, sites, site_labels, k=1):
    from annoy import AnnoyIndex

    assert(X.shape[1] == sites.shape[1])

    # Build index over site points.
    aindex = AnnoyIndex(sites.shape[1], metric='euclidean')
    for i in range(sites.shape[0]):
        aindex.add_item(i, sites[i, :])
    aindex.build(10)

    labels = []
    for i in range(X.shape[0]):
        # Find nearest site point.
        nearest_sites = aindex.get_nns_by_vector(X[i, :], k)
        if len(nearest_sites) < 1:
            labels.append(None)
            continue
        label = Counter([
            site_labels[ns] for ns in nearest_sites
        ]).most_common(1)[0][0]
        labels.append(label)

    return np.array(labels)

Source File: index.py From rep0st with MIT License

5 votes

def load_index(self, index_id):
        if self.annoy_index is None:
            log.info("loading initial index with id {}", self.current_index)
        else:
            log.info("switching index from {} to {}", self.current_index, index_id)

        newindex = AnnoyIndex(108, metric='euclidean')
        newindex.load(config.index_config['index_path'] + 'index_' + str(index_id) + '.ann')
        if self.annoy_index is not None:
            self.annoy_index.unload()
        self.annoy_index = newindex
        self.current_index = index_id
        log.info("finished switching index. now using index {}", self.current_index)

Source File: embedding.py From recoder with MIT License

5 votes

def __build_index(self, index_file):
    self.embedding_size = self.embeddings.shape[1]

    self.index = an.AnnoyIndex(self.embedding_size, metric='angular')

    for embedding_ind in range(self.embeddings.shape[0]):
      embedding = self.embeddings[embedding_ind, :]
      self.index.add_item(embedding_ind, embedding)

    self.index.build(self.n_trees)

    if self.id_map is None:
      self.id_map = dict([(i, i) for i in range(self.embeddings.shape[0])])

    self.inverse_id_map = dict([(v,k) for k,v in self.id_map.items()])

    if index_file:
      embeddings_file = index_file + '.embeddings'
      state = {
        'embedding_size': self.embedding_size,
        'id_map': self.id_map,
      }

      self.index.save(embeddings_file)
      with open(index_file, 'wb') as _index_file:
        pickle.dump(state, _index_file)

Source File: matching.py From realtime-embeddings-matching with Apache License 2.0

5 votes

def __init__(self, index_file):
    logging.info('Initialising matching utility...')
    self.index = AnnoyIndex(VECTOR_LENGTH)
    self.index.load(index_file, prefault=True)
    logging.info('Annoy index {} is loaded'.format(index_file))
    with open(index_file + '.mapping', 'rb') as handle:
      self.mapping = pickle.load(handle)
    logging.info('Mapping file {} is loaded'.format(index_file + '.mapping'))
    logging.info('Matching utility initialised.')

Source File: text_graph.py From reveal-graph-embedding with Apache License 2.0

5 votes

def make_text_graph(user_lemma_matrix, dimensionality, metric, number_of_estimators, number_of_neighbors):
    user_lemma_matrix_tfidf = augmented_tf_idf(user_lemma_matrix)
    # print(user_lemma_matrix_tfidf.shape)
    if (user_lemma_matrix_tfidf.shape[0] <= dimensionality) or (user_lemma_matrix_tfidf.shape[1] <= dimensionality):
        X_svd = user_lemma_matrix_tfidf.toarray()
    else:
        X_svd = TruncatedSVD(n_components=dimensionality).fit_transform(user_lemma_matrix_tfidf)

    annoy_index = AnnoyIndex(X_svd.shape[1], metric=metric)

    for q in range(X_svd.shape[0]):
        annoy_index.add_item(q, X_svd[q, :])

    annoy_index.build(number_of_estimators)

    row = list()
    col = list()
    data = list()
    for q in range(X_svd.shape[0]):
        neighbors, distances = annoy_index.get_nns_by_item(q, number_of_neighbors, include_distances=True)

        row.extend([q] * number_of_neighbors)
        col.extend(neighbors)
        data.extend(distances)

    row = np.array(row, dtype=np.int64)
    col = np.array(col, dtype=np.int64)
    data = np.array(data, dtype=np.float64)

    text_graph = spsp.coo_matrix((data,
                                  (row,
                                   col)),
                                 shape=(X_svd.shape[0],
                                        X_svd.shape[0]))
    text_graph = spsp.csr_matrix(text_graph)

    return text_graph

Source File: index_embeddings.py From jann with MIT License

5 votes

def index_embeddings(args):
    """Main run function for indexing the embeddings."""
    unique_strings_path = args.infile + '.embedded.pkl_unique_strings.csv'

    # Load the unique lines
    with open(unique_strings_path) as f:
        unique_strings = [line.rstrip() for line in f]

    unique_embeddings_path = (args.infile +
                              '.embedded.pkl_unique_strings_embeddings.txt')
    # Load the unique embeddings
    with open(unique_embeddings_path) as f:
        unique_embeddings = [[float(x) for x in
                              line.strip().split()] for line in f]

    tf.logging.info('Loaded {} unique strings, {} embeddings of dimension {}'.
                    format(len(unique_strings),
                           len(unique_embeddings),
                           len(unique_embeddings[0])))

    # Length of item vector that will be indexed
    nn_forest = AnnoyIndex(512, metric='angular')

    for i in range(len(unique_strings)):
        v = unique_embeddings[i]
        nn_forest.add_item(i, v)

    # Build an approximate nearest neighbor forest with num_trees
    nn_forest.build(int(args.num_trees))
    output_path = args.infile + '.ann'
    nn_forest.save(output_path)

    tf.logging.info('Index forest built {}'.format(output_path))

    return True

Source File: generateArtificialSessions.py From MSMARCO-Conversational-Search with MIT License

5 votes

def generateAnnoy(real, artificial, annoyFilename, dimensions):
    idx2vec = np.array(artificial[2])
    t = AnnoyIndex(dimensions)
    for j in range(len(artificial[2])):
        t.add_item(j,idx2vec[j])
    print('Done Adding items to AnnoyIndex')
    t.build(TREESIZE)
    print('Done Building AnnoyIndex')
    t.save(annoyFilename)
    return t

Source File: noveltysearchlive.py From Novelty-Search-Live with GNU General Public License v3.0

5 votes

def AddToTrain(individual):
    global annoy_train
    global test_db
    global IND_SIZE
    global config

    max_memory = 5
    if set.get_master_volume() == 1:
        print set.get_master_volume()
        set.set_master_volume(0.85)

        test_db.append(individual)
        print "SAVING TO TRAINING SET. TestDB Size: " + str(len(test_db))

        annoy_train = AnnoyIndex(IND_SIZE)
        annoy_train.add_item(annoy_train.get_n_items(), individual)
        annoy_train.build(config["annoy_tree"]) # 10 trees

        if len(test_db) > max_memory:
            test_db.pop(0)
            print "delete old memory entry"

    if set.get_master_volume() == 0:
        test_db = []
        # gen_record = []
        annoy_train = AnnoyIndex(IND_SIZE)
        annoy_train.build(config["annoy_tree"]) # 10 trees
        print "clean set"
        set.set_master_volume(0.85)


############ App Main Loop ############

Source File: dnd.py From tensorflow-rl with Apache License 2.0

5 votes

def __init__(self, capacity=100000, key_size=128, cache_size=32, alpha=0.1):
        self.alpha = alpha
        self.capacity = capacity
        self.lru_cache = LRUCache(capacity)
        self.dup_cache = deque(maxlen=cache_size)
        self.index = AnnoyIndex(key_size, metric='euclidean')
        self.keys = np.zeros((capacity, key_size), dtype=np.float32)
        self.values = np.zeros((capacity,), dtype=np.float32)
        self.insert_idx = 0
        self.insertions = 0

Source File: annoy.py From ann-benchmarks with MIT License

5 votes

def fit(self, X):
        self._annoy = annoy.AnnoyIndex(X.shape[1], metric=self._metric)
        for i, x in enumerate(X):
            self._annoy.add_item(i, x.tolist())
        self._annoy.build(self._n_trees)

Source File: test_knn.py From ivis with GNU General Public License v2.0

5 votes

def test_dense_annoy_index(annoy_index_file):
    data = np.random.choice([0, 1], size=(10, 5))
    index = build_annoy_index(data, annoy_index_file)
    assert os.path.exists(annoy_index_file)

    loaded_index = AnnoyIndex(5, metric='angular')
    loaded_index.load(annoy_index_file)

    assert index.f == loaded_index.f == 5
    assert index.get_n_items() == loaded_index.get_n_items() == 10
    assert index.get_nns_by_item(0, 5) == loaded_index.get_nns_by_item(0, 5)

    index.unload()
    loaded_index.unload()

Source File: background_job.py From rep0st with MIT License

5 votes

def build_index(index_id):
    n_trees = config.index_config['tree_count']

    log.info("started index build")
    session = rep.database.DBSession()
    count = session.query(Feature).filter(Feature.type == FeatureType.FEATURE_VECTOR).count()
    index = AnnoyIndex(108, metric='euclidean')
    cnt = 0
    log.info("adding {} features to index", count)
    start = time.time()
    for feature in session.query(Feature).filter(Feature.type == FeatureType.FEATURE_VECTOR).yield_per(1000):
        arr = np.asarray(bytearray(feature.data)).astype(np.float32)
        index.add_item(feature.post_id, arr)
        cnt += 1
        if cnt % 10000 == 0:
            log.debug("added {}/{} features to the index", cnt, count)
    session.close()
    stop = time.time()
    log.info("added all {} features to the index in {}", count, str(datetime.timedelta(seconds=stop - start)))
    log.info("building index with {} trees. this will take a while...", n_trees)
    start = time.time()
    index.build(20)
    index_file = config.index_config['index_path'] + "index_" + str(index_id) + ".ann"
    log.info("saving index to file {}", index_file)
    index.save(index_file)
    stop = time.time()

    log.debug("finished building of index. it took {}", str(datetime.timedelta(seconds=stop - start)))

Source File: hf_sampler.py From hfsoftmax with MIT License

5 votes

def __init__(self,
                 rank,
                 fdim,
                 sample_num,
                 num_output,
                 bias=False,
                 ntrees=50,
                 interval=100,
                 start_iter=0,
                 midw='0',
                 midb='1'):
        super(HFSampler, self).__init__()
        self.rank = rank
        self.fdim = fdim
        self.sample_num = sample_num
        self.num_output = num_output
        self.full_cls = np.arange(self.num_output)
        # init param client
        self.client = ParameterClient(rank)
        self.midw = midw
        self.midb = midb
        self.is_bias = bias
        self.client.add_matrix(self.midw, [self.num_output, self.fdim])
        if self.is_bias:
            self.client.add_matrix(self.midb, [self.num_output, 1])
        # init hashing forest
        self.ntrees = ntrees
        self.interval = interval
        self.start_iter = start_iter
        self.iter = start_iter
        self.test_iter = start_iter
        self.anns = AnnoyIndex(self.fdim)
        self.pool = Pool(processes=2)

Source File: hf_sampler.py From hfsoftmax with MIT License

5 votes

def _update_hf(self):
        if not self.iter % self.interval == 0 and \
            not self.iter == self.start_iter:
            return
        w = self.client.get_value_by_rows(self.midw, self.full_cls)
        self.anns = AnnoyIndex(self.fdim)
        for i, v in enumerate(w):
            self.anns.add_item(i, v)
        self.anns.build(self.ntrees)

Source File: cluster_corr.py From altanalyze with Apache License 2.0

5 votes

def nearest_neighbors(collection, num_neighbors=10, n_trees=100):
    """
    Finds the num_neighbors nearest neighbors to each cell in the sparse matrix

    Return result is a dictionary of lists, where the key is an index into the cells, 
    and the value is the neighbors of that cell
    """
    nn_idx = AnnoyIndex(collection.num_genes())
    # Add the elements in reverse order because Annoy allocates the memory based on
    # the value of the element added - so adding in increasing order will trigger
    # lots of allocations
    for i in range(collection.num_cells()-1, -1, -1):
        nn_idx.add_item(i, collection.get_cell_expression_vector(i))
    nn_idx.build(n_trees)
    return { i: nn_idx.get_nns_by_item(i, num_neighbors) for i in range(collection.num_cells()) }

Source File: similarity_finder.py From hub with Apache License 2.0

5 votes

def __init__(
      self,
      module_url,
      index_file_path,
      mapping_file_path,
      dimensions,
      random_projection_matrix_file,
  ):

    # Load the TF-Hub module
    print('Loading the TF-Hub module...')
    self.embed_fn = hub.load(module_url)
    print('TF-hub module is loaded.')

    dimensions = self.embed_fn(['']).shape[1]

    self.random_projection_matrix = None
    if tf.io.gfile.exists(random_projection_matrix_file):
      with open(random_projection_matrix_file, 'rb') as handle:
        self.random_projection_matrix = pickle.load(handle)
      dimensions = self.random_projection_matrix.shape[1]

    self.index = annoy.AnnoyIndex(dimensions, metric=_METRIC)
    self.index.load(index_file_path, prefault=True)
    print('Annoy index is loaded.')
    with open(mapping_file_path, 'rb') as handle:
      self.mapping = pickle.load(handle)
    print('Mapping file is loaded.')

Source File: differentiable_neural_dictionary.py From coach with Apache License 2.0

5 votes

def __init__(self, dict_size, key_width, new_value_shift_coefficient=0.1, batch_size=100, key_error_threshold=0.01,
                 num_neighbors=50, override_existing_keys=True, rebuild_on_every_update=False):
        self.rebuild_on_every_update = rebuild_on_every_update
        self.max_size = dict_size
        self.curr_size = 0
        self.new_value_shift_coefficient = new_value_shift_coefficient
        self.num_neighbors = num_neighbors
        self.override_existing_keys = override_existing_keys

        self.index = AnnoyIndex(key_width, metric='euclidean')
        self.index.set_seed(1)

        self.embeddings = np.zeros((dict_size, key_width))
        self.values = np.zeros(dict_size)
        self.additional_data = [None] * dict_size

        self.lru_timestamps = np.zeros(dict_size)
        self.current_timestamp = 0.0

        # keys that are in this distance will be considered as the same key
        self.key_error_threshold = key_error_threshold

        self.initial_update_size = batch_size
        self.min_update_size = self.initial_update_size
        self.key_dimension = key_width
        self.value_dimension = 1
        self._reset_buffer()

        self.built_capacity = 0

Source File: differentiable_neural_dictionary.py From coach with Apache License 2.0

5 votes

def load_dnd(model_dir):
    latest_checkpoint_id = -1
    latest_checkpoint = ''
    # get all checkpoint files
    for fname in os.listdir(model_dir):
        path = os.path.join(model_dir, fname)
        if os.path.isdir(path) or fname.split('.')[-1] != 'srs':
            continue
        checkpoint_id = int(fname.split('_')[0])
        if checkpoint_id > latest_checkpoint_id:
            latest_checkpoint = fname
            latest_checkpoint_id = checkpoint_id

    with open(os.path.join(model_dir, str(latest_checkpoint)), 'rb') as f:
        DND = pickle.load(f)

        for a in range(DND.num_actions):
            DND.dicts[a].index = AnnoyIndex(512, metric='euclidean')
            DND.dicts[a].index.set_seed(1)

            for idx, key in zip(range(DND.dicts[a].curr_size), DND.dicts[a].embeddings[:DND.dicts[a].curr_size]):
                DND.dicts[a].index.add_item(idx, key)

            DND.dicts[a].index.build(50)

    return DND

Source File: sketch.py From geosketch with MIT License

5 votes

def srs_positive_annoy(X, N, seed=None, replace=False, prenormalized=False):
    from annoy import AnnoyIndex

    n_samples, n_features = X.shape

    if not replace and N > n_samples:
        raise ValueError('Cannot sample {} elements from {} elements '
                         'without replacement'.format(N, n_samples))
    if not replace and N == n_samples:
        return range(N)

    if not seed is None:
        np.random.seed(seed)

    X = X - X.min(0)

    if not prenormalized:
        X = normalize(X).astype('float32')

    srs_idx = set()
    for i in range(N):
        aindex = AnnoyIndex(X.shape[1], metric='euclidean')
        for i in range(X.shape[0]):
            if i not in srs_idx:
                aindex.add_item(i, X[i, :])
        aindex.build(10)

        Phi_i = np.random.normal(size=(n_features))
        Phi_i /= np.linalg.norm(Phi_i)

        nearest_site = aindex.get_nns_by_vector(Phi_i, 1)
        srs_idx.add(nearest_site[0])

    return sorted(srs_idx)

Source File: approximate_als.py From implicit with MIT License

5 votes

def fit(self, Ciu, show_progress=True):
        # delay loading the annoy library in case its not installed here
        import annoy

        # train the model
        super(AnnoyAlternatingLeastSquares, self).fit(Ciu, show_progress)

        # build up an Annoy Index with all the item_factors (for calculating
        # similar items)
        if self.approximate_similar_items:
            log.debug("Building annoy similar items index")

            self.similar_items_index = annoy.AnnoyIndex(
                self.item_factors.shape[1], 'angular')
            for i, row in enumerate(self.item_factors):
                self.similar_items_index.add_item(i, row)
            self.similar_items_index.build(self.n_trees)

        # build up a separate index for the inner product (for recommend
        # methods)
        if self.approximate_recommend:
            log.debug("Building annoy recommendation index")
            self.max_norm, extra = augment_inner_product_matrix(self.item_factors)
            self.recommend_index = annoy.AnnoyIndex(extra.shape[1], 'angular')
            for i, row in enumerate(extra):
                self.recommend_index.add_item(i, row)
            self.recommend_index.build(self.n_trees)

Source File: embedding.py From recoder with MIT License

5 votes

def __load_index(self, index_file):
    log.info('Loading index file from {}'.format(index_file))
    with open(index_file, 'rb') as _index_file:
      state = pickle.load(_index_file)
    self.embedding_size = state['embedding_size']
    self.id_map = state['id_map']
    embeddings_file = index_file + '.embeddings'
    self.index = an.AnnoyIndex(self.embedding_size, metric='angular')
    self.index.load(embeddings_file)
    self.inverse_id_map = dict([(v,k) for k,v in self.id_map.items()])

Source File: recall.py From nlp_research with MIT License

5 votes

def __init__(self, vecs):
        assert len(vecs)>0, 'no vecs available to init AnnoyIndex'
        size = len(vecs[0])
        self.annoy_model = AnnoyIndex(size)
        for idx,vec in enumerate(vecs):
            self.annoy_model.add_item(idx, vec)
        self.annoy_model.build(50)

Source File: annoyVectorIndex.py From Seq2Seq-Vis with Apache License 2.0

5 votes

def __init__(self, file_name, dim_vector=500):
        self.u = AnnoyIndex(dim_vector)
        self.u.load(file_name)

Python annoy.AnnoyIndex() Examples