Python annoy.AnnoyIndex() Examples
The following are 30
code examples of annoy.AnnoyIndex().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
annoy
, or try the search function
.
Example #1
Source File: knn.py From ivis with GNU General Public License v2.0 | 6 votes |
def run(self): try: index = AnnoyIndex(self.n_dims, metric='angular') index.load(self.index_filepath) for i in range(self.data_indices[0], self.data_indices[1]): neighbour_indexes = index.get_nns_by_item( i, self.k, search_k=self.search_k, include_distances=False) neighbour_indexes = np.array(neighbour_indexes, dtype=np.uint32) self.results_queue.put( IndexNeighbours(row_index=i, neighbour_list=neighbour_indexes)) except Exception as e: self.exception = e finally: self.results_queue.close()
Example #2
Source File: __init__.py From bbknn with MIT License | 6 votes |
def create_tree(data,approx,metric,use_faiss,n_trees): ''' Create a faiss/cKDTree/KDTree/annoy index for nearest neighbour lookup. All undescribed input as in ``bbknn.bbknn()``. Returns the resulting index. Input ----- data : ``numppy.array`` PCA coordinates of a batch's cells to index. ''' if approx: ckd = AnnoyIndex(data.shape[1],metric=metric) for i in np.arange(data.shape[0]): ckd.add_item(i,data[i,:]) ckd.build(n_trees) elif metric == 'euclidean': if 'faiss' in sys.modules and use_faiss: ckd = faiss.IndexFlatL2(data.shape[1]) ckd.add(data) else: ckd = cKDTree(data) else: ckd = KDTree(data,metric=metric) return ckd
Example #3
Source File: test_knn.py From ivis with GNU General Public License v2.0 | 6 votes |
def test_build_sparse_annoy_index(annoy_index_file): data = np.random.choice([0, 1], size=(10, 5)) sparse_data = csr_matrix(data) index = build_annoy_index(sparse_data, annoy_index_file) assert os.path.exists(annoy_index_file) loaded_index = AnnoyIndex(5, metric='angular') loaded_index.load(annoy_index_file) assert index.f == loaded_index.f == 5 assert index.get_n_items() == loaded_index.get_n_items() == 10 assert index.get_nns_by_item(0, 5) == loaded_index.get_nns_by_item(0, 5) index.unload() loaded_index.unload()
Example #4
Source File: scanorama.py From scanorama with MIT License | 6 votes |
def nn_approx(ds1, ds2, knn=KNN, metric='manhattan', n_trees=10): # Build index. a = AnnoyIndex(ds2.shape[1], metric=metric) for i in range(ds2.shape[0]): a.add_item(i, ds2[i, :]) a.build(n_trees) # Search index. ind = [] for i in range(ds1.shape[0]): ind.append(a.get_nns_by_vector(ds1[i, :], knn, search_k=-1)) ind = np.array(ind) # Match. match = set() for a, b in zip(range(ds1.shape[0]), ind): for b_i in b: match.add((a, b_i)) return match # Find mutual nearest neighbors.
Example #5
Source File: __init__.py From magnitude with MIT License | 6 votes |
def get_approx_index_chunks(self): """Gets decompressed chunks of the AnnoyIndex of the vectors from the database.""" try: db = self._db(force_new=True) with lz4.frame.LZ4FrameDecompressor() as decompressor: chunks = db.execute( """ SELECT rowid,index_file FROM `magnitude_approx` WHERE trees = ? """, (self.approx_trees,)) for chunk in chunks: yield decompressor.decompress(chunk[1]) if self.closed: return except Exception as e: if self.closed: pass else: raise e
Example #6
Source File: __init__.py From magnitude with MIT License | 6 votes |
def get_approx_index_chunks(self): """Gets decompressed chunks of the AnnoyIndex of the vectors from the database.""" try: db = self._db(force_new=True) with lz4.frame.LZ4FrameDecompressor() as decompressor: chunks = db.execute( """ SELECT rowid,index_file FROM `magnitude_approx` WHERE trees = ? """, (self.approx_trees,)) for chunk in chunks: yield decompressor.decompress(chunk[1]) if self.closed: return except Exception as e: if self.closed: pass else: raise e
Example #7
Source File: __init__.py From magnitude with MIT License | 6 votes |
def get_approx_index_chunks(self): """Gets decompressed chunks of the AnnoyIndex of the vectors from the database.""" try: db = self._db(force_new=True) with lz4.frame.LZ4FrameDecompressor() as decompressor: chunks = db.execute( """ SELECT rowid,index_file FROM `magnitude_approx` WHERE trees = ? """, (self.approx_trees,)) for chunk in chunks: yield decompressor.decompress(chunk[1]) if self.closed: return except Exception as e: if self.closed: pass else: raise e
Example #8
Source File: sketch.py From geosketch with MIT License | 6 votes |
def label_approx(X, sites, site_labels, k=1): from annoy import AnnoyIndex assert(X.shape[1] == sites.shape[1]) # Build index over site points. aindex = AnnoyIndex(sites.shape[1], metric='euclidean') for i in range(sites.shape[0]): aindex.add_item(i, sites[i, :]) aindex.build(10) labels = [] for i in range(X.shape[0]): # Find nearest site point. nearest_sites = aindex.get_nns_by_vector(X[i, :], k) if len(nearest_sites) < 1: labels.append(None) continue label = Counter([ site_labels[ns] for ns in nearest_sites ]).most_common(1)[0][0] labels.append(label) return np.array(labels)
Example #9
Source File: index.py From rep0st with MIT License | 5 votes |
def load_index(self, index_id): if self.annoy_index is None: log.info("loading initial index with id {}", self.current_index) else: log.info("switching index from {} to {}", self.current_index, index_id) newindex = AnnoyIndex(108, metric='euclidean') newindex.load(config.index_config['index_path'] + 'index_' + str(index_id) + '.ann') if self.annoy_index is not None: self.annoy_index.unload() self.annoy_index = newindex self.current_index = index_id log.info("finished switching index. now using index {}", self.current_index)
Example #10
Source File: embedding.py From recoder with MIT License | 5 votes |
def __build_index(self, index_file): self.embedding_size = self.embeddings.shape[1] self.index = an.AnnoyIndex(self.embedding_size, metric='angular') for embedding_ind in range(self.embeddings.shape[0]): embedding = self.embeddings[embedding_ind, :] self.index.add_item(embedding_ind, embedding) self.index.build(self.n_trees) if self.id_map is None: self.id_map = dict([(i, i) for i in range(self.embeddings.shape[0])]) self.inverse_id_map = dict([(v,k) for k,v in self.id_map.items()]) if index_file: embeddings_file = index_file + '.embeddings' state = { 'embedding_size': self.embedding_size, 'id_map': self.id_map, } self.index.save(embeddings_file) with open(index_file, 'wb') as _index_file: pickle.dump(state, _index_file)
Example #11
Source File: matching.py From realtime-embeddings-matching with Apache License 2.0 | 5 votes |
def __init__(self, index_file): logging.info('Initialising matching utility...') self.index = AnnoyIndex(VECTOR_LENGTH) self.index.load(index_file, prefault=True) logging.info('Annoy index {} is loaded'.format(index_file)) with open(index_file + '.mapping', 'rb') as handle: self.mapping = pickle.load(handle) logging.info('Mapping file {} is loaded'.format(index_file + '.mapping')) logging.info('Matching utility initialised.')
Example #12
Source File: text_graph.py From reveal-graph-embedding with Apache License 2.0 | 5 votes |
def make_text_graph(user_lemma_matrix, dimensionality, metric, number_of_estimators, number_of_neighbors): user_lemma_matrix_tfidf = augmented_tf_idf(user_lemma_matrix) # print(user_lemma_matrix_tfidf.shape) if (user_lemma_matrix_tfidf.shape[0] <= dimensionality) or (user_lemma_matrix_tfidf.shape[1] <= dimensionality): X_svd = user_lemma_matrix_tfidf.toarray() else: X_svd = TruncatedSVD(n_components=dimensionality).fit_transform(user_lemma_matrix_tfidf) annoy_index = AnnoyIndex(X_svd.shape[1], metric=metric) for q in range(X_svd.shape[0]): annoy_index.add_item(q, X_svd[q, :]) annoy_index.build(number_of_estimators) row = list() col = list() data = list() for q in range(X_svd.shape[0]): neighbors, distances = annoy_index.get_nns_by_item(q, number_of_neighbors, include_distances=True) row.extend([q] * number_of_neighbors) col.extend(neighbors) data.extend(distances) row = np.array(row, dtype=np.int64) col = np.array(col, dtype=np.int64) data = np.array(data, dtype=np.float64) text_graph = spsp.coo_matrix((data, (row, col)), shape=(X_svd.shape[0], X_svd.shape[0])) text_graph = spsp.csr_matrix(text_graph) return text_graph
Example #13
Source File: index_embeddings.py From jann with MIT License | 5 votes |
def index_embeddings(args): """Main run function for indexing the embeddings.""" unique_strings_path = args.infile + '.embedded.pkl_unique_strings.csv' # Load the unique lines with open(unique_strings_path) as f: unique_strings = [line.rstrip() for line in f] unique_embeddings_path = (args.infile + '.embedded.pkl_unique_strings_embeddings.txt') # Load the unique embeddings with open(unique_embeddings_path) as f: unique_embeddings = [[float(x) for x in line.strip().split()] for line in f] tf.logging.info('Loaded {} unique strings, {} embeddings of dimension {}'. format(len(unique_strings), len(unique_embeddings), len(unique_embeddings[0]))) # Length of item vector that will be indexed nn_forest = AnnoyIndex(512, metric='angular') for i in range(len(unique_strings)): v = unique_embeddings[i] nn_forest.add_item(i, v) # Build an approximate nearest neighbor forest with num_trees nn_forest.build(int(args.num_trees)) output_path = args.infile + '.ann' nn_forest.save(output_path) tf.logging.info('Index forest built {}'.format(output_path)) return True
Example #14
Source File: generateArtificialSessions.py From MSMARCO-Conversational-Search with MIT License | 5 votes |
def generateAnnoy(real, artificial, annoyFilename, dimensions): idx2vec = np.array(artificial[2]) t = AnnoyIndex(dimensions) for j in range(len(artificial[2])): t.add_item(j,idx2vec[j]) print('Done Adding items to AnnoyIndex') t.build(TREESIZE) print('Done Building AnnoyIndex') t.save(annoyFilename) return t
Example #15
Source File: noveltysearchlive.py From Novelty-Search-Live with GNU General Public License v3.0 | 5 votes |
def AddToTrain(individual): global annoy_train global test_db global IND_SIZE global config max_memory = 5 if set.get_master_volume() == 1: print set.get_master_volume() set.set_master_volume(0.85) test_db.append(individual) print "SAVING TO TRAINING SET. TestDB Size: " + str(len(test_db)) annoy_train = AnnoyIndex(IND_SIZE) annoy_train.add_item(annoy_train.get_n_items(), individual) annoy_train.build(config["annoy_tree"]) # 10 trees if len(test_db) > max_memory: test_db.pop(0) print "delete old memory entry" if set.get_master_volume() == 0: test_db = [] # gen_record = [] annoy_train = AnnoyIndex(IND_SIZE) annoy_train.build(config["annoy_tree"]) # 10 trees print "clean set" set.set_master_volume(0.85) ############ App Main Loop ############
Example #16
Source File: dnd.py From tensorflow-rl with Apache License 2.0 | 5 votes |
def __init__(self, capacity=100000, key_size=128, cache_size=32, alpha=0.1): self.alpha = alpha self.capacity = capacity self.lru_cache = LRUCache(capacity) self.dup_cache = deque(maxlen=cache_size) self.index = AnnoyIndex(key_size, metric='euclidean') self.keys = np.zeros((capacity, key_size), dtype=np.float32) self.values = np.zeros((capacity,), dtype=np.float32) self.insert_idx = 0 self.insertions = 0
Example #17
Source File: annoy.py From ann-benchmarks with MIT License | 5 votes |
def fit(self, X): self._annoy = annoy.AnnoyIndex(X.shape[1], metric=self._metric) for i, x in enumerate(X): self._annoy.add_item(i, x.tolist()) self._annoy.build(self._n_trees)
Example #18
Source File: test_knn.py From ivis with GNU General Public License v2.0 | 5 votes |
def test_dense_annoy_index(annoy_index_file): data = np.random.choice([0, 1], size=(10, 5)) index = build_annoy_index(data, annoy_index_file) assert os.path.exists(annoy_index_file) loaded_index = AnnoyIndex(5, metric='angular') loaded_index.load(annoy_index_file) assert index.f == loaded_index.f == 5 assert index.get_n_items() == loaded_index.get_n_items() == 10 assert index.get_nns_by_item(0, 5) == loaded_index.get_nns_by_item(0, 5) index.unload() loaded_index.unload()
Example #19
Source File: background_job.py From rep0st with MIT License | 5 votes |
def build_index(index_id): n_trees = config.index_config['tree_count'] log.info("started index build") session = rep.database.DBSession() count = session.query(Feature).filter(Feature.type == FeatureType.FEATURE_VECTOR).count() index = AnnoyIndex(108, metric='euclidean') cnt = 0 log.info("adding {} features to index", count) start = time.time() for feature in session.query(Feature).filter(Feature.type == FeatureType.FEATURE_VECTOR).yield_per(1000): arr = np.asarray(bytearray(feature.data)).astype(np.float32) index.add_item(feature.post_id, arr) cnt += 1 if cnt % 10000 == 0: log.debug("added {}/{} features to the index", cnt, count) session.close() stop = time.time() log.info("added all {} features to the index in {}", count, str(datetime.timedelta(seconds=stop - start))) log.info("building index with {} trees. this will take a while...", n_trees) start = time.time() index.build(20) index_file = config.index_config['index_path'] + "index_" + str(index_id) + ".ann" log.info("saving index to file {}", index_file) index.save(index_file) stop = time.time() log.debug("finished building of index. it took {}", str(datetime.timedelta(seconds=stop - start)))
Example #20
Source File: hf_sampler.py From hfsoftmax with MIT License | 5 votes |
def __init__(self, rank, fdim, sample_num, num_output, bias=False, ntrees=50, interval=100, start_iter=0, midw='0', midb='1'): super(HFSampler, self).__init__() self.rank = rank self.fdim = fdim self.sample_num = sample_num self.num_output = num_output self.full_cls = np.arange(self.num_output) # init param client self.client = ParameterClient(rank) self.midw = midw self.midb = midb self.is_bias = bias self.client.add_matrix(self.midw, [self.num_output, self.fdim]) if self.is_bias: self.client.add_matrix(self.midb, [self.num_output, 1]) # init hashing forest self.ntrees = ntrees self.interval = interval self.start_iter = start_iter self.iter = start_iter self.test_iter = start_iter self.anns = AnnoyIndex(self.fdim) self.pool = Pool(processes=2)
Example #21
Source File: hf_sampler.py From hfsoftmax with MIT License | 5 votes |
def _update_hf(self): if not self.iter % self.interval == 0 and \ not self.iter == self.start_iter: return w = self.client.get_value_by_rows(self.midw, self.full_cls) self.anns = AnnoyIndex(self.fdim) for i, v in enumerate(w): self.anns.add_item(i, v) self.anns.build(self.ntrees)
Example #22
Source File: cluster_corr.py From altanalyze with Apache License 2.0 | 5 votes |
def nearest_neighbors(collection, num_neighbors=10, n_trees=100): """ Finds the num_neighbors nearest neighbors to each cell in the sparse matrix Return result is a dictionary of lists, where the key is an index into the cells, and the value is the neighbors of that cell """ nn_idx = AnnoyIndex(collection.num_genes()) # Add the elements in reverse order because Annoy allocates the memory based on # the value of the element added - so adding in increasing order will trigger # lots of allocations for i in range(collection.num_cells()-1, -1, -1): nn_idx.add_item(i, collection.get_cell_expression_vector(i)) nn_idx.build(n_trees) return { i: nn_idx.get_nns_by_item(i, num_neighbors) for i in range(collection.num_cells()) }
Example #23
Source File: similarity_finder.py From hub with Apache License 2.0 | 5 votes |
def __init__( self, module_url, index_file_path, mapping_file_path, dimensions, random_projection_matrix_file, ): # Load the TF-Hub module print('Loading the TF-Hub module...') self.embed_fn = hub.load(module_url) print('TF-hub module is loaded.') dimensions = self.embed_fn(['']).shape[1] self.random_projection_matrix = None if tf.io.gfile.exists(random_projection_matrix_file): with open(random_projection_matrix_file, 'rb') as handle: self.random_projection_matrix = pickle.load(handle) dimensions = self.random_projection_matrix.shape[1] self.index = annoy.AnnoyIndex(dimensions, metric=_METRIC) self.index.load(index_file_path, prefault=True) print('Annoy index is loaded.') with open(mapping_file_path, 'rb') as handle: self.mapping = pickle.load(handle) print('Mapping file is loaded.')
Example #24
Source File: differentiable_neural_dictionary.py From coach with Apache License 2.0 | 5 votes |
def __init__(self, dict_size, key_width, new_value_shift_coefficient=0.1, batch_size=100, key_error_threshold=0.01, num_neighbors=50, override_existing_keys=True, rebuild_on_every_update=False): self.rebuild_on_every_update = rebuild_on_every_update self.max_size = dict_size self.curr_size = 0 self.new_value_shift_coefficient = new_value_shift_coefficient self.num_neighbors = num_neighbors self.override_existing_keys = override_existing_keys self.index = AnnoyIndex(key_width, metric='euclidean') self.index.set_seed(1) self.embeddings = np.zeros((dict_size, key_width)) self.values = np.zeros(dict_size) self.additional_data = [None] * dict_size self.lru_timestamps = np.zeros(dict_size) self.current_timestamp = 0.0 # keys that are in this distance will be considered as the same key self.key_error_threshold = key_error_threshold self.initial_update_size = batch_size self.min_update_size = self.initial_update_size self.key_dimension = key_width self.value_dimension = 1 self._reset_buffer() self.built_capacity = 0
Example #25
Source File: differentiable_neural_dictionary.py From coach with Apache License 2.0 | 5 votes |
def load_dnd(model_dir): latest_checkpoint_id = -1 latest_checkpoint = '' # get all checkpoint files for fname in os.listdir(model_dir): path = os.path.join(model_dir, fname) if os.path.isdir(path) or fname.split('.')[-1] != 'srs': continue checkpoint_id = int(fname.split('_')[0]) if checkpoint_id > latest_checkpoint_id: latest_checkpoint = fname latest_checkpoint_id = checkpoint_id with open(os.path.join(model_dir, str(latest_checkpoint)), 'rb') as f: DND = pickle.load(f) for a in range(DND.num_actions): DND.dicts[a].index = AnnoyIndex(512, metric='euclidean') DND.dicts[a].index.set_seed(1) for idx, key in zip(range(DND.dicts[a].curr_size), DND.dicts[a].embeddings[:DND.dicts[a].curr_size]): DND.dicts[a].index.add_item(idx, key) DND.dicts[a].index.build(50) return DND
Example #26
Source File: sketch.py From geosketch with MIT License | 5 votes |
def srs_positive_annoy(X, N, seed=None, replace=False, prenormalized=False): from annoy import AnnoyIndex n_samples, n_features = X.shape if not replace and N > n_samples: raise ValueError('Cannot sample {} elements from {} elements ' 'without replacement'.format(N, n_samples)) if not replace and N == n_samples: return range(N) if not seed is None: np.random.seed(seed) X = X - X.min(0) if not prenormalized: X = normalize(X).astype('float32') srs_idx = set() for i in range(N): aindex = AnnoyIndex(X.shape[1], metric='euclidean') for i in range(X.shape[0]): if i not in srs_idx: aindex.add_item(i, X[i, :]) aindex.build(10) Phi_i = np.random.normal(size=(n_features)) Phi_i /= np.linalg.norm(Phi_i) nearest_site = aindex.get_nns_by_vector(Phi_i, 1) srs_idx.add(nearest_site[0]) return sorted(srs_idx)
Example #27
Source File: approximate_als.py From implicit with MIT License | 5 votes |
def fit(self, Ciu, show_progress=True): # delay loading the annoy library in case its not installed here import annoy # train the model super(AnnoyAlternatingLeastSquares, self).fit(Ciu, show_progress) # build up an Annoy Index with all the item_factors (for calculating # similar items) if self.approximate_similar_items: log.debug("Building annoy similar items index") self.similar_items_index = annoy.AnnoyIndex( self.item_factors.shape[1], 'angular') for i, row in enumerate(self.item_factors): self.similar_items_index.add_item(i, row) self.similar_items_index.build(self.n_trees) # build up a separate index for the inner product (for recommend # methods) if self.approximate_recommend: log.debug("Building annoy recommendation index") self.max_norm, extra = augment_inner_product_matrix(self.item_factors) self.recommend_index = annoy.AnnoyIndex(extra.shape[1], 'angular') for i, row in enumerate(extra): self.recommend_index.add_item(i, row) self.recommend_index.build(self.n_trees)
Example #28
Source File: embedding.py From recoder with MIT License | 5 votes |
def __load_index(self, index_file): log.info('Loading index file from {}'.format(index_file)) with open(index_file, 'rb') as _index_file: state = pickle.load(_index_file) self.embedding_size = state['embedding_size'] self.id_map = state['id_map'] embeddings_file = index_file + '.embeddings' self.index = an.AnnoyIndex(self.embedding_size, metric='angular') self.index.load(embeddings_file) self.inverse_id_map = dict([(v,k) for k,v in self.id_map.items()])
Example #29
Source File: recall.py From nlp_research with MIT License | 5 votes |
def __init__(self, vecs): assert len(vecs)>0, 'no vecs available to init AnnoyIndex' size = len(vecs[0]) self.annoy_model = AnnoyIndex(size) for idx,vec in enumerate(vecs): self.annoy_model.add_item(idx, vec) self.annoy_model.build(50)
Example #30
Source File: annoyVectorIndex.py From Seq2Seq-Vis with Apache License 2.0 | 5 votes |
def __init__(self, file_name, dim_vector=500): self.u = AnnoyIndex(dim_vector) self.u.load(file_name)