Python elasticsearch.helpers.streaming_bulk() Examples
The following are 10
code examples of elasticsearch.helpers.streaming_bulk().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
elasticsearch.helpers
, or try the search function
.
Example #1
Source File: commands.py From udata with GNU Affero General Public License v3.0 | 4 votes |
def index_model(index_name, adapter, timeout=None): ''' Indel all objects given a model''' model = adapter.model log.info('Indexing %s objects', model.__name__) qs = model.objects if hasattr(model.objects, 'visible'): qs = qs.visible() if adapter.exclude_fields: qs = qs.exclude(*adapter.exclude_fields) docs = iter_qs(qs, adapter) docs = iter_for_index(docs, index_name) for ok, info in streaming_bulk(es.client, docs, raise_on_error=False, request_timeout=timeout): if not ok: log.error('Unable to index %s "%s": %s', model.__name__, info['index']['_id'], info['index']['error'])
Example #2
Source File: views.py From texta with GNU General Public License v3.0 | 4 votes |
def insert_multiple_documents(self, list_of_documents): actions = [{"_source": document, "_index": self.index, "_type": self.doc_type} for document in list_of_documents] for success, response in elastic_parallelbulk(client=self.es, actions=actions): self.logger.info("BulkAPI response: {0}".format(str(response))) if not success: self.logger.error(str(response)) raise ValueError(str(response))
Example #3
Source File: index_data.py From es-django-example with Apache License 2.0 | 4 votes |
def verbose_run(self, model, report_every=100): name = model._meta.verbose_name print('Indexing %s: ' % name, end='') start = time.time() cnt = 0 for _ in streaming_bulk( self.es, (m.to_search().to_dict(True) for m in model.objects.all().iterator()), index=settings.ES_INDEX, doc_type=name.lower(), ): cnt += 1 if cnt % report_every: print('.', end='') sys.stdout.flush() print('DONE\nIndexing %d %s in %.2f seconds'% ( cnt, name, time.time() - start ))
Example #4
Source File: indexer.py From georef-ar-api with MIT License | 4 votes |
def _insert_documents(self, es, index, docs, count, verbose=False): """Inserta documentos dentro de un índice. Args: es (Elasticsearch): Cliente Elasticsearch. index (str): Nombre de índice. docs (Iterator[dict]): Iterator de documentos a insertar. count (int): Cantidad de documentos a insertar. verbose (bool): Mostrar más información en pantalla. """ operations = self._bulk_update_generator(docs, index) creations, errors = 0, 0 logger.info('Insertando documentos...') iterator = helpers.streaming_bulk(es, operations, raise_on_error=False, request_timeout=ES_TIMEOUT) if verbose: iterator = tqdm.tqdm(iterator, total=count, file=sys.stderr) for ok, response in iterator: if ok and response['create']['result'] == 'created': creations += 1 else: errors += 1 identifier = response['create']['_id'] error = response['create']['error'] logger.warning( 'Error al procesar el documento ID {}:'.format(identifier)) logger.warning(json.dumps(error, indent=4, ensure_ascii=False)) logger.warning('') logger.info('Resumen:') logger.info(' + Documentos procesados: {}'.format(count)) logger.info(' + Documentos creados: {}'.format(creations)) logger.info(' + Errores: {}'.format(errors)) logger.info('')
Example #5
Source File: _elasticsearch_helpers.py From mindmeld with Apache License 2.0 | 4 votes |
def version_compatible_streaming_bulk( es_client, docs, index, chunk_size, raise_on_error, doc_type ): if is_es_version_7(es_client): return streaming_bulk( es_client, docs, index=index, chunk_size=chunk_size, raise_on_error=raise_on_error, ) else: return streaming_bulk( es_client, docs, index=index, doc_type=doc_type, chunk_size=chunk_size, raise_on_error=raise_on_error, )
Example #6
Source File: es_load.py From openslack-crawler with Apache License 2.0 | 4 votes |
def load_repo(client, path=None, index='git'): """ Parse a git repository with all it's commits and load it into elasticsearch using `client`. If the index doesn't exist it will be created. """ path = dirname(dirname(abspath(__file__))) if path is None else path repo_name = basename(path) repo = git.Repo(path) create_git_index(client, index) # create the parent document in case it doesn't exist client.create( index=index, doc_type='repos', id=repo_name, body={}, ignore=409 # 409 - conflict - would be returned if the document is already there ) # we let the streaming bulk continuously process the commits as they come # in - since the `parse_commits` function is a generator this will avoid # loading all the commits into memory for ok, result in streaming_bulk( client, parse_commits(repo.refs.master.commit, repo_name), index=index, doc_type='commits', chunk_size=50 # keep the batch sizes small for appearances only ): action, result = result.popitem() doc_id = '/%s/commits/%s' % (index, result['_id']) # process the information from ES whether the document has been # successfully indexed if not ok: print('Failed to %s document %s: %r' % (action, doc_id, result)) else: print(doc_id) # we manually create es repo document and update elasticsearch-py to include metadata
Example #7
Source File: catalog_meta_indexer.py From series-tiempo-ar-api with MIT License | 4 votes |
def index(self) -> bool: if not self.get_available_fields().count(): self.task.info(self.task, "No hay series para indexar en este catálogo") return False index_ok = False for success, info in streaming_bulk(self.elastic, self.generate_actions()): if not success: self.task.info(self.task, 'Error indexando: {}'.format(info)) else: index_ok = True return index_ok
Example #8
Source File: elastic2_doc_manager.py From elastic2-doc-manager with Apache License 2.0 | 3 votes |
def handle_command(self, doc, namespace, timestamp): # Flush buffer before handle command self.commit() db = namespace.split(".", 1)[0] if doc.get("dropDatabase"): dbs = self.command_helper.map_db(db) for _db in dbs: self.elastic.indices.delete(index=_db.lower()) if doc.get("renameCollection"): raise errors.OperationFailed( "elastic_doc_manager does not support renaming a mapping." ) if doc.get("create"): db, coll = self.command_helper.map_collection(db, doc["create"]) if db and coll: self.elastic.indices.put_mapping( index=db.lower(), doc_type=coll, body={"_source": {"enabled": True}} ) if doc.get("drop"): db, coll = self.command_helper.map_collection(db, doc["drop"]) if db and coll: # This will delete the items in coll, but not get rid of the # mapping. warnings.warn( "Deleting all documents of type %s on index %s." "The mapping definition will persist and must be" "removed manually." % (coll, db) ) responses = streaming_bulk( self.elastic, ( dict(result, _op_type="delete") for result in scan( self.elastic, index=db.lower(), doc_type=coll ) ), ) for ok, resp in responses: if not ok: LOG.error( "Error occurred while deleting ElasticSearch docum" "ent during handling of 'drop' command: %r" % resp )
Example #9
Source File: elastic2_doc_manager.py From elastic2-doc-manager with Apache License 2.0 | 3 votes |
def bulk_upsert(self, docs, namespace, timestamp): """Insert multiple documents into Elasticsearch.""" def docs_to_upsert(): doc = None for doc in docs: # Remove metadata and redundant _id index, doc_type = self._index_and_mapping(namespace) doc_id = str(doc.pop("_id")) document_action = { "_index": index, "_type": doc_type, "_id": doc_id, "_source": self._formatter.format_document(doc), } document_meta = { "_index": self.meta_index_name, "_type": self.meta_type, "_id": doc_id, "_source": {"ns": namespace, "_ts": timestamp}, } yield document_action yield document_meta if doc is None: raise errors.EmptyDocsError( "Cannot upsert an empty sequence of " "documents into Elastic Search" ) try: kw = {} if self.chunk_size > 0: kw["chunk_size"] = self.chunk_size responses = streaming_bulk( client=self.elastic, actions=docs_to_upsert(), **kw ) for ok, resp in responses: if not ok: LOG.error( "Could not bulk-upsert document " "into ElasticSearch: %r" % resp ) if self.auto_commit_interval == 0: self.commit() except errors.EmptyDocsError: # This can happen when mongo-connector starts up, there is no # config file, but nothing to dump pass
Example #10
Source File: views.py From autocompeter with Mozilla Public License 2.0 | 3 votes |
def bulk(request, domain): assert domain try: documents = json.loads(request.body.decode('utf-8'))['documents'] except KeyError: return http.JsonResponse({'error': "Missing 'documents'"}, status=400) def iterator(): for document in documents: url = document.get('url', '').strip() if not url: continue title = document.get('title', '').strip() if not title: continue yield TitleDoc( meta={'id': make_id(domain.name, url)}, **{ 'domain': domain.name, 'url': url, 'title': title, 'group': document.get('group', '').strip(), 'popularity': float(document.get('popularity', 0.0)), } ).to_dict(True) count = failures = 0 t0 = time.time() for success, doc in streaming_bulk( connections.get_connection(), iterator(), index=settings.ES_INDEX, doc_type='title_doc', ): if not success: print("NOT SUCCESS!", doc) failures += 1 count += 1 t1 = time.time() return http.JsonResponse({ 'message': 'OK', 'count': count, 'failures': failures, 'took': t1 - t0, }, status=201)