Python Examples of elasticsearch.helpers.streaming

Source File: commands.py From udata with GNU Affero General Public License v3.0

4 votes

def index_model(index_name, adapter, timeout=None):
    ''' Indel all objects given a model'''
    model = adapter.model
    log.info('Indexing %s objects', model.__name__)
    qs = model.objects
    if hasattr(model.objects, 'visible'):
        qs = qs.visible()
    if adapter.exclude_fields:
        qs = qs.exclude(*adapter.exclude_fields)

    docs = iter_qs(qs, adapter)
    docs = iter_for_index(docs, index_name)

    for ok, info in streaming_bulk(es.client, docs, raise_on_error=False,
                                   request_timeout=timeout):
        if not ok:
            log.error('Unable to index %s "%s": %s', model.__name__,
                      info['index']['_id'], info['index']['error'])

Source File: views.py From texta with GNU General Public License v3.0

4 votes

def insert_multiple_documents(self, list_of_documents):
        actions = [{"_source": document, "_index": self.index, "_type": self.doc_type} for document in list_of_documents]

        for success, response in elastic_parallelbulk(client=self.es, actions=actions):
            self.logger.info("BulkAPI response: {0}".format(str(response)))
            if not success:
                self.logger.error(str(response))
                raise ValueError(str(response))

Source File: index_data.py From es-django-example with Apache License 2.0

4 votes

def verbose_run(self, model, report_every=100):
        name = model._meta.verbose_name
        print('Indexing %s: ' % name, end='')
        start = time.time()
        cnt = 0
        for _  in streaming_bulk(
                self.es,
                (m.to_search().to_dict(True) for m in model.objects.all().iterator()),
                index=settings.ES_INDEX,
                doc_type=name.lower(),
            ):
            cnt += 1
            if cnt % report_every:
                print('.', end='')
                sys.stdout.flush()
        print('DONE\nIndexing %d %s in %.2f seconds'% (
            cnt, name, time.time() - start
        ))

Source File: indexer.py From georef-ar-api with MIT License

4 votes

def _insert_documents(self, es, index, docs, count, verbose=False):
        """Inserta documentos dentro de un índice.

        Args:
            es (Elasticsearch): Cliente Elasticsearch.
            index (str): Nombre de índice.
            docs (Iterator[dict]): Iterator de documentos a insertar.
            count (int): Cantidad de documentos a insertar.
            verbose (bool): Mostrar más información en pantalla.

        """
        operations = self._bulk_update_generator(docs, index)
        creations, errors = 0, 0

        logger.info('Insertando documentos...')

        iterator = helpers.streaming_bulk(es, operations, raise_on_error=False,
                                          request_timeout=ES_TIMEOUT)

        if verbose:
            iterator = tqdm.tqdm(iterator, total=count, file=sys.stderr)

        for ok, response in iterator:
            if ok and response['create']['result'] == 'created':
                creations += 1
            else:
                errors += 1
                identifier = response['create']['_id']
                error = response['create']['error']

                logger.warning(
                    'Error al procesar el documento ID {}:'.format(identifier))
                logger.warning(json.dumps(error, indent=4, ensure_ascii=False))
                logger.warning('')

        logger.info('Resumen:')
        logger.info(' + Documentos procesados: {}'.format(count))
        logger.info(' + Documentos creados: {}'.format(creations))
        logger.info(' + Errores: {}'.format(errors))
        logger.info('')

Source File: _elasticsearch_helpers.py From mindmeld with Apache License 2.0

4 votes

def version_compatible_streaming_bulk(
    es_client, docs, index, chunk_size, raise_on_error, doc_type
):

    if is_es_version_7(es_client):
        return streaming_bulk(
            es_client,
            docs,
            index=index,
            chunk_size=chunk_size,
            raise_on_error=raise_on_error,
        )
    else:
        return streaming_bulk(
            es_client,
            docs,
            index=index,
            doc_type=doc_type,
            chunk_size=chunk_size,
            raise_on_error=raise_on_error,
        )

Source File: es_load.py From openslack-crawler with Apache License 2.0

4 votes

def load_repo(client, path=None, index='git'):
    """
    Parse a git repository with all it's commits and load it into elasticsearch
    using `client`. If the index doesn't exist it will be created.
    """
    path = dirname(dirname(abspath(__file__))) if path is None else path
    repo_name = basename(path)
    repo = git.Repo(path)

    create_git_index(client, index)

    # create the parent document in case it doesn't exist
    client.create(
        index=index,
        doc_type='repos',
        id=repo_name,
        body={},
        ignore=409 # 409 - conflict - would be returned if the document is already there
    )

    # we let the streaming bulk continuously process the commits as they come
    # in - since the `parse_commits` function is a generator this will avoid
    # loading all the commits into memory
    for ok, result in streaming_bulk(
            client,
            parse_commits(repo.refs.master.commit, repo_name),
            index=index,
            doc_type='commits',
            chunk_size=50 # keep the batch sizes small for appearances only
        ):
        action, result = result.popitem()
        doc_id = '/%s/commits/%s' % (index, result['_id'])
        # process the information from ES whether the document has been
        # successfully indexed
        if not ok:
            print('Failed to %s document %s: %r' % (action, doc_id, result))
        else:
            print(doc_id)


# we manually create es repo document and update elasticsearch-py to include metadata

Source File: catalog_meta_indexer.py From series-tiempo-ar-api with MIT License

4 votes

def index(self) -> bool:
        if not self.get_available_fields().count():
            self.task.info(self.task, "No hay series para indexar en este catálogo")
            return False

        index_ok = False
        for success, info in streaming_bulk(self.elastic, self.generate_actions()):
            if not success:
                self.task.info(self.task, 'Error indexando: {}'.format(info))
            else:
                index_ok = True

        return index_ok

Source File: elastic2_doc_manager.py From elastic2-doc-manager with Apache License 2.0

3 votes

def handle_command(self, doc, namespace, timestamp):
        # Flush buffer before handle command
        self.commit()
        db = namespace.split(".", 1)[0]
        if doc.get("dropDatabase"):
            dbs = self.command_helper.map_db(db)
            for _db in dbs:
                self.elastic.indices.delete(index=_db.lower())

        if doc.get("renameCollection"):
            raise errors.OperationFailed(
                "elastic_doc_manager does not support renaming a mapping."
            )

        if doc.get("create"):
            db, coll = self.command_helper.map_collection(db, doc["create"])
            if db and coll:
                self.elastic.indices.put_mapping(
                    index=db.lower(), doc_type=coll, body={"_source": {"enabled": True}}
                )

        if doc.get("drop"):
            db, coll = self.command_helper.map_collection(db, doc["drop"])
            if db and coll:
                # This will delete the items in coll, but not get rid of the
                # mapping.
                warnings.warn(
                    "Deleting all documents of type %s on index %s."
                    "The mapping definition will persist and must be"
                    "removed manually." % (coll, db)
                )
                responses = streaming_bulk(
                    self.elastic,
                    (
                        dict(result, _op_type="delete")
                        for result in scan(
                            self.elastic, index=db.lower(), doc_type=coll
                        )
                    ),
                )
                for ok, resp in responses:
                    if not ok:
                        LOG.error(
                            "Error occurred while deleting ElasticSearch docum"
                            "ent during handling of 'drop' command: %r" % resp
                        )

Source File: elastic2_doc_manager.py From elastic2-doc-manager with Apache License 2.0

3 votes

def bulk_upsert(self, docs, namespace, timestamp):
        """Insert multiple documents into Elasticsearch."""

        def docs_to_upsert():
            doc = None
            for doc in docs:
                # Remove metadata and redundant _id
                index, doc_type = self._index_and_mapping(namespace)
                doc_id = str(doc.pop("_id"))
                document_action = {
                    "_index": index,
                    "_type": doc_type,
                    "_id": doc_id,
                    "_source": self._formatter.format_document(doc),
                }
                document_meta = {
                    "_index": self.meta_index_name,
                    "_type": self.meta_type,
                    "_id": doc_id,
                    "_source": {"ns": namespace, "_ts": timestamp},
                }
                yield document_action
                yield document_meta
            if doc is None:
                raise errors.EmptyDocsError(
                    "Cannot upsert an empty sequence of "
                    "documents into Elastic Search"
                )

        try:
            kw = {}
            if self.chunk_size > 0:
                kw["chunk_size"] = self.chunk_size

            responses = streaming_bulk(
                client=self.elastic, actions=docs_to_upsert(), **kw
            )

            for ok, resp in responses:
                if not ok:
                    LOG.error(
                        "Could not bulk-upsert document "
                        "into ElasticSearch: %r" % resp
                    )
            if self.auto_commit_interval == 0:
                self.commit()
        except errors.EmptyDocsError:
            # This can happen when mongo-connector starts up, there is no
            # config file, but nothing to dump
            pass

Source File: views.py From autocompeter with Mozilla Public License 2.0

3 votes

def bulk(request, domain):
    assert domain

    try:
        documents = json.loads(request.body.decode('utf-8'))['documents']
    except KeyError:
        return http.JsonResponse({'error': "Missing 'documents'"}, status=400)

    def iterator():
        for document in documents:
            url = document.get('url', '').strip()
            if not url:
                continue
            title = document.get('title', '').strip()
            if not title:
                continue
            yield TitleDoc(
                meta={'id': make_id(domain.name, url)},
                **{
                    'domain': domain.name,
                    'url': url,
                    'title': title,
                    'group': document.get('group', '').strip(),
                    'popularity': float(document.get('popularity', 0.0)),
                }
            ).to_dict(True)

    count = failures = 0

    t0 = time.time()
    for success, doc in streaming_bulk(
        connections.get_connection(),
        iterator(),
        index=settings.ES_INDEX,
        doc_type='title_doc',
    ):
        if not success:
            print("NOT SUCCESS!", doc)
            failures += 1
        count += 1
    t1 = time.time()

    return http.JsonResponse({
        'message': 'OK',
        'count': count,
        'failures': failures,
        'took': t1 - t0,
    }, status=201)

Python elasticsearch.helpers.streaming_bulk() Examples