Python Examples of elasticsearch.helpers.parallel

Source File: search.py From flask-shop with BSD 3-Clause "New" or "Revised" License

6 votes

def bulk_update(cls, items, chunk_size=5000, op_type="update", **kwargs):
        index = cls._index._name
        _type = cls._doc_type.name
        obj = [
            {
                "_op_type": op_type,
                "_id": f"{doc.id}",
                "_index": index,
                "_type": _type,
                "_source": get_item_data(doc),
            }
            for doc in items
        ]
        client = cls.get_es()
        rs = list(parallel_bulk(client, obj, chunk_size=chunk_size, **kwargs))
        return rs

Source File: bulk_daemon.py From search-MjoLniR with MIT License

5 votes

def bulk_import(**kwargs) -> Tuple[int, int, int]:
    """Bulk import data to elasticsearch.

    Tracks bulk import response metrics, reporting both externally to
    prometheus and to the caller.
    """
    log.info('Starting bulk import: {}'.format(str(kwargs)))
    good, missing, errors = 0, 0, 0
    for ok, result in parallel_bulk(raise_on_exception=False, raise_on_error=False, **kwargs):
        action, result = result.popitem()
        status_code = result.get('status', 500)
        if ok:
            good += 1
            try:
                Metric.ACTION_RESULTS[result['result']].inc()
            except KeyError:
                Metric.OK_UNKNOWN.inc()
        elif status_code == 'TIMEOUT':
            Metric.TIMEOUT.inc()
            errors += 1
        elif not isinstance(status_code, int):
            # Previously found TIMEOUT status_code here
            Metric.FAILED.inc()
            log.warning(
                'Failed bulk %s request with invalid status_code %s: %s',
                action, str(status_code), str(result)[:1024])
            errors += 1
        elif status_code == 404:
            # 404 are quite common so we log them separately. The analytics
            # side doesn't know the namespace mappings and attempts to send all
            # updates to <wiki>_content, letting the docs that don't exist fail
            missing += 1
            Metric.MISSING.inc()
        elif status_code >= 400 and status_code < 500:
            # Bulk contained invalid records, can't do much beyond logging
            Metric.FAILED.inc()
            log.warning('Failed bulk %s request: %s', action, str(result)[:1024])
            errors += 1
        elif status_code >= 500 and status_code < 600:
            # primary not available, etc. Internal elasticsearch errors. Should be retryable
            raise Exception(
                "Internal elasticsearch error on {}, status code {}: {}".format(action, status_code, str(result)))
        else:
            raise Exception(
                "Unexpected response on {}, status code {}: {}".format(action, status_code, str(result)))

    log.info('Completed import with %d success %d missing and %d errors', good, missing, errors)
    return good, missing, errors

Source File: distribution_indexer.py From series-tiempo-ar-api with MIT License

5 votes

def run(self, distribution):
        actions = self.generate_es_actions(distribution)

        if not actions:
            return

        for success, info in parallel_bulk(self.elastic, actions):
            if not success:
                logger.warning(strings.BULK_REQUEST_ERROR, info)

        self.update_distribution_indexation_metadata(distribution)

Source File: generate_data.py From series-tiempo-ar-api with MIT License

5 votes

def index_data(self):
        """Indexa la data leía desde el archivo de datos"""
        with open(DATA_FILE_PATH) as f:
            self.elastic.indices.create(settings.TS_INDEX,
                                        body=INDEX_CREATION_BODY)

            actions = [json.loads(row) for row in f.readlines()]
            for success, info in parallel_bulk(self.elastic, actions):
                if not success:
                    print("ERROR:", info)

            segments = FORCE_MERGE_SEGMENTS
            self.elastic.indices.forcemerge(index=settings.TS_INDEX,
                                            max_num_segments=segments)

Source File: index.py From series-tiempo-ar-api with MIT License

5 votes

def index(self, queryset: QuerySet):
        self._init_index()

        for success, info in parallel_bulk(self.es_connection, generate_es_query(queryset)):
            if not success:
                raise RuntimeError(f"Error indexando query a ES: {info}")

Source File: elastic.py From browbeat with Apache License 2.0

4 votes

def flush_cache(self):
        if len(self.cache) == 0:
            return True
        retry = 2
        for i in range(retry):
            try:
                to_upload = helpers.parallel_bulk(
                    self.es, self.cache_insertable_iterable())
                counter = 0
                num_items = len(self.cache)
                for item in to_upload:
                    self.logger.debug(
                        "{} of {} Elastic objects uploaded".format(
                            num_items, counter))
                    counter = counter + 1
                output = "Pushed {} items to Elasticsearch to index {}".format(
                    num_items, self.index)
                output += " and browbeat UUID {}".format(str(browbeat_uuid))
                self.logger.info(output)
                self.cache = deque()
                self.last_upload = datetime.datetime.utcnow()
                return True
            except Exception as Err:
                self.logger.error(
                    "Error pushing data to Elasticsearch, going to retry"
                    " in 10 seconds")
                self.logger.error("Exception: {}".format(Err))
                time.sleep(10)
                if i == (retry - 1):
                    self.logger.error(
                        "Pushing Data to Elasticsearch failed in spite of retry,"
                        " dumping JSON for {} cached items".format(
                            len(
                                self.cache)))
                    for item in self.cache:
                        filename = item['test_name'] + '-' + item['identifier']
                        filename += '-elastic' + '.' + 'json'
                        elastic_file = os.path.join(item['result_dir'],
                                                    filename)

                        with open(elastic_file, 'w') as result_file:
                            json.dump(item['result'],
                                      result_file,
                                      indent=4,
                                      sort_keys=True)

                            self.logger.info(
                                "Saved Elasticsearch consumable result JSON to {}". format(
                                    elastic_file))
                    self.cache = deque()
                    self.last_upload = datetime.datetime.utcnow()
                    return False

Source File: importLogs.py From aws-la with MIT License

4 votes

def processFiles(f):
    # list for bulk documents
    documents = []

    for log_line in f:
        # Create the body and sanitize
        source = {"message": log_line.strip('\n') }
        body = {"_index": options.index_name, "_type": options.index_name, "pipeline": options.index_name, "_source": source }

        # append record to list before bulk send to ES
        documents.append(body)
        options.totalDocCount +=1

        if len(documents) >= options.bulk_limit:
            # bulk send all our entries
            status = helpers.parallel_bulk(es, documents)

            # look through each result for status
            for i in status:
                if i[0] == False:
                    print "There was an error importing a record.  Error: ", i[1]

            # Using this to have the doc count stay on one line and continually be updated
            sys.stdout.write("Total Documents sent to Elasticsearch: " + str(options.totalDocCount) + "\r")
            sys.stdout.flush()

            # now clean out the document list
            documents[:] = []

    # If we've made it here, then the file ended, and it's possible we still have documents in documents list.  Need to send what we have
    if len(documents) > 0:
        # bulk send all our entries
        status = helpers.parallel_bulk(es, documents)

        # look through each result for status
        for i in status:
            if i[0] == False:
                print "There was an error importing a record.  Error: ", i[1]

        # Using this to have the doc count stay on one line and continually be updated
        sys.stdout.write("Total Documents sent to Elasticsearch: " + str(options.totalDocCount) + "\r")
        sys.stdout.flush()

        # now clean out the document list
        documents[:] = []

    # print the final doc count before moving out of the function
    sys.stdout.write("Total Documents sent to Elasticsearch: " + str(options.totalDocCount) + "\r")

Python elasticsearch.helpers.parallel_bulk() Examples