Python elasticsearch.helpers.parallel_bulk() Examples

The following are 7 code examples of elasticsearch.helpers.parallel_bulk(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module elasticsearch.helpers , or try the search function .
Example #1
Source File: search.py    From flask-shop with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def bulk_update(cls, items, chunk_size=5000, op_type="update", **kwargs):
        index = cls._index._name
        _type = cls._doc_type.name
        obj = [
            {
                "_op_type": op_type,
                "_id": f"{doc.id}",
                "_index": index,
                "_type": _type,
                "_source": get_item_data(doc),
            }
            for doc in items
        ]
        client = cls.get_es()
        rs = list(parallel_bulk(client, obj, chunk_size=chunk_size, **kwargs))
        return rs 
Example #2
Source File: bulk_daemon.py    From search-MjoLniR with MIT License 5 votes vote down vote up
def bulk_import(**kwargs) -> Tuple[int, int, int]:
    """Bulk import data to elasticsearch.

    Tracks bulk import response metrics, reporting both externally to
    prometheus and to the caller.
    """
    log.info('Starting bulk import: {}'.format(str(kwargs)))
    good, missing, errors = 0, 0, 0
    for ok, result in parallel_bulk(raise_on_exception=False, raise_on_error=False, **kwargs):
        action, result = result.popitem()
        status_code = result.get('status', 500)
        if ok:
            good += 1
            try:
                Metric.ACTION_RESULTS[result['result']].inc()
            except KeyError:
                Metric.OK_UNKNOWN.inc()
        elif status_code == 'TIMEOUT':
            Metric.TIMEOUT.inc()
            errors += 1
        elif not isinstance(status_code, int):
            # Previously found TIMEOUT status_code here
            Metric.FAILED.inc()
            log.warning(
                'Failed bulk %s request with invalid status_code %s: %s',
                action, str(status_code), str(result)[:1024])
            errors += 1
        elif status_code == 404:
            # 404 are quite common so we log them separately. The analytics
            # side doesn't know the namespace mappings and attempts to send all
            # updates to <wiki>_content, letting the docs that don't exist fail
            missing += 1
            Metric.MISSING.inc()
        elif status_code >= 400 and status_code < 500:
            # Bulk contained invalid records, can't do much beyond logging
            Metric.FAILED.inc()
            log.warning('Failed bulk %s request: %s', action, str(result)[:1024])
            errors += 1
        elif status_code >= 500 and status_code < 600:
            # primary not available, etc. Internal elasticsearch errors. Should be retryable
            raise Exception(
                "Internal elasticsearch error on {}, status code {}: {}".format(action, status_code, str(result)))
        else:
            raise Exception(
                "Unexpected response on {}, status code {}: {}".format(action, status_code, str(result)))

    log.info('Completed import with %d success %d missing and %d errors', good, missing, errors)
    return good, missing, errors 
Example #3
Source File: distribution_indexer.py    From series-tiempo-ar-api with MIT License 5 votes vote down vote up
def run(self, distribution):
        actions = self.generate_es_actions(distribution)

        if not actions:
            return

        for success, info in parallel_bulk(self.elastic, actions):
            if not success:
                logger.warning(strings.BULK_REQUEST_ERROR, info)

        self.update_distribution_indexation_metadata(distribution) 
Example #4
Source File: generate_data.py    From series-tiempo-ar-api with MIT License 5 votes vote down vote up
def index_data(self):
        """Indexa la data leía desde el archivo de datos"""
        with open(DATA_FILE_PATH) as f:
            self.elastic.indices.create(settings.TS_INDEX,
                                        body=INDEX_CREATION_BODY)

            actions = [json.loads(row) for row in f.readlines()]
            for success, info in parallel_bulk(self.elastic, actions):
                if not success:
                    print("ERROR:", info)

            segments = FORCE_MERGE_SEGMENTS
            self.elastic.indices.forcemerge(index=settings.TS_INDEX,
                                            max_num_segments=segments) 
Example #5
Source File: index.py    From series-tiempo-ar-api with MIT License 5 votes vote down vote up
def index(self, queryset: QuerySet):
        self._init_index()

        for success, info in parallel_bulk(self.es_connection, generate_es_query(queryset)):
            if not success:
                raise RuntimeError(f"Error indexando query a ES: {info}") 
Example #6
Source File: elastic.py    From browbeat with Apache License 2.0 4 votes vote down vote up
def flush_cache(self):
        if len(self.cache) == 0:
            return True
        retry = 2
        for i in range(retry):
            try:
                to_upload = helpers.parallel_bulk(
                    self.es, self.cache_insertable_iterable())
                counter = 0
                num_items = len(self.cache)
                for item in to_upload:
                    self.logger.debug(
                        "{} of {} Elastic objects uploaded".format(
                            num_items, counter))
                    counter = counter + 1
                output = "Pushed {} items to Elasticsearch to index {}".format(
                    num_items, self.index)
                output += " and browbeat UUID {}".format(str(browbeat_uuid))
                self.logger.info(output)
                self.cache = deque()
                self.last_upload = datetime.datetime.utcnow()
                return True
            except Exception as Err:
                self.logger.error(
                    "Error pushing data to Elasticsearch, going to retry"
                    " in 10 seconds")
                self.logger.error("Exception: {}".format(Err))
                time.sleep(10)
                if i == (retry - 1):
                    self.logger.error(
                        "Pushing Data to Elasticsearch failed in spite of retry,"
                        " dumping JSON for {} cached items".format(
                            len(
                                self.cache)))
                    for item in self.cache:
                        filename = item['test_name'] + '-' + item['identifier']
                        filename += '-elastic' + '.' + 'json'
                        elastic_file = os.path.join(item['result_dir'],
                                                    filename)

                        with open(elastic_file, 'w') as result_file:
                            json.dump(item['result'],
                                      result_file,
                                      indent=4,
                                      sort_keys=True)

                            self.logger.info(
                                "Saved Elasticsearch consumable result JSON to {}". format(
                                    elastic_file))
                    self.cache = deque()
                    self.last_upload = datetime.datetime.utcnow()
                    return False 
Example #7
Source File: importLogs.py    From aws-la with MIT License 4 votes vote down vote up
def processFiles(f):
    # list for bulk documents
    documents = []

    for log_line in f:
        # Create the body and sanitize
        source = {"message": log_line.strip('\n') }
        body = {"_index": options.index_name, "_type": options.index_name, "pipeline": options.index_name, "_source": source }

        # append record to list before bulk send to ES
        documents.append(body)
        options.totalDocCount +=1

        if len(documents) >= options.bulk_limit:
            # bulk send all our entries
            status = helpers.parallel_bulk(es, documents)

            # look through each result for status
            for i in status:
                if i[0] == False:
                    print "There was an error importing a record.  Error: ", i[1]

            # Using this to have the doc count stay on one line and continually be updated
            sys.stdout.write("Total Documents sent to Elasticsearch: " + str(options.totalDocCount) + "\r")
            sys.stdout.flush()

            # now clean out the document list
            documents[:] = []

    # If we've made it here, then the file ended, and it's possible we still have documents in documents list.  Need to send what we have
    if len(documents) > 0:
        # bulk send all our entries
        status = helpers.parallel_bulk(es, documents)

        # look through each result for status
        for i in status:
            if i[0] == False:
                print "There was an error importing a record.  Error: ", i[1]

        # Using this to have the doc count stay on one line and continually be updated
        sys.stdout.write("Total Documents sent to Elasticsearch: " + str(options.totalDocCount) + "\r")
        sys.stdout.flush()

        # now clean out the document list
        documents[:] = []

    # print the final doc count before moving out of the function
    sys.stdout.write("Total Documents sent to Elasticsearch: " + str(options.totalDocCount) + "\r")