Python elasticsearch.helpers.parallel_bulk() Examples
The following are 7
code examples of elasticsearch.helpers.parallel_bulk().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
elasticsearch.helpers
, or try the search function
.
Example #1
Source File: search.py From flask-shop with BSD 3-Clause "New" or "Revised" License | 6 votes |
def bulk_update(cls, items, chunk_size=5000, op_type="update", **kwargs): index = cls._index._name _type = cls._doc_type.name obj = [ { "_op_type": op_type, "_id": f"{doc.id}", "_index": index, "_type": _type, "_source": get_item_data(doc), } for doc in items ] client = cls.get_es() rs = list(parallel_bulk(client, obj, chunk_size=chunk_size, **kwargs)) return rs
Example #2
Source File: bulk_daemon.py From search-MjoLniR with MIT License | 5 votes |
def bulk_import(**kwargs) -> Tuple[int, int, int]: """Bulk import data to elasticsearch. Tracks bulk import response metrics, reporting both externally to prometheus and to the caller. """ log.info('Starting bulk import: {}'.format(str(kwargs))) good, missing, errors = 0, 0, 0 for ok, result in parallel_bulk(raise_on_exception=False, raise_on_error=False, **kwargs): action, result = result.popitem() status_code = result.get('status', 500) if ok: good += 1 try: Metric.ACTION_RESULTS[result['result']].inc() except KeyError: Metric.OK_UNKNOWN.inc() elif status_code == 'TIMEOUT': Metric.TIMEOUT.inc() errors += 1 elif not isinstance(status_code, int): # Previously found TIMEOUT status_code here Metric.FAILED.inc() log.warning( 'Failed bulk %s request with invalid status_code %s: %s', action, str(status_code), str(result)[:1024]) errors += 1 elif status_code == 404: # 404 are quite common so we log them separately. The analytics # side doesn't know the namespace mappings and attempts to send all # updates to <wiki>_content, letting the docs that don't exist fail missing += 1 Metric.MISSING.inc() elif status_code >= 400 and status_code < 500: # Bulk contained invalid records, can't do much beyond logging Metric.FAILED.inc() log.warning('Failed bulk %s request: %s', action, str(result)[:1024]) errors += 1 elif status_code >= 500 and status_code < 600: # primary not available, etc. Internal elasticsearch errors. Should be retryable raise Exception( "Internal elasticsearch error on {}, status code {}: {}".format(action, status_code, str(result))) else: raise Exception( "Unexpected response on {}, status code {}: {}".format(action, status_code, str(result))) log.info('Completed import with %d success %d missing and %d errors', good, missing, errors) return good, missing, errors
Example #3
Source File: distribution_indexer.py From series-tiempo-ar-api with MIT License | 5 votes |
def run(self, distribution): actions = self.generate_es_actions(distribution) if not actions: return for success, info in parallel_bulk(self.elastic, actions): if not success: logger.warning(strings.BULK_REQUEST_ERROR, info) self.update_distribution_indexation_metadata(distribution)
Example #4
Source File: generate_data.py From series-tiempo-ar-api with MIT License | 5 votes |
def index_data(self): """Indexa la data leía desde el archivo de datos""" with open(DATA_FILE_PATH) as f: self.elastic.indices.create(settings.TS_INDEX, body=INDEX_CREATION_BODY) actions = [json.loads(row) for row in f.readlines()] for success, info in parallel_bulk(self.elastic, actions): if not success: print("ERROR:", info) segments = FORCE_MERGE_SEGMENTS self.elastic.indices.forcemerge(index=settings.TS_INDEX, max_num_segments=segments)
Example #5
Source File: index.py From series-tiempo-ar-api with MIT License | 5 votes |
def index(self, queryset: QuerySet): self._init_index() for success, info in parallel_bulk(self.es_connection, generate_es_query(queryset)): if not success: raise RuntimeError(f"Error indexando query a ES: {info}")
Example #6
Source File: elastic.py From browbeat with Apache License 2.0 | 4 votes |
def flush_cache(self): if len(self.cache) == 0: return True retry = 2 for i in range(retry): try: to_upload = helpers.parallel_bulk( self.es, self.cache_insertable_iterable()) counter = 0 num_items = len(self.cache) for item in to_upload: self.logger.debug( "{} of {} Elastic objects uploaded".format( num_items, counter)) counter = counter + 1 output = "Pushed {} items to Elasticsearch to index {}".format( num_items, self.index) output += " and browbeat UUID {}".format(str(browbeat_uuid)) self.logger.info(output) self.cache = deque() self.last_upload = datetime.datetime.utcnow() return True except Exception as Err: self.logger.error( "Error pushing data to Elasticsearch, going to retry" " in 10 seconds") self.logger.error("Exception: {}".format(Err)) time.sleep(10) if i == (retry - 1): self.logger.error( "Pushing Data to Elasticsearch failed in spite of retry," " dumping JSON for {} cached items".format( len( self.cache))) for item in self.cache: filename = item['test_name'] + '-' + item['identifier'] filename += '-elastic' + '.' + 'json' elastic_file = os.path.join(item['result_dir'], filename) with open(elastic_file, 'w') as result_file: json.dump(item['result'], result_file, indent=4, sort_keys=True) self.logger.info( "Saved Elasticsearch consumable result JSON to {}". format( elastic_file)) self.cache = deque() self.last_upload = datetime.datetime.utcnow() return False
Example #7
Source File: importLogs.py From aws-la with MIT License | 4 votes |
def processFiles(f): # list for bulk documents documents = [] for log_line in f: # Create the body and sanitize source = {"message": log_line.strip('\n') } body = {"_index": options.index_name, "_type": options.index_name, "pipeline": options.index_name, "_source": source } # append record to list before bulk send to ES documents.append(body) options.totalDocCount +=1 if len(documents) >= options.bulk_limit: # bulk send all our entries status = helpers.parallel_bulk(es, documents) # look through each result for status for i in status: if i[0] == False: print "There was an error importing a record. Error: ", i[1] # Using this to have the doc count stay on one line and continually be updated sys.stdout.write("Total Documents sent to Elasticsearch: " + str(options.totalDocCount) + "\r") sys.stdout.flush() # now clean out the document list documents[:] = [] # If we've made it here, then the file ended, and it's possible we still have documents in documents list. Need to send what we have if len(documents) > 0: # bulk send all our entries status = helpers.parallel_bulk(es, documents) # look through each result for status for i in status: if i[0] == False: print "There was an error importing a record. Error: ", i[1] # Using this to have the doc count stay on one line and continually be updated sys.stdout.write("Total Documents sent to Elasticsearch: " + str(options.totalDocCount) + "\r") sys.stdout.flush() # now clean out the document list documents[:] = [] # print the final doc count before moving out of the function sys.stdout.write("Total Documents sent to Elasticsearch: " + str(options.totalDocCount) + "\r")