Python elasticsearch.helpers.bulk() Examples
The following are 30
code examples of elasticsearch.helpers.bulk().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
elasticsearch.helpers
, or try the search function
.
Example #1
Source File: function.py From Gather-Deployment with MIT License | 10 votes |
def run(self): with self.input()['Emotion'].open('r') as fopen: emotions = json.load(fopen) es = Elasticsearch() for i in range(0, len(emotions), self.batch_size): batch = emotions[i : min(i + self.batch_size, len(emotions))] actions = [ { '_index': self.index, '_type': 'text', '_id': '%d-%s' % (i + j, self.summary), '_source': batch[j], } for j in range(len(batch)) ] helpers.bulk(es, actions)
Example #2
Source File: elastichandler.py From EventMonkey with Apache License 2.0 | 7 votes |
def BulkIndexRecords(self,records): ''' Bulk Index Records IN self: EsHandler records: a list of records to bulk index ''' ELASTIC_LOGGER.debug('[starting] Indexing Bulk Records') success_count,failed_items = es_bulk( self.esh, records, chunk_size=10000, raise_on_error=False ) if len(failed_items) > 0: ELASTIC_LOGGER.error('[PID {}] {} index errors'.format( os.getpid(),len(failed_items) )) for failed_item in failed_items: ELASTIC_LOGGER.error(unicode(failed_item)) ELASTIC_LOGGER.debug('[finished] Indexing Bulk Records')
Example #3
Source File: ceres_base.py From grimoirelab-elk with GNU General Public License v3.0 | 7 votes |
def write(self, items): """Upload items to ElasticSearch. :param items: items to be uploaded. """ if self._read_only: raise IOError("Cannot write, Connector created as Read Only") # Uploading info to the new ES docs = [] for item in items: doc = { "_index": self._es_index, "_type": "item", "_id": item["_id"], "_source": item["_source"] } docs.append(doc) # TODO exception and error handling helpers.bulk(self._es_conn, docs) logger.info("{} Written: {}".format(self.__log_prefix, len(docs)))
Example #4
Source File: es.py From ee-outliers with GNU General Public License v3.0 | 7 votes |
def add_remove_outlier_bulk_action(self, document): """ Creates the bulk action to remove all the outlier traces from all events. Removing an outlier means that the "outlier" tag is removed, as well as the "outlier" dictionary in the event. :param document: the document from which the outlier information should be removed """ action = { '_op_type': 'update', '_index': document["_index"], '_type': document["_type"], '_id': document["_id"], 'retry_on_conflict': 10, '_source': { "script": { "source": "ctx._source.remove(\"outliers\"); " + "if (ctx._source.tags != null && ctx._source.tags.indexOf(\"outlier\") > -1) { " + "ctx._source.tags.remove(ctx._source.tags.indexOf(\"outlier\")); " + "}", "lang": "painless" } } } self.add_bulk_action(action)
Example #5
Source File: elasticsearch.py From nefertari with Apache License 2.0 | 6 votes |
def bulk_index_relations(cls, items, request=None, **kwargs): """ Index objects related to :items: in bulk. Related items are first grouped in map {model_name: {item1, item2, ...}} and then indexed. :param items: Sequence of DB objects related objects if which should be indexed. :param request: Pyramid Request instance. """ index_map = defaultdict(set) for item in items: relations = item.get_related_documents(**kwargs) for model_cls, related_items in relations: indexable = getattr(model_cls, '_index_enabled', False) if indexable and related_items: index_map[model_cls.__name__].update(related_items) for model_name, instances in index_map.items(): cls(model_name).index(to_dicts(instances), request=request)
Example #6
Source File: utils.py From searchlight with Apache License 2.0 | 6 votes |
def reindex(src_index, dst_index, type_list, chunk_size=None, time=None): """Reindex a set of indexes internally within ElasticSearch. All of the documents under the types that live in "type_list" under the index "src_index" will be copied into the documents under the same types in the index "dst_index". In other words, a perfect re-index! Instead of using the plugin API and consuming bandwidth to perform the re-index we will allow ElasticSearch to do some heavy lifting for us. Under the covers we are combining scan/scroll with bulk operations to do this re-indexing as efficient as possible. """ es_engine = searchlight.elasticsearch.get_api() # Create a Query DSL string to access all documents within the specified # document types. We will filter on the "_type" field in this index. Since # there are multiple docuent types, we will need to use the "terms" filter. # All of the document types will be added to the list for "_type". We need # to enable version to allow the search to return the version field. This # will be used by the reindexer. body = {"version": "true", "query": {"bool": {"filter": {"terms": {"_type": type_list}}}}} # Debug: Show all documents that ES will re-index. # LOG.debug(es_engine.search(index=src_index, body=body, size=500)) helper_reindex(client=es_engine, source_index=src_index, target_index=dst_index, query=body)
Example #7
Source File: elastic2_doc_manager.py From elastic2-doc-manager with Apache License 2.0 | 6 votes |
def should_get_id(self, action): """ Mark document to retrieve its source from Elasticsearch. Returns: True - if marking document for the first time in this bulk False - if document has been already marked """ mapping_ids = self.doc_to_get.setdefault(action["_index"], {}).setdefault( action["_type"], set() ) if action["_id"] in mapping_ids: # There is an update on this id already return False else: mapping_ids.add(action["_id"]) return True
Example #8
Source File: elastic2_doc_manager.py From elastic2-doc-manager with Apache License 2.0 | 6 votes |
def __init__(self, docman): # Parent object self.docman = docman # Action buffer for bulk indexing self.action_buffer = [] # Docs to update # Dict stores all documents for which firstly # source has to be retrieved from Elasticsearch # and then apply_update needs to be performed # Format: [ (doc, update_spec, action_buffer_index, get_from_ES) ] self.doc_to_update = [] # Below dictionary contains ids of documents # which need to be retrieved from Elasticsearch # It prevents from getting same document multiple times from ES # Format: {"_index": {"_type": {"_id": True}}} self.doc_to_get = {} # Dictionary of sources # Format: {"_index": {"_type": {"_id": {"_source": actual_source}}}} self.sources = {}
Example #9
Source File: elastic2_doc_manager.py From elastic2-doc-manager with Apache License 2.0 | 6 votes |
def send_buffered_operations(self): """Send buffered operations to Elasticsearch. This method is periodically called by the AutoCommitThread. """ with self.lock: try: action_buffer = self.BulkBuffer.get_buffer() if action_buffer: successes, errors = bulk(self.elastic, action_buffer) LOG.debug( "Bulk request finished, successfully sent %d " "operations", successes, ) if errors: LOG.error("Bulk request finished with errors: %r", errors) except es_exceptions.ElasticsearchException: LOG.exception("Bulk request failed with exception")
Example #10
Source File: sparrow-elastic.py From sparrow-wifi with GNU General Public License v3.0 | 6 votes |
def writeDataToIndex(es, es_index, entries, es_doc_type='_doc'): es_entries = [] for doc in entries: entry = {"_index": es_index, "_type": es_doc_type, "_source": doc } es_entries.append(entry) try: helpers.bulk(es, es_entries, refresh=True, request_timeout=60) except Exception as e: # This can happen if the server is restarted or the connection becomes unavilable print(str(e)) # ------------------- Bluetooth routines ------------------------------------
Example #11
Source File: document.py From esengine with MIT License | 6 votes |
def update_all(cls, docs, es=None, meta=None, **kwargs): """ Update various Doc instances in bulk >>> docs = (Document(value=value) for value in [1, 2, 3]) # change all values to zero >>> Document.update_all(docs, value=0) :param docs: Iterator of Document instances :param es: ES client or None (if implemented a default in Model) :param meta: Extra values to be passed to client :param kwargs: Extra params to be passed to streaming_bulk :return: Es Metadata """ actions = ( { '_op_type': 'update', '_index': cls._index, '_type': cls._doctype, '_id': doc.id, 'doc': kwargs } for doc in docs ) return eh.bulk(cls.get_es(es), actions, **meta if meta else {})
Example #12
Source File: log_manager000.py From distributed_framework with Apache License 2.0 | 6 votes |
def _do_bulk_op(self): while 1: try: if self._task_queue.qsize() > 10000: very_nb_print('防止意外日志积累太多了,不插入es了。') self.__clear_bulk_task() return # noinspection PyUnresolvedReferences tasks = list(self._task_queue.queue) self.__clear_bulk_task() helpers.bulk(self._es_client, tasks) self._last_es_op_time = time.time() except Exception as e: very_nb_print(e) finally: time.sleep(1)
Example #13
Source File: ElasticBurp.py From WASE with GNU General Public License v3.0 | 6 votes |
def genAddToES(self, msgs, component): def menuAddToES(e): progress = ProgressMonitor(component, "Feeding ElasticSearch", "", 0, len(msgs)) i = 0 docs = list() for msg in msgs: if not Burp_onlyResponses or msg.getResponse(): docs.append(self.genESDoc(msg, timeStampFromResponse=True).to_dict(True)) i += 1 progress.setProgress(i) success, failed = bulk(self.es, docs, True, raise_on_error=False) progress.close() JOptionPane.showMessageDialog(self.panel, "<html><p style='width: 300px'>Successful imported %d messages, %d messages failed.</p></html>" % (success, failed), "Finished", JOptionPane.INFORMATION_MESSAGE) return menuAddToES ### Interface to ElasticSearch ###
Example #14
Source File: log_manager000.py From distributed_framework with Apache License 2.0 | 6 votes |
def _do_bulk_op(self): if self.__class__.has_start_do_bulk_op: return self.__class__.has_start_do_bulk_op = True while 1: try: if self.__class__.task_queue.qsize() > 10000: very_nb_print('防止意外日志积累太多了,不插入es了。') self.__clear_bulk_task() return tasks = list(self.__class__.task_queue.queue) self.__clear_bulk_task() helpers.bulk(self._es_client, tasks) self.__class__.last_es_op_time = time.time() except Exception as e: very_nb_print(e) finally: time.sleep(self.ES_INTERVAL_SECONDS)
Example #15
Source File: elasticsearch.py From nefertari with Apache License 2.0 | 6 votes |
def process_chunks(self, documents, operation): """ Apply `operation` to chunks of `documents` of size `self.chunk_size`. """ chunk_size = self.chunk_size start = end = 0 count = len(documents) while count: if count < chunk_size: chunk_size = count end += chunk_size bulk = documents[start:end] operation(documents_actions=bulk) start += chunk_size count -= chunk_size
Example #16
Source File: bulk_operation.py From distributed_framework with Apache License 2.0 | 6 votes |
def _do_bulk_operation(self): if self._to_be_request_queue.qsize() > 0: t_start = time.time() count = 0 request_list = [] for _ in range(self._threshold): try: request = self._to_be_request_queue.get_nowait() count += 1 request_list.append(request) except Empty: pass break if request_list: # self.base_object.bulk_write(request_list, ordered=False) helpers.bulk(self.base_object, request_list) if self._is_print_log: self.logger.info(f'【{self.base_object}】 批量插入的任务数量是 {count} 消耗的时间是 {round(time.time() - t_start, 6)}') self._current_time = time.time()
Example #17
Source File: sentiment_to_elastic.py From Gather-Deployment with MIT License | 6 votes |
def pull_to_elastic(**kwargs): ti = kwargs['ti'] sentiments = ti.xcom_pull(task_ids = 'push_sentiment', key = 'sentiment') es = Elasticsearch() for i in range(0, len(sentiments), batch_size): batch = sentiments[i : min(i + batch_size, len(sentiments))] actions = [ { '_index': 'test_index', '_type': 'text', '_id': '%d-text' % (j + i), '_source': batch[j], } for j in range(len(batch)) ] helpers.bulk(es, actions)
Example #18
Source File: utils.py From searchlight with Apache License 2.0 | 6 votes |
def set_index_refresh_interval(index_name, refresh_interval): """Set refresh_interval of a given index, basically it is used in the reindexing phase. By setting refresh_interval to -1 we disable the refresh of offline index to gain a performance boost for the bulk updates. After reindexing is done, we will restore refresh_interval and put the index online. """ es_engine = searchlight.elasticsearch.get_api() body = { 'index': { 'refresh_interval': refresh_interval } } try: es_engine.indices.put_settings(body, index_name) except Exception as e: LOG.error(encodeutils.exception_to_unicode(e)) raise
Example #19
Source File: document.py From esengine with MIT License | 6 votes |
def save_all(cls, docs, es=None, **kwargs): """ Save various Doc instances in bulk >>> docs = (Document(value=value) for value in [1, 2, 3]) >>> Document.save_all(docs) :param docs: Iterator of Document instances :param es: ES client or None (if implemented a default in Model) :param kwargs: Extra params to be passed to streaming_bulk :return: ES metadata """ actions = [ { '_op_type': 'index', '_index': cls._index, '_type': cls._doctype, '_id': doc.id, '_source': doc.to_dict() } for doc in docs ] return eh.bulk(cls.get_es(es), actions, **kwargs)
Example #20
Source File: document.py From esengine with MIT License | 6 votes |
def delete_all(cls, docs, es=None, **kwargs): """ Delete various Doc instances in bulk >>> docs = (Document(value=value) for value in [1, 2, 3]) >>> Document.delete_all(docs) :param docs: Iterator of Document instances or a list of ids :param es: ES client or None (if implemented a default in Model) :param kwargs: Extra params to be passed to streaming_bulk :return: ES metadata """ actions = [ { '_op_type': 'delete', '_index': cls._index, '_type': cls._doctype, '_id': getattr(doc, 'id', doc), } for doc in docs ] return eh.bulk(cls.get_es(es), actions, **kwargs)
Example #21
Source File: elastic.py From parade with MIT License | 6 votes |
def store(self, df, table, **kwargs): if isinstance(df, pd.DataFrame): es = self.open() records = df.to_dict(orient='records') if df.index.name: actions = [{ "_index": self.datasource.db, "_type": table, "_id": record[df.index.name], "_source": record } for record in records] else: actions = [{ "_index": self.datasource.db, "_type": table, "_source": record } for record in records] if len(actions) > 0: helpers.bulk(es, actions)
Example #22
Source File: projects.py From repoxplorer with Apache License 2.0 | 6 votes |
def get_by_attr_match(self, attribute, value, source=True): params = {'index': self.index} body = { "query": { 'bool': { 'must': {'term': {attribute: value}}, } } } params['body'] = body params['_source'] = source # TODO(fbo): Improve by doing it by bulk instead params['size'] = 10000 res = self.es.search(**params) took = res['took'] hits = res['hits']['total'] docs = [r['_source'] for r in res['hits']['hits']] return took, hits, docs
Example #23
Source File: elasticdatastore.py From ethdrain with MIT License | 6 votes |
def save(self): nb_blocks = sum(act["_type"] == "b" for act in self.actions) nb_txs = sum(act["_type"] == "tx" for act in self.actions) if self.actions: try: helpers.bulk(self.elastic, self.actions) return "{} blocks and {} transactions indexed".format( nb_blocks, nb_txs ) except helpers.BulkIndexError as exception: print("Issue with {} blocks:\n{}\n".format(nb_blocks, exception)) blocks = (act for act in self.actions if act["_type"] == "b") for block in blocks: logging.error("block: " + str(block["_id"]))
Example #24
Source File: es.py From hoover-search with MIT License | 6 votes |
def bulk_index(collection_id, docs): def index(id, data): return dict( data, _op_type='index', _index=_index_name(collection_id), _type=DOCTYPE, _id=id, ) with elasticsearch() as es: _, err = bulk( es, (index(id, data) for id, data in docs), stats_only=True, request_timeout=55, ) if err: raise RuntimeError("Bulk indexing failed on %d documents" % err)
Example #25
Source File: crawling.py From Python-DevOps with MIT License | 6 votes |
def run(self): with self.input()['Sentiment'].open('r') as fopen: sentiment = json.loads(fopen.read()) es = Elasticsearch() for index in range(0, len(sentiment), self.batch_size): batch = sentiment[ index : min(index + self.batch_size, len(sentiment)) ] actions = [ { '_index': self.index, '_type': 'news', '_id': batch[j]['url'], '_source': batch[j], } for j in range(len(batch)) ] helpers.bulk(es, actions)
Example #26
Source File: sentiment_to_elastic.py From Python-DevOps with MIT License | 6 votes |
def pull_to_elastic(**kwargs): ti = kwargs['ti'] sentiments = ti.xcom_pull(task_ids = 'push_sentiment', key = 'sentiment') es = Elasticsearch() for i in range(0, len(sentiments), batch_size): batch = sentiments[i : min(i + batch_size, len(sentiments))] actions = [ { '_index': 'test_index', '_type': 'text', '_id': '%d-text' % (j + i), '_source': batch[j], } for j in range(len(batch)) ] helpers.bulk(es, actions)
Example #27
Source File: __init__.py From elasticsearch_loader with MIT License | 6 votes |
def single_bulk_to_es(bulk, config, attempt_retry): bulk = bulk_builder(bulk, config) max_attempt = 1 if attempt_retry: max_attempt += 3 for attempt in range(1, max_attempt + 1): try: helpers.bulk(config['es_conn'], bulk, chunk_size=config['bulk_size']) except Exception as e: if attempt < max_attempt: wait_seconds = attempt * 3 log('warn', 'attempt [%s/%s] got exception, will retry after %s seconds' % (attempt, max_attempt, wait_seconds)) time.sleep(wait_seconds) continue log('error', 'attempt [%s/%s] got exception, it is a permanent data loss, no retry any more' % (attempt, max_attempt)) raise e if attempt > 1: log('info', 'attempt [%s/%s] succeed. We just get recovered from previous error' % (attempt, max_attempt)) # completed successfully break
Example #28
Source File: reindex.py From django-seeker with BSD 2-Clause "Simplified" License | 6 votes |
def reindex(doc_class, index, using, options): """ Index all the things, using ElasticSearch's bulk API for speed. """ def get_actions(): for doc in doc_class.documents(cursor=options['cursor']): action = { '_index': index, '_type': doc_class._doc_type.name, } action.update(doc) yield action es = connections.get_connection(using) actions = get_actions() if options['quiet'] else progress(get_actions(), count=doc_class.count(), label=doc_class.__name__) bulk(es, actions) es.indices.refresh(index=index)
Example #29
Source File: bulkOp.py From Hippocampe with GNU Affero General Public License v3.0 | 6 votes |
def update(typeNameES, listId): logger.info('bulkOp.update launched') hippoCfg = getHippoConf() es = getES() now = strftime("%Y%m%dT%H%M%S%z") indexNameES = hippoCfg.get('elasticsearch', 'indexNameES') # k is a generator expression that produces # dict to update every doc wich id is in listId k = ({'_op_type': 'update', '_index':indexNameES, '_type':typeNameES, 'doc':{'lastQuery': now}, '_id': id} for id in listId) res = helpers.bulk(es, k) logger.info('bulkOp.update res: %s', res) #res looks like #(2650, []) logger.info('bulkOp.update end') return res[0]
Example #30
Source File: bulkOp.py From Hippocampe with GNU Affero General Public License v3.0 | 6 votes |
def index(cfgPath, listData): logger.info('bulkOp.index launched') hippoCfg = getHippoConf() indexNameES = hippoCfg.get('elasticsearch', 'indexNameES') cfg = getConf(cfgPath) typeNameES = cfg.get('elasticsearch', 'typeIntel') #creating the index, only if does not exist index = IndexIntel(cfgPath) index.createIndexIntel() es = getES() k = ({'_op_type': 'index', '_index':indexNameES, '_type':typeNameES, '_source': data} for data in listData) res = helpers.bulk(es,k, raise_on_error=False) #res = helpers.bulk(es,k, raise_on_exception=False) #res = helpers.bulk(es,k) logger.info('bulkOp.index res: %s', res) logger.info('bulkOp.index end') return res