Python Examples of elasticsearch.helpers.bulk

Source File: function.py From Gather-Deployment with MIT License

10 votes

def run(self):
        with self.input()['Emotion'].open('r') as fopen:
            emotions = json.load(fopen)
        es = Elasticsearch()
        for i in range(0, len(emotions), self.batch_size):
            batch = emotions[i : min(i + self.batch_size, len(emotions))]
            actions = [
                {
                    '_index': self.index,
                    '_type': 'text',
                    '_id': '%d-%s' % (i + j, self.summary),
                    '_source': batch[j],
                }
                for j in range(len(batch))
            ]
            helpers.bulk(es, actions)

Source File: elastichandler.py From EventMonkey with Apache License 2.0

7 votes

def BulkIndexRecords(self,records):
        '''
        Bulk Index Records
        IN
            self: EsHandler
            records: a list of records to bulk index
        '''
        ELASTIC_LOGGER.debug('[starting] Indexing Bulk Records')
        success_count,failed_items = es_bulk(
            self.esh,
            records,
            chunk_size=10000,
            raise_on_error=False
        )
        
        if len(failed_items) > 0:
            ELASTIC_LOGGER.error('[PID {}] {} index errors'.format(
                os.getpid(),len(failed_items)
            ))
            for failed_item in failed_items:
                ELASTIC_LOGGER.error(unicode(failed_item))
        
        ELASTIC_LOGGER.debug('[finished] Indexing Bulk Records')

Source File: ceres_base.py From grimoirelab-elk with GNU General Public License v3.0

7 votes

def write(self, items):
        """Upload items to ElasticSearch.

        :param items: items to be uploaded.
        """
        if self._read_only:
            raise IOError("Cannot write, Connector created as Read Only")

        # Uploading info to the new ES
        docs = []
        for item in items:
            doc = {
                "_index": self._es_index,
                "_type": "item",
                "_id": item["_id"],
                "_source": item["_source"]
            }
            docs.append(doc)
        # TODO exception and error handling
        helpers.bulk(self._es_conn, docs)
        logger.info("{} Written: {}".format(self.__log_prefix, len(docs)))

Source File: es.py From ee-outliers with GNU General Public License v3.0

7 votes

def add_remove_outlier_bulk_action(self, document):
        """
        Creates the bulk action to remove all the outlier traces from all events.
        Removing an outlier means that the "outlier" tag is removed, as well as the "outlier" dictionary in the event.
        :param document: the document from which the outlier information should be removed
        """
        action = {
            '_op_type': 'update',
            '_index': document["_index"],
            '_type': document["_type"],
            '_id': document["_id"],
            'retry_on_conflict': 10,
            '_source': {
                "script": {
                    "source": "ctx._source.remove(\"outliers\"); " +
                              "if (ctx._source.tags != null && ctx._source.tags.indexOf(\"outlier\") > -1) { " +
                              "ctx._source.tags.remove(ctx._source.tags.indexOf(\"outlier\")); " +
                              "}",
                    "lang": "painless"
                }
            }
        }
        self.add_bulk_action(action)

Source File: elasticsearch.py From nefertari with Apache License 2.0

6 votes

def bulk_index_relations(cls, items, request=None, **kwargs):
        """ Index objects related to :items: in bulk.

        Related items are first grouped in map
        {model_name: {item1, item2, ...}} and then indexed.

        :param items: Sequence of DB objects related objects if which
            should be indexed.
        :param request: Pyramid Request instance.
        """
        index_map = defaultdict(set)
        for item in items:
            relations = item.get_related_documents(**kwargs)
            for model_cls, related_items in relations:
                indexable = getattr(model_cls, '_index_enabled', False)
                if indexable and related_items:
                    index_map[model_cls.__name__].update(related_items)

        for model_name, instances in index_map.items():
            cls(model_name).index(to_dicts(instances), request=request)

Source File: utils.py From searchlight with Apache License 2.0

6 votes

def reindex(src_index, dst_index, type_list, chunk_size=None, time=None):
    """Reindex a set of indexes internally within ElasticSearch. All of the
       documents under the types that live in "type_list" under the index
       "src_index" will be copied into the documents under the same types
       in the index "dst_index". In other words, a perfect re-index!
       Instead of using the plugin API and consuming bandwidth to perform
       the re-index we will allow ElasticSearch to do some heavy lifting for
       us. Under the covers we are combining scan/scroll with bulk operations
       to do this re-indexing as efficient as possible.
    """
    es_engine = searchlight.elasticsearch.get_api()

    # Create a Query DSL string to access all documents within the specified
    # document types. We will filter on the "_type" field in this index. Since
    # there are multiple docuent types, we will need to use the "terms" filter.
    # All of the document types will be added to the list for "_type". We need
    # to enable version to allow the search to return the version field. This
    # will be used by the reindexer.
    body = {"version": "true",
            "query": {"bool": {"filter": {"terms": {"_type": type_list}}}}}
    # Debug: Show all documents that ES will re-index.
    # LOG.debug(es_engine.search(index=src_index, body=body, size=500))
    helper_reindex(client=es_engine, source_index=src_index,
                   target_index=dst_index, query=body)

Source File: elastic2_doc_manager.py From elastic2-doc-manager with Apache License 2.0

6 votes

def should_get_id(self, action):
        """
        Mark document to retrieve its source from Elasticsearch.
        Returns:
            True - if marking document for the first time in this bulk
            False - if document has been already marked
        """
        mapping_ids = self.doc_to_get.setdefault(action["_index"], {}).setdefault(
            action["_type"], set()
        )
        if action["_id"] in mapping_ids:
            # There is an update on this id already
            return False
        else:
            mapping_ids.add(action["_id"])
            return True

Source File: elastic2_doc_manager.py From elastic2-doc-manager with Apache License 2.0

6 votes

def __init__(self, docman):

        # Parent object
        self.docman = docman

        # Action buffer for bulk indexing
        self.action_buffer = []

        # Docs to update
        # Dict stores all documents for which firstly
        # source has to be retrieved from Elasticsearch
        # and then apply_update needs to be performed
        # Format: [ (doc, update_spec, action_buffer_index, get_from_ES) ]
        self.doc_to_update = []

        # Below dictionary contains ids of documents
        # which need to be retrieved from Elasticsearch
        # It prevents from getting same document multiple times from ES
        # Format: {"_index": {"_type": {"_id": True}}}
        self.doc_to_get = {}

        # Dictionary of sources
        # Format: {"_index": {"_type": {"_id": {"_source": actual_source}}}}
        self.sources = {}

Source File: elastic2_doc_manager.py From elastic2-doc-manager with Apache License 2.0

6 votes

def send_buffered_operations(self):
        """Send buffered operations to Elasticsearch.

        This method is periodically called by the AutoCommitThread.
        """
        with self.lock:
            try:
                action_buffer = self.BulkBuffer.get_buffer()
                if action_buffer:
                    successes, errors = bulk(self.elastic, action_buffer)
                    LOG.debug(
                        "Bulk request finished, successfully sent %d " "operations",
                        successes,
                    )
                    if errors:
                        LOG.error("Bulk request finished with errors: %r", errors)
            except es_exceptions.ElasticsearchException:
                LOG.exception("Bulk request failed with exception")

Source File: sparrow-elastic.py From sparrow-wifi with GNU General Public License v3.0

6 votes

def writeDataToIndex(es,  es_index, entries, es_doc_type='_doc'):
    es_entries = []
    for doc in entries:
        entry = {"_index": es_index,
                 "_type": es_doc_type, 
                 "_source": doc }

        es_entries.append(entry)    

    try:
        helpers.bulk(es, es_entries, refresh=True, request_timeout=60) 
    except Exception as e:
        # This can happen if the server is restarted or the connection becomes unavilable
        print(str(e))

# ------------------- Bluetooth routines ------------------------------------

Source File: document.py From esengine with MIT License

6 votes

def update_all(cls, docs, es=None, meta=None, **kwargs):
        """
        Update various Doc instances in bulk

        >>> docs = (Document(value=value) for value in [1, 2, 3])
        # change all values to zero
        >>> Document.update_all(docs, value=0)

        :param docs: Iterator of Document instances
        :param es: ES client or None (if implemented a default in Model)
        :param meta: Extra values to be passed to client
        :param kwargs: Extra params to be passed to streaming_bulk
        :return: Es Metadata
        """
        actions = (
            {
                '_op_type': 'update',
                '_index': cls._index,
                '_type': cls._doctype,
                '_id': doc.id,
                'doc': kwargs
            }
            for doc in docs
        )
        return eh.bulk(cls.get_es(es), actions, **meta if meta else {})

Source File: log_manager000.py From distributed_framework with Apache License 2.0

6 votes

def _do_bulk_op(self):
        while 1:
            try:
                if self._task_queue.qsize() > 10000:
                    very_nb_print('防止意外日志积累太多了，不插入es了。')
                    self.__clear_bulk_task()
                    return
                # noinspection PyUnresolvedReferences
                tasks = list(self._task_queue.queue)
                self.__clear_bulk_task()
                helpers.bulk(self._es_client, tasks)

                self._last_es_op_time = time.time()
            except Exception as e:
                very_nb_print(e)
            finally:
                time.sleep(1)

Source File: ElasticBurp.py From WASE with GNU General Public License v3.0

6 votes

def genAddToES(self, msgs, component):
        def menuAddToES(e):
            progress = ProgressMonitor(component, "Feeding ElasticSearch", "", 0, len(msgs))
            i = 0
            docs = list()
            for msg in msgs:
                if not Burp_onlyResponses or msg.getResponse():
                    docs.append(self.genESDoc(msg, timeStampFromResponse=True).to_dict(True))
                i += 1
                progress.setProgress(i)
            success, failed = bulk(self.es, docs, True, raise_on_error=False)
            progress.close()
            JOptionPane.showMessageDialog(self.panel, "<html><p style='width: 300px'>Successful imported %d messages, %d messages failed.</p></html>" % (success, failed), "Finished", JOptionPane.INFORMATION_MESSAGE)
        return menuAddToES

    ### Interface to ElasticSearch ###

Source File: log_manager000.py From distributed_framework with Apache License 2.0

6 votes

def _do_bulk_op(self):
        if self.__class__.has_start_do_bulk_op:
            return
        self.__class__.has_start_do_bulk_op = True
        while 1:
            try:
                if self.__class__.task_queue.qsize() > 10000:
                    very_nb_print('防止意外日志积累太多了，不插入es了。')
                    self.__clear_bulk_task()
                    return
                tasks = list(self.__class__.task_queue.queue)
                self.__clear_bulk_task()
                helpers.bulk(self._es_client, tasks)
                self.__class__.last_es_op_time = time.time()
            except Exception as e:
                very_nb_print(e)
            finally:
                time.sleep(self.ES_INTERVAL_SECONDS)

Source File: elasticsearch.py From nefertari with Apache License 2.0

6 votes

def process_chunks(self, documents, operation):
        """ Apply `operation` to chunks of `documents` of size
        `self.chunk_size`.

        """
        chunk_size = self.chunk_size
        start = end = 0
        count = len(documents)

        while count:
            if count < chunk_size:
                chunk_size = count
            end += chunk_size

            bulk = documents[start:end]
            operation(documents_actions=bulk)

            start += chunk_size
            count -= chunk_size

Source File: bulk_operation.py From distributed_framework with Apache License 2.0

6 votes

def _do_bulk_operation(self):
        if self._to_be_request_queue.qsize() > 0:
            t_start = time.time()
            count = 0
            request_list = []
            for _ in range(self._threshold):
                try:
                    request = self._to_be_request_queue.get_nowait()
                    count += 1
                    request_list.append(request)
                except Empty:
                    pass
                    break
            if request_list:
                # self.base_object.bulk_write(request_list, ordered=False)
                helpers.bulk(self.base_object, request_list)
            if self._is_print_log:
                self.logger.info(f'【{self.base_object}】  批量插入的任务数量是 {count} 消耗的时间是 {round(time.time() - t_start, 6)}')
            self._current_time = time.time()

Source File: sentiment_to_elastic.py From Gather-Deployment with MIT License

6 votes

def pull_to_elastic(**kwargs):
    ti = kwargs['ti']
    sentiments = ti.xcom_pull(task_ids = 'push_sentiment', key = 'sentiment')
    es = Elasticsearch()
    for i in range(0, len(sentiments), batch_size):
        batch = sentiments[i : min(i + batch_size, len(sentiments))]
        actions = [
            {
                '_index': 'test_index',
                '_type': 'text',
                '_id': '%d-text' % (j + i),
                '_source': batch[j],
            }
            for j in range(len(batch))
        ]
        helpers.bulk(es, actions)

Source File: utils.py From searchlight with Apache License 2.0

6 votes

def set_index_refresh_interval(index_name, refresh_interval):
    """Set refresh_interval of a given index, basically it is used in the
       reindexing phase. By setting refresh_interval to -1 we disable the
       refresh of offline index to gain a performance boost for the bulk
       updates. After reindexing is done, we will restore refresh_interval
       and put the index online.
    """

    es_engine = searchlight.elasticsearch.get_api()

    body = {
        'index': {
            'refresh_interval': refresh_interval
        }
    }

    try:
        es_engine.indices.put_settings(body, index_name)
    except Exception as e:
        LOG.error(encodeutils.exception_to_unicode(e))
        raise

Source File: document.py From esengine with MIT License

6 votes

def save_all(cls, docs, es=None, **kwargs):
        """
        Save various Doc instances in bulk

        >>> docs = (Document(value=value) for value in [1, 2, 3])
        >>> Document.save_all(docs)

        :param docs: Iterator of Document instances
        :param es: ES client or None (if implemented a default in Model)
        :param kwargs: Extra params to be passed to streaming_bulk
        :return: ES metadata
        """
        actions = [
            {
                '_op_type': 'index',
                '_index': cls._index,
                '_type': cls._doctype,
                '_id': doc.id,
                '_source': doc.to_dict()
            }
            for doc in docs
        ]
        return eh.bulk(cls.get_es(es), actions, **kwargs)

Source File: document.py From esengine with MIT License

6 votes

def delete_all(cls, docs, es=None, **kwargs):
        """
        Delete various Doc instances in bulk

        >>> docs = (Document(value=value) for value in [1, 2, 3])
        >>> Document.delete_all(docs)

        :param docs: Iterator of Document instances or a list of ids
        :param es: ES client or None (if implemented a default in Model)
        :param kwargs: Extra params to be passed to streaming_bulk
        :return: ES metadata
        """
        actions = [
            {
                '_op_type': 'delete',
                '_index': cls._index,
                '_type': cls._doctype,
                '_id': getattr(doc, 'id', doc),
            }
            for doc in docs
        ]
        return eh.bulk(cls.get_es(es), actions, **kwargs)

Source File: elastic.py From parade with MIT License

6 votes

def store(self, df, table, **kwargs):
        if isinstance(df, pd.DataFrame):
            es = self.open()

            records = df.to_dict(orient='records')

            if df.index.name:
                actions = [{
                    "_index": self.datasource.db,
                    "_type": table,
                    "_id": record[df.index.name],
                    "_source": record
                } for record in records]
            else:
                actions = [{
                    "_index": self.datasource.db,
                    "_type": table,
                    "_source": record
                } for record in records]

            if len(actions) > 0:
                helpers.bulk(es, actions)

Source File: projects.py From repoxplorer with Apache License 2.0

6 votes

def get_by_attr_match(self, attribute, value, source=True):
        params = {'index': self.index}

        body = {
            "query": {
                'bool': {
                    'must': {'term': {attribute: value}},
                }
            }
        }
        params['body'] = body
        params['_source'] = source
        # TODO(fbo): Improve by doing it by bulk instead
        params['size'] = 10000
        res = self.es.search(**params)
        took = res['took']
        hits = res['hits']['total']
        docs = [r['_source'] for r in res['hits']['hits']]
        return took, hits, docs

Source File: elasticdatastore.py From ethdrain with MIT License

6 votes

def save(self):
        nb_blocks = sum(act["_type"] == "b" for act in self.actions)
        nb_txs = sum(act["_type"] == "tx" for act in self.actions)

        if self.actions:
            try:
                helpers.bulk(self.elastic, self.actions)
                return "{} blocks and {} transactions indexed".format(
                    nb_blocks, nb_txs
                )

            except helpers.BulkIndexError as exception:
                print("Issue with {} blocks:\n{}\n".format(nb_blocks, exception))
                blocks = (act for act in self.actions if act["_type"] == "b")
                for block in blocks:
                    logging.error("block: " + str(block["_id"]))

Source File: es.py From hoover-search with MIT License

6 votes

def bulk_index(collection_id, docs):
    def index(id, data):
        return dict(
            data,
            _op_type='index',
            _index=_index_name(collection_id),
            _type=DOCTYPE,
            _id=id,
        )

    with elasticsearch() as es:
        _, err = bulk(
            es,
            (index(id, data) for id, data in docs),
            stats_only=True,
            request_timeout=55,
        )
    if err:
        raise RuntimeError("Bulk indexing failed on %d documents" % err)

Source File: crawling.py From Python-DevOps with MIT License

6 votes

def run(self):
        with self.input()['Sentiment'].open('r') as fopen:
            sentiment = json.loads(fopen.read())
        es = Elasticsearch()
        for index in range(0, len(sentiment), self.batch_size):
            batch = sentiment[
                index : min(index + self.batch_size, len(sentiment))
            ]
            actions = [
                {
                    '_index': self.index,
                    '_type': 'news',
                    '_id': batch[j]['url'],
                    '_source': batch[j],
                }
                for j in range(len(batch))
            ]
            helpers.bulk(es, actions)

Source File: sentiment_to_elastic.py From Python-DevOps with MIT License

6 votes

def pull_to_elastic(**kwargs):
    ti = kwargs['ti']
    sentiments = ti.xcom_pull(task_ids = 'push_sentiment', key = 'sentiment')
    es = Elasticsearch()
    for i in range(0, len(sentiments), batch_size):
        batch = sentiments[i : min(i + batch_size, len(sentiments))]
        actions = [
            {
                '_index': 'test_index',
                '_type': 'text',
                '_id': '%d-text' % (j + i),
                '_source': batch[j],
            }
            for j in range(len(batch))
        ]
        helpers.bulk(es, actions)

Source File: __init__.py From elasticsearch_loader with MIT License

6 votes

def single_bulk_to_es(bulk, config, attempt_retry):
    bulk = bulk_builder(bulk, config)

    max_attempt = 1
    if attempt_retry:
        max_attempt += 3

    for attempt in range(1, max_attempt + 1):
        try:
            helpers.bulk(config['es_conn'], bulk, chunk_size=config['bulk_size'])
        except Exception as e:
            if attempt < max_attempt:
                wait_seconds = attempt * 3
                log('warn', 'attempt [%s/%s] got exception, will retry after %s seconds' % (attempt, max_attempt, wait_seconds))
                time.sleep(wait_seconds)
                continue

            log('error', 'attempt [%s/%s] got exception, it is a permanent data loss, no retry any more' % (attempt, max_attempt))
            raise e

        if attempt > 1:
            log('info', 'attempt [%s/%s] succeed. We just get recovered from previous error' % (attempt, max_attempt))

        # completed successfully
        break

Source File: reindex.py From django-seeker with BSD 2-Clause "Simplified" License

6 votes

def reindex(doc_class, index, using, options):
    """
    Index all the things, using ElasticSearch's bulk API for speed.
    """
    def get_actions():
        for doc in doc_class.documents(cursor=options['cursor']):
            action = {
                '_index': index,
                '_type': doc_class._doc_type.name,
            }
            action.update(doc)
            yield action
    es = connections.get_connection(using)
    actions = get_actions() if options['quiet'] else progress(get_actions(), count=doc_class.count(), label=doc_class.__name__)
    bulk(es, actions)
    es.indices.refresh(index=index)

Source File: bulkOp.py From Hippocampe with GNU Affero General Public License v3.0

6 votes

def update(typeNameES, listId):
	logger.info('bulkOp.update launched')
	hippoCfg = getHippoConf()
	es = getES()
	now = strftime("%Y%m%dT%H%M%S%z")
	indexNameES = hippoCfg.get('elasticsearch', 'indexNameES')
	# k is a generator expression that produces
	# dict to update every doc wich id is in listId
	k = ({'_op_type': 'update', '_index':indexNameES, '_type':typeNameES, 'doc':{'lastQuery': now}, '_id': id}
		for id in listId)

	res = helpers.bulk(es, k)
	logger.info('bulkOp.update res: %s', res)
	#res looks like
	#(2650, [])  
	logger.info('bulkOp.update end')
	return res[0]

Source File: bulkOp.py From Hippocampe with GNU Affero General Public License v3.0

6 votes

def index(cfgPath, listData):
	logger.info('bulkOp.index launched')
	hippoCfg = getHippoConf()
	indexNameES = hippoCfg.get('elasticsearch', 'indexNameES')

	cfg = getConf(cfgPath)
	typeNameES = cfg.get('elasticsearch', 'typeIntel')
	
	#creating the index, only if does not exist
	index = IndexIntel(cfgPath)
	index.createIndexIntel()

	es = getES()
	k = ({'_op_type': 'index', '_index':indexNameES, '_type':typeNameES, '_source': data}
		for data in listData)
	res = helpers.bulk(es,k, raise_on_error=False)
	#res = helpers.bulk(es,k, raise_on_exception=False)
	#res = helpers.bulk(es,k)
	logger.info('bulkOp.index res: %s', res)
	logger.info('bulkOp.index end')
	return res

Python elasticsearch.helpers.bulk() Examples