Python Examples of elasticsearch.helpers

Source File: __init__.py From estnltk with GNU General Public License v2.0

6 votes

def sentences(self, exclude_ids=None, query=None, return_estnltk_object=True, **kwargs):
        if query is None:
            query = {}

        if return_estnltk_object:
            if query.get('fields', None) is None:
                query['fields'] = ['estnltk_text_object']
            else:
                if 'estnltk_text_object' not in query['fields']:
                    raise AssertionError('Query contained the "fields" parameter without the "estnltk_text_object" argument'
                                         'Consider setting the "return_estnltk_object" parameter to False to disable respose handling')
                pass

        if exclude_ids is None:
            for document in elasticsearch.helpers.scan(self.client, query=query, doc_type='sentence', **kwargs):
                if return_estnltk_object:
                    yield Text(json.loads(document['fields']['estnltk_text_object'][0]))
                else:
                    yield json.loads(document)
        else:
            raise NotImplementedError('ID exclusion is not implemented')

Source File: elasticsearch_load.py From edx-analytics-pipeline with GNU Affero General Public License v3.0

6 votes

def next_bulk_action_batch(self, document_iterator):
        """
        Read a batch of documents from the iterator and convert them into bulk index actions.

        Elasticsearch expects each document to actually be transmitted on two lines the first of which details the
        action to take, and the second contains the actual document.

        See the `Cheaper in Bulk <https://www.elastic.co/guide/en/elasticsearch/guide/1.x/bulk.html>`_ guide.

        Arguments:
            document_iterator (iterator of dicts):

        Returns: A list of dicts that can be transmitted to elasticsearch using the "bulk" request.
        """
        bulk_action_batch = []
        for raw_data in islice(document_iterator, self.batch_size):
            action, data = elasticsearch.helpers.expand_action(raw_data)
            bulk_action_batch.append(action)
            if data is not None:
                bulk_action_batch.append(data)
        return bulk_action_batch

Source File: esbtc.py From blockchain-elasticsearch with Apache License 2.0

6 votes

def get_opreturn_data(self, bottom = None, top = None):

            query = { "_source": ["tx",
                                  "height",
                                  "n",
                                  "txid",
                                  "vin.txid",
                                 ],
                      "query" : {
                        "match_all" : {}
                      }
                    }

            if bottom is not None and top is not None:
                query['query'] = {"range" : { "height" : { "gte" : bottom, "lte" :  top}}}

            return elasticsearch.helpers.scan(self.es, index="btc-opreturn", query=query, size=100, scroll='1m')

Source File: esbtc.py From blockchain-elasticsearch with Apache License 2.0

6 votes

def get_nulldata_transactions(self, index):
            # This is a mess. Apologies if you're looking at this

            query = { "_source": ["hash",
                                  "height",
                                  "txid",
                                  "vin.txid",
                                  "vout.scriptPubKey.asm",
                                  "vout.scriptPubKey.type",
                                  "vout.n"
                                 ],
                      "query" : {
                        "bool": {
                          "must": [
                            {"term": { "vout.scriptPubKey.type": "nulldata" }}
                          ]
                        }
                      }
                    }

            return elasticsearch.helpers.scan(self.es, index=index, query=query, scroll='5m')

Source File: SearchObjects.py From data_pipeline with Apache License 2.0

6 votes

def store_in_elasticsearch(so_it, dry_run, es, index, workers_write, queue_write):
        #write into elasticsearch
        chunk_size = 1000 #TODO make configurable
        actions = elasticsearch_actions(so_it, dry_run, index)
        failcount = 0

        if not dry_run:
            results = None
            if workers_write > 0:
                results = elasticsearch.helpers.parallel_bulk(es, actions,
                        thread_count=workers_write,
                        queue_size=queue_write, 
                        chunk_size=chunk_size)
            else:
                results = elasticsearch.helpers.streaming_bulk(es, actions,
                        chunk_size=chunk_size)
            for success, details in results:
                if not success:
                    failcount += 1

            if failcount:
                raise RuntimeError("%s relations failed to index" % failcount)

Source File: DataDrivenRelation.py From data_pipeline with Apache License 2.0

6 votes

def get_target_labels(ids, es, index):
    res = elasticsearch.helpers.scan(client=es,
            query={"query": {
                "ids": {
                    "values": ids,
                }
            },
                '_source': 'approved_symbol',
                'size': 1,
            },
            index=index
        )



    return dict((hit['_id'],hit['_source']['approved_symbol']) for hit in res)

Source File: DataDrivenRelation.py From data_pipeline with Apache License 2.0

6 votes

def store_in_elasticsearch(results, es, dry_run, workers_write, queue_write, index):
    chunk_size = 1000 #TODO make configurable
    actions = elasticsearch_actions(results, dry_run, index)
    failcount = 0

    if not dry_run:
        results = None
        if workers_write > 0:
            results = elasticsearch.helpers.parallel_bulk(es, actions,
                    thread_count=workers_write,
                    queue_size=queue_write, 
                    chunk_size=chunk_size)
        else:
            results = elasticsearch.helpers.streaming_bulk(es, actions,
                    chunk_size=chunk_size)
        for success, details in results:
            if not success:
                failcount += 1

        if failcount:
            raise RuntimeError("%s relations failed to index" % failcount)

Source File: HPA.py From data_pipeline with Apache License 2.0

5 votes

def store_data(self, dry_run):
        self.logger.info('store_data called')

        self.logger.debug('calling to create new expression index')

        with URLZSource(self.es_mappings).open() as mappings_file:
            mappings = json.load(mappings_file)

        with URLZSource(self.es_settings).open() as settings_file:
            settings = json.load(settings_file)

        es = new_es_client(self.es_hosts)
        with ElasticsearchBulkIndexManager(es, self.es_index, settings, mappings):
  
            #write into elasticsearch
            chunk_size = 1000 #TODO make configurable
            actions = elasticsearch_actions(self.hpa_merged_table, dry_run, self.es_index)
            failcount = 0

            if not dry_run:
                results = None
                if self.workers_write > 0:
                    results = elasticsearch.helpers.parallel_bulk(es, actions,
                            thread_count=self.workers_write,
                            queue_size=self.queue_write, 
                            chunk_size=chunk_size)
                else:
                    results = elasticsearch.helpers.streaming_bulk(es, actions,
                            chunk_size=chunk_size)
                for success, details in results:
                    if not success:
                        failcount += 1

                if failcount:
                    raise RuntimeError("%s relations failed to index" % failcount)
        
        if failcount:
            raise RuntimeError("%s failed to index" % failcount)

        self.logger.info('missing tissues %s', str(_missing_tissues))

Source File: esbtc.py From blockchain-elasticsearch with Apache License 2.0

5 votes

def add_opreturn_files(self, data):
        errors = []

        for ok, item in elasticsearch.helpers.streaming_bulk(self.es, data, max_retries=2):
            if not ok:
                errors.append(item)

        return errors

Source File: esbtc.py From blockchain-elasticsearch with Apache License 2.0

5 votes

def update_opreturns(self, the_iter):

        errors = []

        for ok, item in elasticsearch.helpers.streaming_bulk(self.es, the_iter, max_retries=2):
            if not ok:
                errors.append(item)

        return errors

Source File: esbtc.py From blockchain-elasticsearch with Apache License 2.0

5 votes

def get_nonstandard_transactions(self):
            query = { "_source": ["hash", "vout.scriptPubKey.hex", "vout.scriptPubKey.type"], "query" : { "match": { "vout.scriptPubKey.type": "nonstandard" } } }

            return elasticsearch.helpers.scan(self.es, index="btc-transactions-*", query=query, scroll='1m')

Source File: DataDrivenRelation.py From data_pipeline with Apache License 2.0

5 votes

def get_disease_labels(ids, es, index):
    res = elasticsearch.helpers.scan(client=es,
            query={
                "query": {
                    "ids": {
                        "values": ids,
                    }
                },
                '_source': 'label',
                'size': 1,
            },
            index=index
        )

    return dict((hit['_id'],hit['_source']['label']) for hit in res)

Source File: injector.py From gransk with Apache License 2.0

5 votes

def get_elasticsearch_helper(self):
    """
    Get helpers module for Elasticsearch. Used to bulk index documents.

    :returns: package ``elasticsearch.helpers``
    """
    return helpers

Source File: ECO.py From data_pipeline with Apache License 2.0

5 votes

def _store_eco(self, dry_run):

        with URLZSource(self.es_mappings).open() as mappings_file:
            mappings = json.load(mappings_file)

        with URLZSource(self.es_settings).open() as settings_file:
            settings = json.load(settings_file)

        es = new_es_client(self.es_hosts)
        with ElasticsearchBulkIndexManager(es, self.es_index, settings, mappings):

            #write into elasticsearch
            chunk_size = 1000 #TODO make configurable
            actions = elasticsearch_actions(list(self.ecos.items()), self.es_index)
            failcount = 0

            if not dry_run:
                results = None
                if self.workers_write > 0:
                    results = elasticsearch.helpers.parallel_bulk(es, actions,
                            thread_count=self.workers_write,
                            queue_size=self.queue_write, 
                            chunk_size=chunk_size)
                else:
                    results = elasticsearch.helpers.streaming_bulk(es, actions,
                            chunk_size=chunk_size)
                for success, details in results:
                    if not success:
                        failcount += 1

                if failcount:
                    raise RuntimeError("%s relations failed to index" % failcount)

Source File: Drug.py From data_pipeline with Apache License 2.0

5 votes

def store(self, es, dry_run, data):
        self.logger.info("Starting drug storage")
        with URLZSource(self.es_mappings).open() as mappings_file:
            mappings = json.load(mappings_file)

        with URLZSource(self.es_settings).open() as settings_file:
            settings = json.load(settings_file)

        with ElasticsearchBulkIndexManager(es, self.es_index, settings, mappings):
            #write into elasticsearch
            chunk_size = 1000 #TODO make configurable
            actions = elasticsearch_actions(list(data.items()), self.es_index)
            failcount = 0
            if not dry_run:
                results = None
                if self.workers_write > 0:
                    results = elasticsearch.helpers.parallel_bulk(es, actions,
                            thread_count=self.workers_write,
                            queue_size=self.queue_write, 
                            chunk_size=chunk_size)
                else:
                    results = elasticsearch.helpers.streaming_bulk(es, actions,
                            chunk_size=chunk_size)
                for success, details in results:
                    if not success:
                        failcount += 1

                if failcount:
                    raise RuntimeError("%s relations failed to index" % failcount)
        
        self.logger.debug("Completed storage")

Source File: Evidences.py From data_pipeline with Apache License 2.0

5 votes

def validation_on_start(eco_scores_uri, schema_uri, excluded_biotypes, 
        datasources_to_datatypes, es_hosts, es_index_gene, es_index_eco, es_index_efo,
        cache_target, cache_target_u2e, cache_target_contains,
        cache_eco, cache_efo, cache_efo_contains):
    logger = logging.getLogger(__name__)

    validator = opentargets_validator.helpers.generate_validator_from_schema(schema_uri)

    lookup_data = LookUpDataRetriever(new_es_client(es_hosts), 
        gene_index=es_index_gene,
        gene_cache_size = cache_target,
        gene_cache_u2e_size = cache_target_u2e,
        gene_cache_contains_size = cache_target_contains,
        eco_index=es_index_eco,
        eco_cache_size = cache_efo_contains,
        efo_index=es_index_efo,
        efo_cache_size = cache_efo,
        efo_cache_contains_size = cache_efo_contains
        ).lookup


    datasources_to_datatypes = datasources_to_datatypes
    evidence_manager = EvidenceManager(lookup_data, eco_scores_uri, 
        excluded_biotypes, datasources_to_datatypes)

    return logger, validator, lookup_data, datasources_to_datatypes, evidence_manager

Source File: json-parse.py From cve-analysis with GNU General Public License v3.0

5 votes

def main():

    if len(sys.argv) > 1:
        input_file = sys.argv[1]
    else:
        print("Usage: %s <nvd-xml-file>" % (sys.argv[0]))
        sys.exit(1)

    # First let's see if the index exists
    if es.indices.exists('cve-index') is False:
        # We have to create it and add a mapping
        fh = open('cve-index-json-mapping.json')
        mapping = json.load(fh)
        es.indices.create('cve-index', body=mapping)

    fh = open(input_file)
    json_data = json.load(fh)

    the_cves = CVE()
    for i in json_data['CVE_Items']:

        # ['CVE_Items'][0]['cve']['CVE_data_meta']['ID']
        the_cves.add(i)
        #es.update(id=cve_id, index="cve-index", body={'doc' : cve, 'doc_as_upsert': True})


    for ok, item in elasticsearch.helpers.streaming_bulk(es, the_cves, max_retries=2):
            if not ok:
                print("ERROR:")
                print(item)

Source File: metrics.py From rally with Apache License 2.0

5 votes

def bulk_index(self, index, doc_type, items):
        # TODO #653: Remove version-specific support for metrics stores before 7.0.0.
        import elasticsearch.helpers
        if self._cluster_version[0] > 6:
            self.guarded(elasticsearch.helpers.bulk, self._client, items, index=index, chunk_size=5000)
        else:
            self.guarded(elasticsearch.helpers.bulk, self._client, items, index=index, doc_type=doc_type, chunk_size=5000)

Source File: DataDrivenRelation.py From data_pipeline with Apache License 2.0

4 votes

def get_disease_to_targets_vectors(threshold, evidence_count, es, index):
    '''
    Get all the association objects that are:
    - direct -> to avoid ontology inflation
    - > 3 evidence count -> remove noise
    - overall score > threshold -> remove very lo quality noise
    :param threshold: minimum overall score threshold to consider for fetching association data
    :param evidence_count: minimum number of evidence consider for fetching association data
    :return: two dictionaries mapping target to disease  and the reverse
    '''
    res = elasticsearch.helpers.scan(client=es,
            query={
                "query": {
                    "term": {
                        "is_direct": True,
                    }
                },
                '_source': {
                    'includes':
                        ["target.id", 'disease.id', 'harmonic-sum', 'evidence_count']},
                'size': 1000,
            },
            index=index
        )

    target_results = dict()
    disease_results = dict()

    c=0
    for hit in res:
        c+=1
        pair_id = str(hit['_id'])
        hit = hit['_source']
        if hit['evidence_count']['total']>= evidence_count and \
            hit['harmonic-sum']['overall'] >= threshold:
            '''store target associations'''
            if hit['target']['id'] not in target_results:
                target_results[hit['target']['id']] = SparseFloatDict()
            #TODO: return all counts and scores up to datasource level
            target_results[hit['target']['id']][hit['disease']['id']]=hit['harmonic-sum']['overall']

            '''store disease associations'''
            if hit['disease']['id'] not in disease_results:
                disease_results[hit['disease']['id']] = SparseFloatDict()
            # TODO: return all counts and scores up to datasource level
            disease_results[hit['disease']['id']][hit['target']['id']] = hit['harmonic-sum']['overall']

    return target_results, disease_results

Source File: Reactome.py From data_pipeline with Apache License 2.0

4 votes

def process_all(self, dry_run):

        self.g.add_node('root', name="", species="")

        for row in self.downloader.get_pathway_data():
            self.g.add_node(row['id'], name=row['name'], species=row['species'])
        children = set()
        for row in self.downloader.get_pathway_relations():
            self.g.add_edge(row['id'], row['child'])
            children.add(row['child'])

        nodes_without_parent = set(self.g.nodes()) - children
        for node in nodes_without_parent:
            if node != 'root':
                self.g.add_edge('root', node)

        with URLZSource(self.es_mappings).open() as mappings_file:
            mappings = json.load(mappings_file)

        with URLZSource(self.es_settings).open() as settings_file:
            settings = json.load(settings_file)

        es = new_es_client(self.es_hosts)
        with ElasticsearchBulkIndexManager(es, self.es_index, settings, mappings):
            #write into elasticsearch
            chunk_size = 1000 #TODO make configurable
            docs = generate_documents(self.g)
            actions = elasticsearch_actions(docs, self.es_index)
            failcount = 0

            if not dry_run:
                results = None
                if self.workers_write > 0:
                    results = elasticsearch.helpers.parallel_bulk(es, actions,
                            thread_count=self.workers_write,
                            queue_size=self.queue_write, 
                            chunk_size=chunk_size)
                else:
                    results = elasticsearch.helpers.streaming_bulk(es, actions,
                            chunk_size=chunk_size)
                for success, _ in results:
                    if not success:
                        failcount += 1

                if failcount:
                    raise RuntimeError("%s failed to index" % failcount)

Python elasticsearch.helpers() Examples