Python elasticsearch.helpers() Examples
The following are 20
code examples of elasticsearch.helpers().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
elasticsearch
, or try the search function
.
Example #1
Source File: __init__.py From estnltk with GNU General Public License v2.0 | 6 votes |
def sentences(self, exclude_ids=None, query=None, return_estnltk_object=True, **kwargs): if query is None: query = {} if return_estnltk_object: if query.get('fields', None) is None: query['fields'] = ['estnltk_text_object'] else: if 'estnltk_text_object' not in query['fields']: raise AssertionError('Query contained the "fields" parameter without the "estnltk_text_object" argument' 'Consider setting the "return_estnltk_object" parameter to False to disable respose handling') pass if exclude_ids is None: for document in elasticsearch.helpers.scan(self.client, query=query, doc_type='sentence', **kwargs): if return_estnltk_object: yield Text(json.loads(document['fields']['estnltk_text_object'][0])) else: yield json.loads(document) else: raise NotImplementedError('ID exclusion is not implemented')
Example #2
Source File: elasticsearch_load.py From edx-analytics-pipeline with GNU Affero General Public License v3.0 | 6 votes |
def next_bulk_action_batch(self, document_iterator): """ Read a batch of documents from the iterator and convert them into bulk index actions. Elasticsearch expects each document to actually be transmitted on two lines the first of which details the action to take, and the second contains the actual document. See the `Cheaper in Bulk <https://www.elastic.co/guide/en/elasticsearch/guide/1.x/bulk.html>`_ guide. Arguments: document_iterator (iterator of dicts): Returns: A list of dicts that can be transmitted to elasticsearch using the "bulk" request. """ bulk_action_batch = [] for raw_data in islice(document_iterator, self.batch_size): action, data = elasticsearch.helpers.expand_action(raw_data) bulk_action_batch.append(action) if data is not None: bulk_action_batch.append(data) return bulk_action_batch
Example #3
Source File: esbtc.py From blockchain-elasticsearch with Apache License 2.0 | 6 votes |
def get_opreturn_data(self, bottom = None, top = None): query = { "_source": ["tx", "height", "n", "txid", "vin.txid", ], "query" : { "match_all" : {} } } if bottom is not None and top is not None: query['query'] = {"range" : { "height" : { "gte" : bottom, "lte" : top}}} return elasticsearch.helpers.scan(self.es, index="btc-opreturn", query=query, size=100, scroll='1m')
Example #4
Source File: esbtc.py From blockchain-elasticsearch with Apache License 2.0 | 6 votes |
def get_nulldata_transactions(self, index): # This is a mess. Apologies if you're looking at this query = { "_source": ["hash", "height", "txid", "vin.txid", "vout.scriptPubKey.asm", "vout.scriptPubKey.type", "vout.n" ], "query" : { "bool": { "must": [ {"term": { "vout.scriptPubKey.type": "nulldata" }} ] } } } return elasticsearch.helpers.scan(self.es, index=index, query=query, scroll='5m')
Example #5
Source File: SearchObjects.py From data_pipeline with Apache License 2.0 | 6 votes |
def store_in_elasticsearch(so_it, dry_run, es, index, workers_write, queue_write): #write into elasticsearch chunk_size = 1000 #TODO make configurable actions = elasticsearch_actions(so_it, dry_run, index) failcount = 0 if not dry_run: results = None if workers_write > 0: results = elasticsearch.helpers.parallel_bulk(es, actions, thread_count=workers_write, queue_size=queue_write, chunk_size=chunk_size) else: results = elasticsearch.helpers.streaming_bulk(es, actions, chunk_size=chunk_size) for success, details in results: if not success: failcount += 1 if failcount: raise RuntimeError("%s relations failed to index" % failcount)
Example #6
Source File: DataDrivenRelation.py From data_pipeline with Apache License 2.0 | 6 votes |
def get_target_labels(ids, es, index): res = elasticsearch.helpers.scan(client=es, query={"query": { "ids": { "values": ids, } }, '_source': 'approved_symbol', 'size': 1, }, index=index ) return dict((hit['_id'],hit['_source']['approved_symbol']) for hit in res)
Example #7
Source File: DataDrivenRelation.py From data_pipeline with Apache License 2.0 | 6 votes |
def store_in_elasticsearch(results, es, dry_run, workers_write, queue_write, index): chunk_size = 1000 #TODO make configurable actions = elasticsearch_actions(results, dry_run, index) failcount = 0 if not dry_run: results = None if workers_write > 0: results = elasticsearch.helpers.parallel_bulk(es, actions, thread_count=workers_write, queue_size=queue_write, chunk_size=chunk_size) else: results = elasticsearch.helpers.streaming_bulk(es, actions, chunk_size=chunk_size) for success, details in results: if not success: failcount += 1 if failcount: raise RuntimeError("%s relations failed to index" % failcount)
Example #8
Source File: HPA.py From data_pipeline with Apache License 2.0 | 5 votes |
def store_data(self, dry_run): self.logger.info('store_data called') self.logger.debug('calling to create new expression index') with URLZSource(self.es_mappings).open() as mappings_file: mappings = json.load(mappings_file) with URLZSource(self.es_settings).open() as settings_file: settings = json.load(settings_file) es = new_es_client(self.es_hosts) with ElasticsearchBulkIndexManager(es, self.es_index, settings, mappings): #write into elasticsearch chunk_size = 1000 #TODO make configurable actions = elasticsearch_actions(self.hpa_merged_table, dry_run, self.es_index) failcount = 0 if not dry_run: results = None if self.workers_write > 0: results = elasticsearch.helpers.parallel_bulk(es, actions, thread_count=self.workers_write, queue_size=self.queue_write, chunk_size=chunk_size) else: results = elasticsearch.helpers.streaming_bulk(es, actions, chunk_size=chunk_size) for success, details in results: if not success: failcount += 1 if failcount: raise RuntimeError("%s relations failed to index" % failcount) if failcount: raise RuntimeError("%s failed to index" % failcount) self.logger.info('missing tissues %s', str(_missing_tissues))
Example #9
Source File: esbtc.py From blockchain-elasticsearch with Apache License 2.0 | 5 votes |
def add_opreturn_files(self, data): errors = [] for ok, item in elasticsearch.helpers.streaming_bulk(self.es, data, max_retries=2): if not ok: errors.append(item) return errors
Example #10
Source File: esbtc.py From blockchain-elasticsearch with Apache License 2.0 | 5 votes |
def update_opreturns(self, the_iter): errors = [] for ok, item in elasticsearch.helpers.streaming_bulk(self.es, the_iter, max_retries=2): if not ok: errors.append(item) return errors
Example #11
Source File: esbtc.py From blockchain-elasticsearch with Apache License 2.0 | 5 votes |
def get_nonstandard_transactions(self): query = { "_source": ["hash", "vout.scriptPubKey.hex", "vout.scriptPubKey.type"], "query" : { "match": { "vout.scriptPubKey.type": "nonstandard" } } } return elasticsearch.helpers.scan(self.es, index="btc-transactions-*", query=query, scroll='1m')
Example #12
Source File: DataDrivenRelation.py From data_pipeline with Apache License 2.0 | 5 votes |
def get_disease_labels(ids, es, index): res = elasticsearch.helpers.scan(client=es, query={ "query": { "ids": { "values": ids, } }, '_source': 'label', 'size': 1, }, index=index ) return dict((hit['_id'],hit['_source']['label']) for hit in res)
Example #13
Source File: injector.py From gransk with Apache License 2.0 | 5 votes |
def get_elasticsearch_helper(self): """ Get helpers module for Elasticsearch. Used to bulk index documents. :returns: package ``elasticsearch.helpers`` """ return helpers
Example #14
Source File: ECO.py From data_pipeline with Apache License 2.0 | 5 votes |
def _store_eco(self, dry_run): with URLZSource(self.es_mappings).open() as mappings_file: mappings = json.load(mappings_file) with URLZSource(self.es_settings).open() as settings_file: settings = json.load(settings_file) es = new_es_client(self.es_hosts) with ElasticsearchBulkIndexManager(es, self.es_index, settings, mappings): #write into elasticsearch chunk_size = 1000 #TODO make configurable actions = elasticsearch_actions(list(self.ecos.items()), self.es_index) failcount = 0 if not dry_run: results = None if self.workers_write > 0: results = elasticsearch.helpers.parallel_bulk(es, actions, thread_count=self.workers_write, queue_size=self.queue_write, chunk_size=chunk_size) else: results = elasticsearch.helpers.streaming_bulk(es, actions, chunk_size=chunk_size) for success, details in results: if not success: failcount += 1 if failcount: raise RuntimeError("%s relations failed to index" % failcount)
Example #15
Source File: Drug.py From data_pipeline with Apache License 2.0 | 5 votes |
def store(self, es, dry_run, data): self.logger.info("Starting drug storage") with URLZSource(self.es_mappings).open() as mappings_file: mappings = json.load(mappings_file) with URLZSource(self.es_settings).open() as settings_file: settings = json.load(settings_file) with ElasticsearchBulkIndexManager(es, self.es_index, settings, mappings): #write into elasticsearch chunk_size = 1000 #TODO make configurable actions = elasticsearch_actions(list(data.items()), self.es_index) failcount = 0 if not dry_run: results = None if self.workers_write > 0: results = elasticsearch.helpers.parallel_bulk(es, actions, thread_count=self.workers_write, queue_size=self.queue_write, chunk_size=chunk_size) else: results = elasticsearch.helpers.streaming_bulk(es, actions, chunk_size=chunk_size) for success, details in results: if not success: failcount += 1 if failcount: raise RuntimeError("%s relations failed to index" % failcount) self.logger.debug("Completed storage")
Example #16
Source File: Evidences.py From data_pipeline with Apache License 2.0 | 5 votes |
def validation_on_start(eco_scores_uri, schema_uri, excluded_biotypes, datasources_to_datatypes, es_hosts, es_index_gene, es_index_eco, es_index_efo, cache_target, cache_target_u2e, cache_target_contains, cache_eco, cache_efo, cache_efo_contains): logger = logging.getLogger(__name__) validator = opentargets_validator.helpers.generate_validator_from_schema(schema_uri) lookup_data = LookUpDataRetriever(new_es_client(es_hosts), gene_index=es_index_gene, gene_cache_size = cache_target, gene_cache_u2e_size = cache_target_u2e, gene_cache_contains_size = cache_target_contains, eco_index=es_index_eco, eco_cache_size = cache_efo_contains, efo_index=es_index_efo, efo_cache_size = cache_efo, efo_cache_contains_size = cache_efo_contains ).lookup datasources_to_datatypes = datasources_to_datatypes evidence_manager = EvidenceManager(lookup_data, eco_scores_uri, excluded_biotypes, datasources_to_datatypes) return logger, validator, lookup_data, datasources_to_datatypes, evidence_manager
Example #17
Source File: json-parse.py From cve-analysis with GNU General Public License v3.0 | 5 votes |
def main(): if len(sys.argv) > 1: input_file = sys.argv[1] else: print("Usage: %s <nvd-xml-file>" % (sys.argv[0])) sys.exit(1) # First let's see if the index exists if es.indices.exists('cve-index') is False: # We have to create it and add a mapping fh = open('cve-index-json-mapping.json') mapping = json.load(fh) es.indices.create('cve-index', body=mapping) fh = open(input_file) json_data = json.load(fh) the_cves = CVE() for i in json_data['CVE_Items']: # ['CVE_Items'][0]['cve']['CVE_data_meta']['ID'] the_cves.add(i) #es.update(id=cve_id, index="cve-index", body={'doc' : cve, 'doc_as_upsert': True}) for ok, item in elasticsearch.helpers.streaming_bulk(es, the_cves, max_retries=2): if not ok: print("ERROR:") print(item)
Example #18
Source File: metrics.py From rally with Apache License 2.0 | 5 votes |
def bulk_index(self, index, doc_type, items): # TODO #653: Remove version-specific support for metrics stores before 7.0.0. import elasticsearch.helpers if self._cluster_version[0] > 6: self.guarded(elasticsearch.helpers.bulk, self._client, items, index=index, chunk_size=5000) else: self.guarded(elasticsearch.helpers.bulk, self._client, items, index=index, doc_type=doc_type, chunk_size=5000)
Example #19
Source File: DataDrivenRelation.py From data_pipeline with Apache License 2.0 | 4 votes |
def get_disease_to_targets_vectors(threshold, evidence_count, es, index): ''' Get all the association objects that are: - direct -> to avoid ontology inflation - > 3 evidence count -> remove noise - overall score > threshold -> remove very lo quality noise :param threshold: minimum overall score threshold to consider for fetching association data :param evidence_count: minimum number of evidence consider for fetching association data :return: two dictionaries mapping target to disease and the reverse ''' res = elasticsearch.helpers.scan(client=es, query={ "query": { "term": { "is_direct": True, } }, '_source': { 'includes': ["target.id", 'disease.id', 'harmonic-sum', 'evidence_count']}, 'size': 1000, }, index=index ) target_results = dict() disease_results = dict() c=0 for hit in res: c+=1 pair_id = str(hit['_id']) hit = hit['_source'] if hit['evidence_count']['total']>= evidence_count and \ hit['harmonic-sum']['overall'] >= threshold: '''store target associations''' if hit['target']['id'] not in target_results: target_results[hit['target']['id']] = SparseFloatDict() #TODO: return all counts and scores up to datasource level target_results[hit['target']['id']][hit['disease']['id']]=hit['harmonic-sum']['overall'] '''store disease associations''' if hit['disease']['id'] not in disease_results: disease_results[hit['disease']['id']] = SparseFloatDict() # TODO: return all counts and scores up to datasource level disease_results[hit['disease']['id']][hit['target']['id']] = hit['harmonic-sum']['overall'] return target_results, disease_results
Example #20
Source File: Reactome.py From data_pipeline with Apache License 2.0 | 4 votes |
def process_all(self, dry_run): self.g.add_node('root', name="", species="") for row in self.downloader.get_pathway_data(): self.g.add_node(row['id'], name=row['name'], species=row['species']) children = set() for row in self.downloader.get_pathway_relations(): self.g.add_edge(row['id'], row['child']) children.add(row['child']) nodes_without_parent = set(self.g.nodes()) - children for node in nodes_without_parent: if node != 'root': self.g.add_edge('root', node) with URLZSource(self.es_mappings).open() as mappings_file: mappings = json.load(mappings_file) with URLZSource(self.es_settings).open() as settings_file: settings = json.load(settings_file) es = new_es_client(self.es_hosts) with ElasticsearchBulkIndexManager(es, self.es_index, settings, mappings): #write into elasticsearch chunk_size = 1000 #TODO make configurable docs = generate_documents(self.g) actions = elasticsearch_actions(docs, self.es_index) failcount = 0 if not dry_run: results = None if self.workers_write > 0: results = elasticsearch.helpers.parallel_bulk(es, actions, thread_count=self.workers_write, queue_size=self.queue_write, chunk_size=chunk_size) else: results = elasticsearch.helpers.streaming_bulk(es, actions, chunk_size=chunk_size) for success, _ in results: if not success: failcount += 1 if failcount: raise RuntimeError("%s failed to index" % failcount)