Python Examples of elasticsearch

Source File: search_permissions.py From invenio-app-ils with MIT License

7 votes

def search_filter_record_permissions():
    """Filter list of results by `_access` and `restricted` fields."""
    if not has_request_context() or backoffice_permission().allows(g.identity):
        return Q()

    # A record is public if `restricted` field False or missing
    restricted_field_missing = ~Q("exists", field="restricted")
    is_restricted = restricted_field_missing | Q("term", restricted=False)

    combined_filter = is_restricted

    if current_app.config.get("ILS_RECORDS_EXPLICIT_PERMISSIONS_ENABLED"):
        # if `_access`, check `_access.read` against the user. It takes
        # precedence over `restricted`.
        # if not `_access`, check if open access as before.
        _access_field_exists = Q("exists", field="_access.read")
        provides = _get_user_provides()
        user_can_read = _access_field_exists & Q(
            "terms", **{"_access.read": provides}
        )
        combined_filter = user_can_read | (
            ~_access_field_exists & ~is_restricted
        )

    return Q("bool", filter=[combined_filter])

Source File: elastic.py From ivre with GNU General Public License v3.0

6 votes

def searchport(port, protocol='tcp', state='open', neg=False):
        """Filters (if `neg` == True, filters out) records with
        specified protocol/port at required state. Be aware that when
        a host has a lot of ports filtered or closed, it will not
        report all of them, but only a summary, and thus the filter
        might not work as expected. This filter will always work to
        find open ports.

        """
        if port == "host":
            res = Q("nested", path="ports", query=Q("match", ports__port=-1))
        elif state == "open":
            res = Q("match", **{"openports.%s.ports" % protocol: port})
        else:
            res = Q("nested", path="ports", query=(
                Q("match", ports__port=port) &
                Q("match", ports__protocol=protocol) &
                Q("match", ports__state_state=state)
            ))
        if neg:
            return ~res
        return res

Source File: util.py From open-ledger with MIT License

6 votes

def correct_license_capitalization(self, provider='europeana', end=None):
        """[#186] Correct license capitalization"""
        s = Search()
        q = Q('term', provider=provider)
        s = s.query(q)
        response = s.execute()
        total = response.hits.total
        log.info("Using search engine instance %s", settings.ELASTICSEARCH_URL)
        mod_count = 0
        count = 0
        for r in s.scan():
            if not r.license.islower():
                img = search.Image.get(id=r.identifier)
                log.debug("[%d] Changing license %s to %s", count, img.license, img.license.lower())
                img.update(license=img.license.lower())
                mod_count += 1
            count += 1
        log.info("Modified %d records in search engine", mod_count)

Source File: es_search.py From seqr with GNU Affero General Public License v3.0

6 votes

def _pathogenicity_filter(pathogenicity):
    clinvar_filters = pathogenicity.get('clinvar', [])
    hgmd_filters = pathogenicity.get('hgmd', [])

    pathogenicity_filter = None
    if clinvar_filters:
        clinvar_clinical_significance_terms = set()
        for clinvar_filter in clinvar_filters:
            clinvar_clinical_significance_terms.update(CLINVAR_SIGNFICANCE_MAP.get(clinvar_filter, []))
        pathogenicity_filter = Q('terms', clinvar_clinical_significance=sorted(list(clinvar_clinical_significance_terms)))

    if hgmd_filters:
        hgmd_class = set()
        for hgmd_filter in hgmd_filters:
            hgmd_class.update(HGMD_CLASS_MAP.get(hgmd_filter, []))

        hgmd_q = Q('terms', hgmd_class=sorted(list(hgmd_class)))
        pathogenicity_filter = pathogenicity_filter | hgmd_q if pathogenicity_filter else hgmd_q

    return pathogenicity_filter

Source File: util.py From open-ledger with MIT License

6 votes

def correct_orphan_records(self, provider='europeana', end=None):
        """[#185] Delete records from the search engine which aren't found in the database"""
        s = Search()
        q = Q('term', provider=provider)
        s = s.query(q)
        response = s.execute()
        total = response.hits.total
        # A file extracted from the production database listing all of the europeana identifiers
        identifier_file = '/tmp/europeana-identifiers.json'
        db_identifiers = set(json.load(open(identifier_file)))
        total_in_db = len(db_identifiers)
        log.info("Using search engine instance %s", settings.ELASTICSEARCH_URL)
        log.info("Total records: %d (search engine), %d (database) [diff=%d]", total, total_in_db, total - total_in_db)
        deleted_count = 0
        for r in s.scan():
            if r.identifier not in db_identifiers:
                img = search.Image.get(id=r.identifier)
                log.debug("Going to delete image %s", img)
                deleted_count += 1
        log.info("Deleted %d from search engine", deleted_count)

Source File: site_views.py From open-ledger with MIT License

6 votes

def about(request):
    """Information about the current site, its goals, and what content is loaded"""
    # Provider counts
    providers = cache.get_or_set(CACHE_STATS_NAME, [], CACHE_STATS_DURATION)
    if not providers:
        for provider in sorted(settings.PROVIDERS.keys()):
            s = Search()
            q = Q('term', provider=provider)
            s = s.query(q)
            response = s.execute()
            if response.hits.total > 0:
                data = settings.PROVIDERS[provider]
                total = intcomma(response.hits.total)
                data.update({'hits': total})
                providers.append(data)
        # All results
        s = Search()
        response = s.execute()
        total = intcomma(response.hits.total)
        providers.append({'display_name': 'Total', 'hits': total})
        cache.set(CACHE_STATS_NAME, providers)
    return render(request, "about.html", {'providers': providers})

Source File: es_search.py From seqr with GNU Affero General Public License v3.0

6 votes

def filter_by_frequency(self, frequencies):
        q = Q()
        for pop, freqs in sorted(frequencies.items()):
            if freqs.get('af') is not None:
                filter_field = next(
                    (field_key for field_key in POPULATIONS[pop]['filter_AF']
                     if any(field_key in index_metadata['fields'] for index_metadata in self.index_metadata.values())),
                    POPULATIONS[pop]['AF'])
                q &= _pop_freq_filter(filter_field, freqs['af'])
            elif freqs.get('ac') is not None:
                q &= _pop_freq_filter(POPULATIONS[pop]['AC'], freqs['ac'])

            if freqs.get('hh') is not None:
                q &= _pop_freq_filter(POPULATIONS[pop]['Hom'], freqs['hh'])
                q &= _pop_freq_filter(POPULATIONS[pop]['Hemi'], freqs['hh'])
        self.filter(q)

Source File: update_politician_pictures.py From politicos with GNU Affero General Public License v3.0

6 votes

def get_pictures(client):
    es_data = (
        Search(using=client, index='politicians')
        .query(Q('bool', must=[Q('match', ano_eleicao=ANO_ELEICAO)]))
        .source(['sq_candidato', 'sg_ue'])
        .scan()
    )
    data = []
    for hit in es_data:
        url = 'http://divulgacandcontas.tse.jus.br/divulga'
        url = f'{url}/rest/v1/candidatura/buscar'
        url = f'{url}/{ANO_ELEICAO}/{hit.sg_ue}/2022802018'
        url = f'{url}/candidato/{hit.sq_candidato}'
        data.append(
            {
                'doc_id': hit.meta.id,
                'url': url,
                'sg_ue': hit.sg_ue,
                'sq_candidato': hit.sq_candidato,
            }
        )
    return data

Source File: study_ceres_onion.py From grimoirelab-elk with GNU General Public License v3.0

6 votes

def process(self, items_block):
        """Process a DataFrame to compute Onion.

        :param items_block: items to be processed. Expects to find a pandas DataFrame.
        """

        logger.debug("{} Authors to process: {}".format(self.__log_prefix, len(items_block)))

        onion_enrich = Onion(items_block)
        df_onion = onion_enrich.enrich(member_column=ESOnionConnector.AUTHOR_UUID,
                                       events_column=ESOnionConnector.CONTRIBUTIONS)

        # Get and store Quarter as String
        df_onion['quarter'] = df_onion[ESOnionConnector.TIMEFRAME].map(lambda x: str(pandas.Period(x, 'Q')))

        # Add metadata: enriched on timestamp
        df_onion['metadata__enriched_on'] = datetime.utcnow().isoformat()
        df_onion['data_source'] = self.data_source
        df_onion['grimoire_creation_date'] = df_onion[ESOnionConnector.TIMEFRAME]

        logger.debug("{} Final new events: {}".format(self.__log_prefix, len(df_onion)))

        return self.ProcessResults(processed=len(df_onion), out_items=df_onion)

Source File: fields.py From udata with GNU Affero General Public License v3.0

6 votes

def get_value_filter(self, filter_value):
        '''
        Fix here until upstream PR is merged
        https://github.com/elastic/elasticsearch-dsl-py/pull/473
        '''
        self.validate_parameter(filter_value)
        f, t = self._ranges[filter_value]
        limits = {}
        # lt and gte to ensure non-overlapping ranges
        if f is not None:
            limits['gte'] = f
        if t is not None:
            limits['lt'] = t

        return Q('range', **{
            self._params['field']: limits
        })

Source File: test_querysets.py From django-zombodb with MIT License

6 votes

def test_dsl_search_no_limit(self):
        # duplicate tj and soleil
        self.tj.pk = None
        self.tj.save()
        self.soleil.pk = None
        self.soleil.save()

        results = Restaurant.objects.dsl_search(
            ElasticsearchQ('match', street='skillman'),
            sort=True,
            limit=None)

        self.assertEqual(len(results), 4)
        self.assertEqual(
            [r.name for r in results],
            [self.soleil.name, self.soleil.name, self.tj.name, self.tj.name])

Source File: WASEQuery.py From WASE with GNU General Public License v3.0

6 votes

def query_missing(s, field, name, methods=None, responsecodes=None, invert=False):
    # main query
    q = Q("match", ** { field: name })
    if not invert:
        q = ~q
    s.query = q

    # add filters
    ## method
    if methods:
        s = s.filter("terms", ** { 'request.method': methods })
    ## response codes
    if responsecodes:
        for rc in responsecodes:
            rcrange = rc.split("-")
            if len(rcrange) == 2:
                s = s.filter("range", ** { 'response.status': { "gte": int(rcrange[0]), "lte": int(rcrange[1]) } })
            else:
                s = s.filter("term", ** { 'response.status': rc })

    print_debug(s.to_dict())
    return s

Source File: WASEQuery.py From WASE with GNU General Public License v3.0

6 votes

def query_vals(s, field, name, values, invert):
    # match documents where given field value name is present, if required
    if values:
        q = Q("nested", path=field, query=Q("wildcard", ** { field + ".value.keyword": values }))
        if invert:
            s.query = ~q
        else:
            s.query = q
    else:
        s.query = Q()

    # 1. descent into response.headers/request.parameters
    # 2. filter given header
    # 3. aggregate values
    # 4. jump back into main document
    # 5. aggregate URLs
    s.aggs.bucket("field", "nested", path=field)\
            .bucket("valuefilter", "filter", Q("match", ** { field + ".name": name }))\
            .bucket("values", "terms", field=field + ".value.keyword", size=args.size)\
            .bucket("main", "reverse_nested")\
            .bucket("urls", "terms", field="request.url.keyword", size=args.size)
    return s

Source File: test_querysets.py From django-zombodb with MIT License

6 votes

def test_dsl_search(self):
        results = Restaurant.objects.dsl_search(ElasticsearchQ(
            'bool',
            must=[
                ElasticsearchQ('match', street='Skillman Ave'),
                ElasticsearchQ('match', categories='French')
            ]
        ))
        self.assertCountEqual(results, [self.soleil])

        results = Restaurant.objects.dsl_search(ElasticsearchQ(
            'bool',
            must=[
                ElasticsearchQ('match', street='Skillman Ave'),
                ElasticsearchQ('match', zip_code='11377')
            ]
        ))
        self.assertCountEqual(results, [self.tj, self.soleil])

        results = Restaurant.objects.dsl_search(Term(email='alcove@example.org'))
        self.assertCountEqual(results, [self.alcove])

Source File: es_wrapper.py From bitshares-explorer-api with MIT License

6 votes

def get_trade_history(size=10, from_date='2015-10-10', to_date='now', sort_by='-operation_id_num',
                      search_after=None, base="1.3.0", quote="1.3.121"):

    s = Search(using=es, index="bitshares-*")

    s = s.extra(size=size)
    if search_after and search_after != '':
        s = s.extra(search_after=search_after.split(','))

    q = Q()
    q = q & Q("match", operation_type=4)
    q = q & Q("match", operation_history__op_object__is_maker=True)

    q = q & Q("match", operation_history__op_object__fill_price__base__asset_id=base)
    q = q & Q("match", operation_history__op_object__fill_price__quote__asset_id=quote)

    range_query = Q("range", block_data__block_time={'gte': from_date, 'lte': to_date})
    s.query = q & range_query

    s = s.sort(*sort_by.split(','))
    response = s.execute()
    verify_es_response(response)

    return [hit.to_dict() for hit in response]

Source File: test_search.py From elasticsearch-dsl-py with Apache License 2.0

5 votes

def test_query_can_be_assigned_to():
    s = search.Search()

    q = Q('match', title='python')
    s.query = q

    assert s.query._proxied is q

Source File: utils.py From userline with BSD 3-Clause "New" or "Revised" License

5 votes

def get_dsl_logoff_query(screen):
	q = None
	for evtid in config.EVENTS_LOGOFF:
		tmp = Q("match",event_identifier=evtid)
		if q is None:
			q = tmp
		else:
			q = q | tmp

	if screen is True:
		for evtid in config.EVENTS_LOGOFF_SCREEN:
			q = q | Q("match",event_identifier=evtid)

	return q

Source File: search.py From nyaa with GNU General Public License v3.0

5 votes

def _es_name_exact_phrase(literal):
    ''' Returns a Query for a phrase match on the display_name for a given literal '''
    return Q({
        'match_phrase': {
            'display_name.exact': {
                'query': literal,
                'analyzer': 'exact_analyzer'
            }
        }
    })

Source File: test_querysets.py From django-zombodb with MIT License

5 votes

def test_dsl_search_validate(self):
        query = ElasticsearchQ('bool')
        query.name = 'wrong'
        with self.assertRaises(InvalidElasticsearchQuery) as cm:
            Restaurant.objects.dsl_search(query, validate=True)
        self.assertRegex(
            str(cm.exception),
            "Invalid Elasticsearch query: (.+)")

Source File: test_querysets.py From django-zombodb with MIT License

5 votes

def test_dsl_search_score_attr(self):
        results = Restaurant.objects.dsl_search(
            ElasticsearchQ('match', street='skillman'),
            sort=True,
            score_attr='custom_score')

        self.assertEqual(len(results), 2)
        for r in results:
            self.assertTrue(hasattr(r, 'custom_score'))
            self.assertGreater(r.custom_score, 0)

Source File: crawl.py From weapp-zhihulive with Apache License 2.0

5 votes

def parse_zhuanlan_link(self, response):
        posts = await response.json()

        if response.status == 200 and posts:
            for post in posts:
                cover = post['titleImage']
                if not cover:
                    continue
                s = Live.search()
                title = post['title']
                for sep in ('－', '—'):
                    if sep in title:
                        title = title.split(sep)[-1].strip()
                speaker_id = post['author']['hash']
                zid = post['url'].split('/')[-1]
                s = s.query(Q('match_phrase', subject=title))
                lives = await s.execute()
                for live in lives:
                    if live.speaker and live.speaker.speaker_id == speaker_id:
                        await self.update_live(zid, cover, live)
                        break
                else:
                    match = LIVE_REGEX.search(post['content'])
                    if match:
                        live_id = match.group(2)
                        try:
                            live = await Live.get(live_id)
                        except NotFoundError:
                            pass
                        else:
                            await self.update_live(zid, cover, live)

            return get_next_url(response.url)

Source File: more_like_this_validator.py From texta with GNU General Public License v3.0

5 votes

def validate_filter(filter: dict):
    try:
        query = Q(filter)
    except Exception as e:
        logging.getLogger(ERROR_LOGGER).exception("Could not parse filter query {}.".format(filter))
        raise serializers.ValidationError("Could not parse the filter, query. Make sure you have not included the top 'query' key!")

Source File: test_querysets.py From django-zombodb with MIT License

5 votes

def test_dsl_search_sort(self):
        results = Restaurant.objects.dsl_search(
            ElasticsearchQ(
                'bool',
                should=[
                    ElasticsearchQ('match', categories='sushi'),
                    ElasticsearchQ('match', categories='asian'),
                    ElasticsearchQ('match', categories='japanese'),
                    ElasticsearchQ('match', categories='french'),
                ],
                minimum_should_match=1
            ),
            validate=True,
            sort=True)
        self.assertEqual(list(results), [self.tj, self.soleil])

        results = Restaurant.objects.dsl_search(
            ElasticsearchQ(
                'bool',
                should=[
                    ElasticsearchQ('match', categories='french'),
                    ElasticsearchQ('match', categories='coffee'),
                    ElasticsearchQ('match', categories='european'),
                    ElasticsearchQ('match', categories='sushi'),
                ],
                minimum_should_match=1
            ),
            sort=True)
        self.assertEqual(list(results), [self.soleil, self.tj])

Source File: tor_elasticsearch.py From freshonions-torscraper with GNU Affero General Public License v3.0

5 votes

def elasticsearch_retrieve_page_by_id(page_id):
    query = Search().filter(Q("term", nid=int(page_id)))[:1]
    result = query.execute()
    if result.hits.total == 0:
        return None
    return result.hits[0]

Source File: esnotifications.py From stethoscope with Apache License 2.0

5 votes

def create_query_for_email(self, search, email):
    return search.query(elasticsearch_dsl.Q({"match": {'email': email}}))

Source File: tor_elasticsearch.py From freshonions-torscraper with GNU Affero General Public License v3.0

5 votes

def elasticsearch_delete_old():
    _from = NEVER
    _to   = datetime.now() - timedelta(days=30)
    query = Search().filter(Q("range", visited_at={'from': _from, 'to': _to}))
    result = query.delete()

Source File: es_wrapper.py From bitshares-explorer-api with MIT License

5 votes

def get_trx(trx, from_=0, size=10):
    s = Search(using=es, index="bitshares-*", extra={"size": size, "from": from_})
    s.query = Q("match", block_data__trx_id=trx)

    response = s.execute()
    verify_es_response(response)

    return [hit.to_dict() for hit in response]

Source File: es_wrapper.py From bitshares-explorer-api with MIT License

5 votes

def is_alive():
    find_string = datetime.utcnow().strftime("%Y-%m")
    from_date = (datetime.utcnow() - timedelta(days=1)).strftime("%Y-%m-%d")

    s = Search(using=es, index="bitshares-" + find_string)
    s.query = Q("range", block_data__block_time={'gte': from_date, 'lte': "now"})
    s.aggs.metric("max_block_time", "max", field="block_data.block_time")

    json_response = {
        "server_time": datetime.utcnow(),
        "head_block_timestamp": None,
        "head_block_time": None
    }

    response = s.execute()
    verify_es_response(response)

    if response.aggregations.max_block_time.value is not None:
        json_response["head_block_time"] = str(response.aggregations.max_block_time.value_as_string)
        json_response["head_block_timestamp"] = response.aggregations.max_block_time.value
        json_response["deltatime"] = abs((datetime.utcfromtimestamp(json_response["head_block_timestamp"] / 1000) - json_response["server_time"]).total_seconds())
        if json_response["deltatime"] < 30:
            json_response["status"] = "ok"
        else:
            json_response["status"] = "out_of_sync"
            json_response["error"] = "last_block_too_old"
    else:
        json_response["status"] = "out_of_sync"
        json_response["deltatime"] = "Infinite"
        json_response["query_index"] = find_string
        json_response["query_from_date"] = from_date
        json_response["error"] = "no_blocks_last_24_hours"

    return json_response

Source File: es_wrapper.py From bitshares-explorer-api with MIT License

5 votes

def get_single_operation(operation_id):
    s = Search(using=es, index="bitshares-*")
    s.query = Q("match", account_history__operation_id=operation_id)

    response = s.execute()
    verify_es_response(response)

    return [hit.to_dict() for hit in response][0]

Source File: utils.py From userline with BSD 3-Clause "New" or "Revised" License

5 votes

def get_last_shutdown(index,maxtstamp,pattern):
	"""
	Look for the last shutdown event
	"""

	conn = connections.get_connection()

	q = [ \
		Q('match',data_type='windows:evtx:record') , \
		Q('match',event_identifier=config.EVENT_SHUTDOWN)
	]

	if pattern:
		q.append(Q('query_string',query=pattern,analyze_wildcard=True))

	s = Search(using=conn, index=index).query(Q('bool',must=q)).filter('range',datetime={'lte':maxtstamp}).sort('-datetime')[0:0]
	s.aggs.bucket('computer','terms',field='computer_name.keyword').bucket('shutdown','top_hits',size=1)

	res = s.execute()
	ret = {}
	for item in res.aggregations['computer']['buckets']:
		ret[item['key']] = item['shutdown']['hits']['hits'][0]

	if len(ret.keys()) == 0:
		ret = None

	return ret

Python elasticsearch_dsl.Q Examples