Python Examples of urllib.request.url

Source File: distributed_scheduler.py From scrapy-cluster with MIT License

6 votes

def request_to_dict(self, request):
        '''
        Convert Request object to a dict.
        modified from scrapy.utils.reqser
        '''
        req_dict = {
            # urls should be safe (safe_string_url)
            'url': to_unicode(request.url),
            'method': request.method,
            'headers': dict(request.headers),
            'body': request.body,
            'cookies': request.cookies,
            'meta': request.meta,
            '_encoding': request._encoding,
            'priority': request.priority,
            'dont_filter': request.dont_filter,
             #  callback/errback are assumed to be a bound instance of the spider
            'callback': None if request.callback is None else request.callback.__name__,
            'errback': None if request.errback is None else request.errback.__name__,
        }
        return req_dict

Source File: peba.py From PEBA with GNU General Public License v3.0

6 votes

def retrieveAlertStats():
    """ Retrieve combined statistics
        AlertsLastMinute, AlertsLastHour,  AlertsLast24Hours
    """

    # get result from cache
    getCacheResult = getCache(request.url, "url")
    if getCacheResult is not False:
        return jsonify(getCacheResult)

    # query ES
    else:
        returnResult = formatAlertStats(queryAlertStats(checkCommunityIndex(request), getRelevantIndices(2)))
        setCache(request.url, returnResult, 13, "url")
        app.logger.debug('UNCACHED %s' % str(request.url))
        return jsonify(returnResult)

Source File: peba.py From PEBA with GNU General Public License v3.0

6 votes

def retrieveDatasetAlertTypesPerMonth():
    """ Retrieve the attacks / day in the last x days from elasticsearch,
        split by attack group
        and return as JSON for the last x months, defaults to last month,
        if no GET parameter days is given
    """

    # get result from cache
    getCacheResult = getCache(request.url, "url")
    if getCacheResult is not False:
        return jsonify(getCacheResult)

    # query ES
    else:
        if not request.args.get('days'):
            # Using default : within the last month (max 31 day indices)
            returnResult = formatDatasetAlertTypesPerMonth(queryDatasetAlertTypesPerMonth(None, checkCommunityIndex(request), getRelevantIndices(32)))
        else:
            if request.args.get('days').isdecimal() and int(request.args.get('days')) <= 31:
                indexDays = int(request.args.get('days'))+1
            else:
                indexDays = 0
            returnResult = formatDatasetAlertTypesPerMonth(queryDatasetAlertTypesPerMonth(request.args.get('days'), checkCommunityIndex(request), getRelevantIndices(indexDays)))
        setCache(request.url, returnResult, 3600, "url")
        return jsonify(returnResult)

Source File: peba.py From PEBA with GNU General Public License v3.0

6 votes

def retrieveDatasetAlertsPerMonth():
    """ Retrieve the attacks / day in the last x days from elasticsearch
        and return as JSON for the last months, defaults to last month,
        if no GET parameter days is given
    """

    # get result from cache
    getCacheResult = getCache(request.url, "url")
    if getCacheResult is not False:
        return jsonify(getCacheResult)

    # query ES
    else:
        if not request.args.get('days'):
            # Using default : within the last month (max 31 day indices)
            returnResult = formatDatasetAlertsPerMonth(queryDatasetAlertsPerMonth(None, checkCommunityIndex(request), getRelevantIndices(32)))
        else:
            if request.args.get('days').isdecimal() and int(request.args.get('days'))<=31:
                indexDays = int(request.args.get('days')) + 1
            else:
                indexDays = 0
            returnResult = formatDatasetAlertsPerMonth(queryDatasetAlertsPerMonth(request.args.get('days'), checkCommunityIndex(request), getRelevantIndices(indexDays)))
        setCache(request.url, returnResult, 600, "url")
        return jsonify(returnResult)

Source File: peba.py From PEBA with GNU General Public License v3.0

6 votes

def retrieveAlertsJson():
    """ Retrieve last 5 Alerts in JSON without IPs """

    # set cacheItem independent from url parameters, respect community index
    cacheEntry = request.url

    # get result from cache
    getCacheResult = getCache(cacheEntry, "url")
    if getCacheResult is not False:
        app.logger.debug('Returning /retrieveAlertsJson from Cache %s' % str(request.remote_addr))
        return jsonify(getCacheResult)

    # query ES
    else:
        numAlerts = 35
        # Retrieve last X Alerts from ElasticSearch and return JSON formatted with limited alert content
        returnResult =  formatAlertsJson(queryAlertsWithoutIP(numAlerts, checkCommunityIndex(request), getRelevantIndices(2)))
        setCache(cacheEntry, returnResult, 25, "url")
        app.logger.debug('UNCACHED %s' % str(request.url))
        return jsonify(returnResult)

Source File: peba.py From PEBA with GNU General Public License v3.0

6 votes

def retrieveIPs15m():
    """ Retrieve IPs from the last 15mins from ElasticSearch and return formatted XML or JSON with IPs """

    if request.args.get('out') and request.args.get('out') == 'json':
        getCacheResult = getCache(request.url, "url")
        if getCacheResult is not False:
            return jsonify(getCacheResult)
        else:
            returnResult = formatBadIP(
                queryBadIPs(15, checkCommunityIndex(request), getRelevantIndices(2)), 'json')
            setCache(request.url, returnResult, 60, "url")
            return jsonify(returnResult)
    else:
        getCacheResult = getCache(request.url, "url")
        if getCacheResult is not False:
            return Response(getCacheResult, mimetype='text/xml')
        else:
            returnResult = formatBadIP(
                queryBadIPs(15, checkCommunityIndex(request), getRelevantIndices(2)), 'xml')
            setCache(request.url, returnResult, 60, "url")
            return Response(returnResult, mimetype='text/xml')

# Routes with JSON output

Source File: peba.py From PEBA with GNU General Public License v3.0

6 votes

def retrieveIPs():
    """ Retrieve IPs from ElasticSearch and return formatted XML or JSON with IPs """

    if request.args.get('out') and request.args.get('out') == 'json':
        getCacheResult = getCache(request.url, "url")
        if getCacheResult is not False:
            return jsonify(getCacheResult)
        else:
            returnResult = formatBadIP(
                queryBadIPs(app.config['BADIPTIMESPAN'], checkCommunityIndex(request), getRelevantIndices(2)), 'json')
            setCache(request.url, returnResult, 60, "url")
            return jsonify(returnResult)
    else:
        getCacheResult = getCache(request.url, "url")
        if getCacheResult is not False:
            return Response(getCacheResult, mimetype='text/xml')
        else:
            returnResult = formatBadIP(
                queryBadIPs(app.config['BADIPTIMESPAN'], checkCommunityIndex(request), getRelevantIndices(2)), 'xml')
            setCache(request.url, returnResult, 60, "url")
            return Response(returnResult, mimetype='text/xml')

Source File: peba.py From PEBA with GNU General Public License v3.0

6 votes

def querySingleIP():
    """ Retrieve Attack data from index about a single IP
    """

    # get result from cache
    getCacheResult = getCache(request.url, "url")
    if getCacheResult is not False:
        app.logger.debug('Returning /querySingleIP from Cache for %s' % str(request.remote_addr))
        return Response(getCacheResult)

    # query ES
    else:
        returnResult = formatSingleIP(queryForSingleIP(app.config['MAXALERTS'], request.args.get('ip'), checkCommunityIndex(request), getRelevantIndices(0)))
        setCache(request.url, returnResult, 60, "url")
        app.logger.debug('Returning /querySingleIP from ES for %s' % str(request.remote_addr))
        return Response(returnResult, mimetype='text/xml')

# Routes with both XML and JSON output

Source File: peba.py From PEBA with GNU General Public License v3.0

6 votes

def retrieveAlertsCyber():
    """ Retrieve Alerts from ElasticSearch and return formatted 
        XML with limited alert content
    """

    # get result from cache
    getCacheResult = getCache(request.url, "url")
    if getCacheResult is not False:
        app.logger.debug('Returning /retrieveAlertsCyber from Cache for %s' % str(request.remote_addr))
        return Response(getCacheResult)

    # query ES
    else:
        returnResult = formatAlertsXml(queryAlerts(app.config['MAXALERTS'], checkCommunityIndex(request), getRelevantIndices(2)))
        setCache(request.url, returnResult, 1, "url")
        app.logger.debug('Returning /retrieveAlertsCyber from ES for %s' % str(request.remote_addr))
        return Response(returnResult, mimetype='text/xml')

Source File: peba.py From PEBA with GNU General Public License v3.0

5 votes

def retrieveAlertsCount():
    """ Retrieve number of alerts in timeframe (GET-Parameter time as decimal or "day") """

    # Retrieve Number of Alerts from ElasticSearch and return as xml / json
    if not request.args.get('time'):
        app.logger.error('No time GET-parameter supplied in retrieveAlertsCount. Must be decimal number (in minutes) or string "day"')
        return app.config['DEFAULTRESPONSE']
    else:
        if request.args.get('out') and request.args.get('out') == 'json':
            # get result from cache
            getCacheResult = getCache(request.url, "url")
            if getCacheResult is not False:
                return jsonify(getCacheResult)
            else:
                if request.args.get('time').isdecimal() and int(request.args.get('time')) <= 46080:
                    indexDays=(int(int(request.args.get('time'))/1440))+2
                elif request.args.get('time') == "day":
                    indexDays=1
                else:
                    indexDays=0
                returnResult = formatAlertsCount(queryAlertsCount(request.args.get('time'), checkCommunityIndex(request), getRelevantIndices(indexDays)), 'json')
                setCache(request.url, returnResult, 60, "url")
                return jsonify(returnResult)

        else:
            # get result from cache
            getCacheResult = getCache(request.url, "url")
            if getCacheResult is not False:
                return Response(getCacheResult, mimetype='text/xml')
            else:
                if request.args.get('time').isdecimal() and int(request.args.get('time')) <= 46080:
                    indexDays=(int(int(request.args.get('time'))/1440))+2
                elif request.args.get('time') == "day":
                    indexDays=1
                else:
                    indexDays=0
                returnResult = formatAlertsCount(queryAlertsCount(request.args.get('time'), checkCommunityIndex(request), getRelevantIndices(indexDays)), 'xml')
                setCache(request.url, returnResult, 60, "url")
                return Response(returnResult, mimetype='text/xml')

Source File: app.py From fb-feed-gen with GNU General Public License v2.0

5 votes

def generate_feed():
    # app.logger.warning(request.args)

    param = request.args.get('username')
    if param:
        username = urllib.parse.unquote(param).strip()
        match, display = fetch.is_valid_username(username)

        if (match):
            # get posts
            site_url = fetch.build_site_url(username)
            data = fetch.get_remote_data(site_url)
            items = fetch.extract_items(username, data)

            if (items and len(items) > 0):
                # create feed
                feed = AtomFeed('{0} FB Posts'.format(display),
                                subtitle=site_url,
                                feed_url=request.url,
                                url=request.url_root)

                for post in items:
                    feed.add(post['title'],
                             post['article'],
                             content_type='html',
                             author=post['author'],
                             url=post['url'],
                             updated=post['date'],
                             published=post['date'])

                return feed.get_response()
            else:
                return 'No posts found. Are you sure you put in the correct username?'
        else:
            return 'Invalid username provided'
    else:
        return 'No username provided in query string'


# launch

Source File: web.py From mailur with GNU General Public License v3.0

5 votes

def proxy_by_nginx(url):
    url = '/.proxy?url=%s' % url
    response.set_header('X-Accel-Redirect', url)
    return ''

Source File: web.py From mailur with GNU General Public License v3.0

5 votes

def redirect(url, code=None):
    if not code:
        code = 303 if request.get('SERVER_PROTOCOL') == 'HTTP/1.1' else 302
    response.status = code
    response.body = ''
    response.set_header('Location', urllib.parse.urljoin(request.url, url))
    return response

Source File: web.py From mailur with GNU General Public License v3.0

5 votes

def proxy():
    url = request.query.get('url')
    if not url:
        return abort(400)

    return proxy_by_nginx(url)

Source File: web.py From mailur with GNU General Public License v3.0

5 votes

def avatars():
    hashes = set(request.query['hashes'].split(','))
    size = request.query.get('size', 20)
    default = request.query.get('default', 'identicon')
    cls = request.query.get('cls', '.pic-%s')

    response.content_type = 'text/css'
    return '\n'.join((
        '%s {background-image: url(data:image/gif;base64,%s);}'
        % ((cls % h), i.decode())
    ) for h, i in fetch_avatars(hashes, size, default))

Source File: peba.py From PEBA with GNU General Public License v3.0

5 votes

def retrieveTopCountriesAttacks():
    """ Retrieve the Top X countries and their attacks within month
    """

    # get result from cache
    getCacheResult = getCache(request.url, "url")
    if getCacheResult is not False:
        return jsonify(getCacheResult)

    # query ES
    else:
        if not request.args.get('monthOffset'):
            # Using default : within the last month
            offset = None
        else:
            offset = request.args.get('monthOffset')

        if not request.args.get('topx'):
            # Using default top 10
            topx = None
        else:
            topx = request.args.get('topx')
        returnResult = formatTopCountriesAttacks(queryTopCountriesAttacks(offset, topx, checkCommunityIndex(request), getRelevantIndices(0)))
        setCache(request.url, returnResult, 60, "url")
        app.logger.debug('UNCACHED %s' % str(request.url))
        return jsonify(returnResult)

Source File: peba.py From PEBA with GNU General Public License v3.0

5 votes

def tpotstats():
    """ Retrieve statistics on tpot community installations.
    """
    today = str(datetime.date.today()).replace("-","")

    # get result from cache
    getCacheResult = getCache(request.url, "url")
    if getCacheResult is not False:
        return jsonify(getCacheResult)

    # query ES
    else:

        if not request.args.get('day'):
            # Using default : today
            offset = None
        else:
            offset = request.args.get('day')

        returnResult = getTPotAlertStatsJson(app, es, getRelevantIndices(0), offset)

        if not returnResult:
            return app.config['DEFAULTRESPONSE']

        if not request.args.get('day') == today:
            setCache(request.url, returnResult, 60*1440*28, "url")
            return jsonify(returnResult)
        else:
            return jsonify(returnResult)

Source File: http_helpers.py From shavar with Mozilla Public License 2.0

5 votes

def proxy(request, scheme, netloc, timeout=5):
    """Proxies and return the result from the other server.

    - scheme: http or https
    - netloc: proxy location
    """
    parsed = urlparse(request.url)
    path = parsed.path
    params = parsed.params
    query = parsed.query
    fragment = parsed.fragment
    url = urlunparse((scheme, netloc, path, params, query, fragment))
    method = request.method
    data = request.body

    # copying all X- headers
    xheaders = {}
    for header, value in list(request.headers.items()):
        if not header.startswith('X-'):
            continue
        xheaders[header] = value

    if 'X-Forwarded-For' not in request.headers:
        xheaders['X-Forwarded-For'] = request.remote_addr

    if hasattr(request, '_authorization'):
        xheaders['Authorization'] = request._authorization

    status, headers, body = get_url(url, method, data, timeout=timeout,
                                    extra_headers=xheaders)

    return Response(body, status, list(headers.items()))

Source File: distributed_scheduler.py From scrapy-cluster with MIT License

4 votes

def enqueue_request(self, request):
        '''
        Pushes a request from the spider into the proper throttled queue
        '''
        if not request.dont_filter and self.dupefilter.request_seen(request):
            self.logger.debug("Request not added back to redis")
            return
        req_dict = self.request_to_dict(request)

        if not self.is_blacklisted(req_dict['meta']['appid'],
                                   req_dict['meta']['crawlid']):
            # grab the tld of the request
            ex_res = self.extract(req_dict['url'])
            key = "{sid}:{dom}.{suf}:queue".format(
                sid=req_dict['meta']['spiderid'],
                dom=ex_res.domain,
                suf=ex_res.suffix)

            curr_time = time.time()

            domain = "{d}.{s}".format(d=ex_res.domain, s=ex_res.suffix)

            # allow only if we want all requests or we want
            # everything but blacklisted domains
            # insert if crawl never expires (0) or time < expires
            if (self.backlog_blacklist or
                    (not self.backlog_blacklist and
                    domain not in self.black_domains)) and \
                    (req_dict['meta']['expires'] == 0 or
                    curr_time < req_dict['meta']['expires']):
                # we may already have the queue in memory
                if key in self.queue_keys:
                    self.queue_dict[key][0].push(req_dict,
                                              req_dict['meta']['priority'])
                else:
                    # shoving into a new redis queue, negative b/c of sorted sets
                    # this will populate ourself and other schedulers when
                    # they call create_queues
                    self.redis_conn.zadd(key, ujson.dumps(req_dict),
                                        -req_dict['meta']['priority'])
                self.logger.debug("Crawlid: '{id}' Appid: '{appid}' added to queue"
                    .format(appid=req_dict['meta']['appid'],
                            id=req_dict['meta']['crawlid']))
            else:
                self.logger.debug("Crawlid: '{id}' Appid: '{appid}' expired"
                                  .format(appid=req_dict['meta']['appid'],
                                          id=req_dict['meta']['crawlid']))
        else:
            self.logger.debug("Crawlid: '{id}' Appid: '{appid}' blacklisted"
                              .format(appid=req_dict['meta']['appid'],
                                      id=req_dict['meta']['crawlid']))

Source File: peba.py From PEBA with GNU General Public License v3.0

4 votes

def stats():
    """ Retrieve detailed statistics of community installations.
    """
    # get result from cache
    getCacheResult = getCache(urllib.parse.quote_plus(request.url), "url")
    if getCacheResult is not False:
        return jsonify(getCacheResult)

    else:
        queryValue = []
        if not request.args.get('values'):
            # Using default : none
            queryValue=[]
        else:
            for i in urllib.parse.unquote_plus(request.args.get('values')).split(','):
                queryValue.append(i)

        # check start / end times
        # gte
        if not request.args.get('gte'):
            gte = (datetime.datetime.utcnow()+datetime.timedelta(days=-1)).strftime('%Y-%m-%d %H:%M:%S')
            app.logger.error("getStats: no gte value given, setting to default now-24h")

        else:

            try:
                datetime.datetime.strptime(urllib.parse.unquote_plus(request.args.get('gte')), '%Y-%m-%d %H:%M:%S')
                gte = urllib.parse.unquote_plus(request.args.get('gte'))
            except ValueError:
                app.logger.debug("getStats: Incorrect date format for gte, falling back to default gte")
                gte = (datetime.datetime.utcnow() + datetime.timedelta(days=-1)).strftime('%Y-%m-%d %H:%M:%S')
        # lt
        if not request.args.get('lt'):
            lt = datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')
            app.logger.error("getStats: no lt value given, setting to default now()")
        else:
            try:
                datetime.datetime.strptime(urllib.parse.unquote_plus(request.args.get('lt')), '%Y-%m-%d %H:%M:%S')
                lt = urllib.parse.unquote_plus(request.args.get('lt'))
            except ValueError:
                app.logger.debug("getStats: Incorrect date format for lt, falling back to default lt")
                lt = datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')

        returnResult = getStats(app, es, statisticIndex, gte, lt, queryValue)

        if not returnResult:
            return app.config['DEFAULTRESPONSE']

        else:
            setCache(urllib.parse.quote_plus(request.url), returnResult, 60*30, "url")
            return jsonify(returnResult)

Source File: peba.py From PEBA with GNU General Public License v3.0

4 votes

def topx():
    """ Retrieve the top x URLs/ports and gather their timeline .
    """

    # get result from cache
    getCacheResult = getCache(urllib.parse.quote_plus(request.url), "url")
    if getCacheResult is not False:
        return jsonify(getCacheResult)

    else:
        # get topx
        if not request.args.get('topx'):
            topnumber = 10
        elif request.args.get('topx').isdecimal() and int(request.args.get('topx')) <= 30:
            topnumber = request.args.get('topx')
        else:
            return app.config['DEFAULTRESPONSE']

        # check Type
        if not request.args.get('type'):
            return app.config['DEFAULTRESPONSE']
        else:
            if request.args.get('type') in ['destports', 'urls']:
                toptype = request.args.get('type')
            else:
                return app.config['DEFAULTRESPONSE']

        # check timespan
        # days
        if not request.args.get('days'):
            days = 1
            indices = getRelevantIndices(days + 1)
        elif request.args.get('days') in ["1", "7", "28"]:
            days = int(request.args.get('days'))
            if days == 28:
                indices = getRelevantIndices(0)
            else:
                indices = getRelevantIndices(days + 1)
        else:
            return app.config['DEFAULTRESPONSE']

        returnResult = getTops(app, es, indices, days, toptype, topnumber)

        if not returnResult:
            return app.config['DEFAULTRESPONSE']

        else:
            setCache(urllib.parse.quote_plus(request.url), returnResult, 3600*2, "url")
            return jsonify(returnResult)


# PUT Service

Python urllib.request.url() Examples