Python urllib.request.url() Examples

The following are 21 code examples of urllib.request.url(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module urllib.request , or try the search function .
Example #1
Source File: distributed_scheduler.py    From scrapy-cluster with MIT License 6 votes vote down vote up
def request_to_dict(self, request):
        '''
        Convert Request object to a dict.
        modified from scrapy.utils.reqser
        '''
        req_dict = {
            # urls should be safe (safe_string_url)
            'url': to_unicode(request.url),
            'method': request.method,
            'headers': dict(request.headers),
            'body': request.body,
            'cookies': request.cookies,
            'meta': request.meta,
            '_encoding': request._encoding,
            'priority': request.priority,
            'dont_filter': request.dont_filter,
             #  callback/errback are assumed to be a bound instance of the spider
            'callback': None if request.callback is None else request.callback.__name__,
            'errback': None if request.errback is None else request.errback.__name__,
        }
        return req_dict 
Example #2
Source File: peba.py    From PEBA with GNU General Public License v3.0 6 votes vote down vote up
def retrieveAlertStats():
    """ Retrieve combined statistics
        AlertsLastMinute, AlertsLastHour,  AlertsLast24Hours
    """

    # get result from cache
    getCacheResult = getCache(request.url, "url")
    if getCacheResult is not False:
        return jsonify(getCacheResult)

    # query ES
    else:
        returnResult = formatAlertStats(queryAlertStats(checkCommunityIndex(request), getRelevantIndices(2)))
        setCache(request.url, returnResult, 13, "url")
        app.logger.debug('UNCACHED %s' % str(request.url))
        return jsonify(returnResult) 
Example #3
Source File: peba.py    From PEBA with GNU General Public License v3.0 6 votes vote down vote up
def retrieveDatasetAlertTypesPerMonth():
    """ Retrieve the attacks / day in the last x days from elasticsearch,
        split by attack group
        and return as JSON for the last x months, defaults to last month,
        if no GET parameter days is given
    """

    # get result from cache
    getCacheResult = getCache(request.url, "url")
    if getCacheResult is not False:
        return jsonify(getCacheResult)

    # query ES
    else:
        if not request.args.get('days'):
            # Using default : within the last month (max 31 day indices)
            returnResult = formatDatasetAlertTypesPerMonth(queryDatasetAlertTypesPerMonth(None, checkCommunityIndex(request), getRelevantIndices(32)))
        else:
            if request.args.get('days').isdecimal() and int(request.args.get('days')) <= 31:
                indexDays = int(request.args.get('days'))+1
            else:
                indexDays = 0
            returnResult = formatDatasetAlertTypesPerMonth(queryDatasetAlertTypesPerMonth(request.args.get('days'), checkCommunityIndex(request), getRelevantIndices(indexDays)))
        setCache(request.url, returnResult, 3600, "url")
        return jsonify(returnResult) 
Example #4
Source File: peba.py    From PEBA with GNU General Public License v3.0 6 votes vote down vote up
def retrieveDatasetAlertsPerMonth():
    """ Retrieve the attacks / day in the last x days from elasticsearch
        and return as JSON for the last months, defaults to last month,
        if no GET parameter days is given
    """

    # get result from cache
    getCacheResult = getCache(request.url, "url")
    if getCacheResult is not False:
        return jsonify(getCacheResult)

    # query ES
    else:
        if not request.args.get('days'):
            # Using default : within the last month (max 31 day indices)
            returnResult = formatDatasetAlertsPerMonth(queryDatasetAlertsPerMonth(None, checkCommunityIndex(request), getRelevantIndices(32)))
        else:
            if request.args.get('days').isdecimal() and int(request.args.get('days'))<=31:
                indexDays = int(request.args.get('days')) + 1
            else:
                indexDays = 0
            returnResult = formatDatasetAlertsPerMonth(queryDatasetAlertsPerMonth(request.args.get('days'), checkCommunityIndex(request), getRelevantIndices(indexDays)))
        setCache(request.url, returnResult, 600, "url")
        return jsonify(returnResult) 
Example #5
Source File: peba.py    From PEBA with GNU General Public License v3.0 6 votes vote down vote up
def retrieveAlertsJson():
    """ Retrieve last 5 Alerts in JSON without IPs """

    # set cacheItem independent from url parameters, respect community index
    cacheEntry = request.url

    # get result from cache
    getCacheResult = getCache(cacheEntry, "url")
    if getCacheResult is not False:
        app.logger.debug('Returning /retrieveAlertsJson from Cache %s' % str(request.remote_addr))
        return jsonify(getCacheResult)

    # query ES
    else:
        numAlerts = 35
        # Retrieve last X Alerts from ElasticSearch and return JSON formatted with limited alert content
        returnResult =  formatAlertsJson(queryAlertsWithoutIP(numAlerts, checkCommunityIndex(request), getRelevantIndices(2)))
        setCache(cacheEntry, returnResult, 25, "url")
        app.logger.debug('UNCACHED %s' % str(request.url))
        return jsonify(returnResult) 
Example #6
Source File: peba.py    From PEBA with GNU General Public License v3.0 6 votes vote down vote up
def retrieveIPs15m():
    """ Retrieve IPs from the last 15mins from ElasticSearch and return formatted XML or JSON with IPs """

    if request.args.get('out') and request.args.get('out') == 'json':
        getCacheResult = getCache(request.url, "url")
        if getCacheResult is not False:
            return jsonify(getCacheResult)
        else:
            returnResult = formatBadIP(
                queryBadIPs(15, checkCommunityIndex(request), getRelevantIndices(2)), 'json')
            setCache(request.url, returnResult, 60, "url")
            return jsonify(returnResult)
    else:
        getCacheResult = getCache(request.url, "url")
        if getCacheResult is not False:
            return Response(getCacheResult, mimetype='text/xml')
        else:
            returnResult = formatBadIP(
                queryBadIPs(15, checkCommunityIndex(request), getRelevantIndices(2)), 'xml')
            setCache(request.url, returnResult, 60, "url")
            return Response(returnResult, mimetype='text/xml')

# Routes with JSON output 
Example #7
Source File: peba.py    From PEBA with GNU General Public License v3.0 6 votes vote down vote up
def retrieveIPs():
    """ Retrieve IPs from ElasticSearch and return formatted XML or JSON with IPs """

    if request.args.get('out') and request.args.get('out') == 'json':
        getCacheResult = getCache(request.url, "url")
        if getCacheResult is not False:
            return jsonify(getCacheResult)
        else:
            returnResult = formatBadIP(
                queryBadIPs(app.config['BADIPTIMESPAN'], checkCommunityIndex(request), getRelevantIndices(2)), 'json')
            setCache(request.url, returnResult, 60, "url")
            return jsonify(returnResult)
    else:
        getCacheResult = getCache(request.url, "url")
        if getCacheResult is not False:
            return Response(getCacheResult, mimetype='text/xml')
        else:
            returnResult = formatBadIP(
                queryBadIPs(app.config['BADIPTIMESPAN'], checkCommunityIndex(request), getRelevantIndices(2)), 'xml')
            setCache(request.url, returnResult, 60, "url")
            return Response(returnResult, mimetype='text/xml') 
Example #8
Source File: peba.py    From PEBA with GNU General Public License v3.0 6 votes vote down vote up
def querySingleIP():
    """ Retrieve Attack data from index about a single IP
    """

    # get result from cache
    getCacheResult = getCache(request.url, "url")
    if getCacheResult is not False:
        app.logger.debug('Returning /querySingleIP from Cache for %s' % str(request.remote_addr))
        return Response(getCacheResult)

    # query ES
    else:
        returnResult = formatSingleIP(queryForSingleIP(app.config['MAXALERTS'], request.args.get('ip'), checkCommunityIndex(request), getRelevantIndices(0)))
        setCache(request.url, returnResult, 60, "url")
        app.logger.debug('Returning /querySingleIP from ES for %s' % str(request.remote_addr))
        return Response(returnResult, mimetype='text/xml')

# Routes with both XML and JSON output 
Example #9
Source File: peba.py    From PEBA with GNU General Public License v3.0 6 votes vote down vote up
def retrieveAlertsCyber():
    """ Retrieve Alerts from ElasticSearch and return formatted 
        XML with limited alert content
    """

    # get result from cache
    getCacheResult = getCache(request.url, "url")
    if getCacheResult is not False:
        app.logger.debug('Returning /retrieveAlertsCyber from Cache for %s' % str(request.remote_addr))
        return Response(getCacheResult)

    # query ES
    else:
        returnResult = formatAlertsXml(queryAlerts(app.config['MAXALERTS'], checkCommunityIndex(request), getRelevantIndices(2)))
        setCache(request.url, returnResult, 1, "url")
        app.logger.debug('Returning /retrieveAlertsCyber from ES for %s' % str(request.remote_addr))
        return Response(returnResult, mimetype='text/xml') 
Example #10
Source File: peba.py    From PEBA with GNU General Public License v3.0 5 votes vote down vote up
def retrieveAlertsCount():
    """ Retrieve number of alerts in timeframe (GET-Parameter time as decimal or "day") """

    # Retrieve Number of Alerts from ElasticSearch and return as xml / json
    if not request.args.get('time'):
        app.logger.error('No time GET-parameter supplied in retrieveAlertsCount. Must be decimal number (in minutes) or string "day"')
        return app.config['DEFAULTRESPONSE']
    else:
        if request.args.get('out') and request.args.get('out') == 'json':
            # get result from cache
            getCacheResult = getCache(request.url, "url")
            if getCacheResult is not False:
                return jsonify(getCacheResult)
            else:
                if request.args.get('time').isdecimal() and int(request.args.get('time')) <= 46080:
                    indexDays=(int(int(request.args.get('time'))/1440))+2
                elif request.args.get('time') == "day":
                    indexDays=1
                else:
                    indexDays=0
                returnResult = formatAlertsCount(queryAlertsCount(request.args.get('time'), checkCommunityIndex(request), getRelevantIndices(indexDays)), 'json')
                setCache(request.url, returnResult, 60, "url")
                return jsonify(returnResult)

        else:
            # get result from cache
            getCacheResult = getCache(request.url, "url")
            if getCacheResult is not False:
                return Response(getCacheResult, mimetype='text/xml')
            else:
                if request.args.get('time').isdecimal() and int(request.args.get('time')) <= 46080:
                    indexDays=(int(int(request.args.get('time'))/1440))+2
                elif request.args.get('time') == "day":
                    indexDays=1
                else:
                    indexDays=0
                returnResult = formatAlertsCount(queryAlertsCount(request.args.get('time'), checkCommunityIndex(request), getRelevantIndices(indexDays)), 'xml')
                setCache(request.url, returnResult, 60, "url")
                return Response(returnResult, mimetype='text/xml') 
Example #11
Source File: app.py    From fb-feed-gen with GNU General Public License v2.0 5 votes vote down vote up
def generate_feed():
    # app.logger.warning(request.args)

    param = request.args.get('username')
    if param:
        username = urllib.parse.unquote(param).strip()
        match, display = fetch.is_valid_username(username)

        if (match):
            # get posts
            site_url = fetch.build_site_url(username)
            data = fetch.get_remote_data(site_url)
            items = fetch.extract_items(username, data)

            if (items and len(items) > 0):
                # create feed
                feed = AtomFeed('{0} FB Posts'.format(display),
                                subtitle=site_url,
                                feed_url=request.url,
                                url=request.url_root)

                for post in items:
                    feed.add(post['title'],
                             post['article'],
                             content_type='html',
                             author=post['author'],
                             url=post['url'],
                             updated=post['date'],
                             published=post['date'])

                return feed.get_response()
            else:
                return 'No posts found. Are you sure you put in the correct username?'
        else:
            return 'Invalid username provided'
    else:
        return 'No username provided in query string'


# launch 
Example #12
Source File: web.py    From mailur with GNU General Public License v3.0 5 votes vote down vote up
def proxy_by_nginx(url):
    url = '/.proxy?url=%s' % url
    response.set_header('X-Accel-Redirect', url)
    return '' 
Example #13
Source File: web.py    From mailur with GNU General Public License v3.0 5 votes vote down vote up
def redirect(url, code=None):
    if not code:
        code = 303 if request.get('SERVER_PROTOCOL') == 'HTTP/1.1' else 302
    response.status = code
    response.body = ''
    response.set_header('Location', urllib.parse.urljoin(request.url, url))
    return response 
Example #14
Source File: web.py    From mailur with GNU General Public License v3.0 5 votes vote down vote up
def proxy():
    url = request.query.get('url')
    if not url:
        return abort(400)

    return proxy_by_nginx(url) 
Example #15
Source File: web.py    From mailur with GNU General Public License v3.0 5 votes vote down vote up
def avatars():
    hashes = set(request.query['hashes'].split(','))
    size = request.query.get('size', 20)
    default = request.query.get('default', 'identicon')
    cls = request.query.get('cls', '.pic-%s')

    response.content_type = 'text/css'
    return '\n'.join((
        '%s {background-image: url(data:image/gif;base64,%s);}'
        % ((cls % h), i.decode())
    ) for h, i in fetch_avatars(hashes, size, default)) 
Example #16
Source File: peba.py    From PEBA with GNU General Public License v3.0 5 votes vote down vote up
def retrieveTopCountriesAttacks():
    """ Retrieve the Top X countries and their attacks within month
    """

    # get result from cache
    getCacheResult = getCache(request.url, "url")
    if getCacheResult is not False:
        return jsonify(getCacheResult)

    # query ES
    else:
        if not request.args.get('monthOffset'):
            # Using default : within the last month
            offset = None
        else:
            offset = request.args.get('monthOffset')

        if not request.args.get('topx'):
            # Using default top 10
            topx = None
        else:
            topx = request.args.get('topx')
        returnResult = formatTopCountriesAttacks(queryTopCountriesAttacks(offset, topx, checkCommunityIndex(request), getRelevantIndices(0)))
        setCache(request.url, returnResult, 60, "url")
        app.logger.debug('UNCACHED %s' % str(request.url))
        return jsonify(returnResult) 
Example #17
Source File: peba.py    From PEBA with GNU General Public License v3.0 5 votes vote down vote up
def tpotstats():
    """ Retrieve statistics on tpot community installations.
    """
    today = str(datetime.date.today()).replace("-","")

    # get result from cache
    getCacheResult = getCache(request.url, "url")
    if getCacheResult is not False:
        return jsonify(getCacheResult)

    # query ES
    else:

        if not request.args.get('day'):
            # Using default : today
            offset = None
        else:
            offset = request.args.get('day')

        returnResult = getTPotAlertStatsJson(app, es, getRelevantIndices(0), offset)

        if not returnResult:
            return app.config['DEFAULTRESPONSE']

        if not request.args.get('day') == today:
            setCache(request.url, returnResult, 60*1440*28, "url")
            return jsonify(returnResult)
        else:
            return jsonify(returnResult) 
Example #18
Source File: http_helpers.py    From shavar with Mozilla Public License 2.0 5 votes vote down vote up
def proxy(request, scheme, netloc, timeout=5):
    """Proxies and return the result from the other server.

    - scheme: http or https
    - netloc: proxy location
    """
    parsed = urlparse(request.url)
    path = parsed.path
    params = parsed.params
    query = parsed.query
    fragment = parsed.fragment
    url = urlunparse((scheme, netloc, path, params, query, fragment))
    method = request.method
    data = request.body

    # copying all X- headers
    xheaders = {}
    for header, value in list(request.headers.items()):
        if not header.startswith('X-'):
            continue
        xheaders[header] = value

    if 'X-Forwarded-For' not in request.headers:
        xheaders['X-Forwarded-For'] = request.remote_addr

    if hasattr(request, '_authorization'):
        xheaders['Authorization'] = request._authorization

    status, headers, body = get_url(url, method, data, timeout=timeout,
                                    extra_headers=xheaders)

    return Response(body, status, list(headers.items())) 
Example #19
Source File: distributed_scheduler.py    From scrapy-cluster with MIT License 4 votes vote down vote up
def enqueue_request(self, request):
        '''
        Pushes a request from the spider into the proper throttled queue
        '''
        if not request.dont_filter and self.dupefilter.request_seen(request):
            self.logger.debug("Request not added back to redis")
            return
        req_dict = self.request_to_dict(request)

        if not self.is_blacklisted(req_dict['meta']['appid'],
                                   req_dict['meta']['crawlid']):
            # grab the tld of the request
            ex_res = self.extract(req_dict['url'])
            key = "{sid}:{dom}.{suf}:queue".format(
                sid=req_dict['meta']['spiderid'],
                dom=ex_res.domain,
                suf=ex_res.suffix)

            curr_time = time.time()

            domain = "{d}.{s}".format(d=ex_res.domain, s=ex_res.suffix)

            # allow only if we want all requests or we want
            # everything but blacklisted domains
            # insert if crawl never expires (0) or time < expires
            if (self.backlog_blacklist or
                    (not self.backlog_blacklist and
                    domain not in self.black_domains)) and \
                    (req_dict['meta']['expires'] == 0 or
                    curr_time < req_dict['meta']['expires']):
                # we may already have the queue in memory
                if key in self.queue_keys:
                    self.queue_dict[key][0].push(req_dict,
                                              req_dict['meta']['priority'])
                else:
                    # shoving into a new redis queue, negative b/c of sorted sets
                    # this will populate ourself and other schedulers when
                    # they call create_queues
                    self.redis_conn.zadd(key, ujson.dumps(req_dict),
                                        -req_dict['meta']['priority'])
                self.logger.debug("Crawlid: '{id}' Appid: '{appid}' added to queue"
                    .format(appid=req_dict['meta']['appid'],
                            id=req_dict['meta']['crawlid']))
            else:
                self.logger.debug("Crawlid: '{id}' Appid: '{appid}' expired"
                                  .format(appid=req_dict['meta']['appid'],
                                          id=req_dict['meta']['crawlid']))
        else:
            self.logger.debug("Crawlid: '{id}' Appid: '{appid}' blacklisted"
                              .format(appid=req_dict['meta']['appid'],
                                      id=req_dict['meta']['crawlid'])) 
Example #20
Source File: peba.py    From PEBA with GNU General Public License v3.0 4 votes vote down vote up
def stats():
    """ Retrieve detailed statistics of community installations.
    """
    # get result from cache
    getCacheResult = getCache(urllib.parse.quote_plus(request.url), "url")
    if getCacheResult is not False:
        return jsonify(getCacheResult)

    else:
        queryValue = []
        if not request.args.get('values'):
            # Using default : none
            queryValue=[]
        else:
            for i in urllib.parse.unquote_plus(request.args.get('values')).split(','):
                queryValue.append(i)

        # check start / end times
        # gte
        if not request.args.get('gte'):
            gte = (datetime.datetime.utcnow()+datetime.timedelta(days=-1)).strftime('%Y-%m-%d %H:%M:%S')
            app.logger.error("getStats: no gte value given, setting to default now-24h")

        else:

            try:
                datetime.datetime.strptime(urllib.parse.unquote_plus(request.args.get('gte')), '%Y-%m-%d %H:%M:%S')
                gte = urllib.parse.unquote_plus(request.args.get('gte'))
            except ValueError:
                app.logger.debug("getStats: Incorrect date format for gte, falling back to default gte")
                gte = (datetime.datetime.utcnow() + datetime.timedelta(days=-1)).strftime('%Y-%m-%d %H:%M:%S')
        # lt
        if not request.args.get('lt'):
            lt = datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')
            app.logger.error("getStats: no lt value given, setting to default now()")
        else:
            try:
                datetime.datetime.strptime(urllib.parse.unquote_plus(request.args.get('lt')), '%Y-%m-%d %H:%M:%S')
                lt = urllib.parse.unquote_plus(request.args.get('lt'))
            except ValueError:
                app.logger.debug("getStats: Incorrect date format for lt, falling back to default lt")
                lt = datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')

        returnResult = getStats(app, es, statisticIndex, gte, lt, queryValue)

        if not returnResult:
            return app.config['DEFAULTRESPONSE']

        else:
            setCache(urllib.parse.quote_plus(request.url), returnResult, 60*30, "url")
            return jsonify(returnResult) 
Example #21
Source File: peba.py    From PEBA with GNU General Public License v3.0 4 votes vote down vote up
def topx():
    """ Retrieve the top x URLs/ports and gather their timeline .
    """

    # get result from cache
    getCacheResult = getCache(urllib.parse.quote_plus(request.url), "url")
    if getCacheResult is not False:
        return jsonify(getCacheResult)

    else:
        # get topx
        if not request.args.get('topx'):
            topnumber = 10
        elif request.args.get('topx').isdecimal() and int(request.args.get('topx')) <= 30:
            topnumber = request.args.get('topx')
        else:
            return app.config['DEFAULTRESPONSE']

        # check Type
        if not request.args.get('type'):
            return app.config['DEFAULTRESPONSE']
        else:
            if request.args.get('type') in ['destports', 'urls']:
                toptype = request.args.get('type')
            else:
                return app.config['DEFAULTRESPONSE']

        # check timespan
        # days
        if not request.args.get('days'):
            days = 1
            indices = getRelevantIndices(days + 1)
        elif request.args.get('days') in ["1", "7", "28"]:
            days = int(request.args.get('days'))
            if days == 28:
                indices = getRelevantIndices(0)
            else:
                indices = getRelevantIndices(days + 1)
        else:
            return app.config['DEFAULTRESPONSE']

        returnResult = getTops(app, es, indices, days, toptype, topnumber)

        if not returnResult:
            return app.config['DEFAULTRESPONSE']

        else:
            setCache(urllib.parse.quote_plus(request.url), returnResult, 3600*2, "url")
            return jsonify(returnResult)


# PUT Service