Python Examples of scrapy.utils.python.to

Source File: response.py From learn_python3_spider with MIT License

6 votes

def open_in_browser(response, _openfunc=webbrowser.open):
    """Open the given response in a local web browser, populating the <base>
    tag for external links to work
    """
    from scrapy.http import HtmlResponse, TextResponse
    # XXX: this implementation is a bit dirty and could be improved
    body = response.body
    if isinstance(response, HtmlResponse):
        if b'<base' not in body:
            repl = '<head><base href="%s">' % response.url
            body = body.replace(b'<head>', to_bytes(repl))
        ext = '.html'
    elif isinstance(response, TextResponse):
        ext = '.txt'
    else:
        raise TypeError("Unsupported response type: %s" %
                        response.__class__.__name__)
    fd, fname = tempfile.mkstemp(ext)
    os.write(fd, body)
    os.close(fd)
    return _openfunc("file://%s" % fname)

Source File: httpcache.py From learn_python3_spider with MIT License

6 votes

def store_response(self, spider, request, response):
        """Store the given response in the cache."""
        rpath = self._get_request_path(spider, request)
        if not os.path.exists(rpath):
            os.makedirs(rpath)
        metadata = {
            'url': request.url,
            'method': request.method,
            'status': response.status,
            'response_url': response.url,
            'timestamp': time(),
        }
        with self._open(os.path.join(rpath, 'meta'), 'wb') as f:
            f.write(to_bytes(repr(metadata)))
        with self._open(os.path.join(rpath, 'pickled_meta'), 'wb') as f:
            pickle.dump(metadata, f, protocol=2)
        with self._open(os.path.join(rpath, 'response_headers'), 'wb') as f:
            f.write(headers_dict_to_raw(response.headers))
        with self._open(os.path.join(rpath, 'response_body'), 'wb') as f:
            f.write(response.body)
        with self._open(os.path.join(rpath, 'request_headers'), 'wb') as f:
            f.write(headers_dict_to_raw(request.headers))
        with self._open(os.path.join(rpath, 'request_body'), 'wb') as f:
            f.write(request.body)

Source File: httpcache.py From learn_python3_spider with MIT License

6 votes

def store_response(self, spider, request, response):
        """Store the given response in the cache."""
        rpath = self._get_request_path(spider, request)
        if not os.path.exists(rpath):
            os.makedirs(rpath)
        metadata = {
            'url': request.url,
            'method': request.method,
            'status': response.status,
            'response_url': response.url,
            'timestamp': time(),
        }
        with self._open(os.path.join(rpath, 'meta'), 'wb') as f:
            f.write(to_bytes(repr(metadata)))
        with self._open(os.path.join(rpath, 'pickled_meta'), 'wb') as f:
            pickle.dump(metadata, f, protocol=2)
        with self._open(os.path.join(rpath, 'response_headers'), 'wb') as f:
            f.write(headers_dict_to_raw(response.headers))
        with self._open(os.path.join(rpath, 'response_body'), 'wb') as f:
            f.write(response.body)
        with self._open(os.path.join(rpath, 'request_headers'), 'wb') as f:
            f.write(headers_dict_to_raw(request.headers))
        with self._open(os.path.join(rpath, 'request_body'), 'wb') as f:
            f.write(request.body)

Source File: http11.py From learn_python3_spider with MIT License

6 votes

def tunnel_request_data(host, port, proxy_auth_header=None):
    r"""
    Return binary content of a CONNECT request.

    >>> from scrapy.utils.python import to_native_str as s
    >>> s(tunnel_request_data("example.com", 8080))
    'CONNECT example.com:8080 HTTP/1.1\r\nHost: example.com:8080\r\n\r\n'
    >>> s(tunnel_request_data("example.com", 8080, b"123"))
    'CONNECT example.com:8080 HTTP/1.1\r\nHost: example.com:8080\r\nProxy-Authorization: 123\r\n\r\n'
    >>> s(tunnel_request_data(b"example.com", "8090"))
    'CONNECT example.com:8090 HTTP/1.1\r\nHost: example.com:8090\r\n\r\n'
    """
    host_value = to_bytes(host, encoding='ascii') + b':' + to_bytes(str(port))
    tunnel_req = b'CONNECT ' + host_value + b' HTTP/1.1\r\n'
    tunnel_req += b'Host: ' + host_value + b'\r\n'
    if proxy_auth_header:
        tunnel_req += b'Proxy-Authorization: ' + proxy_auth_header + b'\r\n'
    tunnel_req += b'\r\n'
    return tunnel_req

Source File: http11.py From learn_python3_spider with MIT License

6 votes

def tunnel_request_data(host, port, proxy_auth_header=None):
    r"""
    Return binary content of a CONNECT request.

    >>> from scrapy.utils.python import to_native_str as s
    >>> s(tunnel_request_data("example.com", 8080))
    'CONNECT example.com:8080 HTTP/1.1\r\nHost: example.com:8080\r\n\r\n'
    >>> s(tunnel_request_data("example.com", 8080, b"123"))
    'CONNECT example.com:8080 HTTP/1.1\r\nHost: example.com:8080\r\nProxy-Authorization: 123\r\n\r\n'
    >>> s(tunnel_request_data(b"example.com", "8090"))
    'CONNECT example.com:8090 HTTP/1.1\r\nHost: example.com:8090\r\n\r\n'
    """
    host_value = to_bytes(host, encoding='ascii') + b':' + to_bytes(str(port))
    tunnel_req = b'CONNECT ' + host_value + b' HTTP/1.1\r\n'
    tunnel_req += b'Host: ' + host_value + b'\r\n'
    if proxy_auth_header:
        tunnel_req += b'Proxy-Authorization: ' + proxy_auth_header + b'\r\n'
    tunnel_req += b'\r\n'
    return tunnel_req

Source File: http11.py From learn_python3_spider with MIT License

6 votes

def _get_agent(self, request, timeout):
        bindaddress = request.meta.get('bindaddress') or self._bindAddress
        proxy = request.meta.get('proxy')
        if proxy:
            _, _, proxyHost, proxyPort, proxyParams = _parse(proxy)
            scheme = _parse(request.url)[0]
            proxyHost = to_unicode(proxyHost)
            omitConnectTunnel = b'noconnect' in proxyParams
            if  scheme == b'https' and not omitConnectTunnel:
                proxyConf = (proxyHost, proxyPort,
                             request.headers.get(b'Proxy-Authorization', None))
                return self._TunnelingAgent(reactor, proxyConf,
                    contextFactory=self._contextFactory, connectTimeout=timeout,
                    bindAddress=bindaddress, pool=self._pool)
            else:
                return self._ProxyAgent(reactor, proxyURI=to_bytes(proxy, encoding='ascii'),
                    connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool)

        return self._Agent(reactor, contextFactory=self._contextFactory,
            connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool)

Source File: utils.py From scrapy-autounit with BSD 3-Clause "New" or "Revised" License

6 votes

def binary_check(fx_obj, cb_obj, encoding):
    if isinstance(cb_obj, (dict, Item)):
        fx_obj = {
            key: binary_check(value, cb_obj[key], encoding)
            for key, value in fx_obj.items()
        }

    if isinstance(cb_obj, list):
        fx_obj = [
            binary_check(fxitem, cbitem, encoding)
            for fxitem, cbitem in zip(fx_obj, cb_obj)
        ]

    if isinstance(cb_obj, Request):
        headers = {}
        for key, value in fx_obj['headers'].items():
            key = to_bytes(key, encoding)
            headers[key] = [to_bytes(v, encoding) for v in value]
        fx_obj['headers'] = headers
        fx_obj['body'] = to_bytes(fx_obj['body'], encoding)

    if isinstance(cb_obj, six.binary_type):
        fx_obj = fx_obj.encode(encoding)

    return fx_obj

Source File: response.py From learn_python3_spider with MIT License

6 votes

def open_in_browser(response, _openfunc=webbrowser.open):
    """Open the given response in a local web browser, populating the <base>
    tag for external links to work
    """
    from scrapy.http import HtmlResponse, TextResponse
    # XXX: this implementation is a bit dirty and could be improved
    body = response.body
    if isinstance(response, HtmlResponse):
        if b'<base' not in body:
            repl = '<head><base href="%s">' % response.url
            body = body.replace(b'<head>', to_bytes(repl))
        ext = '.html'
    elif isinstance(response, TextResponse):
        ext = '.txt'
    else:
        raise TypeError("Unsupported response type: %s" %
                        response.__class__.__name__)
    fd, fname = tempfile.mkstemp(ext)
    os.write(fd, body)
    os.close(fd)
    return _openfunc("file://%s" % fname)

Source File: http11.py From learn_python3_spider with MIT License

6 votes

def _get_agent(self, request, timeout):
        bindaddress = request.meta.get('bindaddress') or self._bindAddress
        proxy = request.meta.get('proxy')
        if proxy:
            _, _, proxyHost, proxyPort, proxyParams = _parse(proxy)
            scheme = _parse(request.url)[0]
            proxyHost = to_unicode(proxyHost)
            omitConnectTunnel = b'noconnect' in proxyParams
            if  scheme == b'https' and not omitConnectTunnel:
                proxyConf = (proxyHost, proxyPort,
                             request.headers.get(b'Proxy-Authorization', None))
                return self._TunnelingAgent(reactor, proxyConf,
                    contextFactory=self._contextFactory, connectTimeout=timeout,
                    bindAddress=bindaddress, pool=self._pool)
            else:
                return self._ProxyAgent(reactor, proxyURI=to_bytes(proxy, encoding='ascii'),
                    connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool)

        return self._Agent(reactor, contextFactory=self._contextFactory,
            connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool)

Source File: middlewares.py From oh-my-rss with MIT License

6 votes

def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.

        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called
        if spider.browser:
            request.meta['browser'] = self.browser  # to access driver from response
            self.browser.get(request.url)
            # wait js eval
            time.sleep(15)
            body = to_bytes(self.browser.page_source)  # body must be of type bytes
            return HtmlResponse(self.browser.current_url, body=body, encoding='utf-8', request=request)
        else:
            return None

Source File: main.py From python-examples with MIT License

6 votes

def file_path(self, request, response=None, info=None):
        import hashlib
        from scrapy.utils.python import to_bytes
        import datetime
        
        folder = request.meta['folder']
        image_guid = hashlib.sha1(to_bytes(request.url)).hexdigest()

        #YEAR = 2018
        #filename = 'realty-sc/%s/%s/%s/%s.jpg' % (YEAR, image_guid[:2], image_guid[2:4], image_guid)
        
        filename = datetime.datetime.now().strftime('images/%Y.%m.%d-%H.%M/{}/{}.jpg'.format(folder, image_guid))
        
        return filename
                     
# --- it runs without project and saves in `output.csv` ---

Source File: ftp.py From learn_python3_spider with MIT License

5 votes

def _failed(self, result, request):
        message = result.getErrorMessage()
        if result.type == CommandFailed:
            m = _CODE_RE.search(message)
            if m:
                ftpcode = m.group()
                httpcode = self.CODE_MAPPING.get(ftpcode, self.CODE_MAPPING["default"])
                return Response(url=request.url, status=httpcode, body=to_bytes(message))
        raise result.type(result.value)

Source File: request.py From learn_python3_spider with MIT License

5 votes

def request_httprepr(request):
    """Return the raw HTTP representation (as bytes) of the given request.
    This is provided only for reference since it's not the actual stream of
    bytes that will be send when performing the request (that's controlled
    by Twisted).
    """
    parsed = urlparse_cached(request)
    path = urlunparse(('', '', parsed.path or '/', parsed.params, parsed.query, ''))
    s = to_bytes(request.method) + b" " + to_bytes(path) + b" HTTP/1.1\r\n"
    s += b"Host: " + to_bytes(parsed.hostname or b'') + b"\r\n"
    if request.headers:
        s += request.headers.to_string() + b"\r\n"
    s += b"\r\n"
    s += request.body
    return s

Source File: responsetypes.py From learn_python3_spider with MIT License

5 votes

def from_body(self, body):
        """Try to guess the appropriate response based on the body content.
        This method is a bit magic and could be improved in the future, but
        it's not meant to be used except for special cases where response types
        cannot be guess using more straightforward methods."""
        chunk = body[:5000]
        chunk = to_bytes(chunk)
        if not binary_is_text(chunk):
            return self.from_mimetype('application/octet-stream')
        elif b"<html>" in chunk.lower():
            return self.from_mimetype('text/html')
        elif b"<?xml" in chunk.lower():
            return self.from_mimetype('text/xml')
        else:
            return self.from_mimetype('text')

Source File: exporters.py From learn_python3_spider with MIT License

5 votes

def _serialize_value(self, value):
        if isinstance(value, BaseItem):
            return self.export_item(value)
        if isinstance(value, dict):
            return dict(self._serialize_dict(value))
        if is_listlike(value):
            return [self._serialize_value(v) for v in value]
        encode_func = to_bytes if self.binary else to_unicode
        if isinstance(value, (six.text_type, bytes)):
            return encode_func(value, encoding=self.encoding)
        return value

Source File: writer.py From scrapy-kafka-export with MIT License

5 votes

def write(self, key, msg):
        key = None if key is None else to_bytes(key)
        return self._send_message(key, msg, self.topic)

Source File: cache.py From PyFeeds with GNU Affero General Public License v3.0

5 votes

def _get_key_path(self, spider, key):
        key = hashlib.sha1(to_bytes(key)).hexdigest()
        return os.path.join(self.cachedir, spider.name, key[0:2], key)

Source File: cache.py From PyFeeds with GNU Affero General Public License v3.0

5 votes

def _write_meta_to_path(self, path, metadata):
        with self._open(os.path.join(path, "meta"), "wb") as f:
            f.write(to_bytes(repr(metadata)))
        with self._open(os.path.join(path, "pickled_meta"), "wb") as f:
            pickle.dump(metadata, f, protocol=2)

Source File: python - scrapy.py From python-examples with MIT License

5 votes

def file_path(self, request, response=None, info=None):
        '''Changing file name - adding folder name with date and time'''

        from scrapy.utils.python import to_bytes
        import hashlib
        import datetime

        # from original function file_path
        image_guid = hashlib.sha1(to_bytes(request.url)).hexdigest()

        # add folder name with date and time
        filename = datetime.datetime.now().strftime('images/%Y.%m.%d-%H.%M.%S/{}.jpg'.format(image_guid))

        return filename

# --- run without project and save in `output.csv` ---

# scrapy runspider script.py -s USER_AGENT="Mozilla/5.0" -o output.csv -a urls="http://quotes.toscrape.com/tag/love/;http://quotes.toscrape.com/tag/inspirational/http://quotes.toscrape.com/tag/life/"

# --- run without project and save in `output.csv` ---

#import scrapy.cmdline

#start_urls = "http://quotes.toscrape.com/tag/love/;http://quotes.toscrape.com/tag/inspirational/http://quotes.toscrape.com/tag/life/"

#scrapy.cmdline.execute(['scrapy', 'crawl', 'myspider', '-o', 'output.csv', '-a',  'urls=' + start_urls])

# --- run without project and save in `output.csv` ---

# python script.py

#start_urls = [
#    'http://quotes.toscrape.com/tag/love/',
#    'http://quotes.toscrape.com/tag/inspirational/',
#    'http://quotes.toscrape.com/tag/life/',
#]

Source File: python - scrapy.py From python-examples with MIT License

5 votes

def file_path(self, request, response=None, info=None):
        from scrapy.utils.python import to_bytes
        import hashlib
        import datetime
        
        image_guid = hashlib.sha1(to_bytes(request.url)).hexdigest()
        filename = datetime.datetime.now().strftime('images/%Y.%m.%d-%H.%M.%S/{}.jpg'.format(image_guid))
        
        return filename
        
# --- run without project and save in `output.csv` ---

# scrapy runspider script.py -s USER_AGENT="Mozilla/5.0" -o output.csv -a urls="http://quotes.toscrape.com/tag/love/;http://quotes.toscrape.com/tag/inspirational/http://quotes.toscrape.com/tag/life/"

# ---

#import scrapy.cmdline

#start_urls = "http://quotes.toscrape.com/tag/love/;http://quotes.toscrape.com/tag/inspirational/http://quotes.toscrape.com/tag/life/"

#scrapy.cmdline.execute(['scrapy', 'crawl', 'myspider', '-o', 'output.csv', '-a',  'urls=' + start_urls])

# --- run without project and save in `output.csv` ---

# python script.py

#start_urls = [
#    'http://quotes.toscrape.com/tag/love/',
#    'http://quotes.toscrape.com/tag/inspirational/',
#    'http://quotes.toscrape.com/tag/life/',
#]

Source File: ftp.py From learn_python3_spider with MIT License

5 votes

def _build_response(self, result, request, protocol):
        self.result = result
        respcls = responsetypes.from_args(url=request.url)
        protocol.close()
        body = protocol.filename or protocol.body.read()
        headers = {"local filename": protocol.filename or '', "size": protocol.size}
        return respcls(url=request.url, status=200, body=to_bytes(body), headers=headers)

Source File: exporters.py From learn_python3_spider with MIT License

5 votes

def export_item(self, item):
        itemdict = dict(self._get_serialized_fields(item))
        data = self.encoder.encode(itemdict) + '\n'
        self.file.write(to_bytes(data, self.encoding))

Source File: images.py From learn_python3_spider with MIT License

5 votes

def thumb_path(self, request, thumb_id, response=None, info=None):
        thumb_guid = hashlib.sha1(to_bytes(request.url)).hexdigest()
        return 'thumbs/%s/%s.jpg' % (thumb_id, thumb_guid)

Source File: images.py From learn_python3_spider with MIT License

5 votes

def file_path(self, request, response=None, info=None):
        image_guid = hashlib.sha1(to_bytes(request.url)).hexdigest()
        return 'full/%s.jpg' % (image_guid)

Source File: files.py From learn_python3_spider with MIT License

5 votes

def file_path(self, request, response=None, info=None):
        media_guid = hashlib.sha1(to_bytes(request.url)).hexdigest()
        media_ext = os.path.splitext(request.url)[1]
        return 'full/%s%s' % (media_guid, media_ext)

Source File: httpproxy.py From learn_python3_spider with MIT License

5 votes

def _basic_auth_header(self, username, password):
        user_pass = to_bytes(
            '%s:%s' % (unquote(username), unquote(password)),
            encoding=self.auth_encoding)
        return base64.b64encode(user_pass)

Source File: httpcache.py From learn_python3_spider with MIT License

5 votes

def store_response(self, spider, request, response):
        key = self._request_key(request)
        data = {
            'status': response.status,
            'url': response.url,
            'headers': dict(response.headers),
            'body': response.body,
        }
        batch = self._leveldb.WriteBatch()
        batch.Put(key + b'_data', pickle.dumps(data, protocol=2))
        batch.Put(key + b'_time', to_bytes(str(time())))
        self.db.Write(batch)

Source File: httpcache.py From learn_python3_spider with MIT License

5 votes

def __init__(self, settings):
        self.always_store = settings.getbool('HTTPCACHE_ALWAYS_STORE')
        self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES')
        self.ignore_response_cache_controls = [to_bytes(cc) for cc in
            settings.getlist('HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS')]
        self._cc_parsed = WeakKeyDictionary()

Source File: link.py From learn_python3_spider with MIT License

5 votes

def __init__(self, url, text='', fragment='', nofollow=False):
        if not isinstance(url, str):
            if six.PY2:
                warnings.warn("Link urls must be str objects. "
                              "Assuming utf-8 encoding (which could be wrong)")
                url = to_bytes(url, encoding='utf8')
            else:
                got = url.__class__.__name__
                raise TypeError("Link urls must be str objects, got %s" % got)
        self.url = url
        self.text = text
        self.fragment = fragment
        self.nofollow = nofollow

Source File: form.py From learn_python3_spider with MIT License

5 votes

def _urlencode(seq, enc):
    values = [(to_bytes(k, enc), to_bytes(v, enc))
              for k, vs in seq
              for v in (vs if is_listlike(vs) else [vs])]
    return urlencode(values, doseq=1)

Python scrapy.utils.python.to_bytes() Examples