Python scrapy.utils.python.to_bytes() Examples
The following are 30
code examples of scrapy.utils.python.to_bytes().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
scrapy.utils.python
, or try the search function
.
Example #1
Source File: response.py From learn_python3_spider with MIT License | 6 votes |
def open_in_browser(response, _openfunc=webbrowser.open): """Open the given response in a local web browser, populating the <base> tag for external links to work """ from scrapy.http import HtmlResponse, TextResponse # XXX: this implementation is a bit dirty and could be improved body = response.body if isinstance(response, HtmlResponse): if b'<base' not in body: repl = '<head><base href="%s">' % response.url body = body.replace(b'<head>', to_bytes(repl)) ext = '.html' elif isinstance(response, TextResponse): ext = '.txt' else: raise TypeError("Unsupported response type: %s" % response.__class__.__name__) fd, fname = tempfile.mkstemp(ext) os.write(fd, body) os.close(fd) return _openfunc("file://%s" % fname)
Example #2
Source File: httpcache.py From learn_python3_spider with MIT License | 6 votes |
def store_response(self, spider, request, response): """Store the given response in the cache.""" rpath = self._get_request_path(spider, request) if not os.path.exists(rpath): os.makedirs(rpath) metadata = { 'url': request.url, 'method': request.method, 'status': response.status, 'response_url': response.url, 'timestamp': time(), } with self._open(os.path.join(rpath, 'meta'), 'wb') as f: f.write(to_bytes(repr(metadata))) with self._open(os.path.join(rpath, 'pickled_meta'), 'wb') as f: pickle.dump(metadata, f, protocol=2) with self._open(os.path.join(rpath, 'response_headers'), 'wb') as f: f.write(headers_dict_to_raw(response.headers)) with self._open(os.path.join(rpath, 'response_body'), 'wb') as f: f.write(response.body) with self._open(os.path.join(rpath, 'request_headers'), 'wb') as f: f.write(headers_dict_to_raw(request.headers)) with self._open(os.path.join(rpath, 'request_body'), 'wb') as f: f.write(request.body)
Example #3
Source File: httpcache.py From learn_python3_spider with MIT License | 6 votes |
def store_response(self, spider, request, response): """Store the given response in the cache.""" rpath = self._get_request_path(spider, request) if not os.path.exists(rpath): os.makedirs(rpath) metadata = { 'url': request.url, 'method': request.method, 'status': response.status, 'response_url': response.url, 'timestamp': time(), } with self._open(os.path.join(rpath, 'meta'), 'wb') as f: f.write(to_bytes(repr(metadata))) with self._open(os.path.join(rpath, 'pickled_meta'), 'wb') as f: pickle.dump(metadata, f, protocol=2) with self._open(os.path.join(rpath, 'response_headers'), 'wb') as f: f.write(headers_dict_to_raw(response.headers)) with self._open(os.path.join(rpath, 'response_body'), 'wb') as f: f.write(response.body) with self._open(os.path.join(rpath, 'request_headers'), 'wb') as f: f.write(headers_dict_to_raw(request.headers)) with self._open(os.path.join(rpath, 'request_body'), 'wb') as f: f.write(request.body)
Example #4
Source File: http11.py From learn_python3_spider with MIT License | 6 votes |
def tunnel_request_data(host, port, proxy_auth_header=None): r""" Return binary content of a CONNECT request. >>> from scrapy.utils.python import to_native_str as s >>> s(tunnel_request_data("example.com", 8080)) 'CONNECT example.com:8080 HTTP/1.1\r\nHost: example.com:8080\r\n\r\n' >>> s(tunnel_request_data("example.com", 8080, b"123")) 'CONNECT example.com:8080 HTTP/1.1\r\nHost: example.com:8080\r\nProxy-Authorization: 123\r\n\r\n' >>> s(tunnel_request_data(b"example.com", "8090")) 'CONNECT example.com:8090 HTTP/1.1\r\nHost: example.com:8090\r\n\r\n' """ host_value = to_bytes(host, encoding='ascii') + b':' + to_bytes(str(port)) tunnel_req = b'CONNECT ' + host_value + b' HTTP/1.1\r\n' tunnel_req += b'Host: ' + host_value + b'\r\n' if proxy_auth_header: tunnel_req += b'Proxy-Authorization: ' + proxy_auth_header + b'\r\n' tunnel_req += b'\r\n' return tunnel_req
Example #5
Source File: http11.py From learn_python3_spider with MIT License | 6 votes |
def tunnel_request_data(host, port, proxy_auth_header=None): r""" Return binary content of a CONNECT request. >>> from scrapy.utils.python import to_native_str as s >>> s(tunnel_request_data("example.com", 8080)) 'CONNECT example.com:8080 HTTP/1.1\r\nHost: example.com:8080\r\n\r\n' >>> s(tunnel_request_data("example.com", 8080, b"123")) 'CONNECT example.com:8080 HTTP/1.1\r\nHost: example.com:8080\r\nProxy-Authorization: 123\r\n\r\n' >>> s(tunnel_request_data(b"example.com", "8090")) 'CONNECT example.com:8090 HTTP/1.1\r\nHost: example.com:8090\r\n\r\n' """ host_value = to_bytes(host, encoding='ascii') + b':' + to_bytes(str(port)) tunnel_req = b'CONNECT ' + host_value + b' HTTP/1.1\r\n' tunnel_req += b'Host: ' + host_value + b'\r\n' if proxy_auth_header: tunnel_req += b'Proxy-Authorization: ' + proxy_auth_header + b'\r\n' tunnel_req += b'\r\n' return tunnel_req
Example #6
Source File: http11.py From learn_python3_spider with MIT License | 6 votes |
def _get_agent(self, request, timeout): bindaddress = request.meta.get('bindaddress') or self._bindAddress proxy = request.meta.get('proxy') if proxy: _, _, proxyHost, proxyPort, proxyParams = _parse(proxy) scheme = _parse(request.url)[0] proxyHost = to_unicode(proxyHost) omitConnectTunnel = b'noconnect' in proxyParams if scheme == b'https' and not omitConnectTunnel: proxyConf = (proxyHost, proxyPort, request.headers.get(b'Proxy-Authorization', None)) return self._TunnelingAgent(reactor, proxyConf, contextFactory=self._contextFactory, connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool) else: return self._ProxyAgent(reactor, proxyURI=to_bytes(proxy, encoding='ascii'), connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool) return self._Agent(reactor, contextFactory=self._contextFactory, connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool)
Example #7
Source File: utils.py From scrapy-autounit with BSD 3-Clause "New" or "Revised" License | 6 votes |
def binary_check(fx_obj, cb_obj, encoding): if isinstance(cb_obj, (dict, Item)): fx_obj = { key: binary_check(value, cb_obj[key], encoding) for key, value in fx_obj.items() } if isinstance(cb_obj, list): fx_obj = [ binary_check(fxitem, cbitem, encoding) for fxitem, cbitem in zip(fx_obj, cb_obj) ] if isinstance(cb_obj, Request): headers = {} for key, value in fx_obj['headers'].items(): key = to_bytes(key, encoding) headers[key] = [to_bytes(v, encoding) for v in value] fx_obj['headers'] = headers fx_obj['body'] = to_bytes(fx_obj['body'], encoding) if isinstance(cb_obj, six.binary_type): fx_obj = fx_obj.encode(encoding) return fx_obj
Example #8
Source File: response.py From learn_python3_spider with MIT License | 6 votes |
def open_in_browser(response, _openfunc=webbrowser.open): """Open the given response in a local web browser, populating the <base> tag for external links to work """ from scrapy.http import HtmlResponse, TextResponse # XXX: this implementation is a bit dirty and could be improved body = response.body if isinstance(response, HtmlResponse): if b'<base' not in body: repl = '<head><base href="%s">' % response.url body = body.replace(b'<head>', to_bytes(repl)) ext = '.html' elif isinstance(response, TextResponse): ext = '.txt' else: raise TypeError("Unsupported response type: %s" % response.__class__.__name__) fd, fname = tempfile.mkstemp(ext) os.write(fd, body) os.close(fd) return _openfunc("file://%s" % fname)
Example #9
Source File: http11.py From learn_python3_spider with MIT License | 6 votes |
def _get_agent(self, request, timeout): bindaddress = request.meta.get('bindaddress') or self._bindAddress proxy = request.meta.get('proxy') if proxy: _, _, proxyHost, proxyPort, proxyParams = _parse(proxy) scheme = _parse(request.url)[0] proxyHost = to_unicode(proxyHost) omitConnectTunnel = b'noconnect' in proxyParams if scheme == b'https' and not omitConnectTunnel: proxyConf = (proxyHost, proxyPort, request.headers.get(b'Proxy-Authorization', None)) return self._TunnelingAgent(reactor, proxyConf, contextFactory=self._contextFactory, connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool) else: return self._ProxyAgent(reactor, proxyURI=to_bytes(proxy, encoding='ascii'), connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool) return self._Agent(reactor, contextFactory=self._contextFactory, connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool)
Example #10
Source File: middlewares.py From oh-my-rss with MIT License | 6 votes |
def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called if spider.browser: request.meta['browser'] = self.browser # to access driver from response self.browser.get(request.url) # wait js eval time.sleep(15) body = to_bytes(self.browser.page_source) # body must be of type bytes return HtmlResponse(self.browser.current_url, body=body, encoding='utf-8', request=request) else: return None
Example #11
Source File: main.py From python-examples with MIT License | 6 votes |
def file_path(self, request, response=None, info=None): import hashlib from scrapy.utils.python import to_bytes import datetime folder = request.meta['folder'] image_guid = hashlib.sha1(to_bytes(request.url)).hexdigest() #YEAR = 2018 #filename = 'realty-sc/%s/%s/%s/%s.jpg' % (YEAR, image_guid[:2], image_guid[2:4], image_guid) filename = datetime.datetime.now().strftime('images/%Y.%m.%d-%H.%M/{}/{}.jpg'.format(folder, image_guid)) return filename # --- it runs without project and saves in `output.csv` ---
Example #12
Source File: ftp.py From learn_python3_spider with MIT License | 5 votes |
def _failed(self, result, request): message = result.getErrorMessage() if result.type == CommandFailed: m = _CODE_RE.search(message) if m: ftpcode = m.group() httpcode = self.CODE_MAPPING.get(ftpcode, self.CODE_MAPPING["default"]) return Response(url=request.url, status=httpcode, body=to_bytes(message)) raise result.type(result.value)
Example #13
Source File: request.py From learn_python3_spider with MIT License | 5 votes |
def request_httprepr(request): """Return the raw HTTP representation (as bytes) of the given request. This is provided only for reference since it's not the actual stream of bytes that will be send when performing the request (that's controlled by Twisted). """ parsed = urlparse_cached(request) path = urlunparse(('', '', parsed.path or '/', parsed.params, parsed.query, '')) s = to_bytes(request.method) + b" " + to_bytes(path) + b" HTTP/1.1\r\n" s += b"Host: " + to_bytes(parsed.hostname or b'') + b"\r\n" if request.headers: s += request.headers.to_string() + b"\r\n" s += b"\r\n" s += request.body return s
Example #14
Source File: responsetypes.py From learn_python3_spider with MIT License | 5 votes |
def from_body(self, body): """Try to guess the appropriate response based on the body content. This method is a bit magic and could be improved in the future, but it's not meant to be used except for special cases where response types cannot be guess using more straightforward methods.""" chunk = body[:5000] chunk = to_bytes(chunk) if not binary_is_text(chunk): return self.from_mimetype('application/octet-stream') elif b"<html>" in chunk.lower(): return self.from_mimetype('text/html') elif b"<?xml" in chunk.lower(): return self.from_mimetype('text/xml') else: return self.from_mimetype('text')
Example #15
Source File: exporters.py From learn_python3_spider with MIT License | 5 votes |
def _serialize_value(self, value): if isinstance(value, BaseItem): return self.export_item(value) if isinstance(value, dict): return dict(self._serialize_dict(value)) if is_listlike(value): return [self._serialize_value(v) for v in value] encode_func = to_bytes if self.binary else to_unicode if isinstance(value, (six.text_type, bytes)): return encode_func(value, encoding=self.encoding) return value
Example #16
Source File: writer.py From scrapy-kafka-export with MIT License | 5 votes |
def write(self, key, msg): key = None if key is None else to_bytes(key) return self._send_message(key, msg, self.topic)
Example #17
Source File: cache.py From PyFeeds with GNU Affero General Public License v3.0 | 5 votes |
def _get_key_path(self, spider, key): key = hashlib.sha1(to_bytes(key)).hexdigest() return os.path.join(self.cachedir, spider.name, key[0:2], key)
Example #18
Source File: cache.py From PyFeeds with GNU Affero General Public License v3.0 | 5 votes |
def _write_meta_to_path(self, path, metadata): with self._open(os.path.join(path, "meta"), "wb") as f: f.write(to_bytes(repr(metadata))) with self._open(os.path.join(path, "pickled_meta"), "wb") as f: pickle.dump(metadata, f, protocol=2)
Example #19
Source File: python - scrapy.py From python-examples with MIT License | 5 votes |
def file_path(self, request, response=None, info=None): '''Changing file name - adding folder name with date and time''' from scrapy.utils.python import to_bytes import hashlib import datetime # from original function file_path image_guid = hashlib.sha1(to_bytes(request.url)).hexdigest() # add folder name with date and time filename = datetime.datetime.now().strftime('images/%Y.%m.%d-%H.%M.%S/{}.jpg'.format(image_guid)) return filename # --- run without project and save in `output.csv` --- # scrapy runspider script.py -s USER_AGENT="Mozilla/5.0" -o output.csv -a urls="http://quotes.toscrape.com/tag/love/;http://quotes.toscrape.com/tag/inspirational/http://quotes.toscrape.com/tag/life/" # --- run without project and save in `output.csv` --- #import scrapy.cmdline #start_urls = "http://quotes.toscrape.com/tag/love/;http://quotes.toscrape.com/tag/inspirational/http://quotes.toscrape.com/tag/life/" #scrapy.cmdline.execute(['scrapy', 'crawl', 'myspider', '-o', 'output.csv', '-a', 'urls=' + start_urls]) # --- run without project and save in `output.csv` --- # python script.py #start_urls = [ # 'http://quotes.toscrape.com/tag/love/', # 'http://quotes.toscrape.com/tag/inspirational/', # 'http://quotes.toscrape.com/tag/life/', #]
Example #20
Source File: python - scrapy.py From python-examples with MIT License | 5 votes |
def file_path(self, request, response=None, info=None): from scrapy.utils.python import to_bytes import hashlib import datetime image_guid = hashlib.sha1(to_bytes(request.url)).hexdigest() filename = datetime.datetime.now().strftime('images/%Y.%m.%d-%H.%M.%S/{}.jpg'.format(image_guid)) return filename # --- run without project and save in `output.csv` --- # scrapy runspider script.py -s USER_AGENT="Mozilla/5.0" -o output.csv -a urls="http://quotes.toscrape.com/tag/love/;http://quotes.toscrape.com/tag/inspirational/http://quotes.toscrape.com/tag/life/" # --- #import scrapy.cmdline #start_urls = "http://quotes.toscrape.com/tag/love/;http://quotes.toscrape.com/tag/inspirational/http://quotes.toscrape.com/tag/life/" #scrapy.cmdline.execute(['scrapy', 'crawl', 'myspider', '-o', 'output.csv', '-a', 'urls=' + start_urls]) # --- run without project and save in `output.csv` --- # python script.py #start_urls = [ # 'http://quotes.toscrape.com/tag/love/', # 'http://quotes.toscrape.com/tag/inspirational/', # 'http://quotes.toscrape.com/tag/life/', #]
Example #21
Source File: ftp.py From learn_python3_spider with MIT License | 5 votes |
def _build_response(self, result, request, protocol): self.result = result respcls = responsetypes.from_args(url=request.url) protocol.close() body = protocol.filename or protocol.body.read() headers = {"local filename": protocol.filename or '', "size": protocol.size} return respcls(url=request.url, status=200, body=to_bytes(body), headers=headers)
Example #22
Source File: exporters.py From learn_python3_spider with MIT License | 5 votes |
def export_item(self, item): itemdict = dict(self._get_serialized_fields(item)) data = self.encoder.encode(itemdict) + '\n' self.file.write(to_bytes(data, self.encoding))
Example #23
Source File: images.py From learn_python3_spider with MIT License | 5 votes |
def thumb_path(self, request, thumb_id, response=None, info=None): thumb_guid = hashlib.sha1(to_bytes(request.url)).hexdigest() return 'thumbs/%s/%s.jpg' % (thumb_id, thumb_guid)
Example #24
Source File: images.py From learn_python3_spider with MIT License | 5 votes |
def file_path(self, request, response=None, info=None): image_guid = hashlib.sha1(to_bytes(request.url)).hexdigest() return 'full/%s.jpg' % (image_guid)
Example #25
Source File: files.py From learn_python3_spider with MIT License | 5 votes |
def file_path(self, request, response=None, info=None): media_guid = hashlib.sha1(to_bytes(request.url)).hexdigest() media_ext = os.path.splitext(request.url)[1] return 'full/%s%s' % (media_guid, media_ext)
Example #26
Source File: httpproxy.py From learn_python3_spider with MIT License | 5 votes |
def _basic_auth_header(self, username, password): user_pass = to_bytes( '%s:%s' % (unquote(username), unquote(password)), encoding=self.auth_encoding) return base64.b64encode(user_pass)
Example #27
Source File: httpcache.py From learn_python3_spider with MIT License | 5 votes |
def store_response(self, spider, request, response): key = self._request_key(request) data = { 'status': response.status, 'url': response.url, 'headers': dict(response.headers), 'body': response.body, } batch = self._leveldb.WriteBatch() batch.Put(key + b'_data', pickle.dumps(data, protocol=2)) batch.Put(key + b'_time', to_bytes(str(time()))) self.db.Write(batch)
Example #28
Source File: httpcache.py From learn_python3_spider with MIT License | 5 votes |
def __init__(self, settings): self.always_store = settings.getbool('HTTPCACHE_ALWAYS_STORE') self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES') self.ignore_response_cache_controls = [to_bytes(cc) for cc in settings.getlist('HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS')] self._cc_parsed = WeakKeyDictionary()
Example #29
Source File: link.py From learn_python3_spider with MIT License | 5 votes |
def __init__(self, url, text='', fragment='', nofollow=False): if not isinstance(url, str): if six.PY2: warnings.warn("Link urls must be str objects. " "Assuming utf-8 encoding (which could be wrong)") url = to_bytes(url, encoding='utf8') else: got = url.__class__.__name__ raise TypeError("Link urls must be str objects, got %s" % got) self.url = url self.text = text self.fragment = fragment self.nofollow = nofollow
Example #30
Source File: form.py From learn_python3_spider with MIT License | 5 votes |
def _urlencode(seq, enc): values = [(to_bytes(k, enc), to_bytes(v, enc)) for k, vs in seq for v in (vs if is_listlike(vs) else [vs])] return urlencode(values, doseq=1)