Python scrapy.http.TextResponse() Examples
The following are 18
code examples of scrapy.http.TextResponse().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
scrapy.http
, or try the search function
.
Example #1
Source File: test_pagestorage.py From scrapy-pagestorage with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_save_response_with_trim(self): self.instance._writer.maxitemsize = 26 self.instance.hsref.job.key = '123/45/67' resp = TextResponse( 'http://resp', request=Request('http://req'), encoding='cp1251', body='\r\n\r\n<html><body></body></html>\r\n \0\0\0\0\0') with mock.patch.object(Spider, 'logger') as log: spider = Spider('default') self.instance.save_response(resp, self.spider) log.warning.assert_called_with( "Page not saved, body too large: <http://resp>") self.instance.trim_html = True self.instance.save_response(resp, spider) self.instance._writer.write.assert_called_with( {u'body': u'<html><body></body></html>', u'_encoding': u'cp1251', u'_type': u'_pageitem', u'_key': u'9b4bed7e56103ddf63455ed39145f61f53b3c702', u'url': u'http://resp', '_jobid': '123/45/67'})
Example #2
Source File: iterators.py From learn_python3_spider with MIT License | 6 votes |
def _body_or_str(obj, unicode=True): expected_types = (Response, six.text_type, six.binary_type) assert isinstance(obj, expected_types), \ "obj must be %s, not %s" % ( " or ".join(t.__name__ for t in expected_types), type(obj).__name__) if isinstance(obj, Response): if not unicode: return obj.body elif isinstance(obj, TextResponse): return obj.text else: return obj.body.decode('utf-8') elif isinstance(obj, six.text_type): return obj if unicode else obj.encode('utf-8') else: return obj.decode('utf-8') if unicode else obj
Example #3
Source File: response.py From learn_python3_spider with MIT License | 6 votes |
def open_in_browser(response, _openfunc=webbrowser.open): """Open the given response in a local web browser, populating the <base> tag for external links to work """ from scrapy.http import HtmlResponse, TextResponse # XXX: this implementation is a bit dirty and could be improved body = response.body if isinstance(response, HtmlResponse): if b'<base' not in body: repl = '<head><base href="%s">' % response.url body = body.replace(b'<head>', to_bytes(repl)) ext = '.html' elif isinstance(response, TextResponse): ext = '.txt' else: raise TypeError("Unsupported response type: %s" % response.__class__.__name__) fd, fname = tempfile.mkstemp(ext) os.write(fd, body) os.close(fd) return _openfunc("file://%s" % fname)
Example #4
Source File: httpcompression.py From learn_python3_spider with MIT License | 6 votes |
def process_response(self, request, response, spider): if request.method == 'HEAD': return response if isinstance(response, Response): content_encoding = response.headers.getlist('Content-Encoding') if content_encoding: encoding = content_encoding.pop() decoded_body = self._decode(response.body, encoding.lower()) respcls = responsetypes.from_args(headers=response.headers, \ url=response.url, body=decoded_body) kwargs = dict(cls=respcls, body=decoded_body) if issubclass(respcls, TextResponse): # force recalculating the encoding until we make sure the # responsetypes guessing is reliable kwargs['encoding'] = None response = response.replace(**kwargs) if not content_encoding: del response.headers['Content-Encoding'] return response
Example #5
Source File: httpcompression.py From learn_python3_spider with MIT License | 6 votes |
def process_response(self, request, response, spider): if request.method == 'HEAD': return response if isinstance(response, Response): content_encoding = response.headers.getlist('Content-Encoding') if content_encoding: encoding = content_encoding.pop() decoded_body = self._decode(response.body, encoding.lower()) respcls = responsetypes.from_args(headers=response.headers, \ url=response.url, body=decoded_body) kwargs = dict(cls=respcls, body=decoded_body) if issubclass(respcls, TextResponse): # force recalculating the encoding until we make sure the # responsetypes guessing is reliable kwargs['encoding'] = None response = response.replace(**kwargs) if not content_encoding: del response.headers['Content-Encoding'] return response
Example #6
Source File: iterators.py From learn_python3_spider with MIT License | 6 votes |
def _body_or_str(obj, unicode=True): expected_types = (Response, six.text_type, six.binary_type) assert isinstance(obj, expected_types), \ "obj must be %s, not %s" % ( " or ".join(t.__name__ for t in expected_types), type(obj).__name__) if isinstance(obj, Response): if not unicode: return obj.body elif isinstance(obj, TextResponse): return obj.text else: return obj.body.decode('utf-8') elif isinstance(obj, six.text_type): return obj if unicode else obj.encode('utf-8') else: return obj.decode('utf-8') if unicode else obj
Example #7
Source File: response.py From learn_python3_spider with MIT License | 6 votes |
def open_in_browser(response, _openfunc=webbrowser.open): """Open the given response in a local web browser, populating the <base> tag for external links to work """ from scrapy.http import HtmlResponse, TextResponse # XXX: this implementation is a bit dirty and could be improved body = response.body if isinstance(response, HtmlResponse): if b'<base' not in body: repl = '<head><base href="%s">' % response.url body = body.replace(b'<head>', to_bytes(repl)) ext = '.html' elif isinstance(response, TextResponse): ext = '.txt' else: raise TypeError("Unsupported response type: %s" % response.__class__.__name__) fd, fname = tempfile.mkstemp(ext) os.write(fd, body) os.close(fd) return _openfunc("file://%s" % fname)
Example #8
Source File: datauri.py From learn_python3_spider with MIT License | 5 votes |
def download_request(self, request, spider): uri = parse_data_uri(request.url) respcls = responsetypes.from_mimetype(uri.media_type) resp_kwargs = {} if (issubclass(respcls, TextResponse) and uri.media_type.split('/')[0] == 'text'): charset = uri.media_type_parameters.get('charset') resp_kwargs['encoding'] = charset return respcls(url=request.url, body=uri.data, **resp_kwargs)
Example #9
Source File: test_pagestorage.py From scrapy-pagestorage with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_save_response(self): self.instance._writer = mock.MagicMock() self.instance._writer.maxitemsize = 10 # wrong response type self.instance.save_response( Response('http://resp', request=Request('http://req')), self.spider) assert not self.instance._writer.write.called # get request with large body resp1 = TextResponse('http://resp1', request=Request('http://req1'), body='looong loong body', encoding='cp1251') self.instance.save_response(resp1, self.spider) assert not self.instance._writer.write.called # get request with ok-body self.instance.hsref = mock.Mock() self.instance.hsref.job.key = '123/45/67' resp2 = TextResponse('http://resp2', request=Request('http://req2'), body='body', encoding='cp1251', headers={'Set-Cookie': [b'coo1=test;abc=1', b'coo2=tes1;cbd=2']}) self.instance.save_response(resp2, self.spider) self.instance._writer.write.assert_called_with( {'body': u'body', '_encoding': 'cp1251', '_type': '_pageitem', '_key': 'bad42100b1d34e29973a79e512aabb4db885b712', 'cookies': ['coo1=test', 'coo2=tes1'], 'url': 'http://resp2', '_jobid': '123/45/67'})
Example #10
Source File: scrapy_pagestorage.py From scrapy-pagestorage with BSD 3-Clause "New" or "Revised" License | 5 votes |
def save_response(self, response, spider): if isinstance(response, TextResponse): fp = request_fingerprint(response.request) payload = { "_key": fp, "_jobid": self.hsref.job.key, "_type": "_pageitem", "_encoding": response.encoding, "url": response.url, } self._set_cookies(payload, response) if response.request.method == 'POST': payload["postdata"] = dict(parse_qsl(response.request.body.decode())) payload["body"] = response.body_as_unicode() if self.trim_html: payload['body'] = payload['body'].strip(' \r\n\0') if len(payload['body']) > self._writer.maxitemsize: spider.logger.warning("Page not saved, body too large: <%s>" % response.url) return try: self._writer.write(payload) except ValueTooLarge as exc: spider.logger.warning("Page not saved, %s: <%s>" % (exc, response.url))
Example #11
Source File: middlewares.py From hq-proxies with MIT License | 5 votes |
def process_exception(self, request, exception, spider): if isinstance(exception, self.DONT_RETRY_ERRORS): return TextResponse(url=request.meta['proxy'])
Example #12
Source File: datauri.py From learn_python3_spider with MIT License | 5 votes |
def download_request(self, request, spider): uri = parse_data_uri(request.url) respcls = responsetypes.from_mimetype(uri.media_type) resp_kwargs = {} if (issubclass(respcls, TextResponse) and uri.media_type.split('/')[0] == 'text'): charset = uri.media_type_parameters.get('charset') resp_kwargs['encoding'] = charset return respcls(url=request.url, body=uri.data, **resp_kwargs)
Example #13
Source File: test_utils.py From scrapy-poet with BSD 3-Clause "New" or "Revised" License | 5 votes |
def __init__(self, response: TextResponse): self.response = response
Example #14
Source File: test_utils.py From scrapy-poet with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_is_provider_using_response(): assert is_provider_using_response(PageObjectInputProvider) is False assert is_provider_using_response(ResponseDataProvider) is True # TextProductProvider wrongly annotates response dependency as # TextResponse, instead of using the Response type. assert is_provider_using_response(TextProductProvider) is False assert is_provider_using_response(DummyProductProvider) is False assert is_provider_using_response(FakeProductProvider) is False assert is_provider_using_response(StringProductProvider) is False
Example #15
Source File: test_utils.py From scrapy-poet with BSD 3-Clause "New" or "Revised" License | 5 votes |
def parse12(self, response: TextResponse, book_page: DummyProductPage): pass
Example #16
Source File: test_utils.py From scrapy-poet with BSD 3-Clause "New" or "Revised" License | 5 votes |
def parse11(self, response: TextResponse): pass
Example #17
Source File: iterators.py From learn_python3_spider with MIT License | 4 votes |
def csviter(obj, delimiter=None, headers=None, encoding=None, quotechar=None): """ Returns an iterator of dictionaries from the given csv object obj can be: - a Response object - a unicode string - a string encoded as utf-8 delimiter is the character used to separate fields on the given obj. headers is an iterable that when provided offers the keys for the returned dictionaries, if not the first row is used. quotechar is the character used to enclosure fields on the given obj. """ encoding = obj.encoding if isinstance(obj, TextResponse) else encoding or 'utf-8' def row_to_unicode(row_): return [to_unicode(field, encoding) for field in row_] # Python 3 csv reader input object needs to return strings if six.PY3: lines = StringIO(_body_or_str(obj, unicode=True)) else: lines = BytesIO(_body_or_str(obj, unicode=False)) kwargs = {} if delimiter: kwargs["delimiter"] = delimiter if quotechar: kwargs["quotechar"] = quotechar csv_r = csv.reader(lines, **kwargs) if not headers: try: row = next(csv_r) except StopIteration: return headers = row_to_unicode(row) for row in csv_r: row = row_to_unicode(row) if len(row) != len(headers): logger.warning("ignoring row %(csvlnum)d (length: %(csvrow)d, " "should be: %(csvheader)d)", {'csvlnum': csv_r.line_num, 'csvrow': len(row), 'csvheader': len(headers)}) continue else: yield dict(zip(headers, row))
Example #18
Source File: collector.py From collectors with MIT License | 4 votes |
def collect(conf, conn): """Collect ICD-XX-CM conditions. """ # For more information see: # https://www.cms.gov/Medicare/Coding/ICD10/2016-ICD-10-CM-and-GEMs.html URL = 'https://www.cms.gov/Medicare/Coding/ICD10/Downloads/2016-CM-Code-Tables-and-Index.zip' FILE = 'Tabular.xml' VERSION = 'ICD-10-CM' LAST_UPDATED = '2015-10-01' # Prepare xml zip = requests.get(URL).content xml = zipfile.ZipFile(io.BytesIO(zip)).open(FILE).read() res = TextResponse(url=URL, body=xml, encoding='utf-8') count = 0 for diag in res.xpath('//diag'): # We need only leafs childs = diag.xpath('./diag') if not childs: continue # Get data data = { 'name': diag.xpath('./name/text()').extract_first(), 'desc': diag.xpath('./desc/text()').extract_first(), 'terms': diag.xpath('.//note/text()').extract(), 'version': VERSION, 'last_updated': LAST_UPDATED, } # Create record record = Record.create(URL, data) # Write record record.write(conf, conn) # Log info count += 1 if not count % 100: logger.info('Collected %s "%s" conditions', count, record.table)