Python scrapy.http.HtmlResponse() Examples
The following are 30
code examples of scrapy.http.HtmlResponse().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
scrapy.http
, or try the search function
.
Example #1
Source File: middlewares.py From Python24 with MIT License | 9 votes |
def process_request(self, request): """使用selenium模拟点击,获取js等操作,所以需要重写process_request方法""" # 获取调度器返回的url url = request.url if 'month=' in url: # 手动打开chrome发送请求,执行js driver = webdriver.Chrome() driver.get(url=url) # 延迟一下,让页面进行加载 time.sleep(4) data = driver.page_source.encode() driver.close() # 返回数据给引擎 resp = HtmlResponse( url=url, body=data, request=request, encoding='utf8' ) return resp
Example #2
Source File: ajaxcrawl.py From learn_python3_spider with MIT License | 6 votes |
def process_response(self, request, response, spider): if not isinstance(response, HtmlResponse) or response.status != 200: return response if request.method != 'GET': # other HTTP methods are either not safe or don't have a body return response if 'ajax_crawlable' in request.meta: # prevent loops return response if not self._has_ajax_crawlable_variant(response): return response # scrapy already handles #! links properly ajax_crawl_request = request.replace(url=request.url+'#!') logger.debug("Downloading AJAX crawlable %(ajax_crawl_request)s instead of %(request)s", {'ajax_crawl_request': ajax_crawl_request, 'request': request}, extra={'spider': spider}) ajax_crawl_request.meta['ajax_crawlable'] = True return ajax_crawl_request
Example #3
Source File: common_spider.py From ajax_crawler with MIT License | 6 votes |
def parse_multi_items(self, hxs, node, item, response, index, count): if node.restrict_xpaths: for child in node.children: if child.xpaths: restrict_xpath = '|'.join([restrict_xpath.replace("<<", "").replace(">>", "") for restrict_xpath in node.restrict_xpaths]) try: m = re.search(r'<<(.+)&(.*)>>',xpath) restrict_xpath = m.group(1) except: pass restrict_selectors = hxs.select(restrict_xpath) #fetch multi items from one page if index != None and len(restrict_selectors) > index and len(restrict_selectors)==count: try: XmlXPathSelector = Selector except: pass restrict_hxs = XmlXPathSelector(HtmlResponse(response.url, body=re.sub('[\n\r\t]+', '', restrict_selectors[index].extract()), encoding='utf8')) #restrict_hxs = restrict_selectors[index] self.parse_item_xpaths(restrict_hxs, child.xpaths, item, response.url, child.name, True, False)
Example #4
Source File: offline.py From openslack-crawler with Apache License 2.0 | 6 votes |
def do_test(self, meta_object, text, expected_raw, expected_requests): request = Request(url='http://www.drudgereport.com', meta=meta_object) response = HtmlResponse('drudge.url', body=text, request=request) raw_item_count = 0 request_count = 0 for x in self.spider.parse(response): if isinstance(x, RawResponseItem): raw_item_count = raw_item_count + 1 elif isinstance(x, Request): request_count = request_count + 1 self.assertEqual(raw_item_count, expected_raw) self.assertEqual(request_count, expected_requests)
Example #5
Source File: wienerlinien_at.py From PyFeeds with GNU Affero General Public License v3.0 | 6 votes |
def parse(self, response): # Wiener Linien returns HTML with an XML content type which creates an # XmlResponse. response = HtmlResponse(url=response.url, body=response.body) for item in response.css(".block-news-item"): il = FeedEntryItemLoader( response=response, timezone="Europe/Vienna", ignoretz=True, base_url="https://www.{}".format(self.name), ) link = response.urljoin(item.css("a::attr(href)").extract_first()) il.add_value("link", link) il.add_value("title", item.css("h3::text").extract_first()) il.add_value("updated", item.css(".date::text").extract_first()) yield scrapy.Request(link, self.parse_item, meta={"il": il})
Example #6
Source File: link.py From scrapy-bench with MIT License | 6 votes |
def main(): url = 'http://scrapinghub.com/' link_extractor = LinkExtractor() total = 0 time = 0 tar = tarfile.open("sites.tar.gz") for member in tar.getmembers(): f = tar.extractfile(member) html = f.read() start = timer() response = HtmlResponse(url=url, body=html, encoding='utf8') links = link_extractor.extract_links(response) end = timer() total = total + len(links) time = time + end - start print("\nTotal number of links extracted = {0}".format(total)) print("Time taken = {0}".format(time)) click.secho("Rate of link extraction : {0} links/second\n".format( float(total / time)), bold=True) with open("Benchmark.txt", 'w') as g: g.write(" {0}".format((float(total / time))))
Example #7
Source File: lwv_chicago.py From In2ItChicago with GNU General Public License v3.0 | 6 votes |
def parse(self, response): feed_url = response.css('a.feed-icon::attr(href)').extract()[0] feed = feedparser.parse(feed_url) for entry in feed['entries']: detail = HtmlResponse(url='string', body=entry['summary'], encoding='utf-8') description = detail.css('.body.text-secondary p::text').extract() address = detail.css('[itemprop="streetAddress"]::text').extract() yield { 'address': address[0] if len(address) > 0 else '', 'url': entry.link, 'title': entry.title, 'event_time': { 'date': detail.css('span.date-display-single::attr("content")').extract()[0].split('T')[0], 'time_range': detail.css('span.date-display-single::attr("content")').extract()[0].split('T')[1] }, 'description': description[0] if len(description) > 0 else '' }
Example #8
Source File: pixiv_spider.py From scrapy-picture-spider with Apache License 2.0 | 6 votes |
def _requests_to_follow(self, response): if not isinstance(response, HtmlResponse): return seen = set() self.headers['Referer'] = response.url for n, rule in enumerate(self._rules): links = [l for l in rule.link_extractor.extract_links(response) if l not in seen] if links and rule.process_links: links = rule.process_links(links) for link in links: seen.add(link) r = Request( url=link.url, callback=self._response_downloaded, headers=self.headers, dont_filter=True ) # keep cookie r.meta.update( rule=n, link_text=link.text, cookiejar=response.meta['cookiejar'] ) yield rule.process_request(r)
Example #9
Source File: ajaxcrawl.py From learn_python3_spider with MIT License | 6 votes |
def process_response(self, request, response, spider): if not isinstance(response, HtmlResponse) or response.status != 200: return response if request.method != 'GET': # other HTTP methods are either not safe or don't have a body return response if 'ajax_crawlable' in request.meta: # prevent loops return response if not self._has_ajax_crawlable_variant(response): return response # scrapy already handles #! links properly ajax_crawl_request = request.replace(url=request.url+'#!') logger.debug("Downloading AJAX crawlable %(ajax_crawl_request)s instead of %(request)s", {'ajax_crawl_request': ajax_crawl_request, 'request': request}, extra={'spider': spider}) ajax_crawl_request.meta['ajax_crawlable'] = True return ajax_crawl_request
Example #10
Source File: test_link_spider.py From scrapy-cluster with MIT License | 6 votes |
def evaluate(self, meta_object, text, expected_raw, expected_requests): request = Request(url='http://www.drudgereport.com', meta=meta_object) response = HtmlResponse('drudge.url', body=text, request=request, encoding='utf8') raw_item_count = 0 request_count = 0 for x in self.spider.parse(response): if isinstance(x, RawResponseItem): raw_item_count = raw_item_count + 1 elif isinstance(x, Request): request_count = request_count + 1 self.assertEqual(raw_item_count, expected_raw) self.assertEqual(request_count, expected_requests)
Example #11
Source File: response.py From learn_python3_spider with MIT License | 6 votes |
def open_in_browser(response, _openfunc=webbrowser.open): """Open the given response in a local web browser, populating the <base> tag for external links to work """ from scrapy.http import HtmlResponse, TextResponse # XXX: this implementation is a bit dirty and could be improved body = response.body if isinstance(response, HtmlResponse): if b'<base' not in body: repl = '<head><base href="%s">' % response.url body = body.replace(b'<head>', to_bytes(repl)) ext = '.html' elif isinstance(response, TextResponse): ext = '.txt' else: raise TypeError("Unsupported response type: %s" % response.__class__.__name__) fd, fname = tempfile.mkstemp(ext) os.write(fd, body) os.close(fd) return _openfunc("file://%s" % fname)
Example #12
Source File: conftest.py From collectors with MIT License | 6 votes |
def get_url(betamax_session): def _get_url(url, request_kwargs={}): '''Returns a scrapy.html.HtmlResponse with the contents of the received url. Note that the session is kept intact among multiple calls to this method (i.e. cookies are passed over). We also don't verify SSL certificates, because Takeda's certificate is invalid. If they become valid, we can resume verifying the certificates. ''' response = betamax_session.get(url, verify=False) scrapy_response = HtmlResponse( url=str(response.url), body=response.content, ) scrapy_response.request = Request(url, **request_kwargs) return scrapy_response return _get_url
Example #13
Source File: test_fda_dap.py From collectors with MIT License | 6 votes |
def test_parse_drug_details_or_overview_generates_new_request_if_redirected_to_search_page(self): url = 'http://www.accessdata.fda.gov/scripts/cder/drugsatfda/index.cfm?fuseaction=Search.Search_Drug_Name' meta = { 'original_url': 'http://www.accessdata.fda.gov/somewhere.cfm', 'original_cookies': { 'foo': 'bar', }, } mock_response = HtmlResponse(url=url) mock_response.request = Request(url, meta=meta) with mock.patch('random.random', return_value='random_cookiejar'): spider = Spider() request = spider.parse_drug_details_or_overview(mock_response) assert request.url == meta['original_url'] assert request.cookies == meta['original_cookies'] assert request.dont_filter assert request.callback == spider.parse_drug_details_or_overview assert request.meta['cookiejar'] == 'random_cookiejar'
Example #14
Source File: test_wandering_spider.py From scrapy-cluster with MIT License | 6 votes |
def evaluate(self, meta_object, text, expected_raw, expected_requests): request = Request(url='http://www.drudgereport.com', meta=meta_object) response = HtmlResponse('drudge.url', body=text, request=request, encoding='utf8') raw_item_count = 0 request_count = 0 for x in self.spider.parse(response): if isinstance(x, RawResponseItem): raw_item_count = raw_item_count + 1 elif isinstance(x, Request): request_count = request_count + 1 self.assertEqual(raw_item_count, expected_raw) self.assertEqual(request_count, expected_requests)
Example #15
Source File: middlewares.py From oh-my-rss with MIT License | 6 votes |
def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called if spider.browser: request.meta['browser'] = self.browser # to access driver from response self.browser.get(request.url) # wait js eval time.sleep(15) body = to_bytes(self.browser.page_source) # body must be of type bytes return HtmlResponse(self.browser.current_url, body=body, encoding='utf-8', request=request) else: return None
Example #16
Source File: middlewares.py From vrequest with MIT License | 6 votes |
def process_request(self, request, spider): try: self.webdriver.get(url=request.url) time.sleep(2) # 部分智能等待的代码,提高浏览器效率的处理 # from selenium.webdriver.common.by import By # from selenium.webdriver.support import expected_conditions as EC # from selenium.webdriver.support.wait import WebDriverWait as wbw # locator = (By.XPATH, '//img[@class="focus-item-img"]') # # wbw(self.webdriver,10).until(EC.presence_of_element_located(locator)) # 判断某个元素是否被加到了dom树里 # wbw(self.webdriver,10).until(EC.visibility_of_element_located(locator)) # 判断某个元素是否被添加到了dom里并且可见,即宽和高都大于0 current_url = self.webdriver.current_url page_source = self.webdriver.page_source except Exception as e: return self._parse_selenium_temp_exceptions(request, spider, e) # 若是出现请求异常(验证码,或者重新登陆之类的处理),请在这里判断 page_source 是否是异常情况,并在这里处理重新进行登录或其他 h = HtmlResponse( url = current_url, headers = {'Selenium':'Selenium cannot get a certain headers, This is the information created automatically by middleware.'}, body = page_source, encoding = 'utf-8', request = request ) return h
Example #17
Source File: wemp.py From oh-my-rss with MIT License | 6 votes |
def parse_ershicimi_page(rsp): """ 解析 https://www.ershicimi.com/p/3e250905e46b0827af501c19c1c3f2ed :param rsp: :return: """ response = HtmlResponse(url=rsp.url, body=rsp.text, encoding='utf8') title = response.selector.xpath('//h1[@class="article-title"]/text()').extract_first().strip() author = response.selector.xpath('//div[@class="article-sub"]//a/text()').extract_first().strip() try: content = response.selector.xpath('//div[@id="js_content"]').extract_first().strip() except: content = response.selector.xpath('//div[@class="abstract"]').extract_first().strip() return title, author, content
Example #18
Source File: middleware.py From daywatch with MIT License | 5 votes |
def process_request(self, request, spider): if spider.USE_SELENIUM: url = request._get_url() self.driver.get(url) return HtmlResponse(url, body=self.driver.page_source, encoding='utf-8')
Example #19
Source File: middlewares.py From SourceCodeOfBook with MIT License | 5 votes |
def process_request(self, request, spider): if spider.name == 'seleniumSpider': self.driver.get(request.url) time.sleep(2) body = self.driver.page_source return HtmlResponse(self.driver.current_url, body=body, encoding='utf-8', request=request)
Example #20
Source File: pyppeteer.py From Gerapy with MIT License | 5 votes |
def process_request(self, request, spider): """ :param request: request object :param spider: spider object :return: HtmlResponse """ if request.meta.get('render'): try: html, result, status = self.render(request.url, **self.args) return HtmlResponse(url=request.url, body=html, request=request, encoding='utf-8', status=status) except websockets.exceptions.ConnectionClosed: pass
Example #21
Source File: test_magicfields.py From scrapy-magicfields with BSD 3-Clause "New" or "Revised" License | 5 votes |
def setUp(self): self.environ = os.environ.copy() self.spider = Spider('myspider', arg1='val1', start_urls = ["http://example.com"]) def _log(x): print(x) self.spider.log = _log self.response = HtmlResponse(body=b"<html></html>", url="http://www.example.com/product/8798732") self.item = TestItem({'nom': 'myitem', 'prix': "56.70 euros", "url": "http://www.example.com/product.html?item_no=345"})
Example #22
Source File: cssbench.py From scrapy-bench with MIT License | 5 votes |
def main(): total = 0 time = 0 tar = tarfile.open("bookfiles.tar.gz") for member in tar.getmembers(): f = tar.extractfile(member) html = f.read() response = HtmlResponse(url="local", body=html, encoding='utf8') start = timer() rating = response.css( 'p.star-rating::attr(class)').extract_first().split(' ')[-1] title = response.css('.product_main h1::text').extract_first() price = response.css( '.product_main p.price_color::text').re_first('£(.*)') stock = ''.join( response.css('.product_main .instock.availability ::text').re('(\d+)')) category = ''.join( response.css('ul.breadcrumb li:nth-last-child(2) ::text').extract()).strip() end = timer() page = [rating, title, price, stock, category] total = total + 1 time = time + end - start print("\nTotal number of pages extracted = {0}".format(total)) print("Time taken = {0}".format(time)) click.secho("Rate of link extraction : {0} pages/second\n".format( float(total / time)), bold=True) with open("Benchmark.txt", 'w') as g: g.write(" {0}".format((float(total / time))))
Example #23
Source File: broadspider.py From scrapy-bench with MIT License | 5 votes |
def _extract_requests(self, response): r = [] if isinstance(response, HtmlResponse): links = self.link_extractor.extract_links(response) r.extend(Request(x.url, callback=self.parse) for x in links) return r
Example #24
Source File: xpathbench.py From scrapy-bench with MIT License | 5 votes |
def main(): total = 0 time = 0 tar = tarfile.open("bookfiles.tar.gz") for member in tar.getmembers(): f = tar.extractfile(member) html = f.read() response = HtmlResponse(url="local", body=html, encoding='utf8') start = timer() rating = response.xpath( "//*[@id='content_inner']/article/div[1]/div[2]/p[3]/i[1]").extract(), # .split(' ')[-1], title = response.xpath( "//*[@id=('content_inner')]/article/div[1]/div[2]/h1").extract(), price = response.xpath( "//*[@id=('content_inner')]/article/div[1]/div[2]/p[1]"), stock = ''.join(response.xpath( "//*[@id=('content_inner')]/article/div[1]/div[2]/p[2]").re('(\d+)')), end = timer() page = [rating, title, price, stock] total = total + 1 time = time + end - start print("\nTotal number of pages extracted = {0}".format(total)) print("Time taken = {0}".format(time)) click.secho("Rate of link extraction : {0} pages/second\n".format( float(total / time)), bold=True) with open("Benchmark.txt", 'w') as g: g.write(" {0}".format((float(total / time))))
Example #25
Source File: linksys.py From scraper with MIT License | 5 votes |
def parse_kb(self, response): mib = None # need to perform some nasty segmentation because different firmware versions are not clearly separated # reverse order to get MIB before firmware items for entry in reversed(response.xpath( "//div[@id='support-article-downloads']/div/p")): for segment in reversed(entry.extract().split("<br><br>")): resp = HtmlResponse( url=response.url, body=segment, encoding=response.encoding) for href in resp.xpath("//a/@href").extract(): text = resp.xpath("//text()").extract() if "MIBs" in href: mib = href elif "firmware" in href: text = resp.xpath("//text()").extract() item = FirmwareLoader( item=FirmwareImage(), response=resp, date_fmt=["%m/%d/%Y"]) item.add_value("date", item.find_date(text)) item.add_xpath("url", "//a/@href") item.add_value("mib", mib) item.add_value("product", response.meta["product"]) item.add_value("vendor", self.name) item.add_value( "version", FirmwareLoader.find_version_period(text)) yield item.load_item()
Example #26
Source File: itemloader.py From scrapy-bench with MIT License | 5 votes |
def main(): total = 0 time = 0 tar = tarfile.open("bookfiles.tar.gz") for member in tar.getmembers(): f = tar.extractfile(member) html = f.read() response = HtmlResponse(url="local", body=html, encoding='utf8') for i in xrange(0, 10): start = timer() loader = ItemLoader(item=ItemloaderItem(), response=response) loader.add_xpath( 'rating', '//*[@id="content_inner"]/article/div[1]/div[2]/p[3]/i[1]') loader.add_xpath( 'title', '//*[@id=("content_inner")]/article/div[1]/div[2]/h1') loader.add_xpath( 'price', '//*[@id=("content_inner")]/article/div[1]/div[2]/p[1]') loader.add_css('stock', '.product_main .instock.availability ::text') loader.add_css('category', 'ul.breadcrumb li:nth-last-child(2) ::text') loader.add_value('name', 'item {}'.format(i)) loader.add_value('url', 'http://site.com/item{}'.format(i)) product = loader.load_item() end = timer() total += 1 time = time + end - start print("\nTotal number of items extracted = {0}".format(total)) print("Time taken = {0}".format(time)) click.secho("Rate of link extraction : {0} items/second\n".format( float(total / time)), bold=True) with open("Benchmark.txt", 'w') as g: g.write(" {0}".format((float(total / time))))
Example #27
Source File: followall.py From scrapy-bench with MIT License | 5 votes |
def _extract_requests(self, response): r = [] if isinstance(response, HtmlResponse): links = self.link_extractor.extract_links(response) r.extend(Request(x.url, callback=self.parse) for x in links) return r
Example #28
Source File: middlewares.py From Spiders with Apache License 2.0 | 5 votes |
def process_request(self, request, spider): if isinstance(request, FlhhkkRequest): html = spider.scrapper.get(request.url).text # 构造Response # 这个Response将会被你的爬虫进一步处理 return HtmlResponse(url=request.url, request=request, body=html.encode(), encoding="utf-8") return None
Example #29
Source File: test_fda_dap.py From collectors with MIT License | 5 votes |
def test_parse_drug_details_or_overview_delegates_to_parse_drug_details_when_response_in_drug_overview(self): url = 'http://www.accessdata.fda.gov/scripts/cder/drugsatfda/index.cfm?fuseaction=Search.Overview&DrugName=E-BASE' mock_response = HtmlResponse(url=url) expected_result = 'expected_result' with mock.patch.object(Spider, 'parse_drug_overview', return_value=expected_result) as mock_method: spider = Spider() result = spider.parse_drug_details_or_overview(mock_response) mock_method.assert_called_once_with(mock_response) assert result == expected_result
Example #30
Source File: belkin.py From scraper with MIT License | 5 votes |
def parse_kb(self, response): # initial html tokenization to find regions segmented by e.g. "======" # or "------" filtered = response.xpath( "//div[@class='sfdc_richtext']").extract()[0].split("=-") for entry in [x and x.strip() for x in filtered]: resp = HtmlResponse(url=response.url, body=entry, encoding=response.encoding) for link in resp.xpath("//a"): href = link.xpath("@href").extract()[0] if "cache-www" in href: text = resp.xpath("//text()").extract() text_next = link.xpath("following::text()").extract() item = FirmwareLoader(item=FirmwareImage(), response=response, date_fmt=["%b %d, %Y", "%B %d, %Y", "%m/%d/%Y"]) version = FirmwareLoader.find_version_period(text_next) if not version: version = FirmwareLoader.find_version_period(text) item.add_value("version", version) item.add_value("date", item.find_date(text)) item.add_value("url", href) item.add_value("product", response.meta["product"]) item.add_value("vendor", self.name) yield item.load_item()