Python scrapy.exceptions.IgnoreRequest() Examples
The following are 30
code examples of scrapy.exceptions.IgnoreRequest().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
scrapy.exceptions
, or try the search function
.
Example #1
Source File: middlewares.py From freshonions-torscraper with GNU Affero General Public License v3.0 | 6 votes |
def process_request(self, request, spider): parsed_url = urlparse.urlparse(request.url) if not self.test_mode or not parsed_url.path in ["/", ""]: return None if not Domain.is_onion_url(request.url): return None d = Domain.find_by_url(request.url) if d is None: return None now = datetime.now() if now > d.next_scheduled_check: return None else: raise IgnoreRequest('FilterNotScheduledMiddleware: %s is not scheduled to check' % d.host)
Example #2
Source File: middlewares.py From freshonions-torscraper with GNU Affero General Public License v3.0 | 6 votes |
def process_request(self, request, spider): # don't use this middleware while testing is site is up if hasattr(spider, "test") and spider.test=="yes": #logger = logging.getLogger() #logger.info("Testing mode, dead domains disabled") return None if not Domain.is_onion_url(request.url): return None domain = Domain.find_by_url(request.url) if not domain or domain.is_up: return None raise IgnoreRequest('Domain %s is dead, skipping' % domain.host)
Example #3
Source File: redirect.py From learn_python3_spider with MIT License | 6 votes |
def _redirect(self, redirected, request, spider, reason): ttl = request.meta.setdefault('redirect_ttl', self.max_redirect_times) redirects = request.meta.get('redirect_times', 0) + 1 if ttl and redirects <= self.max_redirect_times: redirected.meta['redirect_times'] = redirects redirected.meta['redirect_ttl'] = ttl - 1 redirected.meta['redirect_urls'] = request.meta.get('redirect_urls', []) + \ [request.url] redirected.meta['redirect_reasons'] = request.meta.get('redirect_reasons', []) + \ [reason] redirected.dont_filter = request.dont_filter redirected.priority = request.priority + self.priority_adjust logger.debug("Redirecting (%(reason)s) to %(redirected)s from %(request)s", {'reason': reason, 'redirected': redirected, 'request': request}, extra={'spider': spider}) return redirected else: logger.debug("Discarding %(request)s: max redirections reached", {'request': request}, extra={'spider': spider}) raise IgnoreRequest("max redirections reached")
Example #4
Source File: middlewares.py From scrapy-autoextract with BSD 3-Clause "New" or "Revised" License | 6 votes |
def process_exception(self, request, exception, spider): if isinstance(exception, (IgnoreRequest, DropItem)): return if not self._is_enabled_for_request(request): return autoextract = request.meta.pop(AUTOEXTRACT_META_KEY) stop_time = time.time() latency = time.time() - autoextract['timing']['start_ts'] autoextract['timing'].update({'end_ts': stop_time, 'latency': latency}) # Make sure to log all unknown failures logger.warning('AutoExtract failure after %.3fs for %s: %s', latency, autoextract['original_url'], repr(exception), extra={'spider': spider}) request.meta['autoextract'] = autoextract ex_class = global_object_name(exception.__class__) self.inc_metric('autoextract/errors/total_count', spider=spider) self.inc_metric('autoextract/errors/type_count/%s' % ex_class, spider=spider)
Example #5
Source File: shell.py From learn_python3_spider with MIT License | 6 votes |
def fetch(self, request_or_url, spider=None, redirect=True, **kwargs): if isinstance(request_or_url, Request): request = request_or_url else: url = any_to_uri(request_or_url) request = Request(url, dont_filter=True, **kwargs) if redirect: request.meta['handle_httpstatus_list'] = SequenceExclude(range(300, 400)) else: request.meta['handle_httpstatus_all'] = True response = None try: response, spider = threads.blockingCallFromThread( reactor, self._schedule, request, spider) except IgnoreRequest: pass self.populate_vars(response, request, spider)
Example #6
Source File: scraper.py From learn_python3_spider with MIT License | 6 votes |
def _log_download_errors(self, spider_failure, download_failure, request, spider): """Log and silence errors that come from the engine (typically download errors that got propagated thru here) """ if (isinstance(download_failure, Failure) and not download_failure.check(IgnoreRequest)): if download_failure.frames: logger.error('Error downloading %(request)s', {'request': request}, exc_info=failure_to_exc_info(download_failure), extra={'spider': spider}) else: errmsg = download_failure.getErrorMessage() if errmsg: logger.error('Error downloading %(request)s: %(errmsg)s', {'request': request, 'errmsg': errmsg}, extra={'spider': spider}) if spider_failure is not download_failure: return spider_failure
Example #7
Source File: redirect.py From learn_python3_spider with MIT License | 6 votes |
def _redirect(self, redirected, request, spider, reason): ttl = request.meta.setdefault('redirect_ttl', self.max_redirect_times) redirects = request.meta.get('redirect_times', 0) + 1 if ttl and redirects <= self.max_redirect_times: redirected.meta['redirect_times'] = redirects redirected.meta['redirect_ttl'] = ttl - 1 redirected.meta['redirect_urls'] = request.meta.get('redirect_urls', []) + \ [request.url] redirected.meta['redirect_reasons'] = request.meta.get('redirect_reasons', []) + \ [reason] redirected.dont_filter = request.dont_filter redirected.priority = request.priority + self.priority_adjust logger.debug("Redirecting (%(reason)s) to %(redirected)s from %(request)s", {'reason': reason, 'redirected': redirected, 'request': request}, extra={'spider': spider}) return redirected else: logger.debug("Discarding %(request)s: max redirections reached", {'request': request}, extra={'spider': spider}) raise IgnoreRequest("max redirections reached")
Example #8
Source File: shell.py From learn_python3_spider with MIT License | 6 votes |
def fetch(self, request_or_url, spider=None, redirect=True, **kwargs): if isinstance(request_or_url, Request): request = request_or_url else: url = any_to_uri(request_or_url) request = Request(url, dont_filter=True, **kwargs) if redirect: request.meta['handle_httpstatus_list'] = SequenceExclude(range(300, 400)) else: request.meta['handle_httpstatus_all'] = True response = None try: response, spider = threads.blockingCallFromThread( reactor, self._schedule, request, spider) except IgnoreRequest: pass self.populate_vars(response, request, spider)
Example #9
Source File: robotstxt.py From learn_python3_spider with MIT License | 5 votes |
def _robots_error(self, failure, netloc): if failure.type is not IgnoreRequest: key = 'robotstxt/exception_count/{}'.format(failure.type) self.crawler.stats.inc_value(key) rp_dfd = self._parsers[netloc] self._parsers[netloc] = None rp_dfd.callback(None)
Example #10
Source File: middlewares.py From freshonions-torscraper with GNU Affero General Public License v3.0 | 5 votes |
def process_request(self, request, spider): parsed_url = urlparse.urlparse(request.url) host = parsed_url.hostname if self.counter[host] < self.max_pages: self.counter[host] += 1 spider.logger.info('Page count is %d for %s' % (self.counter[host], host)) return None else: raise IgnoreRequest('MAX_PAGES_PER_DOMAIN reached, filtered %s' % request.url)
Example #11
Source File: middlewares.py From freshonions-torscraper with GNU Affero General Public License v3.0 | 5 votes |
def process_request(self, request, spider): if not Domain.is_onion_url(request.url): return None parsed_url = urlparse.urlparse(request.url) host = parsed_url.hostname subdomains = host.count(".") if subdomains > 2: raise IgnoreRequest('Too many subdomains (%d > 2)' % subdomains) return None
Example #12
Source File: test_pagestorage.py From scrapy-pagestorage with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_process_spider_exception(self): assert self.instance.counters == {'all': 0, 'error': 0} self.instance.save_response = mock.Mock() # all conditions are true self.instance.on_error_enabled = True self.instance.process_spider_exception( 'err-response', Exception(), self.spider) assert self.instance.counters == {'all': 0, 'error': 1} # on_error flag is disabled, skipping self.instance.on_error_enabled = False self.instance.process_spider_exception( 'err-response', Exception(), self.spider) assert self.instance.counters == {'all': 0, 'error': 1} # exceeded error limit self.instance.on_error_enabled = True self.instance.counters['error'] = 11 self.instance.process_spider_exception( 'err-response', Exception(), self.spider) assert self.instance.counters == {'all': 0, 'error': 11} # skip IgnoreRequest self.instance.limits['error'] = 12 self.instance.process_spider_exception( 'err-response', IgnoreRequest(), self.spider) assert self.instance.counters == {'all': 0, 'error': 11} # all conditions are true again self.instance.limits['all'] = 12 self.instance.process_spider_exception( 'err-response', Exception(), self.spider) assert self.instance.counters == {'all': 0, 'error': 12}
Example #13
Source File: scrapy_pagestorage.py From scrapy-pagestorage with BSD 3-Clause "New" or "Revised" License | 5 votes |
def process_spider_exception(self, response, exception, spider): if (self.on_error_enabled and not isinstance(exception, IgnoreRequest) and self.counters['error'] < self.limits['error']): self.counters['error'] += 1 self.save_response(response, spider)
Example #14
Source File: middlewares.py From scrapy-crawl-once with MIT License | 5 votes |
def process_request(self, request, spider): if not request.meta.get('crawl_once', self.default): return if self._get_key(request) in self.db: self.stats.inc_value('crawl_once/ignored') raise IgnoreRequest()
Example #15
Source File: middlewares.py From NewsCrawler with MIT License | 5 votes |
def process_request(self, request, spider): if request.url not in spider.start_urls and (redis_conn.hexists(redis_url_key, request.url) or redis_conn.hexists(redis_invalid_url_key, request.url)): logger.info("Skip URL: %s, has been crawled" % request.url) raise IgnoreRequest("URL %s has been crawled" % request.url)
Example #16
Source File: files.py From learn_python3_spider with MIT License | 5 votes |
def media_failed(self, failure, request, info): if not isinstance(failure.value, IgnoreRequest): referer = referer_str(request) logger.warning( 'File (unknown-error): Error downloading %(medianame)s from ' '%(request)s referred in <%(referer)s>: %(exception)s', {'medianame': self.MEDIA_NAME, 'request': request, 'referer': referer, 'exception': failure.value}, extra={'spider': info.spider} ) raise FileException
Example #17
Source File: httpcache.py From learn_python3_spider with MIT License | 5 votes |
def process_request(self, request, spider): if request.meta.get('dont_cache', False): return # Skip uncacheable requests if not self.policy.should_cache_request(request): request.meta['_dont_cache'] = True # flag as uncacheable return # Look for cached response and check if expired cachedresponse = self.storage.retrieve_response(spider, request) if cachedresponse is None: self.stats.inc_value('httpcache/miss', spider=spider) if self.ignore_missing: self.stats.inc_value('httpcache/ignore', spider=spider) raise IgnoreRequest("Ignored request not in cache: %s" % request) return # first time request # Return cached response only if not expired cachedresponse.flags.append('cached') if self.policy.is_cached_response_fresh(cachedresponse, request): self.stats.inc_value('httpcache/hit', spider=spider) return cachedresponse # Keep a reference to cached response to avoid a second cache lookup on # process_response hook request.meta['cached_response'] = cachedresponse
Example #18
Source File: middleware.py From sozlukcrawler with GNU General Public License v2.0 | 5 votes |
def process_request(self, request, spider): if 'x-ignore-request' in request.url: raise IgnoreRequest() elif 'x-error-request' in request.url: _ = 1 / 0
Example #19
Source File: robotstxt.py From learn_python3_spider with MIT License | 5 votes |
def _logerror(self, failure, request, spider): if failure.type is not IgnoreRequest: logger.error("Error downloading %(request)s: %(f_exception)s", {'request': request, 'f_exception': failure.value}, exc_info=failure_to_exc_info(failure), extra={'spider': spider}) return failure
Example #20
Source File: defer.py From learn_python3_spider with MIT License | 5 votes |
def mustbe_deferred(f, *args, **kw): """Same as twisted.internet.defer.maybeDeferred, but delay calling callback/errback to next reactor loop """ try: result = f(*args, **kw) # FIXME: Hack to avoid introspecting tracebacks. This to speed up # processing of IgnoreRequest errors which are, by far, the most common # exception in Scrapy - see #125 except IgnoreRequest as e: return defer_fail(failure.Failure(e)) except Exception: return defer_fail(failure.Failure()) else: return defer_result(result)
Example #21
Source File: files.py From learn_python3_spider with MIT License | 5 votes |
def media_failed(self, failure, request, info): if not isinstance(failure.value, IgnoreRequest): referer = referer_str(request) logger.warning( 'File (unknown-error): Error downloading %(medianame)s from ' '%(request)s referred in <%(referer)s>: %(exception)s', {'medianame': self.MEDIA_NAME, 'request': request, 'referer': referer, 'exception': failure.value}, extra={'spider': info.spider} ) raise FileException
Example #22
Source File: robotstxt.py From learn_python3_spider with MIT License | 5 votes |
def _robots_error(self, failure, netloc): if failure.type is not IgnoreRequest: key = 'robotstxt/exception_count/{}'.format(failure.type) self.crawler.stats.inc_value(key) rp_dfd = self._parsers[netloc] self._parsers[netloc] = None rp_dfd.callback(None)
Example #23
Source File: robotstxt.py From learn_python3_spider with MIT License | 5 votes |
def _logerror(self, failure, request, spider): if failure.type is not IgnoreRequest: logger.error("Error downloading %(request)s: %(f_exception)s", {'request': request, 'f_exception': failure.value}, exc_info=failure_to_exc_info(failure), extra={'spider': spider}) return failure
Example #24
Source File: robotstxt.py From learn_python3_spider with MIT License | 5 votes |
def process_request_2(self, rp, request, spider): if rp is None: return if not rp.can_fetch(to_native_str(self._useragent), request.url): logger.debug("Forbidden by robots.txt: %(request)s", {'request': request}, extra={'spider': spider}) self.crawler.stats.inc_value('robotstxt/forbidden') raise IgnoreRequest("Forbidden by robots.txt")
Example #25
Source File: defer.py From learn_python3_spider with MIT License | 5 votes |
def mustbe_deferred(f, *args, **kw): """Same as twisted.internet.defer.maybeDeferred, but delay calling callback/errback to next reactor loop """ try: result = f(*args, **kw) # FIXME: Hack to avoid introspecting tracebacks. This to speed up # processing of IgnoreRequest errors which are, by far, the most common # exception in Scrapy - see #125 except IgnoreRequest as e: return defer_fail(failure.Failure(e)) except Exception: return defer_fail(failure.Failure()) else: return defer_result(result)
Example #26
Source File: anti_spider.py From news_spider with MIT License | 5 votes |
def process_request(self, request, spider): # 处理微信反爬(反爬机制一, sogou) if spider.name in ['weixin'] and 'antispider' in request.url: # 获取来源链接 redirect_urls = request.meta['redirect_urls'] # 清理失效 cookies cookies_id = request.meta['cookiejar'] del_cookies(spider.name, cookies_id) # spider.log(message='AntiSpider cookies_id: %s; url: %s' % (cookies_id, redirect_urls[0])) raise IgnoreRequest( 'Spider: %s, AntiSpider cookies_id: %s; url: %s' % (spider.name, cookies_id, redirect_urls[0]))
Example #27
Source File: de_duplication_request.py From news_spider with MIT License | 5 votes |
def process_request(self, request, spider): if not request.url: return None channel_id = request.meta.get('channel_id', 0) # 处理详情页面(忽略列表页面)与pipeline配合 if is_dup_detail(request.url, spider.name, channel_id): raise IgnoreRequest("Spider: %s, DeDuplicationRequest: %s" % (spider.name, request.url))
Example #28
Source File: middleware.py From sozlukcrawler with GNU General Public License v2.0 | 5 votes |
def process_response(self, request, response, spider): if 'x-ignore-response' in request.url: raise IgnoreRequest() elif 'x-error-response' in request.url: _ = 1 / 0 else: return response
Example #29
Source File: test_middleware.py From MaybeDont with MIT License | 4 votes |
def test_middleware(): Rq = lambda path: Request( 'http://example.com{}'.format(path), meta={'avoid_dup_content': True}) Rs = lambda req, body: HtmlResponse( req.url, body=body.encode(), request=req) mw = AvoidDupContentMiddleware( initial_queue_limit=1, threshold=0.5, exploration=0.00) spider = Spider() req = Rq('/') mw.process_request(req, spider) mw.process_response(req, Rs(req, ''), spider) assert mw.dupe_predictor n_dropped = 0 for i in range(10): req = Rq('/viewtopic.php?topic_id={}'.format(i)) mw.process_request(req, spider) mw.process_response(req, Rs(req, 'Topic {}'.format(i)), spider) req = Rq('/viewtopic.php?topic_id={}&start=0'.format(i)) try: mw.process_request(req, spider) except IgnoreRequest: n_dropped += 1 else: mw.process_response(req, Rs(req, 'Topic {}'.format(i)), spider) mw.dupe_predictor.log_dupstats(min_dup=0) assert n_dropped == 5 # one request in different order req = Rq('/viewtopic.php?topic_id=100&start=0') mw.process_request(req, spider) mw.process_response(req, Rs(req, ''), spider) mw.process_request(Rq('/viewtopic.php?topic_id=200'), spider) with pytest.raises(IgnoreRequest): mw.process_request(Rq('/viewtopic.php?topic_id=100'), spider) # test exploration mw.exploration = 0.5 n_dropped = 0 n_requests = 0 for i in range(150, 170): req = Rq('/viewtopic.php?topic_id={}'.format(i)) mw.process_request(req, spider) mw.process_response(req, Rs(req, 'Topic {}'.format(i)), spider) req = Rq('/viewtopic.php?topic_id={}&start=0'.format(i)) n_requests += 1 try: mw.process_request(req, spider) except IgnoreRequest: n_dropped += 1 else: mw.process_response(req, Rs(req, 'Topic {}'.format(i)), spider) assert n_dropped > 0 assert n_dropped < n_requests
Example #30
Source File: test_middleware.py From scrapy-crawl-once with MIT License | 4 votes |
def test_crawl(tmpdir): settings = {'CRAWL_ONCE_PATH': str(tmpdir)} crawler = get_crawler(settings_dict=settings) req1 = scrapy.Request('http://example.com/1', meta={'crawl_once': True}) req2 = scrapy.Request('http://example.com/2') req3 = scrapy.Request('http://example.com/3', meta={'crawl_once': True}) resp1 = Response(req1.url, request=req1) resp2 = Response(req2.url, request=req2) with opened_middleware(crawler) as mw: # 1. check spider middleware interface assert len(mw.db) == 0 assert crawler.stats.get_value('crawl_once/initial') == 0 output = [{}, scrapy.Request('http://example.com')] # crawl_once is False res = list(mw.process_spider_output(resp2, output, crawler.spider)) assert res == output assert len(mw.db) == 0 # crawl_once is True res = list(mw.process_spider_output(resp1, output, crawler.spider)) assert res == output assert len(mw.db) == 1 assert crawler.stats.get_value('crawl_once/initial') == 0 assert crawler.stats.get_value('crawl_once/stored') == 1 # 2. check downloader middleware interface assert mw.process_request(req2, crawler.spider) is None assert crawler.stats.get_value('crawl_once/ignored', 0) == 0 with pytest.raises(IgnoreRequest): mw.process_request(req1, crawler.spider) assert crawler.stats.get_value('crawl_once/ignored', 0) == 1 assert mw.process_request(req3, crawler.spider) is None assert crawler.stats.get_value('crawl_once/ignored', 0) == 1 assert crawler.stats.get_value('crawl_once/initial') == 0 crawler = get_crawler(settings_dict=settings) with opened_middleware(crawler) as mw2: # it reuses the same file, so there are records assert len(mw2.db) == 1 assert crawler.stats.get_value('crawl_once/initial') == 1 assert mw2.process_request(req2, crawler.spider) is None assert crawler.stats.get_value('crawl_once/ignored', 0) == 0 with pytest.raises(IgnoreRequest): mw2.process_request(req1, crawler.spider) assert crawler.stats.get_value('crawl_once/ignored', 0) == 1 assert mw2.process_request(req3, crawler.spider) is None