Python Examples of scrapy.exceptions.IgnoreRequest

Source File: middlewares.py From freshonions-torscraper with GNU Affero General Public License v3.0

6 votes

def process_request(self, request, spider): 
        parsed_url = urlparse.urlparse(request.url)
        
        if not self.test_mode or not parsed_url.path in ["/", ""]:
            return None

        if not Domain.is_onion_url(request.url):
            return None

        d = Domain.find_by_url(request.url)

        if d is None:
            return None

        now = datetime.now()

        if now > d.next_scheduled_check:
            return None
        else:
            raise IgnoreRequest('FilterNotScheduledMiddleware: %s is not scheduled to check' % d.host)

Source File: middlewares.py From freshonions-torscraper with GNU Affero General Public License v3.0

6 votes

def process_request(self, request, spider):

          # don't use this middleware while testing is site is up
        if hasattr(spider, "test") and spider.test=="yes":
            #logger = logging.getLogger()
            #logger.info("Testing mode, dead domains disabled")
            return None

        if not Domain.is_onion_url(request.url):
            return None

        domain = Domain.find_by_url(request.url)
        if not domain or domain.is_up:
            return None

        raise IgnoreRequest('Domain %s is dead, skipping' % domain.host)

Source File: redirect.py From learn_python3_spider with MIT License

6 votes

def _redirect(self, redirected, request, spider, reason):
        ttl = request.meta.setdefault('redirect_ttl', self.max_redirect_times)
        redirects = request.meta.get('redirect_times', 0) + 1

        if ttl and redirects <= self.max_redirect_times:
            redirected.meta['redirect_times'] = redirects
            redirected.meta['redirect_ttl'] = ttl - 1
            redirected.meta['redirect_urls'] = request.meta.get('redirect_urls', []) + \
                [request.url]
            redirected.meta['redirect_reasons'] = request.meta.get('redirect_reasons', []) + \
                [reason]
            redirected.dont_filter = request.dont_filter
            redirected.priority = request.priority + self.priority_adjust
            logger.debug("Redirecting (%(reason)s) to %(redirected)s from %(request)s",
                         {'reason': reason, 'redirected': redirected, 'request': request},
                         extra={'spider': spider})
            return redirected
        else:
            logger.debug("Discarding %(request)s: max redirections reached",
                         {'request': request}, extra={'spider': spider})
            raise IgnoreRequest("max redirections reached")

Source File: middlewares.py From scrapy-autoextract with BSD 3-Clause "New" or "Revised" License

6 votes

def process_exception(self, request, exception, spider):
        if isinstance(exception, (IgnoreRequest, DropItem)):
            return
        if not self._is_enabled_for_request(request):
            return

        autoextract = request.meta.pop(AUTOEXTRACT_META_KEY)
        stop_time = time.time()
        latency = time.time() - autoextract['timing']['start_ts']
        autoextract['timing'].update({'end_ts': stop_time, 'latency': latency})

        # Make sure to log all unknown failures
        logger.warning('AutoExtract failure after %.3fs for %s: %s',
                       latency,
                       autoextract['original_url'],
                       repr(exception),
                       extra={'spider': spider})

        request.meta['autoextract'] = autoextract
        ex_class = global_object_name(exception.__class__)
        self.inc_metric('autoextract/errors/total_count', spider=spider)
        self.inc_metric('autoextract/errors/type_count/%s' % ex_class, spider=spider)

Source File: shell.py From learn_python3_spider with MIT License

6 votes

def fetch(self, request_or_url, spider=None, redirect=True, **kwargs):
        if isinstance(request_or_url, Request):
            request = request_or_url
        else:
            url = any_to_uri(request_or_url)
            request = Request(url, dont_filter=True, **kwargs)
            if redirect:
                request.meta['handle_httpstatus_list'] = SequenceExclude(range(300, 400))
            else:
                request.meta['handle_httpstatus_all'] = True
        response = None
        try:
            response, spider = threads.blockingCallFromThread(
                reactor, self._schedule, request, spider)
        except IgnoreRequest:
            pass
        self.populate_vars(response, request, spider)

Source File: scraper.py From learn_python3_spider with MIT License

6 votes

def _log_download_errors(self, spider_failure, download_failure, request, spider):
        """Log and silence errors that come from the engine (typically download
        errors that got propagated thru here)
        """
        if (isinstance(download_failure, Failure) and
                not download_failure.check(IgnoreRequest)):
            if download_failure.frames:
                logger.error('Error downloading %(request)s',
                             {'request': request},
                             exc_info=failure_to_exc_info(download_failure),
                             extra={'spider': spider})
            else:
                errmsg = download_failure.getErrorMessage()
                if errmsg:
                    logger.error('Error downloading %(request)s: %(errmsg)s',
                                 {'request': request, 'errmsg': errmsg},
                                 extra={'spider': spider})

        if spider_failure is not download_failure:
            return spider_failure

Source File: redirect.py From learn_python3_spider with MIT License

6 votes

def _redirect(self, redirected, request, spider, reason):
        ttl = request.meta.setdefault('redirect_ttl', self.max_redirect_times)
        redirects = request.meta.get('redirect_times', 0) + 1

        if ttl and redirects <= self.max_redirect_times:
            redirected.meta['redirect_times'] = redirects
            redirected.meta['redirect_ttl'] = ttl - 1
            redirected.meta['redirect_urls'] = request.meta.get('redirect_urls', []) + \
                [request.url]
            redirected.meta['redirect_reasons'] = request.meta.get('redirect_reasons', []) + \
                [reason]
            redirected.dont_filter = request.dont_filter
            redirected.priority = request.priority + self.priority_adjust
            logger.debug("Redirecting (%(reason)s) to %(redirected)s from %(request)s",
                         {'reason': reason, 'redirected': redirected, 'request': request},
                         extra={'spider': spider})
            return redirected
        else:
            logger.debug("Discarding %(request)s: max redirections reached",
                         {'request': request}, extra={'spider': spider})
            raise IgnoreRequest("max redirections reached")

Source File: shell.py From learn_python3_spider with MIT License

6 votes

def fetch(self, request_or_url, spider=None, redirect=True, **kwargs):
        if isinstance(request_or_url, Request):
            request = request_or_url
        else:
            url = any_to_uri(request_or_url)
            request = Request(url, dont_filter=True, **kwargs)
            if redirect:
                request.meta['handle_httpstatus_list'] = SequenceExclude(range(300, 400))
            else:
                request.meta['handle_httpstatus_all'] = True
        response = None
        try:
            response, spider = threads.blockingCallFromThread(
                reactor, self._schedule, request, spider)
        except IgnoreRequest:
            pass
        self.populate_vars(response, request, spider)

Source File: robotstxt.py From learn_python3_spider with MIT License

5 votes

def _robots_error(self, failure, netloc):
        if failure.type is not IgnoreRequest:
            key = 'robotstxt/exception_count/{}'.format(failure.type)
            self.crawler.stats.inc_value(key)
        rp_dfd = self._parsers[netloc]
        self._parsers[netloc] = None
        rp_dfd.callback(None)

Source File: middlewares.py From freshonions-torscraper with GNU Affero General Public License v3.0

5 votes

def process_request(self, request, spider):
        
        parsed_url = urlparse.urlparse(request.url)
        host = parsed_url.hostname
        if self.counter[host] < self.max_pages:
            self.counter[host] += 1
            spider.logger.info('Page count is %d for %s' % (self.counter[host], host))
            return None                   
        else:
            raise IgnoreRequest('MAX_PAGES_PER_DOMAIN reached, filtered %s' % request.url)

Source File: middlewares.py From freshonions-torscraper with GNU Affero General Public License v3.0

5 votes

def process_request(self, request, spider):

        if not Domain.is_onion_url(request.url):
            return None
        parsed_url = urlparse.urlparse(request.url)
        host = parsed_url.hostname
        subdomains = host.count(".")
        if subdomains > 2:
            raise IgnoreRequest('Too many subdomains (%d > 2)' % subdomains)

        return None

Source File: test_pagestorage.py From scrapy-pagestorage with BSD 3-Clause "New" or "Revised" License

5 votes

def test_process_spider_exception(self):
        assert self.instance.counters == {'all': 0, 'error': 0}
        self.instance.save_response = mock.Mock()
        # all conditions are true
        self.instance.on_error_enabled = True
        self.instance.process_spider_exception(
            'err-response', Exception(), self.spider)
        assert self.instance.counters == {'all': 0, 'error': 1}
        # on_error flag is disabled, skipping
        self.instance.on_error_enabled = False
        self.instance.process_spider_exception(
            'err-response', Exception(), self.spider)
        assert self.instance.counters == {'all': 0, 'error': 1}
        # exceeded error limit
        self.instance.on_error_enabled = True
        self.instance.counters['error'] = 11
        self.instance.process_spider_exception(
            'err-response', Exception(), self.spider)
        assert self.instance.counters == {'all': 0, 'error': 11}
        # skip IgnoreRequest
        self.instance.limits['error'] = 12
        self.instance.process_spider_exception(
            'err-response', IgnoreRequest(), self.spider)
        assert self.instance.counters == {'all': 0, 'error': 11}
        # all conditions are true again
        self.instance.limits['all'] = 12
        self.instance.process_spider_exception(
            'err-response', Exception(), self.spider)
        assert self.instance.counters == {'all': 0, 'error': 12}

Source File: scrapy_pagestorage.py From scrapy-pagestorage with BSD 3-Clause "New" or "Revised" License

5 votes

def process_spider_exception(self, response, exception, spider):
        if (self.on_error_enabled and
                not isinstance(exception, IgnoreRequest) and
                self.counters['error'] < self.limits['error']):
            self.counters['error'] += 1
            self.save_response(response, spider)

Source File: middlewares.py From scrapy-crawl-once with MIT License

5 votes

def process_request(self, request, spider):
        if not request.meta.get('crawl_once', self.default):
            return
        if self._get_key(request) in self.db:
            self.stats.inc_value('crawl_once/ignored')
            raise IgnoreRequest()

Source File: middlewares.py From NewsCrawler with MIT License

5 votes

def process_request(self, request, spider):
        if request.url not in spider.start_urls and (redis_conn.hexists(redis_url_key, request.url) or redis_conn.hexists(redis_invalid_url_key, request.url)):
            logger.info("Skip URL: %s, has been crawled" % request.url)
            raise IgnoreRequest("URL %s has been crawled" % request.url)

Source File: files.py From learn_python3_spider with MIT License

5 votes

def media_failed(self, failure, request, info):
        if not isinstance(failure.value, IgnoreRequest):
            referer = referer_str(request)
            logger.warning(
                'File (unknown-error): Error downloading %(medianame)s from '
                '%(request)s referred in <%(referer)s>: %(exception)s',
                {'medianame': self.MEDIA_NAME, 'request': request,
                 'referer': referer, 'exception': failure.value},
                extra={'spider': info.spider}
            )

        raise FileException

Source File: httpcache.py From learn_python3_spider with MIT License

5 votes

def process_request(self, request, spider):
        if request.meta.get('dont_cache', False):
            return

        # Skip uncacheable requests
        if not self.policy.should_cache_request(request):
            request.meta['_dont_cache'] = True  # flag as uncacheable
            return

        # Look for cached response and check if expired
        cachedresponse = self.storage.retrieve_response(spider, request)
        if cachedresponse is None:
            self.stats.inc_value('httpcache/miss', spider=spider)
            if self.ignore_missing:
                self.stats.inc_value('httpcache/ignore', spider=spider)
                raise IgnoreRequest("Ignored request not in cache: %s" % request)
            return  # first time request

        # Return cached response only if not expired
        cachedresponse.flags.append('cached')
        if self.policy.is_cached_response_fresh(cachedresponse, request):
            self.stats.inc_value('httpcache/hit', spider=spider)
            return cachedresponse

        # Keep a reference to cached response to avoid a second cache lookup on
        # process_response hook
        request.meta['cached_response'] = cachedresponse

Source File: middleware.py From sozlukcrawler with GNU General Public License v2.0

5 votes

def process_request(self, request, spider):
        if 'x-ignore-request' in request.url:
            raise IgnoreRequest()
        elif 'x-error-request' in request.url:
            _ = 1 / 0

Source File: robotstxt.py From learn_python3_spider with MIT License

5 votes

def _logerror(self, failure, request, spider):
        if failure.type is not IgnoreRequest:
            logger.error("Error downloading %(request)s: %(f_exception)s",
                         {'request': request, 'f_exception': failure.value},
                         exc_info=failure_to_exc_info(failure),
                         extra={'spider': spider})
        return failure

Source File: defer.py From learn_python3_spider with MIT License

5 votes

def mustbe_deferred(f, *args, **kw):
    """Same as twisted.internet.defer.maybeDeferred, but delay calling
    callback/errback to next reactor loop
    """
    try:
        result = f(*args, **kw)
    # FIXME: Hack to avoid introspecting tracebacks. This to speed up
    # processing of IgnoreRequest errors which are, by far, the most common
    # exception in Scrapy - see #125
    except IgnoreRequest as e:
        return defer_fail(failure.Failure(e))
    except Exception:
        return defer_fail(failure.Failure())
    else:
        return defer_result(result)

Source File: files.py From learn_python3_spider with MIT License

5 votes

def media_failed(self, failure, request, info):
        if not isinstance(failure.value, IgnoreRequest):
            referer = referer_str(request)
            logger.warning(
                'File (unknown-error): Error downloading %(medianame)s from '
                '%(request)s referred in <%(referer)s>: %(exception)s',
                {'medianame': self.MEDIA_NAME, 'request': request,
                 'referer': referer, 'exception': failure.value},
                extra={'spider': info.spider}
            )

        raise FileException

Source File: robotstxt.py From learn_python3_spider with MIT License

5 votes

def _robots_error(self, failure, netloc):
        if failure.type is not IgnoreRequest:
            key = 'robotstxt/exception_count/{}'.format(failure.type)
            self.crawler.stats.inc_value(key)
        rp_dfd = self._parsers[netloc]
        self._parsers[netloc] = None
        rp_dfd.callback(None)

Source File: robotstxt.py From learn_python3_spider with MIT License

5 votes

def _logerror(self, failure, request, spider):
        if failure.type is not IgnoreRequest:
            logger.error("Error downloading %(request)s: %(f_exception)s",
                         {'request': request, 'f_exception': failure.value},
                         exc_info=failure_to_exc_info(failure),
                         extra={'spider': spider})
        return failure

Source File: robotstxt.py From learn_python3_spider with MIT License

5 votes

def process_request_2(self, rp, request, spider):
        if rp is None:
            return
        if not rp.can_fetch(to_native_str(self._useragent), request.url):
            logger.debug("Forbidden by robots.txt: %(request)s",
                         {'request': request}, extra={'spider': spider})
            self.crawler.stats.inc_value('robotstxt/forbidden')
            raise IgnoreRequest("Forbidden by robots.txt")

Source File: defer.py From learn_python3_spider with MIT License

5 votes

def mustbe_deferred(f, *args, **kw):
    """Same as twisted.internet.defer.maybeDeferred, but delay calling
    callback/errback to next reactor loop
    """
    try:
        result = f(*args, **kw)
    # FIXME: Hack to avoid introspecting tracebacks. This to speed up
    # processing of IgnoreRequest errors which are, by far, the most common
    # exception in Scrapy - see #125
    except IgnoreRequest as e:
        return defer_fail(failure.Failure(e))
    except Exception:
        return defer_fail(failure.Failure())
    else:
        return defer_result(result)

Source File: anti_spider.py From news_spider with MIT License

5 votes

def process_request(self, request, spider):
        # 处理微信反爬(反爬机制一, sogou)
        if spider.name in ['weixin'] and 'antispider' in request.url:
            # 获取来源链接
            redirect_urls = request.meta['redirect_urls']

            # 清理失效 cookies
            cookies_id = request.meta['cookiejar']
            del_cookies(spider.name, cookies_id)

            # spider.log(message='AntiSpider cookies_id: %s; url: %s' % (cookies_id, redirect_urls[0]))
            raise IgnoreRequest(
                'Spider: %s, AntiSpider cookies_id: %s; url: %s' % (spider.name, cookies_id, redirect_urls[0]))

Source File: de_duplication_request.py From news_spider with MIT License

5 votes

def process_request(self, request, spider):
        if not request.url:
            return None
        channel_id = request.meta.get('channel_id', 0)
        # 处理详情页面（忽略列表页面）与pipeline配合
        if is_dup_detail(request.url, spider.name, channel_id):
            raise IgnoreRequest("Spider: %s, DeDuplicationRequest: %s" % (spider.name, request.url))

Source File: middleware.py From sozlukcrawler with GNU General Public License v2.0

5 votes

def process_response(self, request, response, spider):
        if 'x-ignore-response' in request.url:
            raise IgnoreRequest()
        elif 'x-error-response' in request.url:
            _ = 1 / 0
        else:
            return response

Source File: test_middleware.py From MaybeDont with MIT License

4 votes

def test_middleware():
    Rq = lambda path: Request(
        'http://example.com{}'.format(path),
        meta={'avoid_dup_content': True})
    Rs = lambda req, body: HtmlResponse(
        req.url, body=body.encode(), request=req)
    mw = AvoidDupContentMiddleware(
        initial_queue_limit=1, threshold=0.5, exploration=0.00)
    spider = Spider()
    req = Rq('/')
    mw.process_request(req, spider)
    mw.process_response(req, Rs(req, ''), spider)
    assert mw.dupe_predictor
    n_dropped = 0
    for i in range(10):
        req = Rq('/viewtopic.php?topic_id={}'.format(i))
        mw.process_request(req, spider)
        mw.process_response(req, Rs(req, 'Topic {}'.format(i)), spider)
        req = Rq('/viewtopic.php?topic_id={}&start=0'.format(i))
        try:
            mw.process_request(req, spider)
        except IgnoreRequest:
            n_dropped += 1
        else:
            mw.process_response(req, Rs(req, 'Topic {}'.format(i)), spider)
        mw.dupe_predictor.log_dupstats(min_dup=0)
    assert n_dropped == 5
    # one request in different order
    req = Rq('/viewtopic.php?topic_id=100&start=0')
    mw.process_request(req, spider)
    mw.process_response(req, Rs(req, ''), spider)
    mw.process_request(Rq('/viewtopic.php?topic_id=200'), spider)
    with pytest.raises(IgnoreRequest):
        mw.process_request(Rq('/viewtopic.php?topic_id=100'), spider)
    # test exploration
    mw.exploration = 0.5
    n_dropped = 0
    n_requests = 0
    for i in range(150, 170):
        req = Rq('/viewtopic.php?topic_id={}'.format(i))
        mw.process_request(req, spider)
        mw.process_response(req, Rs(req, 'Topic {}'.format(i)), spider)
        req = Rq('/viewtopic.php?topic_id={}&start=0'.format(i))
        n_requests += 1
        try:
            mw.process_request(req, spider)
        except IgnoreRequest:
            n_dropped += 1
        else:
            mw.process_response(req, Rs(req, 'Topic {}'.format(i)), spider)
    assert n_dropped > 0
    assert n_dropped < n_requests

Source File: test_middleware.py From scrapy-crawl-once with MIT License

4 votes

def test_crawl(tmpdir):
    settings = {'CRAWL_ONCE_PATH': str(tmpdir)}
    crawler = get_crawler(settings_dict=settings)
    req1 = scrapy.Request('http://example.com/1', meta={'crawl_once': True})
    req2 = scrapy.Request('http://example.com/2')
    req3 = scrapy.Request('http://example.com/3', meta={'crawl_once': True})

    resp1 = Response(req1.url, request=req1)
    resp2 = Response(req2.url, request=req2)

    with opened_middleware(crawler) as mw:

        # 1. check spider middleware interface
        assert len(mw.db) == 0
        assert crawler.stats.get_value('crawl_once/initial') == 0
        output = [{}, scrapy.Request('http://example.com')]

        # crawl_once is False
        res = list(mw.process_spider_output(resp2, output, crawler.spider))
        assert res == output
        assert len(mw.db) == 0

        # crawl_once is True
        res = list(mw.process_spider_output(resp1, output, crawler.spider))
        assert res == output
        assert len(mw.db) == 1
        assert crawler.stats.get_value('crawl_once/initial') == 0
        assert crawler.stats.get_value('crawl_once/stored') == 1

        # 2. check downloader middleware interface
        assert mw.process_request(req2, crawler.spider) is None
        assert crawler.stats.get_value('crawl_once/ignored', 0) == 0

        with pytest.raises(IgnoreRequest):
            mw.process_request(req1, crawler.spider)
        assert crawler.stats.get_value('crawl_once/ignored', 0) == 1

        assert mw.process_request(req3, crawler.spider) is None
        assert crawler.stats.get_value('crawl_once/ignored', 0) == 1
        assert crawler.stats.get_value('crawl_once/initial') == 0

    crawler = get_crawler(settings_dict=settings)
    with opened_middleware(crawler) as mw2:
        # it reuses the same file, so there are records
        assert len(mw2.db) == 1
        assert crawler.stats.get_value('crawl_once/initial') == 1
        assert mw2.process_request(req2, crawler.spider) is None
        assert crawler.stats.get_value('crawl_once/ignored', 0) == 0
        with pytest.raises(IgnoreRequest):
            mw2.process_request(req1, crawler.spider)
        assert crawler.stats.get_value('crawl_once/ignored', 0) == 1
        assert mw2.process_request(req3, crawler.spider) is None

Python scrapy.exceptions.IgnoreRequest() Examples