Python scrapy.spiders.Spider() Examples

The following are 16 code examples of scrapy.spiders.Spider(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module scrapy.spiders , or try the search function

Example #1

Source File: test_retry_middleware.py From scrapy-fake-useragent with BSD 3-Clause "New" or "Revised" License

6 votes

def retry_middleware_response(request):
    """
    Fixture to simplify creating a crawler
    with an activated middleware and going through
    the request-response cycle.

    Executes process_response() method of the middleware.
    """
    settings, status = request.param

    crawler = get_crawler(Spider, settings_dict=settings)
    spider = crawler._create_spider('foo')
    mw = RetryUserAgentMiddleware.from_crawler(crawler)

    req = Request('http://www.scrapytest.org/')
    rsp = Response(req.url, body=b'', status=status)

    yield mw.process_response(req, rsp, spider)

Example #2

Source File: test_retry_middleware.py From scrapy-fake-useragent with BSD 3-Clause "New" or "Revised" License

6 votes

def retry_middleware_exception(request):
    """
    Fixture to simplify creating a crawler
    with an activated retry middleware and going through
    the request-response cycle.

    Executes process_exception() method of the middleware.
    """
    settings, exception = request.param

    crawler = get_crawler(Spider, settings_dict=settings)
    spider = crawler._create_spider('foo')
    mw = RetryUserAgentMiddleware.from_crawler(crawler)

    req = Request('http://www.scrapytest.org/')

    yield mw.process_exception(req, exception, spider)

Example #3

Source File: spider.py From learn_python3_spider with MIT License

6 votes

def spidercls_for_request(spider_loader, request, default_spidercls=None,
                          log_none=False, log_multiple=False):
    """Return a spider class that handles the given Request.

    This will look for the spiders that can handle the given request (using
    the spider loader) and return a Spider class if (and only if) there is
    only one Spider able to handle the Request.

    If multiple spiders (or no spider) are found, it will return the
    default_spidercls passed. It can optionally log if multiple or no spiders
    are found.
    """
    snames = spider_loader.find_by_request(request)
    if len(snames) == 1:
        return spider_loader.load(snames[0])

    if len(snames) > 1 and log_multiple:
        logger.error('More than one spider can handle: %(request)s - %(snames)s',
                     {'request': request, 'snames': ', '.join(snames)})

    if len(snames) == 0 and log_none:
        logger.error('Unable to find spider that handles: %(request)s',
                     {'request': request})

    return default_spidercls

Example #4

Source File: spider.py From learn_python3_spider with MIT License

6 votes

def spidercls_for_request(spider_loader, request, default_spidercls=None,
                          log_none=False, log_multiple=False):
    """Return a spider class that handles the given Request.

    This will look for the spiders that can handle the given request (using
    the spider loader) and return a Spider class if (and only if) there is
    only one Spider able to handle the Request.

    If multiple spiders (or no spider) are found, it will return the
    default_spidercls passed. It can optionally log if multiple or no spiders
    are found.
    """
    snames = spider_loader.find_by_request(request)
    if len(snames) == 1:
        return spider_loader.load(snames[0])

    if len(snames) > 1 and log_multiple:
        logger.error('More than one spider can handle: %(request)s - %(snames)s',
                     {'request': request, 'snames': ', '.join(snames)})

    if len(snames) == 0 and log_none:
        logger.error('Unable to find spider that handles: %(request)s',
                     {'request': request})

    return default_spidercls

Example #5

Source File: test_random_ua_middleware.py From scrapy-fake-useragent with BSD 3-Clause "New" or "Revised" License

5 votes

def middleware_request(request):
    crawler = get_crawler(Spider, settings_dict=request.param)
    spider = crawler._create_spider('foo')
    mw = RandomUserAgentMiddleware.from_crawler(crawler)

    req = Request('http://www.scrapytest.org/')

    mw.process_request(req, spider)

    yield req

Example #6

Source File: py_my_scrapy_redis_server.py From vrequest with MIT License

5 votes

def open_spider(self, spider): 
        spider.logger.info('Spider RedisStatsCollector opened. curr pcmac:{}.'.format(self._pc_mac))

Example #7

Source File: py_my_scrapy_redis_server.py From vrequest with MIT License

5 votes

def spider_opened(self, spider): 
        spider.logger.info('Spider RedisCoreStats opened.')

Example #8

Source File: py_my_scrapy_redis_server.py From vrequest with MIT License

5 votes

def spider_opened(self, spider):
        spider.logger.info('Spider VSpiderMiddleware opened.')

# 下载中间件需要将 process_response 函数内挂钩一个微小的部分解决一个分布式异常

Example #9

Source File: py_my_scrapy_redis_server.py From vrequest with MIT License

5 votes

def spider_opened(self, spider):
        spider.logger.info('Spider VDownloaderMiddleware opened.')

# item中间件尾部，依据在item中是否存在 _b2b89079b2f7befcf4691a98a3f0a2a2 字段来决定数据是否存放入 redis

Example #10

Source File: spider.py From learn_python3_spider with MIT License

5 votes

def iter_spider_classes(module):
    """Return an iterator over all spider classes defined in the given module
    that can be instantiated (ie. which have name)
    """
    # this needs to be imported here until get rid of the spider manager
    # singleton in scrapy.spider.spiders
    from scrapy.spiders import Spider

    for obj in six.itervalues(vars(module)):
        if inspect.isclass(obj) and \
           issubclass(obj, Spider) and \
           obj.__module__ == module.__name__ and \
           getattr(obj, 'name', None):
            yield obj

Example #11

Source File: test.py From learn_python3_spider with MIT License

5 votes

def get_crawler(spidercls=None, settings_dict=None):
    """Return an unconfigured Crawler object. If settings_dict is given, it
    will be used to populate the crawler settings with a project level
    priority.
    """
    from scrapy.crawler import CrawlerRunner
    from scrapy.spiders import Spider

    runner = CrawlerRunner(settings_dict)
    return runner.create_crawler(spidercls or Spider)

Example #12

Source File: spider.py From learn_python3_spider with MIT License

5 votes

def iter_spider_classes(module):
    """Return an iterator over all spider classes defined in the given module
    that can be instantiated (ie. which have name)
    """
    # this needs to be imported here until get rid of the spider manager
    # singleton in scrapy.spider.spiders
    from scrapy.spiders import Spider

    for obj in six.itervalues(vars(module)):
        if inspect.isclass(obj) and \
           issubclass(obj, Spider) and \
           obj.__module__ == module.__name__ and \
           getattr(obj, 'name', None):
            yield obj

Example #13

Source File: test_autoextract.py From scrapy-autoextract with BSD 3-Clause "New" or "Revised" License

5 votes

def setup_module(module):
    global spider
    spider = Spider('spidr')

Example #14

Source File: test_magicfields.py From scrapy-magicfields with BSD 3-Clause "New" or "Revised" License

5 votes

def setUp(self):
        self.environ = os.environ.copy()
        self.spider = Spider('myspider', arg1='val1', start_urls = ["http://example.com"])

        def _log(x):
            print(x)

        self.spider.log = _log
        self.response = HtmlResponse(body=b"<html></html>", url="http://www.example.com/product/8798732")
        self.item = TestItem({'nom': 'myitem', 'prix': "56.70 euros", "url": "http://www.example.com/product.html?item_no=345"})

Example #15

Source File: test_magicfields.py From scrapy-magicfields with BSD 3-Clause "New" or "Revised" License

5 votes

def test_spidername_time(self):
        formatted = _format("Spider: $spider:name. Item scraped at $time", self.spider, self.response, self.item, {})
        self.assertRegexpMatches(formatted, 'Spider: myspider. Item scraped at \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$')

Example #16

Source File: py_my_scrapy_redis_server.py From vrequest with MIT License

4 votes

def _stop_clear(self):
        taskids = []
        spider_tids_shot = self.spider_tids.copy()
        for taskid in spider_tids_shot:
            taskids.append(taskid)
            # 在一定时间后对统计信息的快照进行处理，如果快照相同，则计数
            # 相似数超过N次，则代表任务已经收集不到数据了，遂停止任务，并写入任务停止时间，（设置的时间越长越准，十分钟内差不多了）
            if self.spider_tids[taskid]['check_times'] != self.limit_check:
                self.spider_tids[taskid]['check_times'] += 1
            else:
                self.spider_tids[taskid]['check_times'] = 0
                stat_key = self._spider_id_task_format.format(taskid) % {'spider': self.name}

                snapshot, enqueue, dequeue = self._get_snapshot(stat_key)
                snapshot_e2d = enqueue == dequeue
                snapshot_md5 = hmac.new(b'',str(snapshot).encode(),'md5').hexdigest()
                if snapshot_md5 != self.spider_tids[taskid]['stat_snapshot'] or not snapshot_e2d:
                    self.spider_tids[taskid]['stat_snapshot'] = snapshot_md5
                    self.spider_tids[taskid]['same_snapshot_times'] = 0
                else:
                    self.spider_tids[taskid]['same_snapshot_times'] += 1
                    if self.spider_tids[taskid]['same_snapshot_times'] >= self.limit_same:
                        # 这里主要就是直接对任务结束进行收尾处理
                        # 后续需要各种删除 redis 中各种不需要的 key 来清理空间
                        # 另外再清理程序启动时生成的检测停止标签
                        if self._clear_debug_pc:
                            stat_pckey = self._spider_id_debg_format % {'spider': self.name}
                            self.server.delete(stat_pckey)
                        if self._clear_dupefilter:
                            dupefilter = self._spider_id_dupk_format.format(taskid) % {'spider': self.name}
                            self.server.delete(dupefilter)
                        module_name = self.spider_tids[taskid]['module_name']
                        # 在 redis 里面必须常驻的就是任务脚本
                        # 因为任务脚本会经过 hash 处理，以名字的 hash 作为 redis 的 key 进行存储
                        # 这样一个好处就是即便是存在大量重复的任务也只会存放一个任务脚本
                        # 同时 spider 对象也用的是脚本的 hash 作为 key 存放在执行程序的一个字典里面
                        # 为了考虑重复任务的可能，在任务结束时，删除[可能别的任务也在用的]对象的风险和开发难度很大，
                        # 实际上这种对象资源的消耗本身也比较小，所以对象也考虑常驻内存，
                        # 并且程序重启后，如果没有遇到需要用到之前任务的脚本也不会主动去实例化。节省开支。
                        # 另外还有一种恶性情况，就是还没有检查到任务停止的时候程序就意外关闭了
                        # 可能的影响：没有清理过滤池、没有写入finish_time、少数几条正在执行的任务丢失，
                        # 对其他正在执行的任务影响基本没有。所以不考虑了。
                        del self.spider_tids[taskid]
                        self.log_stat(taskid, 'finish_time')
                        snapshot,_,_ = self._get_snapshot(stat_key)
                        self.logger.info('Task {} is Stoped.\n'.format(taskid) + pprint.pformat(snapshot))
                        taskids.remove(taskid)
        if len(taskids) == 0:
            self.logger.info("Spider Task is Empty.")
        else:
            if len(taskids) > self.limit_log:
                fmt_log = '{}'.format(taskids[:self.limit_log]).replace(']',', ...][num:{}]'.format(len(taskids)))
            else:
                fmt_log = '{}'.format(taskids)
            self.logger.info("Check Task Stoping {}.".format(fmt_log))