Python scrapy.signals.spider_idle() Examples

The following are 23 code examples of scrapy.signals.spider_idle(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module scrapy.signals , or try the search function .
Example #1
Source File: engine.py    From learn_python3_spider with MIT License 12 votes vote down vote up
def _spider_idle(self, spider):
        """Called when a spider gets idle. This function is called when there
        are no remaining pages to download or schedule. It can be called
        multiple times. If some extension raises a DontCloseSpider exception
        (in the spider_idle signal handler) the spider is not closed until the
        next loop and this function is guaranteed to be called (at least) once
        again for this spider.
        """
        res = self.signals.send_catch_log(signal=signals.spider_idle, \
            spider=spider, dont_log=DontCloseSpider)
        if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) \
                for _, x in res):
            return

        if self.spider_is_idle(spider):
            self.close_spider(spider, reason='finished') 
Example #2
Source File: engine.py    From learn_python3_spider with MIT License 11 votes vote down vote up
def _spider_idle(self, spider):
        """Called when a spider gets idle. This function is called when there
        are no remaining pages to download or schedule. It can be called
        multiple times. If some extension raises a DontCloseSpider exception
        (in the spider_idle signal handler) the spider is not closed until the
        next loop and this function is guaranteed to be called (at least) once
        again for this spider.
        """
        res = self.signals.send_catch_log(signal=signals.spider_idle, \
            spider=spider, dont_log=DontCloseSpider)
        if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) \
                for _, x in res):
            return

        if self.spider_is_idle(spider):
            self.close_spider(spider, reason='finished') 
Example #3
Source File: py_my_scrapy_redis_server.py    From vrequest with MIT License 10 votes vote down vote up
def spider_idle(self):
        raise DontCloseSpider 
Example #4
Source File: redis_spiders.py    From haipproxy with MIT License 9 votes vote down vote up
def setup_redis(self, crawler):
        """send signals when the spider is free"""
        self.redis_batch_size = SPIDER_FEED_SIZE
        self.redis_con = get_redis_conn()

        crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) 
Example #5
Source File: spiders.py    From openslack-crawler with Apache License 2.0 9 votes vote down vote up
def setup_rabbitmq(self):
        """ Setup RabbitMQ connection.

            Call this method after spider has set its crawler object.
        :return: None
        """

        if not self.rabbitmq_key:
            self.rabbitmq_key = '{}:start_urls'.format(self.name)

        self.server = connection.from_settings(self.crawler.settings)
        self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
        self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped) 
Example #6
Source File: spiders.py    From openslack-crawler with Apache License 2.0 9 votes vote down vote up
def spider_idle(self):
        """Schedules a request if available, otherwise waits."""
        self.schedule_next_request()
        raise DontCloseSpider 
Example #7
Source File: redis_spider.py    From openslack-crawler with Apache License 2.0 9 votes vote down vote up
def spider_idle(self):
        raise DontCloseSpider 
Example #8
Source File: redis_spider.py    From openslack-crawler with Apache License 2.0 9 votes vote down vote up
def set_crawler(self, crawler):
        super(RedisSpider, self).set_crawler(crawler)
        self.crawler.signals.connect(self.spider_idle,
                                     signal=signals.spider_idle) 
Example #9
Source File: spiders.py    From scrapy-kafka with Apache License 2.0 7 votes vote down vote up
def setup_kafka(self, settings):
        """Setup redis connection and idle signal.

        This should be called after the spider has set its crawler object.

        :param settings: The current Scrapy settings being used
        :type settings: scrapy.settings.Settings
        """
        if not hasattr(self, 'topic') or not self.topic:
            self.topic = '%s-starturls' % self.name

        hosts = settings.get('SCRAPY_KAFKA_HOSTS', ['localhost:9092'])
        consumer_group = settings.get('SCRAPY_KAFKA_SPIDER_CONSUMER_GROUP', 'scrapy-kafka')
        _kafka = KafkaClient(hosts)
        # wait at most 1sec for more messages. Otherwise continue
        self.consumer = SimpleConsumer(_kafka, consumer_group, self.topic,
                                       auto_commit=True, iter_timeout=1.0)
        # idle signal is called when the spider has no requests left,
        # that's when we will schedule new requests from kafka topic
        self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
        self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
        self.log("Reading URLs from kafka topic '%s'" % self.kafka_topic) 
Example #10
Source File: spiders.py    From openslack-crawler with Apache License 2.0 7 votes vote down vote up
def setup_kafka(self, settings):
        """Setup redis connection and idle signal.
        This should be called after the spider has set its crawler object.
        :param settings: The current Scrapy settings being used
        :type settings: scrapy.settings.Settings
        """
        if not hasattr(self, 'topic') or not self.topic:
            self.topic = '%s-starturls' % self.name

        hosts = settings.get('SCRAPY_KAFKA_HOSTS', ['localhost:9092'])
        consumer_group = settings.get('SCRAPY_KAFKA_SPIDER_CONSUMER_GROUP', 'scrapy-kafka')
        _kafka = KafkaClient(hosts)
        # wait at most 1sec for more messages. Otherwise continue
        self.consumer = SimpleConsumer(_kafka, consumer_group, self.topic,
                                       auto_commit=True, iter_timeout=1.0)
        # idle signal is called when the spider has no requests left,
        # that's when we will schedule new requests from kafka topic
        self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
        self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
        self.log("Reading URLs from kafka topic '%s'" % self.kafka_topic) 
Example #11
Source File: spider.py    From Gerapy with MIT License 7 votes vote down vote up
def start_requests(self):
        self.crawler.signals.connect(self.make_requests, signal=signals.spider_idle)
        return [] 
Example #12
Source File: spiders.py    From openslack-crawler with Apache License 2.0 6 votes vote down vote up
def spider_idle(self):
        """Schedules a request if available, otherwise waits."""
        self.schedule_next_request()
        raise DontCloseSpider 
Example #13
Source File: spiders.py    From openslack-crawler with Apache License 2.0 6 votes vote down vote up
def setup_redis(self):
        """Setup redis connection and idle signal.
        This should be called after the spider has set its crawler object.
        """
        if not self.redis_key:
            self.redis_key = '%s:start_urls' % self.name

        self.server = connection.from_settings(self.crawler.settings)
        # idle signal is called when the spider has no requests left,
        # that's when we will schedule new requests from redis queue
        self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
        self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
        self.log("Reading URLs from redis list '%s'" % self.redis_key) 
Example #14
Source File: spider.py    From stock with Apache License 2.0 6 votes vote down vote up
def __init__(self):
    self.spider = HqSpider()
    self.crawler = crawler = Crawler(get_project_settings())
    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    crawler.configure()
    crawler.crawl(self.spider)
    dispatcher.connect(self._dont_close_me, signals.spider_idle)
    self.thread = None
    self._started = False
    self._stopped = False 
Example #15
Source File: spiders.py    From scrapy-rabbitmq with MIT License 6 votes vote down vote up
def spider_idle(self):
        """ Waits for request to be scheduled.

        :return: None
        """
        self.schedule_next_request()
        raise DontCloseSpider 
Example #16
Source File: spiders.py    From scrapy-rabbitmq with MIT License 6 votes vote down vote up
def setup_rabbitmq(self):
        """ Setup RabbitMQ connection.

            Call this method after spider has set its crawler object.
        :return: None
        """

        if not self.rabbitmq_key:
            self.rabbitmq_key = '{}:start_urls'.format(self.name)

        self.server = connection.from_settings(self.crawler.settings)
        self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
        self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped) 
Example #17
Source File: spiders.py    From learn_python3_spider with MIT License 6 votes vote down vote up
def spider_idle(self):
        """Schedules a request if available, otherwise waits."""
        # XXX: Handle a sentinel to close the spider.
        self.schedule_next_requests()
        raise DontCloseSpider 
Example #18
Source File: redis_spider.py    From scrapy-cluster with MIT License 6 votes vote down vote up
def spider_idle(self):
        raise DontCloseSpider 
Example #19
Source File: redis_spider.py    From scrapy-cluster with MIT License 6 votes vote down vote up
def _set_crawler(self, crawler):
        super(RedisSpider, self)._set_crawler(crawler)
        self.crawler.signals.connect(self.spider_idle,
                                     signal=signals.spider_idle) 
Example #20
Source File: spiders.py    From scrapy-kafka with Apache License 2.0 6 votes vote down vote up
def spider_idle(self):
        """Schedules a request if available, otherwise waits."""
        self.schedule_next_request()
        raise DontCloseSpider 
Example #21
Source File: redis_spiders.py    From haipproxy with MIT License 6 votes vote down vote up
def spider_idle(self):
        self.schedule_next_requests()
        raise DontCloseSpider 
Example #22
Source File: spiders.py    From learn_python3_spider with MIT License 5 votes vote down vote up
def setup_redis(self, crawler=None):
        """Setup redis connection and idle signal.

        This should be called after the spider has set its crawler object.
        """
        if self.server is not None:
            return

        if crawler is None:
            # We allow optional crawler argument to keep backwards
            # compatibility.
            # XXX: Raise a deprecation warning.
            crawler = getattr(self, 'crawler', None)

        if crawler is None:
            raise ValueError("crawler is required")

        settings = crawler.settings

        if self.redis_key is None:
            self.redis_key = settings.get(
                'REDIS_START_URLS_KEY', defaults.START_URLS_KEY,
            )

        self.redis_key = self.redis_key % {'name': self.name}

        if not self.redis_key.strip():
            raise ValueError("redis_key must not be empty")

        if self.redis_batch_size is None:
            # TODO: Deprecate this setting (REDIS_START_URLS_BATCH_SIZE).
            self.redis_batch_size = settings.getint(
                'REDIS_START_URLS_BATCH_SIZE',
                settings.getint('CONCURRENT_REQUESTS'),
            )

        try:
            self.redis_batch_size = int(self.redis_batch_size)
        except (TypeError, ValueError):
            raise ValueError("redis_batch_size must be an integer")

        if self.redis_encoding is None:
            self.redis_encoding = settings.get('REDIS_ENCODING', defaults.REDIS_ENCODING)

        self.logger.info("Reading start URLs from redis key '%(redis_key)s' "
                         "(batch size: %(redis_batch_size)s, encoding: %(redis_encoding)s",
                         self.__dict__)

        self.server = connection.from_settings(crawler.settings)
        # The idle signal is called when the spider has no requests left,
        # that's when we will schedule new requests from redis queue
        crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) 
Example #23
Source File: py_my_scrapy_redis_server.py    From vrequest with MIT License 5 votes vote down vote up
def setup_redis(self, crawler=None):
        if self.server is not None:
            return
        if crawler is None:
            crawler = getattr(self, 'crawler', None)
        if crawler is None:
            raise ValueError("crawler is required")
        settings = crawler.settings
        if self.redis_key is None:
            self.redis_key = settings.get(
                'REDIS_START_URLS_KEY', defaults.START_URLS_KEY,
            )
        self.redis_key = self.redis_key % {'name': self.name}
        if not self.redis_key.strip():
            raise ValueError("redis_key must not be empty")
        if self.redis_batch_size is None:
            self.redis_batch_size = settings.getint(
                'REDIS_START_URLS_BATCH_SIZE',
                settings.getint('CONCURRENT_REQUESTS'),
            )
        try:
            self.redis_batch_size = int(self.redis_batch_size)
        except (TypeError, ValueError):
            raise ValueError("redis_batch_size must be an integer")
        if self.redis_encoding is None:
            self.redis_encoding = settings.get('REDIS_ENCODING', defaults.REDIS_ENCODING)
        self.logger.info("Reading start URLs from redis key '%(redis_key)s' "
                         "(batch size: %(redis_batch_size)s, encoding: %(redis_encoding)s",
                         self.__dict__)
        self.server = connection.from_settings(crawler.settings)
        # 在后续的处理中,任务不再是在爬虫空闲的时候才进行任务的分配,而是一直都会执行(为了适配多任务)
        # 这样不会让一些任务得不到启动。因此 spider_idle 函数将不在负责执行 schedule_next_requests
        # 而只会抛出 DontCloseSpider 异常,
        # 并且新开一个 schedule_next_requests 函数轮询任务,用于获取启动任务
        # 并且新开一个 _stop_clear 函数轮询任务,用于检测函数停止任务
        crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
        # 将日志的模板拿到这个对象中,后续函数需要用到
        self._clear_debug_pc   = crawler.settings.getbool('CLEAR_DEBUG_PC')
        self._clear_dupefilter = crawler.settings.getbool('CLEAR_DUPEFILTER')
        self._spider_id_debg_format = crawler.settings.get('DEBUG_PC_FORMAT')
        self._spider_id_task_format = crawler.settings.get('TASK_ID_FORMAT')
        self._spider_id_dupk_format = crawler.settings.get('SCHEDULER_DUPEFILTER_KEY')
        # 这里是将该任务开启绑定两个定时执行,永不停止的函数
        # 1/ 为了检查已经停止的任务并且清理任务的空间。
        # 2/ 为了获取到新的 start_url 开启新的任务脚本进行任务的初始化并且处理任务空间的问题。
        self.limit_check = 0 # 这个参数是想让不同的任务的检查时机稍微错开一点,不要都挤在 _stop_clear 一次迭代中
        self.limit_same  = 2 # 日志快照连续相同的次数
        self.interval    = 5 # 多少秒执行一次 检测关闭任务
        # (理论上平均检测关闭的时间大概为 (limit_check+1) * (limit_same+1) * interval )
        # 测试时可以适量调整小一些方便查看框架的问题
        self.interval_s  = 2 # 多少秒执行一次 检测启动任务
        self.limit_log   = 8 # 额外的配置,check stoping 限制显示任务数,防止出现如有几百个任务每次都要全部打印的情况。
        crawler.signals.connect(self.spider_opened, signal=signals.spider_opened)