Python scrapy.signals.spider_idle() Examples
The following are 23
code examples of scrapy.signals.spider_idle().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
scrapy.signals
, or try the search function
.
Example #1
Source File: engine.py From learn_python3_spider with MIT License | 12 votes |
def _spider_idle(self, spider): """Called when a spider gets idle. This function is called when there are no remaining pages to download or schedule. It can be called multiple times. If some extension raises a DontCloseSpider exception (in the spider_idle signal handler) the spider is not closed until the next loop and this function is guaranteed to be called (at least) once again for this spider. """ res = self.signals.send_catch_log(signal=signals.spider_idle, \ spider=spider, dont_log=DontCloseSpider) if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) \ for _, x in res): return if self.spider_is_idle(spider): self.close_spider(spider, reason='finished')
Example #2
Source File: engine.py From learn_python3_spider with MIT License | 11 votes |
def _spider_idle(self, spider): """Called when a spider gets idle. This function is called when there are no remaining pages to download or schedule. It can be called multiple times. If some extension raises a DontCloseSpider exception (in the spider_idle signal handler) the spider is not closed until the next loop and this function is guaranteed to be called (at least) once again for this spider. """ res = self.signals.send_catch_log(signal=signals.spider_idle, \ spider=spider, dont_log=DontCloseSpider) if any(isinstance(x, Failure) and isinstance(x.value, DontCloseSpider) \ for _, x in res): return if self.spider_is_idle(spider): self.close_spider(spider, reason='finished')
Example #3
Source File: py_my_scrapy_redis_server.py From vrequest with MIT License | 10 votes |
def spider_idle(self): raise DontCloseSpider
Example #4
Source File: redis_spiders.py From haipproxy with MIT License | 9 votes |
def setup_redis(self, crawler): """send signals when the spider is free""" self.redis_batch_size = SPIDER_FEED_SIZE self.redis_con = get_redis_conn() crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
Example #5
Source File: spiders.py From openslack-crawler with Apache License 2.0 | 9 votes |
def setup_rabbitmq(self): """ Setup RabbitMQ connection. Call this method after spider has set its crawler object. :return: None """ if not self.rabbitmq_key: self.rabbitmq_key = '{}:start_urls'.format(self.name) self.server = connection.from_settings(self.crawler.settings) self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
Example #6
Source File: spiders.py From openslack-crawler with Apache License 2.0 | 9 votes |
def spider_idle(self): """Schedules a request if available, otherwise waits.""" self.schedule_next_request() raise DontCloseSpider
Example #7
Source File: redis_spider.py From openslack-crawler with Apache License 2.0 | 9 votes |
def spider_idle(self): raise DontCloseSpider
Example #8
Source File: redis_spider.py From openslack-crawler with Apache License 2.0 | 9 votes |
def set_crawler(self, crawler): super(RedisSpider, self).set_crawler(crawler) self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
Example #9
Source File: spiders.py From scrapy-kafka with Apache License 2.0 | 7 votes |
def setup_kafka(self, settings): """Setup redis connection and idle signal. This should be called after the spider has set its crawler object. :param settings: The current Scrapy settings being used :type settings: scrapy.settings.Settings """ if not hasattr(self, 'topic') or not self.topic: self.topic = '%s-starturls' % self.name hosts = settings.get('SCRAPY_KAFKA_HOSTS', ['localhost:9092']) consumer_group = settings.get('SCRAPY_KAFKA_SPIDER_CONSUMER_GROUP', 'scrapy-kafka') _kafka = KafkaClient(hosts) # wait at most 1sec for more messages. Otherwise continue self.consumer = SimpleConsumer(_kafka, consumer_group, self.topic, auto_commit=True, iter_timeout=1.0) # idle signal is called when the spider has no requests left, # that's when we will schedule new requests from kafka topic self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped) self.log("Reading URLs from kafka topic '%s'" % self.kafka_topic)
Example #10
Source File: spiders.py From openslack-crawler with Apache License 2.0 | 7 votes |
def setup_kafka(self, settings): """Setup redis connection and idle signal. This should be called after the spider has set its crawler object. :param settings: The current Scrapy settings being used :type settings: scrapy.settings.Settings """ if not hasattr(self, 'topic') or not self.topic: self.topic = '%s-starturls' % self.name hosts = settings.get('SCRAPY_KAFKA_HOSTS', ['localhost:9092']) consumer_group = settings.get('SCRAPY_KAFKA_SPIDER_CONSUMER_GROUP', 'scrapy-kafka') _kafka = KafkaClient(hosts) # wait at most 1sec for more messages. Otherwise continue self.consumer = SimpleConsumer(_kafka, consumer_group, self.topic, auto_commit=True, iter_timeout=1.0) # idle signal is called when the spider has no requests left, # that's when we will schedule new requests from kafka topic self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped) self.log("Reading URLs from kafka topic '%s'" % self.kafka_topic)
Example #11
Source File: spider.py From Gerapy with MIT License | 7 votes |
def start_requests(self): self.crawler.signals.connect(self.make_requests, signal=signals.spider_idle) return []
Example #12
Source File: spiders.py From openslack-crawler with Apache License 2.0 | 6 votes |
def spider_idle(self): """Schedules a request if available, otherwise waits.""" self.schedule_next_request() raise DontCloseSpider
Example #13
Source File: spiders.py From openslack-crawler with Apache License 2.0 | 6 votes |
def setup_redis(self): """Setup redis connection and idle signal. This should be called after the spider has set its crawler object. """ if not self.redis_key: self.redis_key = '%s:start_urls' % self.name self.server = connection.from_settings(self.crawler.settings) # idle signal is called when the spider has no requests left, # that's when we will schedule new requests from redis queue self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped) self.log("Reading URLs from redis list '%s'" % self.redis_key)
Example #14
Source File: spider.py From stock with Apache License 2.0 | 6 votes |
def __init__(self): self.spider = HqSpider() self.crawler = crawler = Crawler(get_project_settings()) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.configure() crawler.crawl(self.spider) dispatcher.connect(self._dont_close_me, signals.spider_idle) self.thread = None self._started = False self._stopped = False
Example #15
Source File: spiders.py From scrapy-rabbitmq with MIT License | 6 votes |
def spider_idle(self): """ Waits for request to be scheduled. :return: None """ self.schedule_next_request() raise DontCloseSpider
Example #16
Source File: spiders.py From scrapy-rabbitmq with MIT License | 6 votes |
def setup_rabbitmq(self): """ Setup RabbitMQ connection. Call this method after spider has set its crawler object. :return: None """ if not self.rabbitmq_key: self.rabbitmq_key = '{}:start_urls'.format(self.name) self.server = connection.from_settings(self.crawler.settings) self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
Example #17
Source File: spiders.py From learn_python3_spider with MIT License | 6 votes |
def spider_idle(self): """Schedules a request if available, otherwise waits.""" # XXX: Handle a sentinel to close the spider. self.schedule_next_requests() raise DontCloseSpider
Example #18
Source File: redis_spider.py From scrapy-cluster with MIT License | 6 votes |
def spider_idle(self): raise DontCloseSpider
Example #19
Source File: redis_spider.py From scrapy-cluster with MIT License | 6 votes |
def _set_crawler(self, crawler): super(RedisSpider, self)._set_crawler(crawler) self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
Example #20
Source File: spiders.py From scrapy-kafka with Apache License 2.0 | 6 votes |
def spider_idle(self): """Schedules a request if available, otherwise waits.""" self.schedule_next_request() raise DontCloseSpider
Example #21
Source File: redis_spiders.py From haipproxy with MIT License | 6 votes |
def spider_idle(self): self.schedule_next_requests() raise DontCloseSpider
Example #22
Source File: spiders.py From learn_python3_spider with MIT License | 5 votes |
def setup_redis(self, crawler=None): """Setup redis connection and idle signal. This should be called after the spider has set its crawler object. """ if self.server is not None: return if crawler is None: # We allow optional crawler argument to keep backwards # compatibility. # XXX: Raise a deprecation warning. crawler = getattr(self, 'crawler', None) if crawler is None: raise ValueError("crawler is required") settings = crawler.settings if self.redis_key is None: self.redis_key = settings.get( 'REDIS_START_URLS_KEY', defaults.START_URLS_KEY, ) self.redis_key = self.redis_key % {'name': self.name} if not self.redis_key.strip(): raise ValueError("redis_key must not be empty") if self.redis_batch_size is None: # TODO: Deprecate this setting (REDIS_START_URLS_BATCH_SIZE). self.redis_batch_size = settings.getint( 'REDIS_START_URLS_BATCH_SIZE', settings.getint('CONCURRENT_REQUESTS'), ) try: self.redis_batch_size = int(self.redis_batch_size) except (TypeError, ValueError): raise ValueError("redis_batch_size must be an integer") if self.redis_encoding is None: self.redis_encoding = settings.get('REDIS_ENCODING', defaults.REDIS_ENCODING) self.logger.info("Reading start URLs from redis key '%(redis_key)s' " "(batch size: %(redis_batch_size)s, encoding: %(redis_encoding)s", self.__dict__) self.server = connection.from_settings(crawler.settings) # The idle signal is called when the spider has no requests left, # that's when we will schedule new requests from redis queue crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
Example #23
Source File: py_my_scrapy_redis_server.py From vrequest with MIT License | 5 votes |
def setup_redis(self, crawler=None): if self.server is not None: return if crawler is None: crawler = getattr(self, 'crawler', None) if crawler is None: raise ValueError("crawler is required") settings = crawler.settings if self.redis_key is None: self.redis_key = settings.get( 'REDIS_START_URLS_KEY', defaults.START_URLS_KEY, ) self.redis_key = self.redis_key % {'name': self.name} if not self.redis_key.strip(): raise ValueError("redis_key must not be empty") if self.redis_batch_size is None: self.redis_batch_size = settings.getint( 'REDIS_START_URLS_BATCH_SIZE', settings.getint('CONCURRENT_REQUESTS'), ) try: self.redis_batch_size = int(self.redis_batch_size) except (TypeError, ValueError): raise ValueError("redis_batch_size must be an integer") if self.redis_encoding is None: self.redis_encoding = settings.get('REDIS_ENCODING', defaults.REDIS_ENCODING) self.logger.info("Reading start URLs from redis key '%(redis_key)s' " "(batch size: %(redis_batch_size)s, encoding: %(redis_encoding)s", self.__dict__) self.server = connection.from_settings(crawler.settings) # 在后续的处理中,任务不再是在爬虫空闲的时候才进行任务的分配,而是一直都会执行(为了适配多任务) # 这样不会让一些任务得不到启动。因此 spider_idle 函数将不在负责执行 schedule_next_requests # 而只会抛出 DontCloseSpider 异常, # 并且新开一个 schedule_next_requests 函数轮询任务,用于获取启动任务 # 并且新开一个 _stop_clear 函数轮询任务,用于检测函数停止任务 crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) # 将日志的模板拿到这个对象中,后续函数需要用到 self._clear_debug_pc = crawler.settings.getbool('CLEAR_DEBUG_PC') self._clear_dupefilter = crawler.settings.getbool('CLEAR_DUPEFILTER') self._spider_id_debg_format = crawler.settings.get('DEBUG_PC_FORMAT') self._spider_id_task_format = crawler.settings.get('TASK_ID_FORMAT') self._spider_id_dupk_format = crawler.settings.get('SCHEDULER_DUPEFILTER_KEY') # 这里是将该任务开启绑定两个定时执行,永不停止的函数 # 1/ 为了检查已经停止的任务并且清理任务的空间。 # 2/ 为了获取到新的 start_url 开启新的任务脚本进行任务的初始化并且处理任务空间的问题。 self.limit_check = 0 # 这个参数是想让不同的任务的检查时机稍微错开一点,不要都挤在 _stop_clear 一次迭代中 self.limit_same = 2 # 日志快照连续相同的次数 self.interval = 5 # 多少秒执行一次 检测关闭任务 # (理论上平均检测关闭的时间大概为 (limit_check+1) * (limit_same+1) * interval ) # 测试时可以适量调整小一些方便查看框架的问题 self.interval_s = 2 # 多少秒执行一次 检测启动任务 self.limit_log = 8 # 额外的配置,check stoping 限制显示任务数,防止出现如有几百个任务每次都要全部打印的情况。 crawler.signals.connect(self.spider_opened, signal=signals.spider_opened)