Python scrapy.signals.item_scraped() Examples
The following are 26
code examples of scrapy.signals.item_scraped().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
scrapy.signals
, or try the search function
.
Example #1
Source File: spiders.py From openslack-crawler with Apache License 2.0 | 9 votes |
def setup_rabbitmq(self): """ Setup RabbitMQ connection. Call this method after spider has set its crawler object. :return: None """ if not self.rabbitmq_key: self.rabbitmq_key = '{}:start_urls'.format(self.name) self.server = connection.from_settings(self.crawler.settings) self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
Example #2
Source File: spiders.py From openslack-crawler with Apache License 2.0 | 7 votes |
def setup_kafka(self, settings): """Setup redis connection and idle signal. This should be called after the spider has set its crawler object. :param settings: The current Scrapy settings being used :type settings: scrapy.settings.Settings """ if not hasattr(self, 'topic') or not self.topic: self.topic = '%s-starturls' % self.name hosts = settings.get('SCRAPY_KAFKA_HOSTS', ['localhost:9092']) consumer_group = settings.get('SCRAPY_KAFKA_SPIDER_CONSUMER_GROUP', 'scrapy-kafka') _kafka = KafkaClient(hosts) # wait at most 1sec for more messages. Otherwise continue self.consumer = SimpleConsumer(_kafka, consumer_group, self.topic, auto_commit=True, iter_timeout=1.0) # idle signal is called when the spider has no requests left, # that's when we will schedule new requests from kafka topic self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped) self.log("Reading URLs from kafka topic '%s'" % self.kafka_topic)
Example #3
Source File: spiders.py From scrapy-kafka with Apache License 2.0 | 7 votes |
def setup_kafka(self, settings): """Setup redis connection and idle signal. This should be called after the spider has set its crawler object. :param settings: The current Scrapy settings being used :type settings: scrapy.settings.Settings """ if not hasattr(self, 'topic') or not self.topic: self.topic = '%s-starturls' % self.name hosts = settings.get('SCRAPY_KAFKA_HOSTS', ['localhost:9092']) consumer_group = settings.get('SCRAPY_KAFKA_SPIDER_CONSUMER_GROUP', 'scrapy-kafka') _kafka = KafkaClient(hosts) # wait at most 1sec for more messages. Otherwise continue self.consumer = SimpleConsumer(_kafka, consumer_group, self.topic, auto_commit=True, iter_timeout=1.0) # idle signal is called when the spider has no requests left, # that's when we will schedule new requests from kafka topic self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped) self.log("Reading URLs from kafka topic '%s'" % self.kafka_topic)
Example #4
Source File: closespider.py From learn_python3_spider with MIT License | 6 votes |
def __init__(self, crawler): self.crawler = crawler self.close_on = { 'timeout': crawler.settings.getfloat('CLOSESPIDER_TIMEOUT'), 'itemcount': crawler.settings.getint('CLOSESPIDER_ITEMCOUNT'), 'pagecount': crawler.settings.getint('CLOSESPIDER_PAGECOUNT'), 'errorcount': crawler.settings.getint('CLOSESPIDER_ERRORCOUNT'), } if not any(self.close_on.values()): raise NotConfigured self.counter = defaultdict(int) if self.close_on.get('errorcount'): crawler.signals.connect(self.error_count, signal=signals.spider_error) if self.close_on.get('pagecount'): crawler.signals.connect(self.page_count, signal=signals.response_received) if self.close_on.get('timeout'): crawler.signals.connect(self.spider_opened, signal=signals.spider_opened) if self.close_on.get('itemcount'): crawler.signals.connect(self.item_scraped, signal=signals.item_scraped) crawler.signals.connect(self.spider_closed, signal=signals.spider_closed)
Example #5
Source File: spiders.py From openslack-crawler with Apache License 2.0 | 6 votes |
def setup_redis(self): """Setup redis connection and idle signal. This should be called after the spider has set its crawler object. """ if not self.redis_key: self.redis_key = '%s:start_urls' % self.name self.server = connection.from_settings(self.crawler.settings) # idle signal is called when the spider has no requests left, # that's when we will schedule new requests from redis queue self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped) self.log("Reading URLs from redis list '%s'" % self.redis_key)
Example #6
Source File: scraper.py From learn_python3_spider with MIT License | 6 votes |
def _itemproc_finished(self, output, item, response, spider): """ItemProcessor finished for the given ``item`` and returned ``output`` """ self.slot.itemproc_size -= 1 if isinstance(output, Failure): ex = output.value if isinstance(ex, DropItem): logkws = self.logformatter.dropped(item, ex, response, spider) logger.log(*logformatter_adapter(logkws), extra={'spider': spider}) return self.signals.send_catch_log_deferred( signal=signals.item_dropped, item=item, response=response, spider=spider, exception=output.value) else: logger.error('Error processing %(item)s', {'item': item}, exc_info=failure_to_exc_info(output), extra={'spider': spider}) return self.signals.send_catch_log_deferred( signal=signals.item_error, item=item, response=response, spider=spider, failure=output) else: logkws = self.logformatter.scraped(output, response, spider) logger.log(*logformatter_adapter(logkws), extra={'spider': spider}) return self.signals.send_catch_log_deferred( signal=signals.item_scraped, item=output, response=response, spider=spider)
Example #7
Source File: closespider.py From learn_python3_spider with MIT License | 6 votes |
def __init__(self, crawler): self.crawler = crawler self.close_on = { 'timeout': crawler.settings.getfloat('CLOSESPIDER_TIMEOUT'), 'itemcount': crawler.settings.getint('CLOSESPIDER_ITEMCOUNT'), 'pagecount': crawler.settings.getint('CLOSESPIDER_PAGECOUNT'), 'errorcount': crawler.settings.getint('CLOSESPIDER_ERRORCOUNT'), } if not any(self.close_on.values()): raise NotConfigured self.counter = defaultdict(int) if self.close_on.get('errorcount'): crawler.signals.connect(self.error_count, signal=signals.spider_error) if self.close_on.get('pagecount'): crawler.signals.connect(self.page_count, signal=signals.response_received) if self.close_on.get('timeout'): crawler.signals.connect(self.spider_opened, signal=signals.spider_opened) if self.close_on.get('itemcount'): crawler.signals.connect(self.item_scraped, signal=signals.item_scraped) crawler.signals.connect(self.spider_closed, signal=signals.spider_closed)
Example #8
Source File: scraper.py From learn_python3_spider with MIT License | 6 votes |
def _itemproc_finished(self, output, item, response, spider): """ItemProcessor finished for the given ``item`` and returned ``output`` """ self.slot.itemproc_size -= 1 if isinstance(output, Failure): ex = output.value if isinstance(ex, DropItem): logkws = self.logformatter.dropped(item, ex, response, spider) logger.log(*logformatter_adapter(logkws), extra={'spider': spider}) return self.signals.send_catch_log_deferred( signal=signals.item_dropped, item=item, response=response, spider=spider, exception=output.value) else: logger.error('Error processing %(item)s', {'item': item}, exc_info=failure_to_exc_info(output), extra={'spider': spider}) return self.signals.send_catch_log_deferred( signal=signals.item_error, item=item, response=response, spider=spider, failure=output) else: logkws = self.logformatter.scraped(output, response, spider) logger.log(*logformatter_adapter(logkws), extra={'spider': spider}) return self.signals.send_catch_log_deferred( signal=signals.item_scraped, item=output, response=response, spider=spider)
Example #9
Source File: spiders.py From scrapy-rabbitmq with MIT License | 6 votes |
def setup_rabbitmq(self): """ Setup RabbitMQ connection. Call this method after spider has set its crawler object. :return: None """ if not self.rabbitmq_key: self.rabbitmq_key = '{}:start_urls'.format(self.name) self.server = connection.from_settings(self.crawler.settings) self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
Example #10
Source File: feedexport.py From learn_python3_spider with MIT License | 5 votes |
def item_scraped(self, item, spider): slot = self.slot if not self._exporting: slot.exporter.start_exporting() self._exporting = True slot.exporter.export_item(item) slot.itemcount += 1 return item
Example #11
Source File: spiders.py From openslack-crawler with Apache License 2.0 | 5 votes |
def item_scraped(self, *args, **kwargs): """ Avoid waiting for spider. :param args: :param kwargs: :return: None """ self.schedule_next_request()
Example #12
Source File: spiders.py From openslack-crawler with Apache License 2.0 | 5 votes |
def item_scraped(self, *args, **kwargs): """Avoids waiting for the spider to idle before scheduling the next request""" self.schedule_next_request()
Example #13
Source File: pipelines.py From company2vec with MIT License | 5 votes |
def item_scraped(self, item, response, spider): """ Append each individual item scraped :param item: scrapy item :type item: cls :return: None """ self.items.append(item)
Example #14
Source File: pipelines.py From company2vec with MIT License | 5 votes |
def crawl(self, crawler_or_spidercls, *args, **kwargs): """ Launch a crawl and return output as deferred :param crawler_or_spidercls: scrapy crawler :type crawler_or_spidercls: cls :return: deferred object with crawled output """ # keep all items scraped self.items = [] # create crawler (Same as in base CrawlerProcess) crawler = self.create_crawler(crawler_or_spidercls) # handle each item scraped crawler.signals.connect(self.item_scraped, signals.item_scraped) # create Twisted.Deferred launching crawl dfd = self._crawl(crawler, *args, **kwargs) # add callback - when crawl is done cal return_items dfd.addCallback(self.return_items) return dfd
Example #15
Source File: scrapyutils.py From autologin with Apache License 2.0 | 5 votes |
def __init__(self, crawl_d, crawler): self.crawl_d = crawl_d self.crawler = crawler crawler.signals.connect(self._on_item_scraped, signals.item_scraped) crawler.signals.connect(self._on_error, signals.spider_error) crawl_d.addCallback(self._on_finished) crawl_d.addErrback(self._on_error) self.closed = False self._items_available = Deferred() self._items = collections.deque()
Example #16
Source File: extensions.py From scrapy-kafka-export with MIT License | 5 votes |
def __init__(self, crawler): self.crawler = crawler settings = crawler.settings if not settings.getbool('KAFKA_EXPORT_ENABLED', False): raise NotConfigured logger.debug('Kafka export extension is enabled') self.kafka_brokers = settings.getlist('KAFKA_BROKERS') self.kafka_topic = settings.get('KAFKA_TOPIC') self.batch_size = settings.getint('KAFKA_BATCH_SIZE', 100) ssl_module_name = settings.get('KAFKA_SSL_CONFIG_MODULE') if ssl_module_name: def _load(key): return resource_filename(ssl_module_name, settings.get(key)) self.ssl_config = get_ssl_config( cafile=_load('KAFKA_SSL_CACERT_FILE'), certfile=_load('KAFKA_SSL_CLIENTCERT_FILE'), keyfile=_load('KAFKA_SSL_CLIENTKEY_FILE'), ) else: self.ssl_config = {} self.item_writer = None crawler.signals.connect(self.spider_opened, signals.spider_opened) crawler.signals.connect(self.spider_closed, signals.spider_closed) crawler.signals.connect(self.process_item_scraped, signals.item_scraped)
Example #17
Source File: spiders.py From scrapy-rabbitmq with MIT License | 5 votes |
def item_scraped(self, *args, **kwargs): """ Avoid waiting for spider. :param args: :param kwargs: :return: None """ self.schedule_next_request()
Example #18
Source File: closespider.py From learn_python3_spider with MIT License | 5 votes |
def item_scraped(self, item, spider): self.counter['itemcount'] += 1 if self.counter['itemcount'] == self.close_on['itemcount']: self.crawler.engine.close_spider(spider, 'closespider_itemcount')
Example #19
Source File: corestats.py From learn_python3_spider with MIT License | 5 votes |
def item_scraped(self, item, spider): self.stats.inc_value('item_scraped_count', spider=spider)
Example #20
Source File: corestats.py From learn_python3_spider with MIT License | 5 votes |
def from_crawler(cls, crawler): o = cls(crawler.stats) crawler.signals.connect(o.spider_opened, signal=signals.spider_opened) crawler.signals.connect(o.spider_closed, signal=signals.spider_closed) crawler.signals.connect(o.item_scraped, signal=signals.item_scraped) crawler.signals.connect(o.item_dropped, signal=signals.item_dropped) crawler.signals.connect(o.response_received, signal=signals.response_received) return o
Example #21
Source File: feedexport.py From learn_python3_spider with MIT License | 5 votes |
def from_crawler(cls, crawler): o = cls(crawler.settings) o.crawler = crawler crawler.signals.connect(o.open_spider, signals.spider_opened) crawler.signals.connect(o.close_spider, signals.spider_closed) crawler.signals.connect(o.item_scraped, signals.item_scraped) return o
Example #22
Source File: closespider.py From learn_python3_spider with MIT License | 5 votes |
def item_scraped(self, item, spider): self.counter['itemcount'] += 1 if self.counter['itemcount'] == self.close_on['itemcount']: self.crawler.engine.close_spider(spider, 'closespider_itemcount')
Example #23
Source File: corestats.py From learn_python3_spider with MIT License | 5 votes |
def item_scraped(self, item, spider): self.stats.inc_value('item_scraped_count', spider=spider)
Example #24
Source File: corestats.py From learn_python3_spider with MIT License | 5 votes |
def from_crawler(cls, crawler): o = cls(crawler.stats) crawler.signals.connect(o.spider_opened, signal=signals.spider_opened) crawler.signals.connect(o.spider_closed, signal=signals.spider_closed) crawler.signals.connect(o.item_scraped, signal=signals.item_scraped) crawler.signals.connect(o.item_dropped, signal=signals.item_dropped) crawler.signals.connect(o.response_received, signal=signals.response_received) return o
Example #25
Source File: pixiv-beta.py From Pixiv-Crawler with GNU General Public License v3.0 | 5 votes |
def from_crawler(cls, crawler, *args, **kwargs): spider = super(pixivSpider, cls).from_crawler(crawler, *args, **kwargs) crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed) crawler.signals.connect(cls.update_collection_set, signal=signals.item_scraped) return spider # allowed_domains = []
Example #26
Source File: spiders.py From scrapy-kafka with Apache License 2.0 | 5 votes |
def item_scraped(self, *args, **kwargs): """Avoids waiting for the spider to idle before scheduling the next request""" self.schedule_next_request()