Python scrapy.signals.item_scraped() Examples

The following are 26 code examples of scrapy.signals.item_scraped(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module scrapy.signals , or try the search function .
Example #1
Source File: spiders.py    From openslack-crawler with Apache License 2.0 9 votes vote down vote up
def setup_rabbitmq(self):
        """ Setup RabbitMQ connection.

            Call this method after spider has set its crawler object.
        :return: None
        """

        if not self.rabbitmq_key:
            self.rabbitmq_key = '{}:start_urls'.format(self.name)

        self.server = connection.from_settings(self.crawler.settings)
        self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
        self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped) 
Example #2
Source File: spiders.py    From openslack-crawler with Apache License 2.0 7 votes vote down vote up
def setup_kafka(self, settings):
        """Setup redis connection and idle signal.
        This should be called after the spider has set its crawler object.
        :param settings: The current Scrapy settings being used
        :type settings: scrapy.settings.Settings
        """
        if not hasattr(self, 'topic') or not self.topic:
            self.topic = '%s-starturls' % self.name

        hosts = settings.get('SCRAPY_KAFKA_HOSTS', ['localhost:9092'])
        consumer_group = settings.get('SCRAPY_KAFKA_SPIDER_CONSUMER_GROUP', 'scrapy-kafka')
        _kafka = KafkaClient(hosts)
        # wait at most 1sec for more messages. Otherwise continue
        self.consumer = SimpleConsumer(_kafka, consumer_group, self.topic,
                                       auto_commit=True, iter_timeout=1.0)
        # idle signal is called when the spider has no requests left,
        # that's when we will schedule new requests from kafka topic
        self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
        self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
        self.log("Reading URLs from kafka topic '%s'" % self.kafka_topic) 
Example #3
Source File: spiders.py    From scrapy-kafka with Apache License 2.0 7 votes vote down vote up
def setup_kafka(self, settings):
        """Setup redis connection and idle signal.

        This should be called after the spider has set its crawler object.

        :param settings: The current Scrapy settings being used
        :type settings: scrapy.settings.Settings
        """
        if not hasattr(self, 'topic') or not self.topic:
            self.topic = '%s-starturls' % self.name

        hosts = settings.get('SCRAPY_KAFKA_HOSTS', ['localhost:9092'])
        consumer_group = settings.get('SCRAPY_KAFKA_SPIDER_CONSUMER_GROUP', 'scrapy-kafka')
        _kafka = KafkaClient(hosts)
        # wait at most 1sec for more messages. Otherwise continue
        self.consumer = SimpleConsumer(_kafka, consumer_group, self.topic,
                                       auto_commit=True, iter_timeout=1.0)
        # idle signal is called when the spider has no requests left,
        # that's when we will schedule new requests from kafka topic
        self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
        self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
        self.log("Reading URLs from kafka topic '%s'" % self.kafka_topic) 
Example #4
Source File: closespider.py    From learn_python3_spider with MIT License 6 votes vote down vote up
def __init__(self, crawler):
        self.crawler = crawler

        self.close_on = {
            'timeout': crawler.settings.getfloat('CLOSESPIDER_TIMEOUT'),
            'itemcount': crawler.settings.getint('CLOSESPIDER_ITEMCOUNT'),
            'pagecount': crawler.settings.getint('CLOSESPIDER_PAGECOUNT'),
            'errorcount': crawler.settings.getint('CLOSESPIDER_ERRORCOUNT'),
            }

        if not any(self.close_on.values()):
            raise NotConfigured

        self.counter = defaultdict(int)

        if self.close_on.get('errorcount'):
            crawler.signals.connect(self.error_count, signal=signals.spider_error)
        if self.close_on.get('pagecount'):
            crawler.signals.connect(self.page_count, signal=signals.response_received)
        if self.close_on.get('timeout'):
            crawler.signals.connect(self.spider_opened, signal=signals.spider_opened)
        if self.close_on.get('itemcount'):
            crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
        crawler.signals.connect(self.spider_closed, signal=signals.spider_closed) 
Example #5
Source File: spiders.py    From openslack-crawler with Apache License 2.0 6 votes vote down vote up
def setup_redis(self):
        """Setup redis connection and idle signal.
        This should be called after the spider has set its crawler object.
        """
        if not self.redis_key:
            self.redis_key = '%s:start_urls' % self.name

        self.server = connection.from_settings(self.crawler.settings)
        # idle signal is called when the spider has no requests left,
        # that's when we will schedule new requests from redis queue
        self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
        self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
        self.log("Reading URLs from redis list '%s'" % self.redis_key) 
Example #6
Source File: scraper.py    From learn_python3_spider with MIT License 6 votes vote down vote up
def _itemproc_finished(self, output, item, response, spider):
        """ItemProcessor finished for the given ``item`` and returned ``output``
        """
        self.slot.itemproc_size -= 1
        if isinstance(output, Failure):
            ex = output.value
            if isinstance(ex, DropItem):
                logkws = self.logformatter.dropped(item, ex, response, spider)
                logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
                return self.signals.send_catch_log_deferred(
                    signal=signals.item_dropped, item=item, response=response,
                    spider=spider, exception=output.value)
            else:
                logger.error('Error processing %(item)s', {'item': item},
                             exc_info=failure_to_exc_info(output),
                             extra={'spider': spider})
                return self.signals.send_catch_log_deferred(
                    signal=signals.item_error, item=item, response=response,
                    spider=spider, failure=output)
        else:
            logkws = self.logformatter.scraped(output, response, spider)
            logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
            return self.signals.send_catch_log_deferred(
                signal=signals.item_scraped, item=output, response=response,
                spider=spider) 
Example #7
Source File: closespider.py    From learn_python3_spider with MIT License 6 votes vote down vote up
def __init__(self, crawler):
        self.crawler = crawler

        self.close_on = {
            'timeout': crawler.settings.getfloat('CLOSESPIDER_TIMEOUT'),
            'itemcount': crawler.settings.getint('CLOSESPIDER_ITEMCOUNT'),
            'pagecount': crawler.settings.getint('CLOSESPIDER_PAGECOUNT'),
            'errorcount': crawler.settings.getint('CLOSESPIDER_ERRORCOUNT'),
            }

        if not any(self.close_on.values()):
            raise NotConfigured

        self.counter = defaultdict(int)

        if self.close_on.get('errorcount'):
            crawler.signals.connect(self.error_count, signal=signals.spider_error)
        if self.close_on.get('pagecount'):
            crawler.signals.connect(self.page_count, signal=signals.response_received)
        if self.close_on.get('timeout'):
            crawler.signals.connect(self.spider_opened, signal=signals.spider_opened)
        if self.close_on.get('itemcount'):
            crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
        crawler.signals.connect(self.spider_closed, signal=signals.spider_closed) 
Example #8
Source File: scraper.py    From learn_python3_spider with MIT License 6 votes vote down vote up
def _itemproc_finished(self, output, item, response, spider):
        """ItemProcessor finished for the given ``item`` and returned ``output``
        """
        self.slot.itemproc_size -= 1
        if isinstance(output, Failure):
            ex = output.value
            if isinstance(ex, DropItem):
                logkws = self.logformatter.dropped(item, ex, response, spider)
                logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
                return self.signals.send_catch_log_deferred(
                    signal=signals.item_dropped, item=item, response=response,
                    spider=spider, exception=output.value)
            else:
                logger.error('Error processing %(item)s', {'item': item},
                             exc_info=failure_to_exc_info(output),
                             extra={'spider': spider})
                return self.signals.send_catch_log_deferred(
                    signal=signals.item_error, item=item, response=response,
                    spider=spider, failure=output)
        else:
            logkws = self.logformatter.scraped(output, response, spider)
            logger.log(*logformatter_adapter(logkws), extra={'spider': spider})
            return self.signals.send_catch_log_deferred(
                signal=signals.item_scraped, item=output, response=response,
                spider=spider) 
Example #9
Source File: spiders.py    From scrapy-rabbitmq with MIT License 6 votes vote down vote up
def setup_rabbitmq(self):
        """ Setup RabbitMQ connection.

            Call this method after spider has set its crawler object.
        :return: None
        """

        if not self.rabbitmq_key:
            self.rabbitmq_key = '{}:start_urls'.format(self.name)

        self.server = connection.from_settings(self.crawler.settings)
        self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
        self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped) 
Example #10
Source File: feedexport.py    From learn_python3_spider with MIT License 5 votes vote down vote up
def item_scraped(self, item, spider):
        slot = self.slot
        if not self._exporting:
            slot.exporter.start_exporting()
            self._exporting = True
        slot.exporter.export_item(item)
        slot.itemcount += 1
        return item 
Example #11
Source File: spiders.py    From openslack-crawler with Apache License 2.0 5 votes vote down vote up
def item_scraped(self, *args, **kwargs):
        """ Avoid waiting for spider.
        :param args:
        :param kwargs:
        :return: None
        """
        self.schedule_next_request() 
Example #12
Source File: spiders.py    From openslack-crawler with Apache License 2.0 5 votes vote down vote up
def item_scraped(self, *args, **kwargs):
        """Avoids waiting for the spider to  idle before scheduling the next request"""
        self.schedule_next_request() 
Example #13
Source File: pipelines.py    From company2vec with MIT License 5 votes vote down vote up
def item_scraped(self, item, response, spider):

        """
        Append each individual item scraped

        :param item: scrapy item
        :type item: cls

        :return: None
        """

        self.items.append(item) 
Example #14
Source File: pipelines.py    From company2vec with MIT License 5 votes vote down vote up
def crawl(self, crawler_or_spidercls, *args, **kwargs):

        """
        Launch a crawl and return output as deferred

        :param crawler_or_spidercls: scrapy crawler
        :type crawler_or_spidercls: cls

        :return: deferred object with crawled output
        """

        # keep all items scraped
        self.items = []

        # create crawler (Same as in base CrawlerProcess)
        crawler = self.create_crawler(crawler_or_spidercls)

        # handle each item scraped
        crawler.signals.connect(self.item_scraped, signals.item_scraped)

        # create Twisted.Deferred launching crawl
        dfd = self._crawl(crawler, *args, **kwargs)

        # add callback - when crawl is done cal return_items
        dfd.addCallback(self.return_items)
        return dfd 
Example #15
Source File: scrapyutils.py    From autologin with Apache License 2.0 5 votes vote down vote up
def __init__(self, crawl_d, crawler):
        self.crawl_d = crawl_d
        self.crawler = crawler

        crawler.signals.connect(self._on_item_scraped, signals.item_scraped)
        crawler.signals.connect(self._on_error, signals.spider_error)

        crawl_d.addCallback(self._on_finished)
        crawl_d.addErrback(self._on_error)

        self.closed = False
        self._items_available = Deferred()
        self._items = collections.deque() 
Example #16
Source File: extensions.py    From scrapy-kafka-export with MIT License 5 votes vote down vote up
def __init__(self, crawler):
        self.crawler = crawler
        settings = crawler.settings
        if not settings.getbool('KAFKA_EXPORT_ENABLED', False):
            raise NotConfigured
        logger.debug('Kafka export extension is enabled')

        self.kafka_brokers = settings.getlist('KAFKA_BROKERS')
        self.kafka_topic = settings.get('KAFKA_TOPIC')
        self.batch_size = settings.getint('KAFKA_BATCH_SIZE', 100)
        ssl_module_name = settings.get('KAFKA_SSL_CONFIG_MODULE')
        if ssl_module_name:
            def _load(key):
                return resource_filename(ssl_module_name, settings.get(key))

            self.ssl_config = get_ssl_config(
                cafile=_load('KAFKA_SSL_CACERT_FILE'),
                certfile=_load('KAFKA_SSL_CLIENTCERT_FILE'),
                keyfile=_load('KAFKA_SSL_CLIENTKEY_FILE'),
            )
        else:
            self.ssl_config = {}

        self.item_writer = None
        crawler.signals.connect(self.spider_opened, signals.spider_opened)
        crawler.signals.connect(self.spider_closed, signals.spider_closed)
        crawler.signals.connect(self.process_item_scraped,
                                signals.item_scraped) 
Example #17
Source File: spiders.py    From scrapy-rabbitmq with MIT License 5 votes vote down vote up
def item_scraped(self, *args, **kwargs):
        """ Avoid waiting for spider.
        :param args:
        :param kwargs:
        :return: None
        """
        self.schedule_next_request() 
Example #18
Source File: closespider.py    From learn_python3_spider with MIT License 5 votes vote down vote up
def item_scraped(self, item, spider):
        self.counter['itemcount'] += 1
        if self.counter['itemcount'] == self.close_on['itemcount']:
            self.crawler.engine.close_spider(spider, 'closespider_itemcount') 
Example #19
Source File: corestats.py    From learn_python3_spider with MIT License 5 votes vote down vote up
def item_scraped(self, item, spider):
        self.stats.inc_value('item_scraped_count', spider=spider) 
Example #20
Source File: corestats.py    From learn_python3_spider with MIT License 5 votes vote down vote up
def from_crawler(cls, crawler):
        o = cls(crawler.stats)
        crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
        crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
        crawler.signals.connect(o.item_scraped, signal=signals.item_scraped)
        crawler.signals.connect(o.item_dropped, signal=signals.item_dropped)
        crawler.signals.connect(o.response_received, signal=signals.response_received)
        return o 
Example #21
Source File: feedexport.py    From learn_python3_spider with MIT License 5 votes vote down vote up
def from_crawler(cls, crawler):
        o = cls(crawler.settings)
        o.crawler = crawler
        crawler.signals.connect(o.open_spider, signals.spider_opened)
        crawler.signals.connect(o.close_spider, signals.spider_closed)
        crawler.signals.connect(o.item_scraped, signals.item_scraped)
        return o 
Example #22
Source File: closespider.py    From learn_python3_spider with MIT License 5 votes vote down vote up
def item_scraped(self, item, spider):
        self.counter['itemcount'] += 1
        if self.counter['itemcount'] == self.close_on['itemcount']:
            self.crawler.engine.close_spider(spider, 'closespider_itemcount') 
Example #23
Source File: corestats.py    From learn_python3_spider with MIT License 5 votes vote down vote up
def item_scraped(self, item, spider):
        self.stats.inc_value('item_scraped_count', spider=spider) 
Example #24
Source File: corestats.py    From learn_python3_spider with MIT License 5 votes vote down vote up
def from_crawler(cls, crawler):
        o = cls(crawler.stats)
        crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
        crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
        crawler.signals.connect(o.item_scraped, signal=signals.item_scraped)
        crawler.signals.connect(o.item_dropped, signal=signals.item_dropped)
        crawler.signals.connect(o.response_received, signal=signals.response_received)
        return o 
Example #25
Source File: pixiv-beta.py    From Pixiv-Crawler with GNU General Public License v3.0 5 votes vote down vote up
def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(pixivSpider, cls).from_crawler(crawler, *args, **kwargs)
        crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed)
        crawler.signals.connect(cls.update_collection_set, signal=signals.item_scraped)
        return spider

    # allowed_domains = [] 
Example #26
Source File: spiders.py    From scrapy-kafka with Apache License 2.0 5 votes vote down vote up
def item_scraped(self, *args, **kwargs):
        """Avoids waiting for the spider to  idle before scheduling the next request"""
        self.schedule_next_request()