Python Examples of scrapy.crawler.Crawler

Source File: TorSplashCrawler.py From AIL-framework with GNU Affero General Public License v3.0

6 votes

def __init__(self, splash_url, crawler_options):
        self.process = CrawlerProcess({'LOG_ENABLED': True})
        self.crawler = Crawler(self.TorSplashSpider, {
            'USER_AGENT': crawler_options['user_agent'],
            'SPLASH_URL': splash_url,
            'ROBOTSTXT_OBEY': False,
            'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723,
                                       'scrapy_splash.SplashMiddleware': 725,
                                       'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
                                       'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
                                       },
            'SPIDER_MIDDLEWARES': {'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,},
            'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
            'HTTPERROR_ALLOW_ALL': True,
            'RETRY_TIMES': 2,
            'CLOSESPIDER_PAGECOUNT': crawler_options['closespider_pagecount'],
            'DEPTH_LIMIT': crawler_options['depth_limit'],
            'SPLASH_COOKIES_DEBUG': False
            })

Source File: spider.py From stock with Apache License 2.0

6 votes

def __init__(self):
    self.spider = HqSpider()
    self.crawler = crawler = Crawler(get_project_settings())
    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    crawler.configure()
    crawler.crawl(self.spider)
    dispatcher.connect(self._dont_close_me, signals.spider_idle)
    self.thread = None
    self._started = False
    self._stopped = False

Source File: spider.py From fp-server with MIT License

6 votes

def build_crawler(self, spider):
        """
        do some specific settings for spider
        and return the wrapped crawler

        :param spider: spider class
        :return: crawler
        """
        # TODO: specify settings
        settings = crawler_runner.settings

        # FIXME !!!
        # conf = {}
        # log_file = crawler_runner.settings.get('LOG_FILE')
        # if log_file:
        #     conf['LOG_FILE'] = '%s.%s' % (log_file, spider.name)
        #     conf['LOG_FILE'] = None
        #     conf['LOG_FORMAT'] = ('%(levelname)1.1s [%(asctime)s]'
        #                           ' [spider-{spider}]'
        #                           ' %(message)s'
        #                           ).format(spider=spider.name)
        #     settings = updated_crawler_settings(settings, conf)
        # configure_logging(settings)

        return Crawler(spider, settings)

Source File: base.py From invana-bot with MIT License

5 votes

def start_job(self, job=None, callback_fn=None):
        print(job)
        spider_job = job['spider_job']
        runner = job['runner']
        spider_cls = spider_job['spider_cls']
        spider_settings = spider_job['spider_settings']
        spider_kwargs = spider_job['spider_kwargs']

        def engine_stopped_callback():
            runner.transform_and_index(callback_fn=callback_fn)

        if callback_fn:
            print("""
==========================================================
WARNING: callback_fn is {}
==========================================================
Since start_job is called with callback_fn, make sure you end the reactor if you want the spider process to
stop after the callback function is executed. By default callback_fn=None will close the reactor.

To write a custom callback_fn

def callback_fn():
    print ("Write your own callback logic")
    from twisted.internet import reactor
    reactor.stop()
==========================================================
        """.format(callback_fn))

        spider = Crawler(spider_cls, Settings(spider_settings))
        spider.signals.connect(engine_stopped_callback, signals.engine_stopped)
        self.runner.crawl(spider, **spider_kwargs)
        """
        d = runner.crawl(spider, **spider_kwargs)
        # d.addBoth(engine_stopped_callback)
        """
        reactor.run()

Source File: conftest.py From spidermon with BSD 3-Clause "New" or "Revised" License

5 votes

def get_crawler():
    def _crawler(extended_settings={}):
        settings = {
            "SPIDERMON_ENABLED": True,
            "EXTENSIONS": {"spidermon.contrib.scrapy.extensions.Spidermon": 500},
        }
        settings.update(extended_settings)
        crawler = Crawler(Spider, settings=settings)
        crawler.spider = Spider("dummy")
        return crawler

    return _crawler

Source File: test_monitors.py From spidermon with BSD 3-Clause "New" or "Revised" License

5 votes

def make_data(request):
    def _make_data(settings=None):
        crawler = Crawler(Spider, settings=settings)
        spider = Spider("dummy")
        return {
            "stats": crawler.stats.get_stats(),
            "crawler": crawler,
            "spider": spider,
            "runner": SpiderMonitorRunner(spider=spider),
            "job": None,
        }

    return _make_data

Source File: crawl_with_status.py From zulip with Apache License 2.0

5 votes

def run(self, args: List[str], opts: optparse.Values) -> None:
        crawlers = []
        real_create_crawler = self.crawler_process.create_crawler

        def create_crawler(crawler_or_spidercls: Union[Crawler, str]) -> Crawler:
            crawler = real_create_crawler(crawler_or_spidercls)
            crawlers.append(crawler)
            return crawler

        self.crawler_process.create_crawler = create_crawler
        super().run(args, opts)
        if any(crawler.stats.get_value("log_count/ERROR") for crawler in crawlers):
            self.exitcode = 1

Source File: utils.py From scrapy-poet with BSD 3-Clause "New" or "Revised" License

5 votes

def make_crawler(spider_cls, settings):
    if not getattr(spider_cls, 'name', None):
        class Spider(spider_cls):
            name = 'test_spider'
        Spider.__name__ = spider_cls.__name__
        Spider.__module__ = spider_cls.__module__
        spider_cls = Spider
    return Crawler(spider_cls, settings)

Source File: utils.py From scrapy-autounit with BSD 3-Clause "New" or "Revised" License

5 votes

def prepare_callback_replay(fixture_path, encoding="utf-8"):
    with open(str(fixture_path), 'rb') as f:
        raw_data = f.read()

    fixture_info = unpickle_data(decompress_data(raw_data), encoding)
    if 'fixture_version' in fixture_info:
        encoding = fixture_info['encoding']
        data = unpickle_data(fixture_info['data'], encoding)
    else:
        data = fixture_info  # legacy tests

    settings = get_project_settings()

    spider_name = data.get('spider_name')
    if not spider_name:  # legacy tests
        spider_name = os.path.basename(
            os.path.dirname(
                os.path.dirname(fixture_path)
            )
        )

    spider_cls = get_spider_class(spider_name, settings)
    spider_cls.update_settings(settings)

    for k, v in data.get('settings', {}).items():
        settings.set(k, v, 50)

    crawler = Crawler(spider_cls, settings)
    spider_args_in = data.get('spider_args', data.get('spider_args_in', {}))
    spider = spider_cls.from_crawler(crawler, **spider_args_in)
    crawler.spider = spider

    return data, crawler, spider, settings

Source File: TorSplashCrawler.py From AIL-framework with GNU Affero General Public License v3.0

5 votes

def __init__(self, type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item, *args, **kwargs):
            self.domain_type = type
            self.requested_mode = requested_mode
            self.original_item = original_item
            self.root_key = None
            self.start_urls = url
            self.domains = [domain]
            self.port = str(port)
            date_str = '{}/{}/{}'.format(date['date_day'][0:4], date['date_day'][4:6], date['date_day'][6:8])
            self.full_date = date['date_day']
            self.date_month = date['date_month']
            self.date_epoch = int(date['epoch'])

            self.png = crawler_options['png']
            self.har = crawler_options['har']
            self.cookies = cookies

            config_section = 'Crawler'
            self.p = Process(config_section)
            self.item_dir = os.path.join(self.p.config.get("Directories", "crawled"), date_str )
            self.har_dir = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot"), date_str )
            self.r_serv_log_submit = redis.StrictRedis(
                host=self.p.config.get("Redis_Log_submit", "host"),
                port=self.p.config.getint("Redis_Log_submit", "port"),
                db=self.p.config.getint("Redis_Log_submit", "db"),
                decode_responses=True)

            self.root_key = None

Source File: spiders.py From daywatch with MIT License

5 votes

def run_spider_instance(spider_class, site_id, main_url):
    """Run a spider given its spider class. For example, importing the TestSpider
and passing it to this function will run it."""
    spider = spider_class(site_id=site_id, main_url=main_url)
    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.configure()
    # Scrapy uses a deprecated Twisted interface. Until the fix makes it to a
    # new version (>0.24.4), we'll use this so deprecation warnings don't
    # clutter the output
    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    warnings.filterwarnings("ignore", category=DeprecationWarning)
    crawler.crawl(spider)
    crawler.start()
    reactor.run()

Source File: utils.py From legco-watch with MIT License

5 votes

def list_spiders():
    settings = get_project_settings()
    crawler = Crawler(settings)
    return crawler.spiders.list()

Source File: tasks.py From legco-watch with MIT License

4 votes

def do_scrape(spider_name):
    """
    Asynchronous task for individual scrapes that is executed by Celery workers.
    :param spider_name: str name of the spider that should be run
    :return: the full path of the jsonlines output file to which results are stored
    """
    # create and configure the spider
    crawl_settings = get_project_settings()
    # configure the output
    # Technically don't need this unless we actually do the scrape, but need to put
    # up here before the crawler is instantiated so the FEED_URI override is active
    output_name = generate_scrape_name(spider_name)
    output_path = os.path.join(crawl_settings.get('DATA_DIR_BASE'), 'scrapes', output_name)
    crawl_settings.overrides['FEED_URI'] = output_path
    crawler = Crawler(crawl_settings)
    crawler.configure()
    try:
        spider = crawler.spiders.create(spider_name)
    except KeyError as e:
        # No spider found.
        raise RuntimeError('Could not find spider with name {}'.format(spider_name))

    # Check to see if we're already running a scrape by looking for open ScrapeJobs
    is_scraping = is_spider_scraping(spider_name)
    if is_scraping is False:
        logger.info('Starting new scrape of {}'.format(spider_name))
        # Create the ScrapeJob record
        job_id = do_scrape.request.id
        if job_id is None:
            # Case if called directly without using Celery, put in a dummy job id
            timestamp = datetime.now().strftime('%y%m%d%H%M')
            job_id = 'MANUAL_RUN{}'.format(timestamp)
        job = ScrapeJob.objects.create(
            spider=spider_name,
            scheduled=datetime.now(),
            # see http://stackoverflow.com/questions/18872854/getting-task-id-inside-a-celery-task
            job_id=job_id,
            raw_response=output_path
        )
        # and set up the callback for updating it
        complete_cb = complete_job(job.id)

        # Connect the signals and logging, then start it up
        crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
        crawler.signals.connect(complete_cb, signal=signals.spider_closed)
        log.start(loglevel=log.INFO, logstdout=True)
        crawler.crawl(spider)
        crawler.start()
        reactor.run()
    else:
        logger.info('Pending job found for spider {}'.format(spider_name))
        job = is_scraping

    return job.raw_response

Python scrapy.crawler.Crawler() Examples