Python scrapy.crawler.Crawler() Examples

The following are 13 code examples of scrapy.crawler.Crawler(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module scrapy.crawler , or try the search function .
Example #1
Source File: TorSplashCrawler.py    From AIL-framework with GNU Affero General Public License v3.0 6 votes vote down vote up
def __init__(self, splash_url, crawler_options):
        self.process = CrawlerProcess({'LOG_ENABLED': True})
        self.crawler = Crawler(self.TorSplashSpider, {
            'USER_AGENT': crawler_options['user_agent'],
            'SPLASH_URL': splash_url,
            'ROBOTSTXT_OBEY': False,
            'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723,
                                       'scrapy_splash.SplashMiddleware': 725,
                                       'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
                                       'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
                                       },
            'SPIDER_MIDDLEWARES': {'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,},
            'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
            'HTTPERROR_ALLOW_ALL': True,
            'RETRY_TIMES': 2,
            'CLOSESPIDER_PAGECOUNT': crawler_options['closespider_pagecount'],
            'DEPTH_LIMIT': crawler_options['depth_limit'],
            'SPLASH_COOKIES_DEBUG': False
            }) 
Example #2
Source File: spider.py    From stock with Apache License 2.0 6 votes vote down vote up
def __init__(self):
    self.spider = HqSpider()
    self.crawler = crawler = Crawler(get_project_settings())
    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    crawler.configure()
    crawler.crawl(self.spider)
    dispatcher.connect(self._dont_close_me, signals.spider_idle)
    self.thread = None
    self._started = False
    self._stopped = False 
Example #3
Source File: spider.py    From fp-server with MIT License 6 votes vote down vote up
def build_crawler(self, spider):
        """
        do some specific settings for spider
        and return the wrapped crawler

        :param spider: spider class
        :return: crawler
        """
        # TODO: specify settings
        settings = crawler_runner.settings

        # FIXME !!!
        # conf = {}
        # log_file = crawler_runner.settings.get('LOG_FILE')
        # if log_file:
        #     conf['LOG_FILE'] = '%s.%s' % (log_file, spider.name)
        #     conf['LOG_FILE'] = None
        #     conf['LOG_FORMAT'] = ('%(levelname)1.1s [%(asctime)s]'
        #                           ' [spider-{spider}]'
        #                           ' %(message)s'
        #                           ).format(spider=spider.name)
        #     settings = updated_crawler_settings(settings, conf)
        # configure_logging(settings)

        return Crawler(spider, settings) 
Example #4
Source File: base.py    From invana-bot with MIT License 5 votes vote down vote up
def start_job(self, job=None, callback_fn=None):
        print(job)
        spider_job = job['spider_job']
        runner = job['runner']
        spider_cls = spider_job['spider_cls']
        spider_settings = spider_job['spider_settings']
        spider_kwargs = spider_job['spider_kwargs']

        def engine_stopped_callback():
            runner.transform_and_index(callback_fn=callback_fn)

        if callback_fn:
            print("""
==========================================================
WARNING: callback_fn is {}
==========================================================
Since start_job is called with callback_fn, make sure you end the reactor if you want the spider process to
stop after the callback function is executed. By default callback_fn=None will close the reactor.

To write a custom callback_fn

def callback_fn():
    print ("Write your own callback logic")
    from twisted.internet import reactor
    reactor.stop()
==========================================================
        """.format(callback_fn))

        spider = Crawler(spider_cls, Settings(spider_settings))
        spider.signals.connect(engine_stopped_callback, signals.engine_stopped)
        self.runner.crawl(spider, **spider_kwargs)
        """
        d = runner.crawl(spider, **spider_kwargs)
        # d.addBoth(engine_stopped_callback)
        """
        reactor.run() 
Example #5
Source File: conftest.py    From spidermon with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def get_crawler():
    def _crawler(extended_settings={}):
        settings = {
            "SPIDERMON_ENABLED": True,
            "EXTENSIONS": {"spidermon.contrib.scrapy.extensions.Spidermon": 500},
        }
        settings.update(extended_settings)
        crawler = Crawler(Spider, settings=settings)
        crawler.spider = Spider("dummy")
        return crawler

    return _crawler 
Example #6
Source File: test_monitors.py    From spidermon with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def make_data(request):
    def _make_data(settings=None):
        crawler = Crawler(Spider, settings=settings)
        spider = Spider("dummy")
        return {
            "stats": crawler.stats.get_stats(),
            "crawler": crawler,
            "spider": spider,
            "runner": SpiderMonitorRunner(spider=spider),
            "job": None,
        }

    return _make_data 
Example #7
Source File: crawl_with_status.py    From zulip with Apache License 2.0 5 votes vote down vote up
def run(self, args: List[str], opts: optparse.Values) -> None:
        crawlers = []
        real_create_crawler = self.crawler_process.create_crawler

        def create_crawler(crawler_or_spidercls: Union[Crawler, str]) -> Crawler:
            crawler = real_create_crawler(crawler_or_spidercls)
            crawlers.append(crawler)
            return crawler

        self.crawler_process.create_crawler = create_crawler
        super().run(args, opts)
        if any(crawler.stats.get_value("log_count/ERROR") for crawler in crawlers):
            self.exitcode = 1 
Example #8
Source File: utils.py    From scrapy-poet with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def make_crawler(spider_cls, settings):
    if not getattr(spider_cls, 'name', None):
        class Spider(spider_cls):
            name = 'test_spider'
        Spider.__name__ = spider_cls.__name__
        Spider.__module__ = spider_cls.__module__
        spider_cls = Spider
    return Crawler(spider_cls, settings) 
Example #9
Source File: utils.py    From scrapy-autounit with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def prepare_callback_replay(fixture_path, encoding="utf-8"):
    with open(str(fixture_path), 'rb') as f:
        raw_data = f.read()

    fixture_info = unpickle_data(decompress_data(raw_data), encoding)
    if 'fixture_version' in fixture_info:
        encoding = fixture_info['encoding']
        data = unpickle_data(fixture_info['data'], encoding)
    else:
        data = fixture_info  # legacy tests

    settings = get_project_settings()

    spider_name = data.get('spider_name')
    if not spider_name:  # legacy tests
        spider_name = os.path.basename(
            os.path.dirname(
                os.path.dirname(fixture_path)
            )
        )

    spider_cls = get_spider_class(spider_name, settings)
    spider_cls.update_settings(settings)

    for k, v in data.get('settings', {}).items():
        settings.set(k, v, 50)

    crawler = Crawler(spider_cls, settings)
    spider_args_in = data.get('spider_args', data.get('spider_args_in', {}))
    spider = spider_cls.from_crawler(crawler, **spider_args_in)
    crawler.spider = spider

    return data, crawler, spider, settings 
Example #10
Source File: TorSplashCrawler.py    From AIL-framework with GNU Affero General Public License v3.0 5 votes vote down vote up
def __init__(self, type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item, *args, **kwargs):
            self.domain_type = type
            self.requested_mode = requested_mode
            self.original_item = original_item
            self.root_key = None
            self.start_urls = url
            self.domains = [domain]
            self.port = str(port)
            date_str = '{}/{}/{}'.format(date['date_day'][0:4], date['date_day'][4:6], date['date_day'][6:8])
            self.full_date = date['date_day']
            self.date_month = date['date_month']
            self.date_epoch = int(date['epoch'])

            self.png = crawler_options['png']
            self.har = crawler_options['har']
            self.cookies = cookies

            config_section = 'Crawler'
            self.p = Process(config_section)
            self.item_dir = os.path.join(self.p.config.get("Directories", "crawled"), date_str )
            self.har_dir = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot"), date_str )
            self.r_serv_log_submit = redis.StrictRedis(
                host=self.p.config.get("Redis_Log_submit", "host"),
                port=self.p.config.getint("Redis_Log_submit", "port"),
                db=self.p.config.getint("Redis_Log_submit", "db"),
                decode_responses=True)

            self.root_key = None 
Example #11
Source File: spiders.py    From daywatch with MIT License 5 votes vote down vote up
def run_spider_instance(spider_class, site_id, main_url):
    """Run a spider given its spider class. For example, importing the TestSpider
and passing it to this function will run it."""
    spider = spider_class(site_id=site_id, main_url=main_url)
    settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.configure()
    # Scrapy uses a deprecated Twisted interface. Until the fix makes it to a
    # new version (>0.24.4), we'll use this so deprecation warnings don't
    # clutter the output
    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    warnings.filterwarnings("ignore", category=DeprecationWarning)
    crawler.crawl(spider)
    crawler.start()
    reactor.run() 
Example #12
Source File: utils.py    From legco-watch with MIT License 5 votes vote down vote up
def list_spiders():
    settings = get_project_settings()
    crawler = Crawler(settings)
    return crawler.spiders.list() 
Example #13
Source File: tasks.py    From legco-watch with MIT License 4 votes vote down vote up
def do_scrape(spider_name):
    """
    Asynchronous task for individual scrapes that is executed by Celery workers.
    :param spider_name: str name of the spider that should be run
    :return: the full path of the jsonlines output file to which results are stored
    """
    # create and configure the spider
    crawl_settings = get_project_settings()
    # configure the output
    # Technically don't need this unless we actually do the scrape, but need to put
    # up here before the crawler is instantiated so the FEED_URI override is active
    output_name = generate_scrape_name(spider_name)
    output_path = os.path.join(crawl_settings.get('DATA_DIR_BASE'), 'scrapes', output_name)
    crawl_settings.overrides['FEED_URI'] = output_path
    crawler = Crawler(crawl_settings)
    crawler.configure()
    try:
        spider = crawler.spiders.create(spider_name)
    except KeyError as e:
        # No spider found.
        raise RuntimeError('Could not find spider with name {}'.format(spider_name))

    # Check to see if we're already running a scrape by looking for open ScrapeJobs
    is_scraping = is_spider_scraping(spider_name)
    if is_scraping is False:
        logger.info('Starting new scrape of {}'.format(spider_name))
        # Create the ScrapeJob record
        job_id = do_scrape.request.id
        if job_id is None:
            # Case if called directly without using Celery, put in a dummy job id
            timestamp = datetime.now().strftime('%y%m%d%H%M')
            job_id = 'MANUAL_RUN{}'.format(timestamp)
        job = ScrapeJob.objects.create(
            spider=spider_name,
            scheduled=datetime.now(),
            # see http://stackoverflow.com/questions/18872854/getting-task-id-inside-a-celery-task
            job_id=job_id,
            raw_response=output_path
        )
        # and set up the callback for updating it
        complete_cb = complete_job(job.id)

        # Connect the signals and logging, then start it up
        crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
        crawler.signals.connect(complete_cb, signal=signals.spider_closed)
        log.start(loglevel=log.INFO, logstdout=True)
        crawler.crawl(spider)
        crawler.start()
        reactor.run()
    else:
        logger.info('Pending job found for spider {}'.format(spider_name))
        job = is_scraping

    return job.raw_response