Python scrapy.crawler.CrawlerRunner() Examples
The following are 12
code examples of scrapy.crawler.CrawlerRunner().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
scrapy.crawler
, or try the search function
.
Example #1
Source File: spiders.py From autologin with Apache License 2.0 | 7 votes |
def crawl_runner(extra_settings=None): settings = base_settings.copy() if extra_settings is not None: settings.update(extra_settings, priority='cmdline') if settings.get('SPLASH_URL'): settings['DUPEFILTER_CLASS'] = 'scrapy_splash.SplashAwareDupeFilter' settings.setdefault('DOWNLOADER_MIDDLEWARES', {}).update({ 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': None, 'scrapy_splash.SplashCookiesMiddleware': 723, 'scrapy_splash.SplashMiddleware': 725, 'scrapy.downloadermiddlewares.httpcompression' '.HttpCompressionMiddleware': 810, }) else: settings.setdefault('DOWNLOADER_MIDDLEWARES', {}).update({ 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': None, 'autologin.middleware.ExposeCookiesMiddleware': 700, }) return CrawlerRunner(settings)
Example #2
Source File: scheduler.py From haipproxy with MIT License | 5 votes |
def crawler_start(usage, tasks): """Start specified spiders or validators from cmd with scrapy core api. There are four kinds of spiders: common, ajax, gfw, ajax_gfw. If you don't assign any tasks, all these spiders will run. """ if usage == 'crawler': maps = CRAWLER_TASK_MAPS origin_spiders = DEFAULT_CRAWLERS else: maps = TEMP_TASK_MAPS origin_spiders = DEFAULT_VALIDATORS if not tasks: spiders = origin_spiders else: spiders = list() cases = list(map(BaseCase, origin_spiders)) for task in tasks: for case in cases: if case.check(task, maps): spiders.append(case.spider) break else: # crawler_logger.warning('spider task {} is an invalid task, the allowed tasks are {}'.format( # task, list(maps.keys()))) pass if not spiders: #crawler_logger.warning('no spider starts up, please check your task input') return settings = get_project_settings() configure_logging(settings) runner = CrawlerRunner(settings) for spider in spiders: runner.crawl(spider) d = runner.join() d.addBoth(lambda _: reactor.stop()) reactor.run()
Example #3
Source File: online.py From scrapy-cluster with MIT License | 5 votes |
def test_crawler_process(self): runner = CrawlerRunner(self.settings) d = runner.crawl(CustomSpider) d.addBoth(lambda _: reactor.stop()) # add crawl to redis key = "test-spider:dmoztools.net:queue" self.redis_conn.zadd(key, self.example_feed, -99) # run the spider, give 20 seconds to see the url, crawl it, # and send to kafka. Then we kill the reactor def thread_func(): time.sleep(20) reactor.stop() thread = threading.Thread(target=thread_func) thread.start() reactor.run() message_count = 0 m = next(self.consumer) if m is None: pass else: the_dict = json.loads(m.value) if the_dict is not None and the_dict['appid'] == 'test' \ and the_dict['crawlid'] == 'abc12345': message_count += 1 self.assertEquals(message_count, 1)
Example #4
Source File: full_analysis.py From jd_analysis with GNU Lesser General Public License v3.0 | 5 votes |
def runspider(self): configure_logging(install_root_handler = False) s = get_project_settings() runner = CrawlerRunner(settings = s) @defer.inlineCallbacks def crawl(**spargs): yield runner.crawl(JDItemInfoSpider, **spargs) yield runner.crawl(JDCommentSpider, **spargs) reactor.stop() crawl(**self.spargs) reactor.run() # the script will block here until the last crawl call is finished # 调度分析
Example #5
Source File: run.py From PythonScrapyBasicSetup with MIT License | 5 votes |
def run(): configure_logging() # importing project settings for further usage # mainly because of the middlewares settings = get_project_settings() runner = CrawlerRunner(settings) # running spiders sequentially (non-distributed) @defer.inlineCallbacks def crawl(): yield runner.crawl(IPTesterSpider) yield runner.crawl(UATesterSpider) reactor.stop() crawl() reactor.run() # block until the last call
Example #6
Source File: test.py From learn_python3_spider with MIT License | 5 votes |
def get_crawler(spidercls=None, settings_dict=None): """Return an unconfigured Crawler object. If settings_dict is given, it will be used to populate the crawler settings with a project level priority. """ from scrapy.crawler import CrawlerRunner from scrapy.spiders import Spider runner = CrawlerRunner(settings_dict) return runner.create_crawler(spidercls or Spider)
Example #7
Source File: test.py From learn_python3_spider with MIT License | 5 votes |
def get_crawler(spidercls=None, settings_dict=None): """Return an unconfigured Crawler object. If settings_dict is given, it will be used to populate the crawler settings with a project level priority. """ from scrapy.crawler import CrawlerRunner from scrapy.spiders import Spider runner = CrawlerRunner(settings_dict) return runner.create_crawler(spidercls or Spider)
Example #8
Source File: pipelines.py From company2vec with MIT License | 5 votes |
def return_spider_output(output): """ Turns scrapy output into dictionaries :param output: items scraped by CrawlerRunner :type output: dict :return: json with list of items """ # this just turns items into dictionaries return [dict(item) for item in output]
Example #9
Source File: parser.py From Gerapy with MIT License | 5 votes |
def __init__(self, settings, spider, args): """ init parser :param settings: :param spider: :param args: """ self.args = args self.spider = spider self.crawler_process = CrawlerRunner(settings) self.spider_loader = self.crawler_process.spider_loader self.spidercls = self.spider_loader.load(self.spider)
Example #10
Source File: parser.py From Gerapy with MIT License | 5 votes |
def get_start_requests(project_path, spider_name): """ get start requests :param project_path: project path :param spider_name: spider name :return: """ work_cwd = os.getcwd() try: # change work dir os.chdir(project_path) # load settings settings = get_project_settings() check_deprecated_settings(settings) runner = CrawlerRunner(settings=settings) # add crawler spider_cls = runner.spider_loader.load(spider_name) runner.crawl(spider_cls) # get crawler crawler = list(runner.crawlers)[0] # get spider by crawler spider = crawler.spider # get start requests requests = list(spider.start_requests()) if not requests and hasattr(spider, 'start'): requests = list(spider.start()) requests = list(map(lambda r: process_request(r), requests)) return {'finished': True, 'requests': requests} finally: os.chdir(work_cwd)
Example #11
Source File: crawler.py From fp-server with MIT License | 5 votes |
def init_crawler_runner(): crochet.setup() init_scrapy_env() settings = get_project_settings() global CRAWLER_RUNNER CRAWLER_RUNNER = CrawlerRunner(settings) logger.info('Initialized crawler runner: %s' % CRAWLER_RUNNER) # TODO: move these to config file?
Example #12
Source File: run.py From openslack-crawler with Apache License 2.0 | 5 votes |
def run_spider2(spider, *args): configure_logging() runner = CrawlerRunner(get_project_settings()) runner.crawl(spider, *args) runner.crawl(spider, *args) d = runner.join() # d = runner.crawl(spider, *args) d.addBoth(lambda _: reactor.stop()) reactor.run() # the script will block here until all crawling jobs are finished