Python scrapy.crawler.Crawler() Examples
The following are 13
code examples of scrapy.crawler.Crawler().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
scrapy.crawler
, or try the search function
.
Example #1
Source File: TorSplashCrawler.py From AIL-framework with GNU Affero General Public License v3.0 | 6 votes |
def __init__(self, splash_url, crawler_options): self.process = CrawlerProcess({'LOG_ENABLED': True}) self.crawler = Crawler(self.TorSplashSpider, { 'USER_AGENT': crawler_options['user_agent'], 'SPLASH_URL': splash_url, 'ROBOTSTXT_OBEY': False, 'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723, 'scrapy_splash.SplashMiddleware': 725, 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, 'scrapy_splash.SplashDeduplicateArgsMiddleware': 100, }, 'SPIDER_MIDDLEWARES': {'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,}, 'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter', 'HTTPERROR_ALLOW_ALL': True, 'RETRY_TIMES': 2, 'CLOSESPIDER_PAGECOUNT': crawler_options['closespider_pagecount'], 'DEPTH_LIMIT': crawler_options['depth_limit'], 'SPLASH_COOKIES_DEBUG': False })
Example #2
Source File: spider.py From stock with Apache License 2.0 | 6 votes |
def __init__(self): self.spider = HqSpider() self.crawler = crawler = Crawler(get_project_settings()) crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.configure() crawler.crawl(self.spider) dispatcher.connect(self._dont_close_me, signals.spider_idle) self.thread = None self._started = False self._stopped = False
Example #3
Source File: spider.py From fp-server with MIT License | 6 votes |
def build_crawler(self, spider): """ do some specific settings for spider and return the wrapped crawler :param spider: spider class :return: crawler """ # TODO: specify settings settings = crawler_runner.settings # FIXME !!! # conf = {} # log_file = crawler_runner.settings.get('LOG_FILE') # if log_file: # conf['LOG_FILE'] = '%s.%s' % (log_file, spider.name) # conf['LOG_FILE'] = None # conf['LOG_FORMAT'] = ('%(levelname)1.1s [%(asctime)s]' # ' [spider-{spider}]' # ' %(message)s' # ).format(spider=spider.name) # settings = updated_crawler_settings(settings, conf) # configure_logging(settings) return Crawler(spider, settings)
Example #4
Source File: base.py From invana-bot with MIT License | 5 votes |
def start_job(self, job=None, callback_fn=None): print(job) spider_job = job['spider_job'] runner = job['runner'] spider_cls = spider_job['spider_cls'] spider_settings = spider_job['spider_settings'] spider_kwargs = spider_job['spider_kwargs'] def engine_stopped_callback(): runner.transform_and_index(callback_fn=callback_fn) if callback_fn: print(""" ========================================================== WARNING: callback_fn is {} ========================================================== Since start_job is called with callback_fn, make sure you end the reactor if you want the spider process to stop after the callback function is executed. By default callback_fn=None will close the reactor. To write a custom callback_fn def callback_fn(): print ("Write your own callback logic") from twisted.internet import reactor reactor.stop() ========================================================== """.format(callback_fn)) spider = Crawler(spider_cls, Settings(spider_settings)) spider.signals.connect(engine_stopped_callback, signals.engine_stopped) self.runner.crawl(spider, **spider_kwargs) """ d = runner.crawl(spider, **spider_kwargs) # d.addBoth(engine_stopped_callback) """ reactor.run()
Example #5
Source File: conftest.py From spidermon with BSD 3-Clause "New" or "Revised" License | 5 votes |
def get_crawler(): def _crawler(extended_settings={}): settings = { "SPIDERMON_ENABLED": True, "EXTENSIONS": {"spidermon.contrib.scrapy.extensions.Spidermon": 500}, } settings.update(extended_settings) crawler = Crawler(Spider, settings=settings) crawler.spider = Spider("dummy") return crawler return _crawler
Example #6
Source File: test_monitors.py From spidermon with BSD 3-Clause "New" or "Revised" License | 5 votes |
def make_data(request): def _make_data(settings=None): crawler = Crawler(Spider, settings=settings) spider = Spider("dummy") return { "stats": crawler.stats.get_stats(), "crawler": crawler, "spider": spider, "runner": SpiderMonitorRunner(spider=spider), "job": None, } return _make_data
Example #7
Source File: crawl_with_status.py From zulip with Apache License 2.0 | 5 votes |
def run(self, args: List[str], opts: optparse.Values) -> None: crawlers = [] real_create_crawler = self.crawler_process.create_crawler def create_crawler(crawler_or_spidercls: Union[Crawler, str]) -> Crawler: crawler = real_create_crawler(crawler_or_spidercls) crawlers.append(crawler) return crawler self.crawler_process.create_crawler = create_crawler super().run(args, opts) if any(crawler.stats.get_value("log_count/ERROR") for crawler in crawlers): self.exitcode = 1
Example #8
Source File: utils.py From scrapy-poet with BSD 3-Clause "New" or "Revised" License | 5 votes |
def make_crawler(spider_cls, settings): if not getattr(spider_cls, 'name', None): class Spider(spider_cls): name = 'test_spider' Spider.__name__ = spider_cls.__name__ Spider.__module__ = spider_cls.__module__ spider_cls = Spider return Crawler(spider_cls, settings)
Example #9
Source File: utils.py From scrapy-autounit with BSD 3-Clause "New" or "Revised" License | 5 votes |
def prepare_callback_replay(fixture_path, encoding="utf-8"): with open(str(fixture_path), 'rb') as f: raw_data = f.read() fixture_info = unpickle_data(decompress_data(raw_data), encoding) if 'fixture_version' in fixture_info: encoding = fixture_info['encoding'] data = unpickle_data(fixture_info['data'], encoding) else: data = fixture_info # legacy tests settings = get_project_settings() spider_name = data.get('spider_name') if not spider_name: # legacy tests spider_name = os.path.basename( os.path.dirname( os.path.dirname(fixture_path) ) ) spider_cls = get_spider_class(spider_name, settings) spider_cls.update_settings(settings) for k, v in data.get('settings', {}).items(): settings.set(k, v, 50) crawler = Crawler(spider_cls, settings) spider_args_in = data.get('spider_args', data.get('spider_args_in', {})) spider = spider_cls.from_crawler(crawler, **spider_args_in) crawler.spider = spider return data, crawler, spider, settings
Example #10
Source File: TorSplashCrawler.py From AIL-framework with GNU Affero General Public License v3.0 | 5 votes |
def __init__(self, type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item, *args, **kwargs): self.domain_type = type self.requested_mode = requested_mode self.original_item = original_item self.root_key = None self.start_urls = url self.domains = [domain] self.port = str(port) date_str = '{}/{}/{}'.format(date['date_day'][0:4], date['date_day'][4:6], date['date_day'][6:8]) self.full_date = date['date_day'] self.date_month = date['date_month'] self.date_epoch = int(date['epoch']) self.png = crawler_options['png'] self.har = crawler_options['har'] self.cookies = cookies config_section = 'Crawler' self.p = Process(config_section) self.item_dir = os.path.join(self.p.config.get("Directories", "crawled"), date_str ) self.har_dir = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot"), date_str ) self.r_serv_log_submit = redis.StrictRedis( host=self.p.config.get("Redis_Log_submit", "host"), port=self.p.config.getint("Redis_Log_submit", "port"), db=self.p.config.getint("Redis_Log_submit", "db"), decode_responses=True) self.root_key = None
Example #11
Source File: spiders.py From daywatch with MIT License | 5 votes |
def run_spider_instance(spider_class, site_id, main_url): """Run a spider given its spider class. For example, importing the TestSpider and passing it to this function will run it.""" spider = spider_class(site_id=site_id, main_url=main_url) settings = get_project_settings() crawler = Crawler(settings) crawler.configure() # Scrapy uses a deprecated Twisted interface. Until the fix makes it to a # new version (>0.24.4), we'll use this so deprecation warnings don't # clutter the output crawler.signals.connect(reactor.stop, signal=signals.spider_closed) warnings.filterwarnings("ignore", category=DeprecationWarning) crawler.crawl(spider) crawler.start() reactor.run()
Example #12
Source File: utils.py From legco-watch with MIT License | 5 votes |
def list_spiders(): settings = get_project_settings() crawler = Crawler(settings) return crawler.spiders.list()
Example #13
Source File: tasks.py From legco-watch with MIT License | 4 votes |
def do_scrape(spider_name): """ Asynchronous task for individual scrapes that is executed by Celery workers. :param spider_name: str name of the spider that should be run :return: the full path of the jsonlines output file to which results are stored """ # create and configure the spider crawl_settings = get_project_settings() # configure the output # Technically don't need this unless we actually do the scrape, but need to put # up here before the crawler is instantiated so the FEED_URI override is active output_name = generate_scrape_name(spider_name) output_path = os.path.join(crawl_settings.get('DATA_DIR_BASE'), 'scrapes', output_name) crawl_settings.overrides['FEED_URI'] = output_path crawler = Crawler(crawl_settings) crawler.configure() try: spider = crawler.spiders.create(spider_name) except KeyError as e: # No spider found. raise RuntimeError('Could not find spider with name {}'.format(spider_name)) # Check to see if we're already running a scrape by looking for open ScrapeJobs is_scraping = is_spider_scraping(spider_name) if is_scraping is False: logger.info('Starting new scrape of {}'.format(spider_name)) # Create the ScrapeJob record job_id = do_scrape.request.id if job_id is None: # Case if called directly without using Celery, put in a dummy job id timestamp = datetime.now().strftime('%y%m%d%H%M') job_id = 'MANUAL_RUN{}'.format(timestamp) job = ScrapeJob.objects.create( spider=spider_name, scheduled=datetime.now(), # see http://stackoverflow.com/questions/18872854/getting-task-id-inside-a-celery-task job_id=job_id, raw_response=output_path ) # and set up the callback for updating it complete_cb = complete_job(job.id) # Connect the signals and logging, then start it up crawler.signals.connect(reactor.stop, signal=signals.spider_closed) crawler.signals.connect(complete_cb, signal=signals.spider_closed) log.start(loglevel=log.INFO, logstdout=True) crawler.crawl(spider) crawler.start() reactor.run() else: logger.info('Pending job found for spider {}'.format(spider_name)) job = is_scraping return job.raw_response