Python scrapy.crawler.CrawlerProcess() Examples
The following are 30
code examples of scrapy.crawler.CrawlerProcess().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
scrapy.crawler
, or try the search function
.
Example #1
Source File: run_spider.py From IPProxyTool with MIT License | 8 votes |
def runspider(name): configure_logging(install_root_handler=False) logging.basicConfig( filename='log/%s.log' % name, format='%(levelname)s %(asctime)s: %(message)s', level=logging.DEBUG ) process = CrawlerProcess(get_project_settings()) try: logging.info('runspider start spider:%s' % name) process.crawl(name) process.start() except Exception as e: logging.exception('runspider spider:%s exception:%s' % (name, e)) logging.debug('finish this spider:%s\n\n' % name)
Example #2
Source File: runner.py From In2ItChicago with GNU General Public License v3.0 | 7 votes |
def run(): config.connect_to_client() print('Running event processor...') crawlerProcess = CrawlerProcess(get_project_settings()) settings = project.get_project_settings() spider_loader = spiderloader.SpiderLoader.from_settings(settings) spiders = spider_loader.list() classes = [s for s in (spider_loader.load(name) for name in spiders if config.spider_name == None or name == config.spider_name) if s.enabled] crawlerProcess = CrawlerProcess(get_project_settings()) for spider_class in classes: crawlerProcess.crawl(spider_class) crawlerProcess.start() crawlerProcess.join() print('Event processor completed') session = HttpUtils.get_session() events = session.get(config.get_events, params = {}) if len(events.json()) > 0: print('Data retrieved successfully') else: print('No data retrieved')
Example #3
Source File: real_time_analysis.py From jd_analysis with GNU Lesser General Public License v3.0 | 6 votes |
def runspider(spargs): url = spargs.get('url') name = spargs.get('name', 'jd') if not os.path.exists('log'): os.makedirs('log') configure_logging(install_root_handler = False) logging.basicConfig( filename = 'log/%s.log' % name, format = '%(levelname)s %(asctime)s: %(message)s', level = logging.ERROR ) print "get_project_settings().attributes:", get_project_settings().attributes['SPIDER_MODULES'] process = CrawlerProcess(get_project_settings()) start_time = time.time() try: logging.info('进入爬虫') process.crawl(name, **spargs) process.start() except Exception, e: process.stop() logging.error("url:%s, errorMsg:%s" % (url, e.message))
Example #4
Source File: crawl.py From hoaxy-backend with GNU General Public License v3.0 | 6 votes |
def fetch_html(cls, session, url_tuples): """Actual method to do fetch html action. Parameters ---------- session : object a SQLAlchemy session object. url_tuples : list a list of url tuple (id, raw, status_code). """ settings = Settings(cls.conf['crawl']['scrapy']) settings.set('ITEM_PIPELINES', {'hoaxy.crawl.pipelines.HtmlPipeline': 300}) process = CrawlerProcess(settings) sll = cls.conf['logging']['loggers']['scrapy']['level'] logging.getLogger('scrapy').setLevel(logging.getLevelName(sll)) logger.warning('Number of url to fetch html is: %s', len(url_tuples)) process.crawl( HtmlSpider, session=session, url_tuples=url_tuples, excluded_domains=cls.conf['crawl']['excluded_domains']) process.start()
Example #5
Source File: crawl.py From hoaxy-backend with GNU General Public License v3.0 | 6 votes |
def fetch_url(cls, session, msites, platform_id, purpose): """Actual method to do fetch url action. Parameters ---------- msites : list a list of Site model class, contains info to build spiders. platform_id : int id of platform, bind fetched url with this id. purpose : {'update', 'archive'} indicate which url to fetch. """ settings = Settings(cls.conf['crawl']['scrapy']) settings.set('ITEM_PIPELINES', {'hoaxy.crawl.pipelines.UrlPipeline': 300}) process = CrawlerProcess(settings) sll = cls.conf['logging']['loggers']['scrapy']['level'] logging.getLogger('scrapy').setLevel(logging.getLevelName(sll)) for ms in msites: for sm in build_spiders_iter(ms, purpose): sm['kwargs']['session'] = session sm['kwargs']['platform_id'] = platform_id process.crawl(sm['cls'], *sm['args'], **sm['kwargs']) process.start()
Example #6
Source File: cli.py From PyFeeds with GNU Affero General Public License v3.0 | 6 votes |
def cleanup(ctx): """ Cleanup old cache entries. By default, entries older than 90 days will be removed. This value can be overriden in the config file. """ settings = ctx.obj["settings"] # Manually configure logging since we don't have a CrawlerProcess which # would take care of that. configure_logging(settings) if not settings.getbool("HTTPCACHE_ENABLED"): logger.error("Cache is disabled, will not clean up cache dir.") return 1 run_cleanup_cache(settings)
Example #7
Source File: single_crawler.py From news-please with Apache License 2.0 | 6 votes |
def load_crawler(self, crawler, url, ignore_regex): """ Loads the given crawler with the given url. :param class crawler: class of the crawler to load :param str url: url to start the crawler with :param regex ignore_regex: to be able to ignore urls that match this regex code """ self.process = CrawlerProcess(self.cfg.get_scrapy_options()) self.process.crawl( crawler, self.helper, url=url, config=self.cfg, ignore_regex=ignore_regex)
Example #8
Source File: scrapyctl.py From kmanga with GNU General Public License v3.0 | 6 votes |
def __init__(self, accounts, loglevel, remote=False): self.accounts = settings.SCRAPY_ACCOUNTS if accounts: self.accounts.update(accounts) self.loglevel = loglevel self.settings = self._get_settings() # Values for `loglevel`: CRITICAL, ERROR, WARNING, INFO, DEBUG. self.settings.set('LOG_LEVEL', loglevel) if remote: # Configure remote logging and disable the scrapy logging. self.settings.set('LOG_ENABLED', False) logger = logging.getLogger() handler = ScrapySocketHandler( 'localhost', logging.handlers.DEFAULT_TCP_LOGGING_PORT) handler.setLevel(loglevel) logger.addHandler(handler) self.process = CrawlerProcess(self.settings)
Example #9
Source File: TorSplashCrawler.py From AIL-framework with GNU Affero General Public License v3.0 | 6 votes |
def __init__(self, splash_url, crawler_options): self.process = CrawlerProcess({'LOG_ENABLED': True}) self.crawler = Crawler(self.TorSplashSpider, { 'USER_AGENT': crawler_options['user_agent'], 'SPLASH_URL': splash_url, 'ROBOTSTXT_OBEY': False, 'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723, 'scrapy_splash.SplashMiddleware': 725, 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, 'scrapy_splash.SplashDeduplicateArgsMiddleware': 100, }, 'SPIDER_MIDDLEWARES': {'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,}, 'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter', 'HTTPERROR_ALLOW_ALL': True, 'RETRY_TIMES': 2, 'CLOSESPIDER_PAGECOUNT': crawler_options['closespider_pagecount'], 'DEPTH_LIMIT': crawler_options['depth_limit'], 'SPLASH_COOKIES_DEBUG': False })
Example #10
Source File: __init__.py From fooltrader with MIT License | 6 votes |
def crawl(spider, setting): process = CrawlerProcess({**get_project_settings(), **setting}) process.crawl(spider) process.start()
Example #11
Source File: EuropythonSpyder.py From Learning-Python-Networking-Second-Edition with MIT License | 6 votes |
def main(): """Main routine for the execution of the Spider""" # set up signal to catch items scraped def catch_item(sender, item, **kwargs): print("Item extracted:", item) dispatcher.connect(catch_item, signal=signals.item_passed) settings = Settings() settings.set("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36") settings.set("LOG_ENABLED",False) # setup crawler from scrapy.crawler import CrawlerProcess crawler = CrawlerProcess(settings) # define the spider for the crawler crawler.crawl(EuropythonSpyder()) # start scrapy print("STARTING ENGINE") crawler.start() #iniciar el crawler llamando al spider definido print("ENGINE STOPPED")
Example #12
Source File: run_spider.py From jd_analysis with GNU Lesser General Public License v3.0 | 6 votes |
def runspider(spargs): url = spargs.get('url') name = spargs.get('name', 'jd') guid = spargs.get('guid') product_id = spargs.get('product_id') if not os.path.exists('log'): os.makedirs('log') configure_logging(install_root_handler = False) logging.basicConfig( filename = 'log/%s.log' % name, format = '%(levelname)s %(asctime)s: %(message)s', level = logging.ERROR ) print "get_project_settings().attributes:", get_project_settings().attributes['SPIDER_MODULES'] process = CrawlerProcess(get_project_settings()) start_time = time.time() try: logging.info('进入爬虫') process.crawl(name, **spargs) process.start() except Exception, e: process.stop() logging.error("url:%s, errorMsg:%s" % (url, e.message))
Example #13
Source File: collector.py From collectors with MIT License | 5 votes |
def collect(conf, conn, date_from=None, date_to=None): process = CrawlerProcess(conf['SCRAPY_SETTINGS']) process.crawl(Spider, conn=conn, date_from=date_from, date_to=date_to) process.start()
Example #14
Source File: crawl.py From hoaxy-backend with GNU General Public License v3.0 | 5 votes |
def parse_article(cls, session, url_tuples): """Actual method to do parse to article action. Parameters ---------- session : object a SQLAlchemy session object. url_tuples : list a list of url tuple (id, created_at, date_published, canonical, site_id) """ settings = Settings(cls.conf['crawl']['scrapy']) settings.set('ITEM_PIPELINES', {'hoaxy.crawl.pipelines.ArticlePipeline': 300}) process = CrawlerProcess(settings) sll = cls.conf['logging']['loggers']['scrapy']['level'] logging.getLogger('scrapy').setLevel(logging.getLevelName(sll)) logger.info('Number of url to parse is: %s', len(url_tuples)) process.crawl( ArticleParserSpider, session=session, url_tuples=url_tuples, node_path=cls.conf['crawl']['article_parser']['node_installation_path'], mercury_parser_path=cls.conf['crawl']['article_parser']['parse_with_mercury_js_path'], ) process.start()
Example #15
Source File: crawler.py From Sitadel with GNU General Public License v3.0 | 5 votes |
def crawl(url, user_agent): try: output = Services.get("output") # Settings for the crawler settings = get_project_settings() settings.set("USER_AGENT", user_agent) settings.set("LOG_LEVEL", "CRITICAL") settings.set("RETRY_ENABLED", False) settings.set("CONCURRENT_REQUESTS", 15) # Create the process that will perform the crawl output.info("Start crawling the target website") process = CrawlerProcess(settings) allowed_domains.append(str(urlparse(url).hostname)) process.crawl( SitadelSpider, start_urls=[str(url)], allowed_domains=allowed_domains ) process.start() # Clean the results clean_urls = [] for u in urls: try: new_url = urlparse(u).geturl() clean_urls.append(new_url) except ValueError: continue return clean_urls except KeyboardInterrupt: process.stop() raise
Example #16
Source File: run.py From openslack-crawler with Apache License 2.0 | 5 votes |
def run_spider(spider, *args): print spider settings = get_project_settings() process = CrawlerProcess(settings) process.crawl(spider, 0, 0, 0) process.start()
Example #17
Source File: collector.py From collectors with MIT License | 5 votes |
def collect(conf, conn): process = CrawlerProcess(conf['SCRAPY_SETTINGS']) process.crawl(Spider, conn=conn) process.start()
Example #18
Source File: cli.py From PyFeeds with GNU Affero General Public License v3.0 | 5 votes |
def list(): """List all available spiders.""" settings = get_project_settings() settings["LOG_ENABLED"] = False process = CrawlerProcess(settings) for s in sorted(process.spider_loader.list()): print(s)
Example #19
Source File: cli.py From PyFeeds with GNU Affero General Public License v3.0 | 5 votes |
def crawl(ctx, spiders, stats): """ Crawl one or many or all pages. What spider(s) to run is determined in the following order: 1. Spider(s) given as argument(s) 2. Spider(s) specified in the configuration file Note that if a spider is given as an argument, the spiders in the configuration file are ignored. All available spiders will be used to crawl if no arguments are given and no spiders are configured. """ settings = ctx.obj["settings"] if stats: settings.set("STATS_CLASS", "scrapy.statscollectors.MemoryStatsCollector") # Start a new crawler process. process = CrawlerProcess(settings) spiders = spiders_to_crawl(process, spiders) if not spiders: logger.error("Please specify what spiders you want to run!") else: for spider in spiders: logger.info("Starting crawl of {} ...".format(spider)) process.crawl(spider) process.start() if settings.getbool("HTTPCACHE_ENABLED"): run_cleanup_cache(settings)
Example #20
Source File: __main__.py From wayback-machine-scraper with ISC License | 5 votes |
def main(): # configure the settings for the crawler and spider args = parse_args() config = { 'domains': args.domains, 'directory': args.output, 'allow': args.allow, 'deny': args.deny, 'unix': args.unix, } settings = Settings({ 'USER_AGENT': ( 'Wayback Machine Scraper/{0} ' '(+https://github.com/sangaline/scrapy-wayback-machine)' ).format(get_distribution('wayback-machine-scraper').version), 'LOG_LEVEL': 'DEBUG' if args.verbose else 'INFO', 'DOWNLOADER_MIDDLEWARES': { 'scrapy_wayback_machine.WaybackMachineMiddleware': 5, }, 'AUTOTHROTTLE_ENABLED': True, 'AUTOTHROTTLE_DEBUG': args.verbose, 'AUTOTHROTTLE_START_DELAY': 1, 'AUTOTHROTTLE_TARGET_CONCURRENCY': args.concurrency, 'WAYBACK_MACHINE_TIME_RANGE': (getattr(args, 'from'), args.to), }) # start the crawler process = CrawlerProcess(settings) process.crawl(MirrorSpider, **config) process.start()
Example #21
Source File: parser.py From dtp-stat with GNU General Public License v2.0 | 5 votes |
def download_dtp(): if os.path.exists("data/dtp.json"): os.remove("data/dtp.json") settings = Settings() os.environ['SCRAPY_SETTINGS_MODULE'] = 'parser.dtpparser.settings' settings_module_path = os.environ['SCRAPY_SETTINGS_MODULE'] settings.setmodule(settings_module_path, priority='project') process = CrawlerProcess(settings) process.crawl(DtpSpider) process.start()
Example #22
Source File: parser.py From dtp-stat with GNU General Public License v2.0 | 5 votes |
def download_regions(): if os.path.exists("data/regions.json"): os.remove("data/regions.json") settings = Settings() os.environ['SCRAPY_SETTINGS_MODULE'] = 'parser.dtpparser.settings' settings_module_path = os.environ['SCRAPY_SETTINGS_MODULE'] settings.setmodule(settings_module_path, priority='project') process = CrawlerProcess(settings) process.crawl(RegionSpider) process.start()
Example #23
Source File: cli.py From StrepHit with GNU General Public License v3.0 | 5 votes |
def crawl(spider_name, results_dir): """ Run one or more spiders """ settings = get_project_settings() # prevent scrapy from configuring its own logging, since we already have it settings.set('LOG_ENABLED', False) process = CrawlerProcess(settings) for s in spider_name: process.settings.set('FEED_URI', 'file://%s.jsonlines' % os.path.join(results_dir, s)) process.settings.set('FEED_FORMAT', 'jsonlines') spider = process.spider_loader.load(s) process.crawl(spider) process.start()
Example #24
Source File: main.py From imagebot with MIT License | 5 votes |
def start_spider(args): settings.LOG_LEVEL = args.log_level project_settings = Settings() project_settings.setmodule(settings) process = CrawlerProcess(project_settings) process.crawl(ImageSpider, domains=args.domains, start_urls=args.start_urls, jobname=args.jobname, stay_under=args.stay_under, monitor=args.monitor, user_agent=args.user_agent, minsize=args.min_size, no_cache=args.no_cache, images_store=args.images_store, depth_limit=args.depth_limit, url_regex=args.url_regex, no_cdns=args.no_cdns, auto_throttle=args.auto_throttle, log_level=args.log_level) process.start()
Example #25
Source File: crawler.py From ws-backend-community with GNU General Public License v3.0 | 5 votes |
def __crawl(self, spider_kwargs=None, settings=None): """ Perform a crawl based on the contents of self._crawling_config. :param spider_kwargs: Keyword arguments to use to create a spider class. :param settings: Scrapy settings to use to crawl the remote endpoint. :return: None """ print("SPIDER KWARGS ARE %s." % (spider_kwargs,)) config.globals["%s-hostname" % (os.getpid(),)] = spider_kwargs["input_hostname"] spider = self.get_spider_class_for_domain(**spider_kwargs) process = CrawlerProcess(settings) process.crawl(spider) process.start()
Example #26
Source File: collector.py From collectors with MIT License | 5 votes |
def collect(conf, conn, date_from=None, date_to=None): process = CrawlerProcess(conf['SCRAPY_SETTINGS']) process.crawl(Spider, conn=conn, date_from=date_from, date_to=date_to) process.start()
Example #27
Source File: cmdline.py From learn_python3_spider with MIT License | 5 votes |
def execute(argv=None, settings=None): if argv is None: argv = sys.argv if settings is None: settings = get_project_settings() # set EDITOR from environment if available try: editor = os.environ['EDITOR'] except KeyError: pass else: settings['EDITOR'] = editor check_deprecated_settings(settings) inproject = inside_project() cmds = _get_commands_dict(settings, inproject) cmdname = _pop_command_name(argv) parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \ conflict_handler='resolve') if not cmdname: _print_commands(settings, inproject) sys.exit(0) elif cmdname not in cmds: _print_unknown_command(settings, cmdname, inproject) sys.exit(2) cmd = cmds[cmdname] parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax()) parser.description = cmd.long_desc() settings.setdict(cmd.default_settings, priority='command') cmd.settings = settings cmd.add_options(parser) opts, args = parser.parse_args(args=argv[1:]) _run_print_help(parser, cmd.process_options, args, opts) cmd.crawler_process = CrawlerProcess(settings) _run_print_help(parser, _run_command, cmd, args, opts) sys.exit(cmd.exitcode)
Example #28
Source File: collector.py From collectors with MIT License | 4 votes |
def collect(conf, conn, date_from=None, date_to=None): process = CrawlerProcess(conf['SCRAPY_SETTINGS']) process.crawl(Spider, conn=conn, date_from=date_from, date_to=date_to) process.start()
Example #29
Source File: collector.py From collectors with MIT License | 4 votes |
def collect(conf, conn, page_from=None, page_to=None): process = CrawlerProcess(conf['SCRAPY_SETTINGS']) process.crawl(Spider, conn=conn, page_from=page_from, page_to=page_to) process.start()
Example #30
Source File: collector.py From collectors with MIT License | 4 votes |
def collect(conf, conn): process = CrawlerProcess(conf['SCRAPY_SETTINGS']) process.crawl(Spider, conn=conn) process.start()