Python Examples of scrapy.crawler.CrawlerProcess

Source File: run_spider.py From IPProxyTool with MIT License

8 votes

def runspider(name):
    configure_logging(install_root_handler=False)
    logging.basicConfig(
        filename='log/%s.log' % name,
        format='%(levelname)s %(asctime)s: %(message)s',
        level=logging.DEBUG
    )
    process = CrawlerProcess(get_project_settings())
    try:
        logging.info('runspider start spider:%s' % name)
        process.crawl(name)
        process.start()
    except Exception as e:
        logging.exception('runspider spider:%s exception:%s' % (name, e))

    logging.debug('finish this spider:%s\n\n' % name)

Source File: runner.py From In2ItChicago with GNU General Public License v3.0

7 votes

def run():
    config.connect_to_client()

    print('Running event processor...')

    crawlerProcess = CrawlerProcess(get_project_settings())

    settings = project.get_project_settings()
    spider_loader = spiderloader.SpiderLoader.from_settings(settings)
    spiders = spider_loader.list()
    classes = [s for s in (spider_loader.load(name) for name in spiders if config.spider_name == None or name == config.spider_name) if s.enabled]

    crawlerProcess = CrawlerProcess(get_project_settings())

    for spider_class in classes:
        crawlerProcess.crawl(spider_class)

    crawlerProcess.start()
    crawlerProcess.join()

    print('Event processor completed')

    session = HttpUtils.get_session()
    events = session.get(config.get_events, params = {})

    if len(events.json()) > 0:
        print('Data retrieved successfully')
    else:
        print('No data retrieved')

Source File: real_time_analysis.py From jd_analysis with GNU Lesser General Public License v3.0

6 votes

def runspider(spargs):
    url = spargs.get('url')
    name = spargs.get('name', 'jd')

    if not os.path.exists('log'):
        os.makedirs('log')

    configure_logging(install_root_handler = False)
    logging.basicConfig(
            filename = 'log/%s.log' % name,
            format = '%(levelname)s %(asctime)s: %(message)s',
            level = logging.ERROR
    )
    print "get_project_settings().attributes:", get_project_settings().attributes['SPIDER_MODULES']
    process = CrawlerProcess(get_project_settings())
    start_time = time.time()
    try:
        logging.info('进入爬虫')
        process.crawl(name, **spargs)
        process.start()
    except Exception, e:
        process.stop()
        logging.error("url:%s, errorMsg:%s" % (url, e.message))

Source File: crawl.py From hoaxy-backend with GNU General Public License v3.0

6 votes

def fetch_html(cls, session, url_tuples):
        """Actual method to do fetch html action.

        Parameters
        ----------
            session : object
                a SQLAlchemy session object.
            url_tuples : list
                a list of url tuple (id, raw, status_code).
        """
        settings = Settings(cls.conf['crawl']['scrapy'])
        settings.set('ITEM_PIPELINES',
                     {'hoaxy.crawl.pipelines.HtmlPipeline': 300})
        process = CrawlerProcess(settings)
        sll = cls.conf['logging']['loggers']['scrapy']['level']
        logging.getLogger('scrapy').setLevel(logging.getLevelName(sll))
        logger.warning('Number of url to fetch html is: %s', len(url_tuples))
        process.crawl(
            HtmlSpider,
            session=session,
            url_tuples=url_tuples,
            excluded_domains=cls.conf['crawl']['excluded_domains'])
        process.start()

Source File: crawl.py From hoaxy-backend with GNU General Public License v3.0

6 votes

def fetch_url(cls, session, msites, platform_id, purpose):
        """Actual method to do fetch url action.

        Parameters
        ----------
            msites : list
                a list of Site model class, contains info to build spiders.
            platform_id : int
                id of platform, bind fetched url with this id.
            purpose : {'update', 'archive'}
                indicate which url to fetch.
        """
        settings = Settings(cls.conf['crawl']['scrapy'])
        settings.set('ITEM_PIPELINES',
                     {'hoaxy.crawl.pipelines.UrlPipeline': 300})
        process = CrawlerProcess(settings)
        sll = cls.conf['logging']['loggers']['scrapy']['level']
        logging.getLogger('scrapy').setLevel(logging.getLevelName(sll))
        for ms in msites:
            for sm in build_spiders_iter(ms, purpose):
                sm['kwargs']['session'] = session
                sm['kwargs']['platform_id'] = platform_id
                process.crawl(sm['cls'], *sm['args'], **sm['kwargs'])
        process.start()

Source File: cli.py From PyFeeds with GNU Affero General Public License v3.0

6 votes

def cleanup(ctx):
    """
    Cleanup old cache entries.

    By default, entries older than 90 days will be removed. This value can be
    overriden in the config file.
    """
    settings = ctx.obj["settings"]
    # Manually configure logging since we don't have a CrawlerProcess which
    # would take care of that.
    configure_logging(settings)

    if not settings.getbool("HTTPCACHE_ENABLED"):
        logger.error("Cache is disabled, will not clean up cache dir.")
        return 1

    run_cleanup_cache(settings)

Source File: single_crawler.py From news-please with Apache License 2.0

6 votes

def load_crawler(self, crawler, url, ignore_regex):
        """
        Loads the given crawler with the given url.

        :param class crawler: class of the crawler to load
        :param str url: url to start the crawler with
        :param regex ignore_regex: to be able to ignore urls that match this
                                   regex code
        """
        self.process = CrawlerProcess(self.cfg.get_scrapy_options())
        self.process.crawl(
            crawler,
            self.helper,
            url=url,
            config=self.cfg,
            ignore_regex=ignore_regex)

Source File: scrapyctl.py From kmanga with GNU General Public License v3.0

6 votes

def __init__(self, accounts, loglevel, remote=False):
        self.accounts = settings.SCRAPY_ACCOUNTS
        if accounts:
            self.accounts.update(accounts)
        self.loglevel = loglevel
        self.settings = self._get_settings()
        # Values for `loglevel`: CRITICAL, ERROR, WARNING, INFO, DEBUG.
        self.settings.set('LOG_LEVEL', loglevel)
        if remote:
            # Configure remote logging and disable the scrapy logging.
            self.settings.set('LOG_ENABLED', False)
            logger = logging.getLogger()
            handler = ScrapySocketHandler(
                'localhost', logging.handlers.DEFAULT_TCP_LOGGING_PORT)
            handler.setLevel(loglevel)
            logger.addHandler(handler)

        self.process = CrawlerProcess(self.settings)

Source File: TorSplashCrawler.py From AIL-framework with GNU Affero General Public License v3.0

6 votes

def __init__(self, splash_url, crawler_options):
        self.process = CrawlerProcess({'LOG_ENABLED': True})
        self.crawler = Crawler(self.TorSplashSpider, {
            'USER_AGENT': crawler_options['user_agent'],
            'SPLASH_URL': splash_url,
            'ROBOTSTXT_OBEY': False,
            'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723,
                                       'scrapy_splash.SplashMiddleware': 725,
                                       'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
                                       'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
                                       },
            'SPIDER_MIDDLEWARES': {'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,},
            'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
            'HTTPERROR_ALLOW_ALL': True,
            'RETRY_TIMES': 2,
            'CLOSESPIDER_PAGECOUNT': crawler_options['closespider_pagecount'],
            'DEPTH_LIMIT': crawler_options['depth_limit'],
            'SPLASH_COOKIES_DEBUG': False
            })

Source File: __init__.py From fooltrader with MIT License

6 votes

def crawl(spider, setting):
    process = CrawlerProcess({**get_project_settings(), **setting})
    process.crawl(spider)
    process.start()

Source File: EuropythonSpyder.py From Learning-Python-Networking-Second-Edition with MIT License

6 votes

def main():
	"""Main routine for the execution of the Spider"""
	# set up signal to catch items scraped
	def catch_item(sender, item, **kwargs):
		print("Item extracted:", item)
	dispatcher.connect(catch_item, signal=signals.item_passed)
	
	settings = Settings()
	settings.set("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36")
	settings.set("LOG_ENABLED",False)	

	# setup crawler
	from scrapy.crawler import CrawlerProcess

	crawler = CrawlerProcess(settings)

	# define the spider for the crawler
	crawler.crawl(EuropythonSpyder())

	# start scrapy
	print("STARTING ENGINE")
	crawler.start() #iniciar el crawler llamando al spider definido
	print("ENGINE STOPPED")

Source File: run_spider.py From jd_analysis with GNU Lesser General Public License v3.0

6 votes

def runspider(spargs):
    url = spargs.get('url')
    name = spargs.get('name', 'jd')
    guid = spargs.get('guid')
    product_id = spargs.get('product_id')

    if not os.path.exists('log'):
        os.makedirs('log')

    configure_logging(install_root_handler = False)
    logging.basicConfig(
            filename = 'log/%s.log' % name,
            format = '%(levelname)s %(asctime)s: %(message)s',
            level = logging.ERROR
    )
    print "get_project_settings().attributes:", get_project_settings().attributes['SPIDER_MODULES']
    process = CrawlerProcess(get_project_settings())
    start_time = time.time()
    try:
        logging.info('进入爬虫')
        process.crawl(name, **spargs)
        process.start()
    except Exception, e:
        process.stop()
        logging.error("url:%s, errorMsg:%s" % (url, e.message))

Source File: collector.py From collectors with MIT License

5 votes

def collect(conf, conn, date_from=None, date_to=None):
    process = CrawlerProcess(conf['SCRAPY_SETTINGS'])
    process.crawl(Spider, conn=conn, date_from=date_from, date_to=date_to)
    process.start()

Source File: crawl.py From hoaxy-backend with GNU General Public License v3.0

5 votes

def parse_article(cls, session, url_tuples):
        """Actual method to do parse to article action.

        Parameters
        ----------
            session : object
                a SQLAlchemy session object.
            url_tuples : list
                a list of url tuple (id, created_at, date_published,
                canonical, site_id)
        """
        settings = Settings(cls.conf['crawl']['scrapy'])
        settings.set('ITEM_PIPELINES',
                     {'hoaxy.crawl.pipelines.ArticlePipeline': 300})
        process = CrawlerProcess(settings)
        sll = cls.conf['logging']['loggers']['scrapy']['level']
        logging.getLogger('scrapy').setLevel(logging.getLevelName(sll))
        logger.info('Number of url to parse is: %s', len(url_tuples))
        process.crawl(
            ArticleParserSpider,
            session=session,
            url_tuples=url_tuples,
            node_path=cls.conf['crawl']['article_parser']['node_installation_path'],
            mercury_parser_path=cls.conf['crawl']['article_parser']['parse_with_mercury_js_path'],
        )
        process.start()

Source File: crawler.py From Sitadel with GNU General Public License v3.0

5 votes

def crawl(url, user_agent):
    try:
        output = Services.get("output")

        # Settings for the crawler
        settings = get_project_settings()
        settings.set("USER_AGENT", user_agent)
        settings.set("LOG_LEVEL", "CRITICAL")
        settings.set("RETRY_ENABLED", False)
        settings.set("CONCURRENT_REQUESTS", 15)

        # Create the process that will perform the crawl
        output.info("Start crawling the target website")
        process = CrawlerProcess(settings)
        allowed_domains.append(str(urlparse(url).hostname))
        process.crawl(
            SitadelSpider, start_urls=[str(url)], allowed_domains=allowed_domains
        )
        process.start()

        # Clean the results
        clean_urls = []
        for u in urls:
            try:
                new_url = urlparse(u).geturl()
                clean_urls.append(new_url)
            except ValueError:
                continue
        return clean_urls

    except KeyboardInterrupt:
        process.stop()
        raise

Source File: run.py From openslack-crawler with Apache License 2.0

5 votes

def run_spider(spider, *args):
    print spider
    settings = get_project_settings()
    process = CrawlerProcess(settings)
    process.crawl(spider, 0, 0, 0)
    process.start()

Source File: collector.py From collectors with MIT License

5 votes

def collect(conf, conn):
    process = CrawlerProcess(conf['SCRAPY_SETTINGS'])
    process.crawl(Spider, conn=conn)
    process.start()

Source File: cli.py From PyFeeds with GNU Affero General Public License v3.0

5 votes

def list():
    """List all available spiders."""
    settings = get_project_settings()
    settings["LOG_ENABLED"] = False
    process = CrawlerProcess(settings)
    for s in sorted(process.spider_loader.list()):
        print(s)

Source File: cli.py From PyFeeds with GNU Affero General Public License v3.0

5 votes

def crawl(ctx, spiders, stats):
    """
    Crawl one or many or all pages.

    What spider(s) to run is determined in the following order:

      1. Spider(s) given as argument(s)

      2. Spider(s) specified in the configuration file

    Note that if a spider is given as an argument, the spiders in the
    configuration file are ignored. All available spiders will be used to
    crawl if no arguments are given and no spiders are configured.
    """
    settings = ctx.obj["settings"]
    if stats:
        settings.set("STATS_CLASS", "scrapy.statscollectors.MemoryStatsCollector")

    # Start a new crawler process.
    process = CrawlerProcess(settings)
    spiders = spiders_to_crawl(process, spiders)
    if not spiders:
        logger.error("Please specify what spiders you want to run!")
    else:
        for spider in spiders:
            logger.info("Starting crawl of {} ...".format(spider))
            process.crawl(spider)

    process.start()

    if settings.getbool("HTTPCACHE_ENABLED"):
        run_cleanup_cache(settings)

Source File: __main__.py From wayback-machine-scraper with ISC License

5 votes

def main():
    # configure the settings for the crawler and spider
    args = parse_args()
    config = {
        'domains': args.domains,
        'directory': args.output,
        'allow': args.allow,
        'deny': args.deny,
        'unix': args.unix,
    }
    settings = Settings({
        'USER_AGENT': (
            'Wayback Machine Scraper/{0} '
            '(+https://github.com/sangaline/scrapy-wayback-machine)'
        ).format(get_distribution('wayback-machine-scraper').version),
        'LOG_LEVEL': 'DEBUG' if args.verbose else 'INFO',
        'DOWNLOADER_MIDDLEWARES': {
            'scrapy_wayback_machine.WaybackMachineMiddleware': 5,
        },
        'AUTOTHROTTLE_ENABLED': True,
        'AUTOTHROTTLE_DEBUG': args.verbose,
        'AUTOTHROTTLE_START_DELAY': 1,
        'AUTOTHROTTLE_TARGET_CONCURRENCY': args.concurrency,
        'WAYBACK_MACHINE_TIME_RANGE': (getattr(args, 'from'), args.to),
    })

    # start the crawler
    process = CrawlerProcess(settings)
    process.crawl(MirrorSpider, **config)
    process.start()

Source File: parser.py From dtp-stat with GNU General Public License v2.0

5 votes

def download_dtp():
    if os.path.exists("data/dtp.json"):
        os.remove("data/dtp.json")
    settings = Settings()
    os.environ['SCRAPY_SETTINGS_MODULE'] = 'parser.dtpparser.settings'
    settings_module_path = os.environ['SCRAPY_SETTINGS_MODULE']
    settings.setmodule(settings_module_path, priority='project')
    process = CrawlerProcess(settings)

    process.crawl(DtpSpider)
    process.start()

Source File: parser.py From dtp-stat with GNU General Public License v2.0

5 votes

def download_regions():
    if os.path.exists("data/regions.json"):
        os.remove("data/regions.json")

    settings = Settings()
    os.environ['SCRAPY_SETTINGS_MODULE'] = 'parser.dtpparser.settings'
    settings_module_path = os.environ['SCRAPY_SETTINGS_MODULE']
    settings.setmodule(settings_module_path, priority='project')
    process = CrawlerProcess(settings)

    process.crawl(RegionSpider)
    process.start()

Source File: cli.py From StrepHit with GNU General Public License v3.0

5 votes

def crawl(spider_name, results_dir):
    """ Run one or more spiders """
    settings = get_project_settings()
    # prevent scrapy from configuring its own logging, since we already have it
    settings.set('LOG_ENABLED', False)

    process = CrawlerProcess(settings)
    for s in spider_name:
        process.settings.set('FEED_URI',
                             'file://%s.jsonlines' % os.path.join(results_dir, s))
        process.settings.set('FEED_FORMAT', 'jsonlines')
        spider = process.spider_loader.load(s)
        process.crawl(spider)
    process.start()

Source File: main.py From imagebot with MIT License

5 votes

def start_spider(args):
	settings.LOG_LEVEL = args.log_level
	project_settings = Settings()
	project_settings.setmodule(settings)
	
	process = CrawlerProcess(project_settings)
	
	process.crawl(ImageSpider, domains=args.domains, start_urls=args.start_urls, jobname=args.jobname, stay_under=args.stay_under,
			monitor=args.monitor, user_agent=args.user_agent, minsize=args.min_size, no_cache=args.no_cache,
			images_store=args.images_store, depth_limit=args.depth_limit, url_regex=args.url_regex,
			no_cdns=args.no_cdns, auto_throttle=args.auto_throttle, log_level=args.log_level)

	process.start()

Source File: crawler.py From ws-backend-community with GNU General Public License v3.0

5 votes

def __crawl(self, spider_kwargs=None, settings=None):
        """
        Perform a crawl based on the contents of self._crawling_config.
        :param spider_kwargs: Keyword arguments to use to create a spider class.
        :param settings: Scrapy settings to use to crawl the remote endpoint.
        :return: None
        """
        print("SPIDER KWARGS ARE %s." % (spider_kwargs,))
        config.globals["%s-hostname" % (os.getpid(),)] = spider_kwargs["input_hostname"]
        spider = self.get_spider_class_for_domain(**spider_kwargs)
        process = CrawlerProcess(settings)
        process.crawl(spider)
        process.start()

Source File: collector.py From collectors with MIT License

5 votes

def collect(conf, conn, date_from=None, date_to=None):
    process = CrawlerProcess(conf['SCRAPY_SETTINGS'])
    process.crawl(Spider, conn=conn, date_from=date_from, date_to=date_to)
    process.start()

Source File: cmdline.py From learn_python3_spider with MIT License

5 votes

def execute(argv=None, settings=None):
    if argv is None:
        argv = sys.argv

    if settings is None:
        settings = get_project_settings()
        # set EDITOR from environment if available
        try:
            editor = os.environ['EDITOR']
        except KeyError:
            pass
        else:
            settings['EDITOR'] = editor
    check_deprecated_settings(settings)

    inproject = inside_project()
    cmds = _get_commands_dict(settings, inproject)
    cmdname = _pop_command_name(argv)
    parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \
                                   conflict_handler='resolve')
    if not cmdname:
        _print_commands(settings, inproject)
        sys.exit(0)
    elif cmdname not in cmds:
        _print_unknown_command(settings, cmdname, inproject)
        sys.exit(2)

    cmd = cmds[cmdname]
    parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax())
    parser.description = cmd.long_desc()
    settings.setdict(cmd.default_settings, priority='command')
    cmd.settings = settings
    cmd.add_options(parser)
    opts, args = parser.parse_args(args=argv[1:])
    _run_print_help(parser, cmd.process_options, args, opts)

    cmd.crawler_process = CrawlerProcess(settings)
    _run_print_help(parser, _run_command, cmd, args, opts)
    sys.exit(cmd.exitcode)

Source File: collector.py From collectors with MIT License

4 votes

def collect(conf, conn, date_from=None, date_to=None):
    process = CrawlerProcess(conf['SCRAPY_SETTINGS'])
    process.crawl(Spider, conn=conn, date_from=date_from, date_to=date_to)
    process.start()

Source File: collector.py From collectors with MIT License

4 votes

def collect(conf, conn, page_from=None, page_to=None):
    process = CrawlerProcess(conf['SCRAPY_SETTINGS'])
    process.crawl(Spider, conn=conn, page_from=page_from, page_to=page_to)
    process.start()

Source File: collector.py From collectors with MIT License

4 votes

def collect(conf, conn):
    process = CrawlerProcess(conf['SCRAPY_SETTINGS'])
    process.crawl(Spider, conn=conn)
    process.start()

Python scrapy.crawler.CrawlerProcess() Examples