Python Examples of scrapy.settings

Source File: spiders.py From autologin with Apache License 2.0

7 votes

def crawl_runner(extra_settings=None):
    settings = base_settings.copy()
    if extra_settings is not None:
        settings.update(extra_settings, priority='cmdline')
    if settings.get('SPLASH_URL'):
        settings['DUPEFILTER_CLASS'] = 'scrapy_splash.SplashAwareDupeFilter'
        settings.setdefault('DOWNLOADER_MIDDLEWARES', {}).update({
            'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': None,
            'scrapy_splash.SplashCookiesMiddleware': 723,
            'scrapy_splash.SplashMiddleware': 725,
            'scrapy.downloadermiddlewares.httpcompression'
                '.HttpCompressionMiddleware': 810,
        })
    else:
        settings.setdefault('DOWNLOADER_MIDDLEWARES', {}).update({
            'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': None,
            'autologin.middleware.ExposeCookiesMiddleware': 700,
        })
    return CrawlerRunner(settings)

Source File: spiders.py From autologin with Apache License 2.0

6 votes

def start_requests(self):
        self._finish_init()
        settings = self.crawler.settings
        self.solver = None
        try:
            import decaptcha
        except ImportError:
            self.logger.warning('Decaptcha not installed')
        else:
            from decaptcha.solvers.deathbycaptcha import DeathbycaptchaSolver
            if (settings.get('DECAPTCHA_DEATHBYCAPTCHA_USERNAME') and
                    settings.get('DECAPTCHA_DEATHBYCAPTCHA_PASSWORD')):
                self.solver = DeathbycaptchaSolver(self.crawler)
            else:
                self.logger.warning('DeathByCaptcha account not provided')
        self.retries_left = settings.getint('LOGIN_MAX_RETRIES')
        request_kwargs = {}
        if self.using_splash:
            request_kwargs['args'] = {'full_render': True}
        yield self.request(self.start_url, **request_kwargs)

Source File: log.py From learn_python3_spider with MIT License

6 votes

def _get_handler(settings):
    """ Return a log handler object according to settings """
    filename = settings.get('LOG_FILE')
    if filename:
        encoding = settings.get('LOG_ENCODING')
        handler = logging.FileHandler(filename, encoding=encoding)
    elif settings.getbool('LOG_ENABLED'):
        handler = logging.StreamHandler()
    else:
        handler = logging.NullHandler()

    formatter = logging.Formatter(
        fmt=settings.get('LOG_FORMAT'),
        datefmt=settings.get('LOG_DATEFORMAT')
    )
    handler.setFormatter(formatter)
    handler.setLevel(settings.get('LOG_LEVEL'))
    if settings.getbool('LOG_SHORT_NAMES'):
        handler.addFilter(TopLevelFormatter(['scrapy']))
    return handler

Source File: shell.py From learn_python3_spider with MIT License

6 votes

def populate_vars(self, response=None, request=None, spider=None):
        import scrapy

        self.vars['scrapy'] = scrapy
        self.vars['crawler'] = self.crawler
        self.vars['item'] = self.item_class()
        self.vars['settings'] = self.crawler.settings
        self.vars['spider'] = spider
        self.vars['request'] = request
        self.vars['response'] = response
        self.vars['sel'] = _SelectorProxy(response)
        if self.inthread:
            self.vars['fetch'] = self.fetch
        self.vars['view'] = open_in_browser
        self.vars['shelp'] = self.print_help
        self.update_vars(self.vars)
        if not self.code:
            self.vars['banner'] = self.get_help()

Source File: log.py From learn_python3_spider with MIT License

6 votes

def _get_handler(settings):
    """ Return a log handler object according to settings """
    filename = settings.get('LOG_FILE')
    if filename:
        encoding = settings.get('LOG_ENCODING')
        handler = logging.FileHandler(filename, encoding=encoding)
    elif settings.getbool('LOG_ENABLED'):
        handler = logging.StreamHandler()
    else:
        handler = logging.NullHandler()

    formatter = logging.Formatter(
        fmt=settings.get('LOG_FORMAT'),
        datefmt=settings.get('LOG_DATEFORMAT')
    )
    handler.setFormatter(formatter)
    handler.setLevel(settings.get('LOG_LEVEL'))
    if settings.getbool('LOG_SHORT_NAMES'):
        handler.addFilter(TopLevelFormatter(['scrapy']))
    return handler

Source File: EuropythonSpyder.py From Learning-Python-Networking-Second-Edition with MIT License

6 votes

def main():
	"""Main routine for the execution of the Spider"""
	# set up signal to catch items scraped
	def catch_item(sender, item, **kwargs):
		print("Item extracted:", item)
	dispatcher.connect(catch_item, signal=signals.item_passed)
	
	settings = Settings()
	settings.set("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36")
	settings.set("LOG_ENABLED",False)	

	# setup crawler
	from scrapy.crawler import CrawlerProcess

	crawler = CrawlerProcess(settings)

	# define the spider for the crawler
	crawler.crawl(EuropythonSpyder())

	# start scrapy
	print("STARTING ENGINE")
	crawler.start() #iniciar el crawler llamando al spider definido
	print("ENGINE STOPPED")

Source File: shell.py From learn_python3_spider with MIT License

6 votes

def populate_vars(self, response=None, request=None, spider=None):
        import scrapy

        self.vars['scrapy'] = scrapy
        self.vars['crawler'] = self.crawler
        self.vars['item'] = self.item_class()
        self.vars['settings'] = self.crawler.settings
        self.vars['spider'] = spider
        self.vars['request'] = request
        self.vars['response'] = response
        self.vars['sel'] = _SelectorProxy(response)
        if self.inthread:
            self.vars['fetch'] = self.fetch
        self.vars['view'] = open_in_browser
        self.vars['shelp'] = self.print_help
        self.update_vars(self.vars)
        if not self.code:
            self.vars['banner'] = self.get_help()

Source File: shell.py From learn_python3_spider with MIT License

5 votes

def start(self, url=None, request=None, response=None, spider=None, redirect=True):
        # disable accidental Ctrl-C key press from shutting down the engine
        signal.signal(signal.SIGINT, signal.SIG_IGN)
        if url:
            self.fetch(url, spider, redirect=redirect)
        elif request:
            self.fetch(request, spider)
        elif response:
            request = response.request
            self.populate_vars(response, request, spider)
        else:
            self.populate_vars()
        if self.code:
            print(eval(self.code, globals(), self.vars))
        else:
            """
            Detect interactive shell setting in scrapy.cfg
            e.g.: ~/.config/scrapy.cfg or ~/.scrapy.cfg
            [settings]
            # shell can be one of ipython, bpython or python;
            # to be used as the interactive python console, if available.
            # (default is ipython, fallbacks in the order listed above)
            shell = python
            """
            cfg = get_config()
            section, option = 'settings', 'shell'
            env = os.environ.get('SCRAPY_PYTHON_SHELL')
            shells = []
            if env:
                shells += env.strip().lower().split(',')
            elif cfg.has_option(section, option):
                shells += [cfg.get(section, option).strip().lower()]
            else:  # try all by default
                shells += DEFAULT_PYTHON_SHELLS.keys()
            # always add standard shell as fallback
            shells += ['python']
            start_python_console(self.vars, shells=shells,
                                 banner=self.vars.pop('banner', ''))

Source File: spiders.py From autologin with Apache License 2.0

5 votes

def _finish_init(self):
        self.using_splash = bool(self.settings.get('SPLASH_URL'))
        if self.using_splash:
            with open(os.path.join(
                    os.path.dirname(__file__), 'directives', self.lua_source),
                    'rb') as f:
                lua_source = f.read().decode('utf-8')
            self.request = partial(
                splash_request, lua_source,
                extra_js=self.extra_js)
        else:
            if self.extra_js:
                raise ValueError(
                    '"extra_js" not supported without "SPLASH_URL"')
            self.request = scrapy.Request

Source File: crawler.py From ws-backend-community with GNU General Public License v3.0

5 votes

def crawling_config(self):
        """
        Get a dictionary containing the spider and Scrapy settings to use to crawl an endpoint.
        :return: A dictionary containing the spider and Scrapy settings to use to crawl an endpoint.
        """
        return self._crawling_config

    # Representation and Comparison

Source File: crawler.py From ws-backend-community with GNU General Public License v3.0

5 votes

def __crawl(self, spider_kwargs=None, settings=None):
        """
        Perform a crawl based on the contents of self._crawling_config.
        :param spider_kwargs: Keyword arguments to use to create a spider class.
        :param settings: Scrapy settings to use to crawl the remote endpoint.
        :return: None
        """
        print("SPIDER KWARGS ARE %s." % (spider_kwargs,))
        config.globals["%s-hostname" % (os.getpid(),)] = spider_kwargs["input_hostname"]
        spider = self.get_spider_class_for_domain(**spider_kwargs)
        process = CrawlerProcess(settings)
        process.crawl(spider)
        process.start()

Source File: crawler.py From ws-backend-community with GNU General Public License v3.0

5 votes

def get_scrapy_settings(self, item_pipeline=None, hostname=None):
        """
        Get a scrapy settings dictionary to use for crawling web applications.
        :param item_pipeline: The item pipeline configuration to configure in the settings.
        :param hostname: The hostname to request by default in all Scrapy requests.
        :return: A scrapy settings dictionary to use for crawling web applications.
        """
        item_pipeline = item_pipeline if item_pipeline is not None else self.__get_default_item_pipeline()
        return scrapy.settings.Settings(values={
            "CONCURRENT_ITEMS": self.concurrent_items,
            "CONCURRENT_REQUESTS": self.concurrent_requests,
            "DEFAULT_REQUEST_HEADERS": {
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                "Accept-Language": "en",
                "Host": hostname,
            },
            "DEPTH_LIMIT": self.depth_limit,
            "DEPTH_PRIORITY": self.depth_priority,
            "DOWNLOADER_CLIENTCONTEXTFACTORY": "lib.inspection.web.crawling.WebSightClientContextFactory",
            "EXTENSIONS": {
                "scrapy.extensions.telnet.TelnetConsole": None,
            },
            "DOWNLOADER_MIDDLEWARES": {
                "scrapy.downloadermiddlewares.redirect.RedirectMiddleware": None,
                "scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware": None,
            },
            "SPIDER_MIDDLEWARES": {
                "scrapy.spidermiddlewares.offsite.OffsiteMiddleware": None,
            },
            "DOWNLOAD_MAXSIZE": self.max_size,
            "HTTPERROR_ALLOW_ALL": self.allow_all_errors,
            "ITEM_PIPELINES": item_pipeline,
            "LOG_LEVEL": config.log_crawling_level,
            "TELNETCONSOLE_ENABLED": self.enable_telnet,
            "USER_AGENT": self.user_agent,
        })

Source File: log.py From learn_python3_spider with MIT License

5 votes

def configure_logging(settings=None, install_root_handler=True):
    """
    Initialize logging defaults for Scrapy.

    :param settings: settings used to create and configure a handler for the
        root logger (default: None).
    :type settings: dict, :class:`~scrapy.settings.Settings` object or ``None``

    :param install_root_handler: whether to install root logging handler
        (default: True)
    :type install_root_handler: bool

    This function does:

    - Route warnings and twisted logging through Python standard logging
    - Assign DEBUG and ERROR level to Scrapy and Twisted loggers respectively
    - Route stdout to log if LOG_STDOUT setting is True

    When ``install_root_handler`` is True (default), this function also
    creates a handler for the root logger according to given settings
    (see :ref:`topics-logging-settings`). You can override default options
    using ``settings`` argument. When ``settings`` is empty or None, defaults
    are used.
    """
    if not sys.warnoptions:
        # Route warnings through python logging
        logging.captureWarnings(True)

    observer = twisted_log.PythonLoggingObserver('twisted')
    observer.start()

    dictConfig(DEFAULT_LOGGING)

    if isinstance(settings, dict) or settings is None:
        settings = Settings(settings)

    if settings.getbool('LOG_STDOUT'):
        sys.stdout = StreamLogger(logging.getLogger('stdout'))

    if install_root_handler:
        install_scrapy_root_handler(settings)

Source File: log.py From learn_python3_spider with MIT License

5 votes

def log_scrapy_info(settings):
    logger.info("Scrapy %(version)s started (bot: %(bot)s)",
                {'version': scrapy.__version__, 'bot': settings['BOT_NAME']})
    logger.info("Versions: %(versions)s",
                {'versions': ", ".join("%s %s" % (name, version)
                    for name, version in scrapy_components_versions()
                    if name != "Scrapy")})

Source File: log.py From learn_python3_spider with MIT License

5 votes

def install_scrapy_root_handler(settings):
    global _scrapy_root_handler

    if (_scrapy_root_handler is not None
            and _scrapy_root_handler in logging.root.handlers):
        logging.root.removeHandler(_scrapy_root_handler)
    logging.root.setLevel(logging.NOTSET)
    _scrapy_root_handler = _get_handler(settings)
    logging.root.addHandler(_scrapy_root_handler)

Source File: log.py From learn_python3_spider with MIT License

5 votes

def configure_logging(settings=None, install_root_handler=True):
    """
    Initialize logging defaults for Scrapy.

    :param settings: settings used to create and configure a handler for the
        root logger (default: None).
    :type settings: dict, :class:`~scrapy.settings.Settings` object or ``None``

    :param install_root_handler: whether to install root logging handler
        (default: True)
    :type install_root_handler: bool

    This function does:

    - Route warnings and twisted logging through Python standard logging
    - Assign DEBUG and ERROR level to Scrapy and Twisted loggers respectively
    - Route stdout to log if LOG_STDOUT setting is True

    When ``install_root_handler`` is True (default), this function also
    creates a handler for the root logger according to given settings
    (see :ref:`topics-logging-settings`). You can override default options
    using ``settings`` argument. When ``settings`` is empty or None, defaults
    are used.
    """
    if not sys.warnoptions:
        # Route warnings through python logging
        logging.captureWarnings(True)

    observer = twisted_log.PythonLoggingObserver('twisted')
    observer.start()

    dictConfig(DEFAULT_LOGGING)

    if isinstance(settings, dict) or settings is None:
        settings = Settings(settings)

    if settings.getbool('LOG_STDOUT'):
        sys.stdout = StreamLogger(logging.getLogger('stdout'))

    if install_root_handler:
        install_scrapy_root_handler(settings)

Source File: shell.py From learn_python3_spider with MIT License

5 votes

def __init__(self, crawler, update_vars=None, code=None):
        self.crawler = crawler
        self.update_vars = update_vars or (lambda x: None)
        self.item_class = load_object(crawler.settings['DEFAULT_ITEM_CLASS'])
        self.spider = None
        self.inthread = not threadable.isInIOThread()
        self.code = code
        self.vars = {}

Source File: log.py From learn_python3_spider with MIT License

5 votes

def log_scrapy_info(settings):
    logger.info("Scrapy %(version)s started (bot: %(bot)s)",
                {'version': scrapy.__version__, 'bot': settings['BOT_NAME']})
    logger.info("Versions: %(versions)s",
                {'versions': ", ".join("%s %s" % (name, version)
                    for name, version in scrapy_components_versions()
                    if name != "Scrapy")})

Source File: log.py From learn_python3_spider with MIT License

5 votes

def install_scrapy_root_handler(settings):
    global _scrapy_root_handler

    if (_scrapy_root_handler is not None
            and _scrapy_root_handler in logging.root.handlers):
        logging.root.removeHandler(_scrapy_root_handler)
    logging.root.setLevel(logging.NOTSET)
    _scrapy_root_handler = _get_handler(settings)
    logging.root.addHandler(_scrapy_root_handler)

Source File: crawler.py From ws-backend-community with GNU General Public License v3.0

4 votes

def crawl_endpoint_to_file(
            self,
            ip_address=None,
            port=None,
            hostname=None,
            use_ssl=False,
            use_sni=False,
            start_urls=[],
            in_separate_process=True,
    ):
        """
        Start crawling the given endpoint using the given list of URLs and write the results to
        a local file.
        :param ip_address: The IP address to crawl.
        :param port: The port where the application resides.
        :param hostname: The hostname to submit alongside all requests to the remote endpoint.
        :param use_ssl: Whether or not to use SSL to connect to the remote web service.
        :param use_sni: Whether or not to use SNI to connect to the remote web service.
        :param start_urls: A list of URLs to start crawling from.
        :param in_separate_process: Whether or not to spawn off a separate process for the crawl. This
        enables us to call this method multiple times in the same process, as a Twisted reactor can only
        be started and stopped once per process.
        :return: A tuple containing (1) the string containing the local file path where crawling
        results are stored and (2) a ScrapyResultWrapper configured to process the contents of the file.
        """
        temp_file_path = FilesystemHelper.get_temporary_file_path()
        local_file_path = "%s-%s-%s:%s" % (temp_file_path, self.bot_name, ip_address, port)
        spider_kwargs = {
            "input_ip_address": ip_address,
            "input_start_urls": start_urls,
            "input_file_path": local_file_path,
            "input_hostname": hostname,
            "input_use_ssl": use_ssl,
            "input_use_sni": use_sni,
            "input_port": port,
        }
        pipeline_settings = self.__get_local_storage_item_pipeline()
        requested_hostname = hostname if hostname is not None else ip_address
        settings = self.get_scrapy_settings(item_pipeline=pipeline_settings, hostname=requested_hostname)
        crawling_config = {
            "spider_kwargs": spider_kwargs,
            "settings": settings,
        }
        if in_separate_process:
            process = Process(target=self.__crawl, kwargs=crawling_config)
            process.start()
            process.join()
            process.terminate()
        else:
            self.__crawl(**crawling_config)
        return local_file_path, ScrapyResultWrapper.from_file(local_file_path)

Python scrapy.settings() Examples