Python scrapy.settings() Examples

The following are 20 code examples of scrapy.settings(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module scrapy , or try the search function .
Example #1
Source File: spiders.py    From autologin with Apache License 2.0 7 votes vote down vote up
def crawl_runner(extra_settings=None):
    settings = base_settings.copy()
    if extra_settings is not None:
        settings.update(extra_settings, priority='cmdline')
    if settings.get('SPLASH_URL'):
        settings['DUPEFILTER_CLASS'] = 'scrapy_splash.SplashAwareDupeFilter'
        settings.setdefault('DOWNLOADER_MIDDLEWARES', {}).update({
            'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': None,
            'scrapy_splash.SplashCookiesMiddleware': 723,
            'scrapy_splash.SplashMiddleware': 725,
            'scrapy.downloadermiddlewares.httpcompression'
                '.HttpCompressionMiddleware': 810,
        })
    else:
        settings.setdefault('DOWNLOADER_MIDDLEWARES', {}).update({
            'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': None,
            'autologin.middleware.ExposeCookiesMiddleware': 700,
        })
    return CrawlerRunner(settings) 
Example #2
Source File: spiders.py    From autologin with Apache License 2.0 6 votes vote down vote up
def start_requests(self):
        self._finish_init()
        settings = self.crawler.settings
        self.solver = None
        try:
            import decaptcha
        except ImportError:
            self.logger.warning('Decaptcha not installed')
        else:
            from decaptcha.solvers.deathbycaptcha import DeathbycaptchaSolver
            if (settings.get('DECAPTCHA_DEATHBYCAPTCHA_USERNAME') and
                    settings.get('DECAPTCHA_DEATHBYCAPTCHA_PASSWORD')):
                self.solver = DeathbycaptchaSolver(self.crawler)
            else:
                self.logger.warning('DeathByCaptcha account not provided')
        self.retries_left = settings.getint('LOGIN_MAX_RETRIES')
        request_kwargs = {}
        if self.using_splash:
            request_kwargs['args'] = {'full_render': True}
        yield self.request(self.start_url, **request_kwargs) 
Example #3
Source File: log.py    From learn_python3_spider with MIT License 6 votes vote down vote up
def _get_handler(settings):
    """ Return a log handler object according to settings """
    filename = settings.get('LOG_FILE')
    if filename:
        encoding = settings.get('LOG_ENCODING')
        handler = logging.FileHandler(filename, encoding=encoding)
    elif settings.getbool('LOG_ENABLED'):
        handler = logging.StreamHandler()
    else:
        handler = logging.NullHandler()

    formatter = logging.Formatter(
        fmt=settings.get('LOG_FORMAT'),
        datefmt=settings.get('LOG_DATEFORMAT')
    )
    handler.setFormatter(formatter)
    handler.setLevel(settings.get('LOG_LEVEL'))
    if settings.getbool('LOG_SHORT_NAMES'):
        handler.addFilter(TopLevelFormatter(['scrapy']))
    return handler 
Example #4
Source File: shell.py    From learn_python3_spider with MIT License 6 votes vote down vote up
def populate_vars(self, response=None, request=None, spider=None):
        import scrapy

        self.vars['scrapy'] = scrapy
        self.vars['crawler'] = self.crawler
        self.vars['item'] = self.item_class()
        self.vars['settings'] = self.crawler.settings
        self.vars['spider'] = spider
        self.vars['request'] = request
        self.vars['response'] = response
        self.vars['sel'] = _SelectorProxy(response)
        if self.inthread:
            self.vars['fetch'] = self.fetch
        self.vars['view'] = open_in_browser
        self.vars['shelp'] = self.print_help
        self.update_vars(self.vars)
        if not self.code:
            self.vars['banner'] = self.get_help() 
Example #5
Source File: log.py    From learn_python3_spider with MIT License 6 votes vote down vote up
def _get_handler(settings):
    """ Return a log handler object according to settings """
    filename = settings.get('LOG_FILE')
    if filename:
        encoding = settings.get('LOG_ENCODING')
        handler = logging.FileHandler(filename, encoding=encoding)
    elif settings.getbool('LOG_ENABLED'):
        handler = logging.StreamHandler()
    else:
        handler = logging.NullHandler()

    formatter = logging.Formatter(
        fmt=settings.get('LOG_FORMAT'),
        datefmt=settings.get('LOG_DATEFORMAT')
    )
    handler.setFormatter(formatter)
    handler.setLevel(settings.get('LOG_LEVEL'))
    if settings.getbool('LOG_SHORT_NAMES'):
        handler.addFilter(TopLevelFormatter(['scrapy']))
    return handler 
Example #6
Source File: EuropythonSpyder.py    From Learning-Python-Networking-Second-Edition with MIT License 6 votes vote down vote up
def main():
	"""Main routine for the execution of the Spider"""
	# set up signal to catch items scraped
	def catch_item(sender, item, **kwargs):
		print("Item extracted:", item)
	dispatcher.connect(catch_item, signal=signals.item_passed)
	
	settings = Settings()
	settings.set("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36")
	settings.set("LOG_ENABLED",False)	

	# setup crawler
	from scrapy.crawler import CrawlerProcess

	crawler = CrawlerProcess(settings)

	# define the spider for the crawler
	crawler.crawl(EuropythonSpyder())

	# start scrapy
	print("STARTING ENGINE")
	crawler.start() #iniciar el crawler llamando al spider definido
	print("ENGINE STOPPED") 
Example #7
Source File: shell.py    From learn_python3_spider with MIT License 6 votes vote down vote up
def populate_vars(self, response=None, request=None, spider=None):
        import scrapy

        self.vars['scrapy'] = scrapy
        self.vars['crawler'] = self.crawler
        self.vars['item'] = self.item_class()
        self.vars['settings'] = self.crawler.settings
        self.vars['spider'] = spider
        self.vars['request'] = request
        self.vars['response'] = response
        self.vars['sel'] = _SelectorProxy(response)
        if self.inthread:
            self.vars['fetch'] = self.fetch
        self.vars['view'] = open_in_browser
        self.vars['shelp'] = self.print_help
        self.update_vars(self.vars)
        if not self.code:
            self.vars['banner'] = self.get_help() 
Example #8
Source File: shell.py    From learn_python3_spider with MIT License 5 votes vote down vote up
def start(self, url=None, request=None, response=None, spider=None, redirect=True):
        # disable accidental Ctrl-C key press from shutting down the engine
        signal.signal(signal.SIGINT, signal.SIG_IGN)
        if url:
            self.fetch(url, spider, redirect=redirect)
        elif request:
            self.fetch(request, spider)
        elif response:
            request = response.request
            self.populate_vars(response, request, spider)
        else:
            self.populate_vars()
        if self.code:
            print(eval(self.code, globals(), self.vars))
        else:
            """
            Detect interactive shell setting in scrapy.cfg
            e.g.: ~/.config/scrapy.cfg or ~/.scrapy.cfg
            [settings]
            # shell can be one of ipython, bpython or python;
            # to be used as the interactive python console, if available.
            # (default is ipython, fallbacks in the order listed above)
            shell = python
            """
            cfg = get_config()
            section, option = 'settings', 'shell'
            env = os.environ.get('SCRAPY_PYTHON_SHELL')
            shells = []
            if env:
                shells += env.strip().lower().split(',')
            elif cfg.has_option(section, option):
                shells += [cfg.get(section, option).strip().lower()]
            else:  # try all by default
                shells += DEFAULT_PYTHON_SHELLS.keys()
            # always add standard shell as fallback
            shells += ['python']
            start_python_console(self.vars, shells=shells,
                                 banner=self.vars.pop('banner', '')) 
Example #9
Source File: spiders.py    From autologin with Apache License 2.0 5 votes vote down vote up
def _finish_init(self):
        self.using_splash = bool(self.settings.get('SPLASH_URL'))
        if self.using_splash:
            with open(os.path.join(
                    os.path.dirname(__file__), 'directives', self.lua_source),
                    'rb') as f:
                lua_source = f.read().decode('utf-8')
            self.request = partial(
                splash_request, lua_source,
                extra_js=self.extra_js)
        else:
            if self.extra_js:
                raise ValueError(
                    '"extra_js" not supported without "SPLASH_URL"')
            self.request = scrapy.Request 
Example #10
Source File: crawler.py    From ws-backend-community with GNU General Public License v3.0 5 votes vote down vote up
def crawling_config(self):
        """
        Get a dictionary containing the spider and Scrapy settings to use to crawl an endpoint.
        :return: A dictionary containing the spider and Scrapy settings to use to crawl an endpoint.
        """
        return self._crawling_config

    # Representation and Comparison 
Example #11
Source File: crawler.py    From ws-backend-community with GNU General Public License v3.0 5 votes vote down vote up
def __crawl(self, spider_kwargs=None, settings=None):
        """
        Perform a crawl based on the contents of self._crawling_config.
        :param spider_kwargs: Keyword arguments to use to create a spider class.
        :param settings: Scrapy settings to use to crawl the remote endpoint.
        :return: None
        """
        print("SPIDER KWARGS ARE %s." % (spider_kwargs,))
        config.globals["%s-hostname" % (os.getpid(),)] = spider_kwargs["input_hostname"]
        spider = self.get_spider_class_for_domain(**spider_kwargs)
        process = CrawlerProcess(settings)
        process.crawl(spider)
        process.start() 
Example #12
Source File: crawler.py    From ws-backend-community with GNU General Public License v3.0 5 votes vote down vote up
def get_scrapy_settings(self, item_pipeline=None, hostname=None):
        """
        Get a scrapy settings dictionary to use for crawling web applications.
        :param item_pipeline: The item pipeline configuration to configure in the settings.
        :param hostname: The hostname to request by default in all Scrapy requests.
        :return: A scrapy settings dictionary to use for crawling web applications.
        """
        item_pipeline = item_pipeline if item_pipeline is not None else self.__get_default_item_pipeline()
        return scrapy.settings.Settings(values={
            "CONCURRENT_ITEMS": self.concurrent_items,
            "CONCURRENT_REQUESTS": self.concurrent_requests,
            "DEFAULT_REQUEST_HEADERS": {
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                "Accept-Language": "en",
                "Host": hostname,
            },
            "DEPTH_LIMIT": self.depth_limit,
            "DEPTH_PRIORITY": self.depth_priority,
            "DOWNLOADER_CLIENTCONTEXTFACTORY": "lib.inspection.web.crawling.WebSightClientContextFactory",
            "EXTENSIONS": {
                "scrapy.extensions.telnet.TelnetConsole": None,
            },
            "DOWNLOADER_MIDDLEWARES": {
                "scrapy.downloadermiddlewares.redirect.RedirectMiddleware": None,
                "scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware": None,
            },
            "SPIDER_MIDDLEWARES": {
                "scrapy.spidermiddlewares.offsite.OffsiteMiddleware": None,
            },
            "DOWNLOAD_MAXSIZE": self.max_size,
            "HTTPERROR_ALLOW_ALL": self.allow_all_errors,
            "ITEM_PIPELINES": item_pipeline,
            "LOG_LEVEL": config.log_crawling_level,
            "TELNETCONSOLE_ENABLED": self.enable_telnet,
            "USER_AGENT": self.user_agent,
        }) 
Example #13
Source File: log.py    From learn_python3_spider with MIT License 5 votes vote down vote up
def configure_logging(settings=None, install_root_handler=True):
    """
    Initialize logging defaults for Scrapy.

    :param settings: settings used to create and configure a handler for the
        root logger (default: None).
    :type settings: dict, :class:`~scrapy.settings.Settings` object or ``None``

    :param install_root_handler: whether to install root logging handler
        (default: True)
    :type install_root_handler: bool

    This function does:

    - Route warnings and twisted logging through Python standard logging
    - Assign DEBUG and ERROR level to Scrapy and Twisted loggers respectively
    - Route stdout to log if LOG_STDOUT setting is True

    When ``install_root_handler`` is True (default), this function also
    creates a handler for the root logger according to given settings
    (see :ref:`topics-logging-settings`). You can override default options
    using ``settings`` argument. When ``settings`` is empty or None, defaults
    are used.
    """
    if not sys.warnoptions:
        # Route warnings through python logging
        logging.captureWarnings(True)

    observer = twisted_log.PythonLoggingObserver('twisted')
    observer.start()

    dictConfig(DEFAULT_LOGGING)

    if isinstance(settings, dict) or settings is None:
        settings = Settings(settings)

    if settings.getbool('LOG_STDOUT'):
        sys.stdout = StreamLogger(logging.getLogger('stdout'))

    if install_root_handler:
        install_scrapy_root_handler(settings) 
Example #14
Source File: log.py    From learn_python3_spider with MIT License 5 votes vote down vote up
def log_scrapy_info(settings):
    logger.info("Scrapy %(version)s started (bot: %(bot)s)",
                {'version': scrapy.__version__, 'bot': settings['BOT_NAME']})
    logger.info("Versions: %(versions)s",
                {'versions': ", ".join("%s %s" % (name, version)
                    for name, version in scrapy_components_versions()
                    if name != "Scrapy")}) 
Example #15
Source File: log.py    From learn_python3_spider with MIT License 5 votes vote down vote up
def install_scrapy_root_handler(settings):
    global _scrapy_root_handler

    if (_scrapy_root_handler is not None
            and _scrapy_root_handler in logging.root.handlers):
        logging.root.removeHandler(_scrapy_root_handler)
    logging.root.setLevel(logging.NOTSET)
    _scrapy_root_handler = _get_handler(settings)
    logging.root.addHandler(_scrapy_root_handler) 
Example #16
Source File: log.py    From learn_python3_spider with MIT License 5 votes vote down vote up
def configure_logging(settings=None, install_root_handler=True):
    """
    Initialize logging defaults for Scrapy.

    :param settings: settings used to create and configure a handler for the
        root logger (default: None).
    :type settings: dict, :class:`~scrapy.settings.Settings` object or ``None``

    :param install_root_handler: whether to install root logging handler
        (default: True)
    :type install_root_handler: bool

    This function does:

    - Route warnings and twisted logging through Python standard logging
    - Assign DEBUG and ERROR level to Scrapy and Twisted loggers respectively
    - Route stdout to log if LOG_STDOUT setting is True

    When ``install_root_handler`` is True (default), this function also
    creates a handler for the root logger according to given settings
    (see :ref:`topics-logging-settings`). You can override default options
    using ``settings`` argument. When ``settings`` is empty or None, defaults
    are used.
    """
    if not sys.warnoptions:
        # Route warnings through python logging
        logging.captureWarnings(True)

    observer = twisted_log.PythonLoggingObserver('twisted')
    observer.start()

    dictConfig(DEFAULT_LOGGING)

    if isinstance(settings, dict) or settings is None:
        settings = Settings(settings)

    if settings.getbool('LOG_STDOUT'):
        sys.stdout = StreamLogger(logging.getLogger('stdout'))

    if install_root_handler:
        install_scrapy_root_handler(settings) 
Example #17
Source File: shell.py    From learn_python3_spider with MIT License 5 votes vote down vote up
def __init__(self, crawler, update_vars=None, code=None):
        self.crawler = crawler
        self.update_vars = update_vars or (lambda x: None)
        self.item_class = load_object(crawler.settings['DEFAULT_ITEM_CLASS'])
        self.spider = None
        self.inthread = not threadable.isInIOThread()
        self.code = code
        self.vars = {} 
Example #18
Source File: log.py    From learn_python3_spider with MIT License 5 votes vote down vote up
def log_scrapy_info(settings):
    logger.info("Scrapy %(version)s started (bot: %(bot)s)",
                {'version': scrapy.__version__, 'bot': settings['BOT_NAME']})
    logger.info("Versions: %(versions)s",
                {'versions': ", ".join("%s %s" % (name, version)
                    for name, version in scrapy_components_versions()
                    if name != "Scrapy")}) 
Example #19
Source File: log.py    From learn_python3_spider with MIT License 5 votes vote down vote up
def install_scrapy_root_handler(settings):
    global _scrapy_root_handler

    if (_scrapy_root_handler is not None
            and _scrapy_root_handler in logging.root.handlers):
        logging.root.removeHandler(_scrapy_root_handler)
    logging.root.setLevel(logging.NOTSET)
    _scrapy_root_handler = _get_handler(settings)
    logging.root.addHandler(_scrapy_root_handler) 
Example #20
Source File: crawler.py    From ws-backend-community with GNU General Public License v3.0 4 votes vote down vote up
def crawl_endpoint_to_file(
            self,
            ip_address=None,
            port=None,
            hostname=None,
            use_ssl=False,
            use_sni=False,
            start_urls=[],
            in_separate_process=True,
    ):
        """
        Start crawling the given endpoint using the given list of URLs and write the results to
        a local file.
        :param ip_address: The IP address to crawl.
        :param port: The port where the application resides.
        :param hostname: The hostname to submit alongside all requests to the remote endpoint.
        :param use_ssl: Whether or not to use SSL to connect to the remote web service.
        :param use_sni: Whether or not to use SNI to connect to the remote web service.
        :param start_urls: A list of URLs to start crawling from.
        :param in_separate_process: Whether or not to spawn off a separate process for the crawl. This
        enables us to call this method multiple times in the same process, as a Twisted reactor can only
        be started and stopped once per process.
        :return: A tuple containing (1) the string containing the local file path where crawling
        results are stored and (2) a ScrapyResultWrapper configured to process the contents of the file.
        """
        temp_file_path = FilesystemHelper.get_temporary_file_path()
        local_file_path = "%s-%s-%s:%s" % (temp_file_path, self.bot_name, ip_address, port)
        spider_kwargs = {
            "input_ip_address": ip_address,
            "input_start_urls": start_urls,
            "input_file_path": local_file_path,
            "input_hostname": hostname,
            "input_use_ssl": use_ssl,
            "input_use_sni": use_sni,
            "input_port": port,
        }
        pipeline_settings = self.__get_local_storage_item_pipeline()
        requested_hostname = hostname if hostname is not None else ip_address
        settings = self.get_scrapy_settings(item_pipeline=pipeline_settings, hostname=requested_hostname)
        crawling_config = {
            "spider_kwargs": spider_kwargs,
            "settings": settings,
        }
        if in_separate_process:
            process = Process(target=self.__crawl, kwargs=crawling_config)
            process.start()
            process.join()
            process.terminate()
        else:
            self.__crawl(**crawling_config)
        return local_file_path, ScrapyResultWrapper.from_file(local_file_path)