Python scrapy.settings() Examples
The following are 20
code examples of scrapy.settings().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
scrapy
, or try the search function
.
Example #1
Source File: spiders.py From autologin with Apache License 2.0 | 7 votes |
def crawl_runner(extra_settings=None): settings = base_settings.copy() if extra_settings is not None: settings.update(extra_settings, priority='cmdline') if settings.get('SPLASH_URL'): settings['DUPEFILTER_CLASS'] = 'scrapy_splash.SplashAwareDupeFilter' settings.setdefault('DOWNLOADER_MIDDLEWARES', {}).update({ 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': None, 'scrapy_splash.SplashCookiesMiddleware': 723, 'scrapy_splash.SplashMiddleware': 725, 'scrapy.downloadermiddlewares.httpcompression' '.HttpCompressionMiddleware': 810, }) else: settings.setdefault('DOWNLOADER_MIDDLEWARES', {}).update({ 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': None, 'autologin.middleware.ExposeCookiesMiddleware': 700, }) return CrawlerRunner(settings)
Example #2
Source File: spiders.py From autologin with Apache License 2.0 | 6 votes |
def start_requests(self): self._finish_init() settings = self.crawler.settings self.solver = None try: import decaptcha except ImportError: self.logger.warning('Decaptcha not installed') else: from decaptcha.solvers.deathbycaptcha import DeathbycaptchaSolver if (settings.get('DECAPTCHA_DEATHBYCAPTCHA_USERNAME') and settings.get('DECAPTCHA_DEATHBYCAPTCHA_PASSWORD')): self.solver = DeathbycaptchaSolver(self.crawler) else: self.logger.warning('DeathByCaptcha account not provided') self.retries_left = settings.getint('LOGIN_MAX_RETRIES') request_kwargs = {} if self.using_splash: request_kwargs['args'] = {'full_render': True} yield self.request(self.start_url, **request_kwargs)
Example #3
Source File: log.py From learn_python3_spider with MIT License | 6 votes |
def _get_handler(settings): """ Return a log handler object according to settings """ filename = settings.get('LOG_FILE') if filename: encoding = settings.get('LOG_ENCODING') handler = logging.FileHandler(filename, encoding=encoding) elif settings.getbool('LOG_ENABLED'): handler = logging.StreamHandler() else: handler = logging.NullHandler() formatter = logging.Formatter( fmt=settings.get('LOG_FORMAT'), datefmt=settings.get('LOG_DATEFORMAT') ) handler.setFormatter(formatter) handler.setLevel(settings.get('LOG_LEVEL')) if settings.getbool('LOG_SHORT_NAMES'): handler.addFilter(TopLevelFormatter(['scrapy'])) return handler
Example #4
Source File: shell.py From learn_python3_spider with MIT License | 6 votes |
def populate_vars(self, response=None, request=None, spider=None): import scrapy self.vars['scrapy'] = scrapy self.vars['crawler'] = self.crawler self.vars['item'] = self.item_class() self.vars['settings'] = self.crawler.settings self.vars['spider'] = spider self.vars['request'] = request self.vars['response'] = response self.vars['sel'] = _SelectorProxy(response) if self.inthread: self.vars['fetch'] = self.fetch self.vars['view'] = open_in_browser self.vars['shelp'] = self.print_help self.update_vars(self.vars) if not self.code: self.vars['banner'] = self.get_help()
Example #5
Source File: log.py From learn_python3_spider with MIT License | 6 votes |
def _get_handler(settings): """ Return a log handler object according to settings """ filename = settings.get('LOG_FILE') if filename: encoding = settings.get('LOG_ENCODING') handler = logging.FileHandler(filename, encoding=encoding) elif settings.getbool('LOG_ENABLED'): handler = logging.StreamHandler() else: handler = logging.NullHandler() formatter = logging.Formatter( fmt=settings.get('LOG_FORMAT'), datefmt=settings.get('LOG_DATEFORMAT') ) handler.setFormatter(formatter) handler.setLevel(settings.get('LOG_LEVEL')) if settings.getbool('LOG_SHORT_NAMES'): handler.addFilter(TopLevelFormatter(['scrapy'])) return handler
Example #6
Source File: EuropythonSpyder.py From Learning-Python-Networking-Second-Edition with MIT License | 6 votes |
def main(): """Main routine for the execution of the Spider""" # set up signal to catch items scraped def catch_item(sender, item, **kwargs): print("Item extracted:", item) dispatcher.connect(catch_item, signal=signals.item_passed) settings = Settings() settings.set("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36") settings.set("LOG_ENABLED",False) # setup crawler from scrapy.crawler import CrawlerProcess crawler = CrawlerProcess(settings) # define the spider for the crawler crawler.crawl(EuropythonSpyder()) # start scrapy print("STARTING ENGINE") crawler.start() #iniciar el crawler llamando al spider definido print("ENGINE STOPPED")
Example #7
Source File: shell.py From learn_python3_spider with MIT License | 6 votes |
def populate_vars(self, response=None, request=None, spider=None): import scrapy self.vars['scrapy'] = scrapy self.vars['crawler'] = self.crawler self.vars['item'] = self.item_class() self.vars['settings'] = self.crawler.settings self.vars['spider'] = spider self.vars['request'] = request self.vars['response'] = response self.vars['sel'] = _SelectorProxy(response) if self.inthread: self.vars['fetch'] = self.fetch self.vars['view'] = open_in_browser self.vars['shelp'] = self.print_help self.update_vars(self.vars) if not self.code: self.vars['banner'] = self.get_help()
Example #8
Source File: shell.py From learn_python3_spider with MIT License | 5 votes |
def start(self, url=None, request=None, response=None, spider=None, redirect=True): # disable accidental Ctrl-C key press from shutting down the engine signal.signal(signal.SIGINT, signal.SIG_IGN) if url: self.fetch(url, spider, redirect=redirect) elif request: self.fetch(request, spider) elif response: request = response.request self.populate_vars(response, request, spider) else: self.populate_vars() if self.code: print(eval(self.code, globals(), self.vars)) else: """ Detect interactive shell setting in scrapy.cfg e.g.: ~/.config/scrapy.cfg or ~/.scrapy.cfg [settings] # shell can be one of ipython, bpython or python; # to be used as the interactive python console, if available. # (default is ipython, fallbacks in the order listed above) shell = python """ cfg = get_config() section, option = 'settings', 'shell' env = os.environ.get('SCRAPY_PYTHON_SHELL') shells = [] if env: shells += env.strip().lower().split(',') elif cfg.has_option(section, option): shells += [cfg.get(section, option).strip().lower()] else: # try all by default shells += DEFAULT_PYTHON_SHELLS.keys() # always add standard shell as fallback shells += ['python'] start_python_console(self.vars, shells=shells, banner=self.vars.pop('banner', ''))
Example #9
Source File: spiders.py From autologin with Apache License 2.0 | 5 votes |
def _finish_init(self): self.using_splash = bool(self.settings.get('SPLASH_URL')) if self.using_splash: with open(os.path.join( os.path.dirname(__file__), 'directives', self.lua_source), 'rb') as f: lua_source = f.read().decode('utf-8') self.request = partial( splash_request, lua_source, extra_js=self.extra_js) else: if self.extra_js: raise ValueError( '"extra_js" not supported without "SPLASH_URL"') self.request = scrapy.Request
Example #10
Source File: crawler.py From ws-backend-community with GNU General Public License v3.0 | 5 votes |
def crawling_config(self): """ Get a dictionary containing the spider and Scrapy settings to use to crawl an endpoint. :return: A dictionary containing the spider and Scrapy settings to use to crawl an endpoint. """ return self._crawling_config # Representation and Comparison
Example #11
Source File: crawler.py From ws-backend-community with GNU General Public License v3.0 | 5 votes |
def __crawl(self, spider_kwargs=None, settings=None): """ Perform a crawl based on the contents of self._crawling_config. :param spider_kwargs: Keyword arguments to use to create a spider class. :param settings: Scrapy settings to use to crawl the remote endpoint. :return: None """ print("SPIDER KWARGS ARE %s." % (spider_kwargs,)) config.globals["%s-hostname" % (os.getpid(),)] = spider_kwargs["input_hostname"] spider = self.get_spider_class_for_domain(**spider_kwargs) process = CrawlerProcess(settings) process.crawl(spider) process.start()
Example #12
Source File: crawler.py From ws-backend-community with GNU General Public License v3.0 | 5 votes |
def get_scrapy_settings(self, item_pipeline=None, hostname=None): """ Get a scrapy settings dictionary to use for crawling web applications. :param item_pipeline: The item pipeline configuration to configure in the settings. :param hostname: The hostname to request by default in all Scrapy requests. :return: A scrapy settings dictionary to use for crawling web applications. """ item_pipeline = item_pipeline if item_pipeline is not None else self.__get_default_item_pipeline() return scrapy.settings.Settings(values={ "CONCURRENT_ITEMS": self.concurrent_items, "CONCURRENT_REQUESTS": self.concurrent_requests, "DEFAULT_REQUEST_HEADERS": { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en", "Host": hostname, }, "DEPTH_LIMIT": self.depth_limit, "DEPTH_PRIORITY": self.depth_priority, "DOWNLOADER_CLIENTCONTEXTFACTORY": "lib.inspection.web.crawling.WebSightClientContextFactory", "EXTENSIONS": { "scrapy.extensions.telnet.TelnetConsole": None, }, "DOWNLOADER_MIDDLEWARES": { "scrapy.downloadermiddlewares.redirect.RedirectMiddleware": None, "scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware": None, }, "SPIDER_MIDDLEWARES": { "scrapy.spidermiddlewares.offsite.OffsiteMiddleware": None, }, "DOWNLOAD_MAXSIZE": self.max_size, "HTTPERROR_ALLOW_ALL": self.allow_all_errors, "ITEM_PIPELINES": item_pipeline, "LOG_LEVEL": config.log_crawling_level, "TELNETCONSOLE_ENABLED": self.enable_telnet, "USER_AGENT": self.user_agent, })
Example #13
Source File: log.py From learn_python3_spider with MIT License | 5 votes |
def configure_logging(settings=None, install_root_handler=True): """ Initialize logging defaults for Scrapy. :param settings: settings used to create and configure a handler for the root logger (default: None). :type settings: dict, :class:`~scrapy.settings.Settings` object or ``None`` :param install_root_handler: whether to install root logging handler (default: True) :type install_root_handler: bool This function does: - Route warnings and twisted logging through Python standard logging - Assign DEBUG and ERROR level to Scrapy and Twisted loggers respectively - Route stdout to log if LOG_STDOUT setting is True When ``install_root_handler`` is True (default), this function also creates a handler for the root logger according to given settings (see :ref:`topics-logging-settings`). You can override default options using ``settings`` argument. When ``settings`` is empty or None, defaults are used. """ if not sys.warnoptions: # Route warnings through python logging logging.captureWarnings(True) observer = twisted_log.PythonLoggingObserver('twisted') observer.start() dictConfig(DEFAULT_LOGGING) if isinstance(settings, dict) or settings is None: settings = Settings(settings) if settings.getbool('LOG_STDOUT'): sys.stdout = StreamLogger(logging.getLogger('stdout')) if install_root_handler: install_scrapy_root_handler(settings)
Example #14
Source File: log.py From learn_python3_spider with MIT License | 5 votes |
def log_scrapy_info(settings): logger.info("Scrapy %(version)s started (bot: %(bot)s)", {'version': scrapy.__version__, 'bot': settings['BOT_NAME']}) logger.info("Versions: %(versions)s", {'versions': ", ".join("%s %s" % (name, version) for name, version in scrapy_components_versions() if name != "Scrapy")})
Example #15
Source File: log.py From learn_python3_spider with MIT License | 5 votes |
def install_scrapy_root_handler(settings): global _scrapy_root_handler if (_scrapy_root_handler is not None and _scrapy_root_handler in logging.root.handlers): logging.root.removeHandler(_scrapy_root_handler) logging.root.setLevel(logging.NOTSET) _scrapy_root_handler = _get_handler(settings) logging.root.addHandler(_scrapy_root_handler)
Example #16
Source File: log.py From learn_python3_spider with MIT License | 5 votes |
def configure_logging(settings=None, install_root_handler=True): """ Initialize logging defaults for Scrapy. :param settings: settings used to create and configure a handler for the root logger (default: None). :type settings: dict, :class:`~scrapy.settings.Settings` object or ``None`` :param install_root_handler: whether to install root logging handler (default: True) :type install_root_handler: bool This function does: - Route warnings and twisted logging through Python standard logging - Assign DEBUG and ERROR level to Scrapy and Twisted loggers respectively - Route stdout to log if LOG_STDOUT setting is True When ``install_root_handler`` is True (default), this function also creates a handler for the root logger according to given settings (see :ref:`topics-logging-settings`). You can override default options using ``settings`` argument. When ``settings`` is empty or None, defaults are used. """ if not sys.warnoptions: # Route warnings through python logging logging.captureWarnings(True) observer = twisted_log.PythonLoggingObserver('twisted') observer.start() dictConfig(DEFAULT_LOGGING) if isinstance(settings, dict) or settings is None: settings = Settings(settings) if settings.getbool('LOG_STDOUT'): sys.stdout = StreamLogger(logging.getLogger('stdout')) if install_root_handler: install_scrapy_root_handler(settings)
Example #17
Source File: shell.py From learn_python3_spider with MIT License | 5 votes |
def __init__(self, crawler, update_vars=None, code=None): self.crawler = crawler self.update_vars = update_vars or (lambda x: None) self.item_class = load_object(crawler.settings['DEFAULT_ITEM_CLASS']) self.spider = None self.inthread = not threadable.isInIOThread() self.code = code self.vars = {}
Example #18
Source File: log.py From learn_python3_spider with MIT License | 5 votes |
def log_scrapy_info(settings): logger.info("Scrapy %(version)s started (bot: %(bot)s)", {'version': scrapy.__version__, 'bot': settings['BOT_NAME']}) logger.info("Versions: %(versions)s", {'versions': ", ".join("%s %s" % (name, version) for name, version in scrapy_components_versions() if name != "Scrapy")})
Example #19
Source File: log.py From learn_python3_spider with MIT License | 5 votes |
def install_scrapy_root_handler(settings): global _scrapy_root_handler if (_scrapy_root_handler is not None and _scrapy_root_handler in logging.root.handlers): logging.root.removeHandler(_scrapy_root_handler) logging.root.setLevel(logging.NOTSET) _scrapy_root_handler = _get_handler(settings) logging.root.addHandler(_scrapy_root_handler)
Example #20
Source File: crawler.py From ws-backend-community with GNU General Public License v3.0 | 4 votes |
def crawl_endpoint_to_file( self, ip_address=None, port=None, hostname=None, use_ssl=False, use_sni=False, start_urls=[], in_separate_process=True, ): """ Start crawling the given endpoint using the given list of URLs and write the results to a local file. :param ip_address: The IP address to crawl. :param port: The port where the application resides. :param hostname: The hostname to submit alongside all requests to the remote endpoint. :param use_ssl: Whether or not to use SSL to connect to the remote web service. :param use_sni: Whether or not to use SNI to connect to the remote web service. :param start_urls: A list of URLs to start crawling from. :param in_separate_process: Whether or not to spawn off a separate process for the crawl. This enables us to call this method multiple times in the same process, as a Twisted reactor can only be started and stopped once per process. :return: A tuple containing (1) the string containing the local file path where crawling results are stored and (2) a ScrapyResultWrapper configured to process the contents of the file. """ temp_file_path = FilesystemHelper.get_temporary_file_path() local_file_path = "%s-%s-%s:%s" % (temp_file_path, self.bot_name, ip_address, port) spider_kwargs = { "input_ip_address": ip_address, "input_start_urls": start_urls, "input_file_path": local_file_path, "input_hostname": hostname, "input_use_ssl": use_ssl, "input_use_sni": use_sni, "input_port": port, } pipeline_settings = self.__get_local_storage_item_pipeline() requested_hostname = hostname if hostname is not None else ip_address settings = self.get_scrapy_settings(item_pipeline=pipeline_settings, hostname=requested_hostname) crawling_config = { "spider_kwargs": spider_kwargs, "settings": settings, } if in_separate_process: process = Process(target=self.__crawl, kwargs=crawling_config) process.start() process.join() process.terminate() else: self.__crawl(**crawling_config) return local_file_path, ScrapyResultWrapper.from_file(local_file_path)