Python scrapy.crawler() Examples
The following are 21
code examples of scrapy.crawler().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
scrapy
, or try the search function
.
Example #1
Source File: spiders.py From autologin with Apache License 2.0 | 6 votes |
def start_requests(self): self._finish_init() settings = self.crawler.settings self.solver = None try: import decaptcha except ImportError: self.logger.warning('Decaptcha not installed') else: from decaptcha.solvers.deathbycaptcha import DeathbycaptchaSolver if (settings.get('DECAPTCHA_DEATHBYCAPTCHA_USERNAME') and settings.get('DECAPTCHA_DEATHBYCAPTCHA_PASSWORD')): self.solver = DeathbycaptchaSolver(self.crawler) else: self.logger.warning('DeathByCaptcha account not provided') self.retries_left = settings.getint('LOGIN_MAX_RETRIES') request_kwargs = {} if self.using_splash: request_kwargs['args'] = {'full_render': True} yield self.request(self.start_url, **request_kwargs)
Example #2
Source File: EuropythonSpyder.py From Learning-Python-Networking-Second-Edition with MIT License | 6 votes |
def main(): """Main routine for the execution of the Spider""" # set up signal to catch items scraped def catch_item(sender, item, **kwargs): print("Item extracted:", item) dispatcher.connect(catch_item, signal=signals.item_passed) settings = Settings() settings.set("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36") settings.set("LOG_ENABLED",False) # setup crawler from scrapy.crawler import CrawlerProcess crawler = CrawlerProcess(settings) # define the spider for the crawler crawler.crawl(EuropythonSpyder()) # start scrapy print("STARTING ENGINE") crawler.start() #iniciar el crawler llamando al spider definido print("ENGINE STOPPED")
Example #3
Source File: shell.py From learn_python3_spider with MIT License | 6 votes |
def populate_vars(self, response=None, request=None, spider=None): import scrapy self.vars['scrapy'] = scrapy self.vars['crawler'] = self.crawler self.vars['item'] = self.item_class() self.vars['settings'] = self.crawler.settings self.vars['spider'] = spider self.vars['request'] = request self.vars['response'] = response self.vars['sel'] = _SelectorProxy(response) if self.inthread: self.vars['fetch'] = self.fetch self.vars['view'] = open_in_browser self.vars['shelp'] = self.print_help self.update_vars(self.vars) if not self.code: self.vars['banner'] = self.get_help()
Example #4
Source File: shell.py From learn_python3_spider with MIT License | 6 votes |
def populate_vars(self, response=None, request=None, spider=None): import scrapy self.vars['scrapy'] = scrapy self.vars['crawler'] = self.crawler self.vars['item'] = self.item_class() self.vars['settings'] = self.crawler.settings self.vars['spider'] = spider self.vars['request'] = request self.vars['response'] = response self.vars['sel'] = _SelectorProxy(response) if self.inthread: self.vars['fetch'] = self.fetch self.vars['view'] = open_in_browser self.vars['shelp'] = self.print_help self.update_vars(self.vars) if not self.code: self.vars['banner'] = self.get_help()
Example #5
Source File: shell.py From learn_python3_spider with MIT License | 5 votes |
def _schedule(self, request, spider): spider = self._open_spider(request, spider) d = _request_deferred(request) d.addCallback(lambda x: (x, spider)) self.crawler.engine.crawl(request, spider) return d
Example #6
Source File: main.py From python-examples with MIT License | 5 votes |
def from_crawler(cls, crawler): print('from_crawler stats:', crawler.stats) return cls(crawler.stats)
Example #7
Source File: shell.py From learn_python3_spider with MIT License | 5 votes |
def inspect_response(response, spider): """Open a shell to inspect the given response""" Shell(spider.crawler).start(response=response, spider=spider)
Example #8
Source File: shell.py From learn_python3_spider with MIT License | 5 votes |
def _open_spider(self, request, spider): if self.spider: return self.spider if spider is None: spider = self.crawler.spider or self.crawler._create_spider() self.crawler.spider = spider self.crawler.engine.open_spider(spider, close_if_idle=False) self.spider = spider return spider
Example #9
Source File: shell.py From learn_python3_spider with MIT License | 5 votes |
def _schedule(self, request, spider): spider = self._open_spider(request, spider) d = _request_deferred(request) d.addCallback(lambda x: (x, spider)) self.crawler.engine.crawl(request, spider) return d
Example #10
Source File: shell.py From learn_python3_spider with MIT License | 5 votes |
def inspect_response(response, spider): """Open a shell to inspect the given response""" Shell(spider.crawler).start(response=response, spider=spider)
Example #11
Source File: shell.py From learn_python3_spider with MIT License | 5 votes |
def _open_spider(self, request, spider): if self.spider: return self.spider if spider is None: spider = self.crawler.spider or self.crawler._create_spider() self.crawler.spider = spider self.crawler.engine.open_spider(spider, close_if_idle=False) self.spider = spider return spider
Example #12
Source File: online.py From scrapy-cluster with MIT License | 5 votes |
def tearDown(self): keys = self.redis_conn.keys('stats:crawler:*:test-spider:*') keys = keys + self.redis_conn.keys('test-spider:*') for key in keys: self.redis_conn.delete(key) # if for some reason the tests fail, we end up falling behind on # the consumer for m in self.consumer: pass self.consumer.close()
Example #13
Source File: shell.py From learn_python3_spider with MIT License | 5 votes |
def __init__(self, crawler, update_vars=None, code=None): self.crawler = crawler self.update_vars = update_vars or (lambda x: None) self.item_class = load_object(crawler.settings['DEFAULT_ITEM_CLASS']) self.spider = None self.inthread = not threadable.isInIOThread() self.code = code self.vars = {}
Example #14
Source File: py_my_scrapy_redis_server.py From vrequest with MIT License | 5 votes |
def from_crawler(cls, crawler): return cls.from_settings(crawler.settings)
Example #15
Source File: py_my_scrapy_redis_server.py From vrequest with MIT License | 5 votes |
def from_crawler(cls, crawler): s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s
Example #16
Source File: py_my_scrapy_redis_server.py From vrequest with MIT License | 5 votes |
def from_crawler(cls, crawler): s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s
Example #17
Source File: py_my_scrapy_redis_server.py From vrequest with MIT License | 5 votes |
def schedule_next_requests(self): for req in self.next_requests(): self.crawler.engine.crawl(req, spider=self) # 下面的部分主要是处理 start_url 的部分,这里的处理是永久打开直至程序关闭的 # 原本 scrapy-redis 是用这个来接收一个起始 url 字符串,不过现在改成了接收一个json数据传递脚本数据 # 将此处魔改成对传递过来的参数各种初始化的地方,在发送端生成id后传入这边进行处理 # 这里可以传过来一个简单的 json 数据来装脚本的代码部分,方便脚本的传递以及实例化
Example #18
Source File: py_my_scrapy_redis_server.py From vrequest with MIT License | 5 votes |
def setup_redis(self, crawler=None): if self.server is not None: return if crawler is None: crawler = getattr(self, 'crawler', None) if crawler is None: raise ValueError("crawler is required") settings = crawler.settings if self.redis_key is None: self.redis_key = settings.get( 'REDIS_START_URLS_KEY', defaults.START_URLS_KEY, ) self.redis_key = self.redis_key % {'name': self.name} if not self.redis_key.strip(): raise ValueError("redis_key must not be empty") if self.redis_batch_size is None: self.redis_batch_size = settings.getint( 'REDIS_START_URLS_BATCH_SIZE', settings.getint('CONCURRENT_REQUESTS'), ) try: self.redis_batch_size = int(self.redis_batch_size) except (TypeError, ValueError): raise ValueError("redis_batch_size must be an integer") if self.redis_encoding is None: self.redis_encoding = settings.get('REDIS_ENCODING', defaults.REDIS_ENCODING) self.logger.info("Reading start URLs from redis key '%(redis_key)s' " "(batch size: %(redis_batch_size)s, encoding: %(redis_encoding)s", self.__dict__) self.server = connection.from_settings(crawler.settings) # 在后续的处理中,任务不再是在爬虫空闲的时候才进行任务的分配,而是一直都会执行(为了适配多任务) # 这样不会让一些任务得不到启动。因此 spider_idle 函数将不在负责执行 schedule_next_requests # 而只会抛出 DontCloseSpider 异常, # 并且新开一个 schedule_next_requests 函数轮询任务,用于获取启动任务 # 并且新开一个 _stop_clear 函数轮询任务,用于检测函数停止任务 crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) # 将日志的模板拿到这个对象中,后续函数需要用到 self._clear_debug_pc = crawler.settings.getbool('CLEAR_DEBUG_PC') self._clear_dupefilter = crawler.settings.getbool('CLEAR_DUPEFILTER') self._spider_id_debg_format = crawler.settings.get('DEBUG_PC_FORMAT') self._spider_id_task_format = crawler.settings.get('TASK_ID_FORMAT') self._spider_id_dupk_format = crawler.settings.get('SCHEDULER_DUPEFILTER_KEY') # 这里是将该任务开启绑定两个定时执行,永不停止的函数 # 1/ 为了检查已经停止的任务并且清理任务的空间。 # 2/ 为了获取到新的 start_url 开启新的任务脚本进行任务的初始化并且处理任务空间的问题。 self.limit_check = 0 # 这个参数是想让不同的任务的检查时机稍微错开一点,不要都挤在 _stop_clear 一次迭代中 self.limit_same = 2 # 日志快照连续相同的次数 self.interval = 5 # 多少秒执行一次 检测关闭任务 # (理论上平均检测关闭的时间大概为 (limit_check+1) * (limit_same+1) * interval ) # 测试时可以适量调整小一些方便查看框架的问题 self.interval_s = 2 # 多少秒执行一次 检测启动任务 self.limit_log = 8 # 额外的配置,check stoping 限制显示任务数,防止出现如有几百个任务每次都要全部打印的情况。 crawler.signals.connect(self.spider_opened, signal=signals.spider_opened)
Example #19
Source File: py_my_scrapy_redis_server.py From vrequest with MIT License | 5 votes |
def __init__(self, crawler): self._spider_id_debg_format = crawler.settings.get('DEBUG_PC_FORMAT') self._spider_id_task_format = crawler.settings.get('TASK_ID_FORMAT') self._pc_mac = crawler.settings.get('PCMAC') self._dump = crawler.settings.getbool('STATS_DUMP') self._debug_pc = crawler.settings.getbool('DEBUG_PC') self._local_max = crawler.settings.get('DEPTH_MAX_FORMAT') self._stats = {} self.server = connection.from_settings(crawler.settings) self.encoding = self.server.connection_pool.connection_kwargs.get('encoding')
Example #20
Source File: py_my_scrapy_redis_server.py From vrequest with MIT License | 5 votes |
def from_crawler(cls, crawler): return cls.from_settings(crawler.settings)
Example #21
Source File: py_my_scrapy_redis_server.py From vrequest with MIT License | 5 votes |
def from_crawler(cls, crawler): instance = cls.from_settings(crawler.settings) instance.stats = crawler.stats return instance