Python scrapy.Spider() Examples
The following are 30
code examples of scrapy.Spider().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
scrapy
, or try the search function
.
Example #1
Source File: assessment_spider.py From assessor-scraper with MIT License | 6 votes |
def parse(self, response): """ Default callback function with response for the crawled url https://doc.scrapy.org/en/latest/topics/spiders.html#scrapy.spiders.Spider.parse """ response = response.replace(body=re.sub(r"<br\s*[\/]?>", "\n", response.body.decode('utf=8'))) property_key = response.url.split('=')[1].replace('&', '') # logging.debug("Parsing property_key: %s", property_key) if 'No Data at this time' in response.text: msg = "No data for " + response.url logging.warning(msg) raise DropItem(msg) else: property_info = self.parse_property_info(response) property_values = self.parse_property_values(response) property_sales = self.parse_property_sales(response) property_info['sales'] = property_sales property_info['values'] = property_values property_info['property_key'] = property_key yield Property(property_info)
Example #2
Source File: test_pagestorage.py From scrapy-pagestorage with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_save_response_with_trim(self): self.instance._writer.maxitemsize = 26 self.instance.hsref.job.key = '123/45/67' resp = TextResponse( 'http://resp', request=Request('http://req'), encoding='cp1251', body='\r\n\r\n<html><body></body></html>\r\n \0\0\0\0\0') with mock.patch.object(Spider, 'logger') as log: spider = Spider('default') self.instance.save_response(resp, self.spider) log.warning.assert_called_with( "Page not saved, body too large: <http://resp>") self.instance.trim_html = True self.instance.save_response(resp, spider) self.instance._writer.write.assert_called_with( {u'body': u'<html><body></body></html>', u'_encoding': u'cp1251', u'_type': u'_pageitem', u'_key': u'9b4bed7e56103ddf63455ed39145f61f53b3c702', u'url': u'http://resp', '_jobid': '123/45/67'})
Example #3
Source File: test_localstoragestats.py From spidermon with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_spider_has_two_last_stats_history_when_opened_third_time( test_settings, stats_temporary_location ): crawler = get_crawler(Spider, test_settings) crawler.crawl("foo_spider") crawler.stats.set_value("first_execution", "value") crawler.stop() crawler = get_crawler(Spider, test_settings) crawler.crawl("foo_spider") crawler.stats.set_value("second_execution", "value") crawler.stop() crawler = get_crawler(Spider, test_settings) crawler.crawl("foo_spider") assert len(crawler.spider.stats_history) == 2 assert "second_execution" in crawler.spider.stats_history[0].keys() assert "first_execution" in crawler.spider.stats_history[1].keys() crawler.stop()
Example #4
Source File: test_localstoragestats.py From spidermon with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_spider_limit_number_of_stored_stats(test_settings, stats_temporary_location): test_settings["SPIDERMON_MAX_STORED_STATS"] = 2 crawler = get_crawler(Spider, test_settings) crawler.crawl("foo_spider") crawler.stats.set_value("first_execution", "value") crawler.stop() crawler = get_crawler(Spider, test_settings) crawler.crawl("foo_spider") crawler.stats.set_value("second_execution", "value") crawler.stop() crawler = get_crawler(Spider, test_settings) crawler.crawl("foo_spider") crawler.stats.set_value("third_execution", "value") crawler.stop() crawler = get_crawler(Spider, test_settings) crawler.crawl("foo_spider") assert len(crawler.spider.stats_history) == 2 assert "third_execution" in crawler.spider.stats_history[0].keys() assert "second_execution" in crawler.spider.stats_history[1].keys() crawler.stop()
Example #5
Source File: crawler.py From learn_python3_spider with MIT License | 6 votes |
def create_crawler(self, crawler_or_spidercls): """ Return a :class:`~scrapy.crawler.Crawler` object. * If ``crawler_or_spidercls`` is a Crawler, it is returned as-is. * If ``crawler_or_spidercls`` is a Spider subclass, a new Crawler is constructed for it. * If ``crawler_or_spidercls`` is a string, this function finds a spider with this name in a Scrapy project (using spider loader), then creates a Crawler instance for it. """ if isinstance(crawler_or_spidercls, Spider): raise ValueError( 'The crawler_or_spidercls argument cannot be a spider object, ' 'it must be a spider class (or a Crawler object)') if isinstance(crawler_or_spidercls, Crawler): return crawler_or_spidercls return self._create_crawler(crawler_or_spidercls)
Example #6
Source File: middleware.py From scrapy-poet with BSD 3-Clause "New" or "Revised" License | 6 votes |
def process_request(self, request: Request, spider: Spider): """This method checks if the request is really needed and if its download could be skipped by trying to infer if a ``Response`` is going to be used by the callback or a Page Input. If the ``Response`` can be ignored, a ``utils.DummyResponse`` object is returned on its place. This ``DummyResponse`` is linked to the original ``Request`` instance. With this behavior, we're able to optimize spider executions avoiding unnecessary downloads. That could be the case when the callback is actually using another source like external APIs such as Scrapinghub's Auto Extract. """ if utils.is_response_going_to_be_used(request, spider): return spider.logger.debug(f'Skipping download of {request}') return utils.DummyResponse(url=request.url, request=request)
Example #7
Source File: sina_category_spider.py From fooltrader with MIT License | 5 votes |
def spider_closed(self, spider, reason): if self.sh_df[self.category_type].any(): self.sh_df.to_csv(get_security_list_path('stock', 'sh'), index=False) if self.sz_df[self.category_type].any(): self.sz_df.to_csv(get_security_list_path('stock', 'sz'), index=False) spider.logger.info('Spider closed: %s,%s\n', spider.name, reason)
Example #8
Source File: future_shfe_spider.py From fooltrader with MIT License | 5 votes |
def spider_closed(self, spider, reason): if self.trading_dates: parse_shfe_day_data() else: parse_shfe_data() spider.logger.info('Spider closed: %s,%s\n', spider.name, reason)
Example #9
Source File: stock_summary_spider.py From fooltrader with MIT License | 5 votes |
def spider_closed(self, spider, reason): self.current_df = self.current_df.loc[:, KDATA_INDEX_COL] print(self.current_df) self.current_df.to_csv(get_kdata_path(item=self.security_item), index=False) spider.logger.info('Spider closed: %s,%s\n', spider.name, reason)
Example #10
Source File: stock_finance_report_event_spider.py From fooltrader with MIT License | 5 votes |
def spider_closed(self, spider, reason): spider.logger.info('Spider closed: %s,%s\n', spider.name, reason)
Example #11
Source File: china_stock_list_spider.py From fooltrader with MIT License | 5 votes |
def spider_closed(self, spider, reason): spider.logger.info('Spider closed: %s,%s\n', spider.name, reason)
Example #12
Source File: stock_forecast_spider.py From fooltrader with MIT License | 5 votes |
def spider_closed(self, spider, reason): spider.logger.info('Spider closed: %s,%s\n', spider.name, reason)
Example #13
Source File: stock_tick_spider.py From fooltrader with MIT License | 5 votes |
def spider_closed(self, spider, reason): spider.logger.info('Spider closed: %s,%s\n', spider.name, reason)
Example #14
Source File: stock_kdata_163_spider.py From fooltrader with MIT License | 5 votes |
def spider_closed(self, spider, reason): spider.logger.info('Spider closed: %s,%s\n', spider.name, reason)
Example #15
Source File: stock_trading_date_spider.py From fooltrader with MIT License | 5 votes |
def spider_closed(self, spider, reason): spider.logger.info('Spider closed: %s,%s\n', spider.name, reason)
Example #16
Source File: america_list_spider.py From fooltrader with MIT License | 5 votes |
def spider_closed(self, spider, reason): spider.logger.info('Spider closed: %s,%s\n', spider.name, reason)
Example #17
Source File: america_stock_kdata_spider_163.py From fooltrader with MIT License | 5 votes |
def spider_closed(self, spider, reason): spider.logger.info('Spider closed: %s,%s\n', spider.name, reason)
Example #18
Source File: sp500_spider.py From fooltrader with MIT License | 5 votes |
def spider_closed(self, spider, reason): self.df_pe['close'] = self.df_close['close'] self.df_pe['code'] = self.security_item['code'] self.df_pe['securityId'] = self.security_item['id'] self.df_pe['name'] = self.security_item['name'] self.df_pe.to_csv(get_kdata_path(self.security_item), index=False) spider.logger.info('Spider closed: %s,%s\n', spider.name, reason)
Example #19
Source File: america_stock_finance_spider.py From fooltrader with MIT License | 5 votes |
def spider_closed(self, spider, reason): spider.logger.info('Spider closed: %s,%s\n', spider.name, reason)
Example #20
Source File: test_middleware.py From MaybeDont with MIT License | 5 votes |
def test_skip(): mw = AvoidDupContentMiddleware( initial_queue_limit=300, threshold=0.98, exploration=0.05) spider = Spider() mw.process_request(Request('http://example.com'), spider) assert len(mw.initial_queue) == 0 req = Request('http://example.com', meta={'avoid_dup_content': True}) mw.process_request(req, spider) mw.process_response( req, HtmlResponse(req.url, body=b'a', request=req), spider) assert len(mw.initial_queue) == 1
Example #21
Source File: bnb.py From airbnb-scraper with GNU General Public License v3.0 | 5 votes |
def start_requests(self): """Spider entry point. Generate the first search request(s).""" self.logger.info(f"starting survey for: {self._place}") # get params from injected constructor values params = {} if self._price_max: params['price_max'] = self._price_max if self._price_min: params['price_min'] = self._price_min if self._ne_lat: params['ne_lat'] = self._ne_lat if self._ne_lng: params['ne_lng'] = self._ne_lng if self._sw_lat: params['sw_lat'] = self._sw_lat if self._sw_lng: params['sw_lng'] = self._sw_lng if not self._checkin: # assume not self._checkout also yield self._api_request(params, callback=self.parse_landing_page) checkin_range_spec, checkout_range_spec = self._process_checkin_vars() # perform request(s) yield from self._perform_checkin_start_requests(checkin_range_spec, checkout_range_spec, params)
Example #22
Source File: test_pagestorage.py From scrapy-pagestorage with BSD 3-Clause "New" or "Revised" License | 5 votes |
def setUp(self): self.spider = Spider('default') self.mocked_hsref = mock.Mock() self.patch = mock.patch('sh_scrapy.hsref.hsref', self.mocked_hsref) self.crawler_mock = mock.Mock() self.crawler_mock.settings = Settings( {'PAGE_STORAGE_ENABLED': True, 'PAGE_STORAGE_MODE': 'VERSIONED_CACHE', 'PAGE_STORAGE_LIMIT': 10, 'PAGE_STORAGE_ON_ERROR_LIMIT': 5}) self.mocked_hsref.project.collections.url = '/test/url' self.patch.start() self.instance = PageStorageMiddleware.from_crawler(self.crawler_mock)
Example #23
Source File: mixins.py From OpenScraper with MIT License | 5 votes |
def spider_closed(self, spider): """Send some status to logging""" self.logger.info("**** Spider closed: {} ****".format(spider.name)) self.logger.info("--- {} Items retrieved (main search page count)".format(self.item_count)) self.logger.info("--- {} Items retrieved (detailed page count)".format(self.item_count_depth_1)) self.logger.info("--- {} Pages crawled".format(self.page_count)) self.print_error() self.logger.info("***End scraping***\n")
Example #24
Source File: aiqiyi_spider.py From video_url_crawler_demo with GNU General Public License v3.0 | 5 votes |
def __init__(self): scrapy.spiders.Spider.__init__(self) self.global_settings = get_project_settings() if self.global_settings['PLATFORM'] in ['win', 'mac']: self.driver = webdriver.PhantomJS(executable_path= self.global_settings['PHANTOMJS_PATH']) elif self.global_settings['PLATFORM'] in ['linux']: self.driver = webdriver.PhantomJS() self.driver.set_page_load_timeout(30) self.driver.implicitly_wait(10) self.type_id_list = self.global_settings['CRAWLER']['type_id_list'] self.re_type_id = re.compile(self.global_settings['CRAWLER']['re_type_id']) self.url_template = self.global_settings['CRAWLER']['url_template']
Example #25
Source File: aiqiyi_spider.py From video_url_crawler_demo with GNU General Public License v3.0 | 5 votes |
def __del__(self): self.driver.quit() scrapy.spiders.Spider.__del__(self)
Example #26
Source File: conftest.py From spidermon with BSD 3-Clause "New" or "Revised" License | 5 votes |
def get_crawler(): def _crawler(extended_settings={}): settings = { "SPIDERMON_ENABLED": True, "EXTENSIONS": {"spidermon.contrib.scrapy.extensions.Spidermon": 500}, } settings.update(extended_settings) crawler = Crawler(Spider, settings=settings) crawler.spider = Spider("dummy") return crawler return _crawler
Example #27
Source File: test_monitors.py From spidermon with BSD 3-Clause "New" or "Revised" License | 5 votes |
def make_data(request): def _make_data(settings=None): crawler = Crawler(Spider, settings=settings) spider = Spider("dummy") return { "stats": crawler.stats.get_stats(), "crawler": crawler, "spider": spider, "runner": SpiderMonitorRunner(spider=spider), "job": None, } return _make_data
Example #28
Source File: test_extensions.py From spidermon with BSD 3-Clause "New" or "Revised" License | 5 votes |
def run_test(self, **kwargs): dt = TestData(**kwargs) settings = { "SPIDERMON_ENABLED": True, "SPIDERMON_SPIDER_OPEN_EXPRESSION_MONITORS": [ {"tests": [{"expression": dt.expression}]} ], } settings.update(dt.settings) crawler = get_crawler(settings_dict=settings) crawler.stats.get_stats = lambda _: dt.stats spidermon = Spidermon.from_crawler(crawler) spider = Spider(name=self.spider_name) # mocking, to see test results via raising AssertionError exception # with failures and errors as results spidermon._run_suites = partial(_test_run_suites, spidermon) try: spidermon.spider_opened(spider) except AssertionError as e: failures, errors = e.args[0] for f in failures: _, trace = f raise AssertionError(trace) for e in errors: _, trace = e if dt.expected_error and dt.expected_error in trace: dt.expected_error = None else: raise AssertionError(trace) if dt.expected_error: raise AssertionError( "Expected error <{}> was not raised".format(dt.expected_error) )
Example #29
Source File: test_localstoragestats.py From spidermon with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_spider_has_stats_history_attribute_when_opened_with_collector( test_settings, stats_temporary_location ): crawler = get_crawler(Spider, test_settings) crawler.crawl("foo_spider") crawler.stats.set_value("garbage", "value") assert hasattr(crawler.spider, "stats_history") assert crawler.spider.stats_history == deque() crawler.stop()
Example #30
Source File: test_localstoragestats.py From spidermon with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_spider_has_stats_history_queue_with_specified_max_size( test_settings, stats_temporary_location ): max_stored_stats = 2 test_settings["SPIDERMON_MAX_STORED_STATS"] = max_stored_stats crawler = get_crawler(Spider, test_settings) crawler.crawl("foo_spider") assert crawler.spider.stats_history == deque() assert crawler.spider.stats_history.maxlen == max_stored_stats crawler.stop()