Python scrapy.Spider() Examples

The following are 30 code examples of scrapy.Spider(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module scrapy , or try the search function .
Example #1
Source File: assessment_spider.py    From assessor-scraper with MIT License 6 votes vote down vote up
def parse(self, response):
        """
        Default callback function with response for the crawled url
        https://doc.scrapy.org/en/latest/topics/spiders.html#scrapy.spiders.Spider.parse
        """
        response = response.replace(body=re.sub(r"<br\s*[\/]?>", "\n", response.body.decode('utf=8')))
        property_key = response.url.split('=')[1].replace('&', '')
        # logging.debug("Parsing property_key: %s", property_key)
        if 'No Data at this time' in response.text:
            msg = "No data for " + response.url
            logging.warning(msg)
            raise DropItem(msg)
        else:
            property_info = self.parse_property_info(response)
            property_values = self.parse_property_values(response)
            property_sales = self.parse_property_sales(response)
            property_info['sales'] = property_sales
            property_info['values'] = property_values
            property_info['property_key'] = property_key
            yield Property(property_info) 
Example #2
Source File: test_pagestorage.py    From scrapy-pagestorage with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_save_response_with_trim(self):
        self.instance._writer.maxitemsize = 26
        self.instance.hsref.job.key = '123/45/67'
        resp = TextResponse(
            'http://resp', request=Request('http://req'), encoding='cp1251',
            body='\r\n\r\n<html><body></body></html>\r\n \0\0\0\0\0')
        with mock.patch.object(Spider, 'logger') as log:
            spider = Spider('default')
            self.instance.save_response(resp, self.spider)
        log.warning.assert_called_with(
            "Page not saved, body too large: <http://resp>")
        self.instance.trim_html = True
        self.instance.save_response(resp, spider)
        self.instance._writer.write.assert_called_with(
            {u'body': u'<html><body></body></html>', u'_encoding': u'cp1251',
             u'_type': u'_pageitem',
             u'_key': u'9b4bed7e56103ddf63455ed39145f61f53b3c702',
             u'url': u'http://resp', '_jobid': '123/45/67'}) 
Example #3
Source File: test_localstoragestats.py    From spidermon with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_spider_has_two_last_stats_history_when_opened_third_time(
    test_settings, stats_temporary_location
):
    crawler = get_crawler(Spider, test_settings)
    crawler.crawl("foo_spider")
    crawler.stats.set_value("first_execution", "value")
    crawler.stop()

    crawler = get_crawler(Spider, test_settings)
    crawler.crawl("foo_spider")
    crawler.stats.set_value("second_execution", "value")
    crawler.stop()

    crawler = get_crawler(Spider, test_settings)
    crawler.crawl("foo_spider")
    assert len(crawler.spider.stats_history) == 2
    assert "second_execution" in crawler.spider.stats_history[0].keys()
    assert "first_execution" in crawler.spider.stats_history[1].keys()
    crawler.stop() 
Example #4
Source File: test_localstoragestats.py    From spidermon with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_spider_limit_number_of_stored_stats(test_settings, stats_temporary_location):
    test_settings["SPIDERMON_MAX_STORED_STATS"] = 2
    crawler = get_crawler(Spider, test_settings)
    crawler.crawl("foo_spider")
    crawler.stats.set_value("first_execution", "value")
    crawler.stop()

    crawler = get_crawler(Spider, test_settings)
    crawler.crawl("foo_spider")
    crawler.stats.set_value("second_execution", "value")
    crawler.stop()

    crawler = get_crawler(Spider, test_settings)
    crawler.crawl("foo_spider")
    crawler.stats.set_value("third_execution", "value")
    crawler.stop()

    crawler = get_crawler(Spider, test_settings)
    crawler.crawl("foo_spider")
    assert len(crawler.spider.stats_history) == 2
    assert "third_execution" in crawler.spider.stats_history[0].keys()
    assert "second_execution" in crawler.spider.stats_history[1].keys()
    crawler.stop() 
Example #5
Source File: crawler.py    From learn_python3_spider with MIT License 6 votes vote down vote up
def create_crawler(self, crawler_or_spidercls):
        """
        Return a :class:`~scrapy.crawler.Crawler` object.

        * If ``crawler_or_spidercls`` is a Crawler, it is returned as-is.
        * If ``crawler_or_spidercls`` is a Spider subclass, a new Crawler
          is constructed for it.
        * If ``crawler_or_spidercls`` is a string, this function finds
          a spider with this name in a Scrapy project (using spider loader),
          then creates a Crawler instance for it.
        """
        if isinstance(crawler_or_spidercls, Spider):
            raise ValueError(
                'The crawler_or_spidercls argument cannot be a spider object, '
                'it must be a spider class (or a Crawler object)')
        if isinstance(crawler_or_spidercls, Crawler):
            return crawler_or_spidercls
        return self._create_crawler(crawler_or_spidercls) 
Example #6
Source File: middleware.py    From scrapy-poet with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def process_request(self, request: Request, spider: Spider):
        """This method checks if the request is really needed and if its
        download could be skipped by trying to infer if a ``Response``
        is going to be used by the callback or a Page Input.

        If the ``Response`` can be ignored, a ``utils.DummyResponse`` object is
        returned on its place. This ``DummyResponse`` is linked to the original
        ``Request`` instance.

        With this behavior, we're able to optimize spider executions avoiding
        unnecessary downloads. That could be the case when the callback is
        actually using another source like external APIs such as Scrapinghub's
        Auto Extract.
        """
        if utils.is_response_going_to_be_used(request, spider):
            return

        spider.logger.debug(f'Skipping download of {request}')
        return utils.DummyResponse(url=request.url, request=request) 
Example #7
Source File: sina_category_spider.py    From fooltrader with MIT License 5 votes vote down vote up
def spider_closed(self, spider, reason):
        if self.sh_df[self.category_type].any():
            self.sh_df.to_csv(get_security_list_path('stock', 'sh'), index=False)
        if self.sz_df[self.category_type].any():
            self.sz_df.to_csv(get_security_list_path('stock', 'sz'), index=False)
        spider.logger.info('Spider closed: %s,%s\n', spider.name, reason) 
Example #8
Source File: future_shfe_spider.py    From fooltrader with MIT License 5 votes vote down vote up
def spider_closed(self, spider, reason):
        if self.trading_dates:
            parse_shfe_day_data()
        else:
            parse_shfe_data()
        spider.logger.info('Spider closed: %s,%s\n', spider.name, reason) 
Example #9
Source File: stock_summary_spider.py    From fooltrader with MIT License 5 votes vote down vote up
def spider_closed(self, spider, reason):
        self.current_df = self.current_df.loc[:, KDATA_INDEX_COL]
        print(self.current_df)
        self.current_df.to_csv(get_kdata_path(item=self.security_item), index=False)
        spider.logger.info('Spider closed: %s,%s\n', spider.name, reason) 
Example #10
Source File: stock_finance_report_event_spider.py    From fooltrader with MIT License 5 votes vote down vote up
def spider_closed(self, spider, reason):
        spider.logger.info('Spider closed: %s,%s\n', spider.name, reason) 
Example #11
Source File: china_stock_list_spider.py    From fooltrader with MIT License 5 votes vote down vote up
def spider_closed(self, spider, reason):
        spider.logger.info('Spider closed: %s,%s\n', spider.name, reason) 
Example #12
Source File: stock_forecast_spider.py    From fooltrader with MIT License 5 votes vote down vote up
def spider_closed(self, spider, reason):
        spider.logger.info('Spider closed: %s,%s\n', spider.name, reason) 
Example #13
Source File: stock_tick_spider.py    From fooltrader with MIT License 5 votes vote down vote up
def spider_closed(self, spider, reason):
        spider.logger.info('Spider closed: %s,%s\n', spider.name, reason) 
Example #14
Source File: stock_kdata_163_spider.py    From fooltrader with MIT License 5 votes vote down vote up
def spider_closed(self, spider, reason):
        spider.logger.info('Spider closed: %s,%s\n', spider.name, reason) 
Example #15
Source File: stock_trading_date_spider.py    From fooltrader with MIT License 5 votes vote down vote up
def spider_closed(self, spider, reason):
        spider.logger.info('Spider closed: %s,%s\n', spider.name, reason) 
Example #16
Source File: america_list_spider.py    From fooltrader with MIT License 5 votes vote down vote up
def spider_closed(self, spider, reason):
        spider.logger.info('Spider closed: %s,%s\n', spider.name, reason) 
Example #17
Source File: america_stock_kdata_spider_163.py    From fooltrader with MIT License 5 votes vote down vote up
def spider_closed(self, spider, reason):
        spider.logger.info('Spider closed: %s,%s\n', spider.name, reason) 
Example #18
Source File: sp500_spider.py    From fooltrader with MIT License 5 votes vote down vote up
def spider_closed(self, spider, reason):
        self.df_pe['close'] = self.df_close['close']
        self.df_pe['code'] = self.security_item['code']
        self.df_pe['securityId'] = self.security_item['id']
        self.df_pe['name'] = self.security_item['name']
        self.df_pe.to_csv(get_kdata_path(self.security_item), index=False)
        spider.logger.info('Spider closed: %s,%s\n', spider.name, reason) 
Example #19
Source File: america_stock_finance_spider.py    From fooltrader with MIT License 5 votes vote down vote up
def spider_closed(self, spider, reason):
        spider.logger.info('Spider closed: %s,%s\n', spider.name, reason) 
Example #20
Source File: test_middleware.py    From MaybeDont with MIT License 5 votes vote down vote up
def test_skip():
    mw = AvoidDupContentMiddleware(
        initial_queue_limit=300, threshold=0.98, exploration=0.05)
    spider = Spider()
    mw.process_request(Request('http://example.com'), spider)
    assert len(mw.initial_queue) == 0
    req = Request('http://example.com', meta={'avoid_dup_content': True})
    mw.process_request(req, spider)
    mw.process_response(
        req, HtmlResponse(req.url, body=b'a', request=req), spider)
    assert len(mw.initial_queue) == 1 
Example #21
Source File: bnb.py    From airbnb-scraper with GNU General Public License v3.0 5 votes vote down vote up
def start_requests(self):
        """Spider entry point. Generate the first search request(s)."""
        self.logger.info(f"starting survey for: {self._place}")

        # get params from injected constructor values
        params = {}
        if self._price_max:
            params['price_max'] = self._price_max

        if self._price_min:
            params['price_min'] = self._price_min

        if self._ne_lat:
            params['ne_lat'] = self._ne_lat

        if self._ne_lng:
            params['ne_lng'] = self._ne_lng

        if self._sw_lat:
            params['sw_lat'] = self._sw_lat

        if self._sw_lng:
            params['sw_lng'] = self._sw_lng

        if not self._checkin:  # assume not self._checkout also
            yield self._api_request(params, callback=self.parse_landing_page)

        checkin_range_spec, checkout_range_spec = self._process_checkin_vars()

        # perform request(s)
        yield from self._perform_checkin_start_requests(checkin_range_spec, checkout_range_spec, params) 
Example #22
Source File: test_pagestorage.py    From scrapy-pagestorage with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def setUp(self):
        self.spider = Spider('default')
        self.mocked_hsref = mock.Mock()
        self.patch = mock.patch('sh_scrapy.hsref.hsref', self.mocked_hsref)
        self.crawler_mock = mock.Mock()
        self.crawler_mock.settings = Settings(
            {'PAGE_STORAGE_ENABLED': True,
             'PAGE_STORAGE_MODE': 'VERSIONED_CACHE',
             'PAGE_STORAGE_LIMIT': 10,
             'PAGE_STORAGE_ON_ERROR_LIMIT': 5})
        self.mocked_hsref.project.collections.url = '/test/url'
        self.patch.start()
        self.instance = PageStorageMiddleware.from_crawler(self.crawler_mock) 
Example #23
Source File: mixins.py    From OpenScraper with MIT License 5 votes vote down vote up
def spider_closed(self, spider):
		"""Send some status to logging"""
		self.logger.info("**** Spider closed: {} ****".format(spider.name))
		self.logger.info("--- {} Items retrieved (main search page count)".format(self.item_count))
		self.logger.info("--- {} Items retrieved (detailed page count)".format(self.item_count_depth_1))
		self.logger.info("--- {} Pages crawled".format(self.page_count))
		self.print_error()
		self.logger.info("***End scraping***\n") 
Example #24
Source File: aiqiyi_spider.py    From video_url_crawler_demo with GNU General Public License v3.0 5 votes vote down vote up
def __init__(self):
		scrapy.spiders.Spider.__init__(self)

		self.global_settings = get_project_settings()
		if self.global_settings['PLATFORM'] in ['win', 'mac']:
			self.driver = webdriver.PhantomJS(executable_path= self.global_settings['PHANTOMJS_PATH'])
		elif self.global_settings['PLATFORM'] in ['linux']:
			self.driver = webdriver.PhantomJS()
		self.driver.set_page_load_timeout(30)
		self.driver.implicitly_wait(10)

		self.type_id_list = self.global_settings['CRAWLER']['type_id_list']
		self.re_type_id = re.compile(self.global_settings['CRAWLER']['re_type_id'])
		self.url_template = self.global_settings['CRAWLER']['url_template'] 
Example #25
Source File: aiqiyi_spider.py    From video_url_crawler_demo with GNU General Public License v3.0 5 votes vote down vote up
def __del__(self):
		self.driver.quit()
		scrapy.spiders.Spider.__del__(self) 
Example #26
Source File: conftest.py    From spidermon with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def get_crawler():
    def _crawler(extended_settings={}):
        settings = {
            "SPIDERMON_ENABLED": True,
            "EXTENSIONS": {"spidermon.contrib.scrapy.extensions.Spidermon": 500},
        }
        settings.update(extended_settings)
        crawler = Crawler(Spider, settings=settings)
        crawler.spider = Spider("dummy")
        return crawler

    return _crawler 
Example #27
Source File: test_monitors.py    From spidermon with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def make_data(request):
    def _make_data(settings=None):
        crawler = Crawler(Spider, settings=settings)
        spider = Spider("dummy")
        return {
            "stats": crawler.stats.get_stats(),
            "crawler": crawler,
            "spider": spider,
            "runner": SpiderMonitorRunner(spider=spider),
            "job": None,
        }

    return _make_data 
Example #28
Source File: test_extensions.py    From spidermon with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def run_test(self, **kwargs):
        dt = TestData(**kwargs)
        settings = {
            "SPIDERMON_ENABLED": True,
            "SPIDERMON_SPIDER_OPEN_EXPRESSION_MONITORS": [
                {"tests": [{"expression": dt.expression}]}
            ],
        }
        settings.update(dt.settings)
        crawler = get_crawler(settings_dict=settings)
        crawler.stats.get_stats = lambda _: dt.stats
        spidermon = Spidermon.from_crawler(crawler)
        spider = Spider(name=self.spider_name)

        # mocking, to see test results via raising AssertionError exception
        # with failures and errors as results
        spidermon._run_suites = partial(_test_run_suites, spidermon)

        try:
            spidermon.spider_opened(spider)
        except AssertionError as e:
            failures, errors = e.args[0]
            for f in failures:
                _, trace = f
                raise AssertionError(trace)
            for e in errors:
                _, trace = e
                if dt.expected_error and dt.expected_error in trace:
                    dt.expected_error = None
                else:
                    raise AssertionError(trace)
            if dt.expected_error:
                raise AssertionError(
                    "Expected error <{}> was not raised".format(dt.expected_error)
                ) 
Example #29
Source File: test_localstoragestats.py    From spidermon with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_spider_has_stats_history_attribute_when_opened_with_collector(
    test_settings, stats_temporary_location
):
    crawler = get_crawler(Spider, test_settings)
    crawler.crawl("foo_spider")
    crawler.stats.set_value("garbage", "value")
    assert hasattr(crawler.spider, "stats_history")
    assert crawler.spider.stats_history == deque()
    crawler.stop() 
Example #30
Source File: test_localstoragestats.py    From spidermon with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_spider_has_stats_history_queue_with_specified_max_size(
    test_settings, stats_temporary_location
):
    max_stored_stats = 2
    test_settings["SPIDERMON_MAX_STORED_STATS"] = max_stored_stats

    crawler = get_crawler(Spider, test_settings)
    crawler.crawl("foo_spider")
    assert crawler.spider.stats_history == deque()
    assert crawler.spider.stats_history.maxlen == max_stored_stats
    crawler.stop()