Python scrapy.exceptions.CloseSpider() Examples
The following are 25
code examples of scrapy.exceptions.CloseSpider().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
scrapy.exceptions
, or try the search function
.
Example #1
Source File: pipelines.py From scrape with MIT License | 6 votes |
def spider_closed(self, spider, reason): # Calls After Spider is closed # Check Connection if not spider.postgres.checkConnection(): raise CloseSpider("Unable to Establish a Database Connection") # Collect all Stats url_stats = { "urls_dropped": spider.urls_dropped, "urls_scraped": spider.urls_scraped, "urls_parsed": spider.urls_parsed, "urls_stored": spider.urls_stored } # End The Log if not spider.postgres.end_log(spider.log_id, url_stats, reason): logger.error(__name__ + " Unable to End Log for Spider " + spider.name + " with stats: " + str(url_stats)) # Close the database connection spider.postgres.connection.close() logger.info(__name__ + " [" + spider.name + "] SPIDER CLOSED")
Example #2
Source File: pipelines.py From scrape with MIT License | 6 votes |
def checkSite(self, spider): """ Verifies if site exist in database, add otherwise """ # Verify Database Connection if not spider.postgres.checkConnection(): logger.error(__name__ + " No Database Connection Found!") raise CloseSpider(" No Database Connection Found!") try: # Check if site Exists in Database using it's site_id if not spider.postgres.siteExists(spider.custom_settings['site_id']): # Add it to Database if not spider.postgres.cursor.execute(spider.postgres.insert_site_str, ( spider.custom_settings['site_id'], spider.custom_settings['site_name'], spider.custom_settings['site_url'], spider.name, ) ) except Exception as e: logger.error(__name__ + " Unable to add site to Database! Msg: " + str(e)) raise CloseSpider("Unable to add site to Database") # Special Methods Below, Read about them before altering
Example #3
Source File: pipelines.py From scrape with MIT License | 6 votes |
def open_spider(self, spider): # Called when a spider starts #Create a dedicated Database Connection for the spider spider.postgres = postgresSQL() #Verify the Connection if spider.postgres.connect() == False: raise CloseSpider(" Database Connection cannot be established!") #Initialize the Stats spider.urls_dropped = 0 spider.urls_scraped = 0 spider.urls_parsed = 0 spider.urls_stored = 0 #Add/Verify Site in Database self.checkSite(spider) #Start Spider's Log spider.log_id = spider.postgres.start_log(spider.custom_settings['site_id'], os.getpid()) if not spider.log_id: raise CloseSpider(" Unable to Start Log!")
Example #4
Source File: scraper.py From learn_python3_spider with MIT License | 6 votes |
def handle_spider_error(self, _failure, request, response, spider): exc = _failure.value if isinstance(exc, CloseSpider): self.crawler.engine.close_spider(spider, exc.reason or 'cancelled') return logger.error( "Spider error processing %(request)s (referer: %(referer)s)", {'request': request, 'referer': referer_str(request)}, exc_info=failure_to_exc_info(_failure), extra={'spider': spider} ) self.signals.send_catch_log( signal=signals.spider_error, failure=_failure, response=response, spider=spider ) self.crawler.stats.inc_value( "spider_exceptions/%s" % _failure.value.__class__.__name__, spider=spider )
Example #5
Source File: scraper.py From learn_python3_spider with MIT License | 6 votes |
def handle_spider_error(self, _failure, request, response, spider): exc = _failure.value if isinstance(exc, CloseSpider): self.crawler.engine.close_spider(spider, exc.reason or 'cancelled') return logger.error( "Spider error processing %(request)s (referer: %(referer)s)", {'request': request, 'referer': referer_str(request)}, exc_info=failure_to_exc_info(_failure), extra={'spider': spider} ) self.signals.send_catch_log( signal=signals.spider_error, failure=_failure, response=response, spider=spider ) self.crawler.stats.inc_value( "spider_exceptions/%s" % _failure.value.__class__.__name__, spider=spider )
Example #6
Source File: spiders.py From autologin with Apache License 2.0 | 6 votes |
def parse(self, response): self.logger.info(response.url) if response.text: for _, meta in formasaurus.extract_forms(response.text): form_type = meta['form'] if form_type == 'login' and not self.found_login: self.found_login = True self.handle_login_form(response.url) elif form_type == 'registration' \ and not self.found_registration: self.found_registration = True self.handle_registration_form(response.url) if self.found_registration and self.found_login: raise CloseSpider('done') for link in self.link_extractor.extract_links(response): priority = 0 text = ' '.join([relative_url(link.url), link.text]).lower() if any(pattern in text for pattern in self.priority_patterns): priority = 100 yield self.request(link.url, self.parse, priority=priority)
Example #7
Source File: luxe_spider.py From NewsScrapy with GNU Lesser General Public License v3.0 | 6 votes |
def parse_news(self,response): item = response.meta.get("item",None) # #把结束条件移到爬取内容中,以免引起事务的错误 # news_date = item.get("news_date",None) # if news_date: # struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d") # news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S") # # delta = self.end_now-struct_date # if delta.days == self.end_day: # # pass # raise CloseSpider('today scrapy end') soup = BeautifulSoup(response.body) news_content_group = soup.find("div",class_="entry-content group") #去除相关阅读 news_content_group.find("div",class_="related_posts").replace_with("") content = news_content_group.text.strip() item["content"] = content item["catalogue"] = u"最新内容" yield item
Example #8
Source File: inventus.py From Inventus with MIT License | 6 votes |
def parse_item(self, response): item = InventusSpiderItem() for url in Selector(text=response.body).xpath('//a/@href').extract(): if not url.startswith('http://') or url.startswith('https://'): url = self.base_url + url try: parsed_uri = urlparse(url) except ValueError: # If the URL is invalid we can ignore it. continue if parsed_uri.netloc.endswith('.' + self.domain) and 'mailto:' not in url: if not parsed_uri.netloc in self.subdomains: self.subdomains.append(parsed_uri.netloc) item['subdomain'] = parsed_uri.netloc yield item if len(self.subdomains) > int(self.subdomain_limit): break yield Request(url, callback=self.parse) if len(self.subdomains) >= int(self.subdomain_limit): raise CloseSpider('subdomain limit reached')
Example #9
Source File: middlewares.py From scrapy-rotating-proxies with MIT License | 6 votes |
def process_request(self, request, spider): if 'proxy' in request.meta and not request.meta.get('_rotating_proxy'): return proxy = self.proxies.get_random() if not proxy: if self.stop_if_no_proxies: raise CloseSpider("no_proxies") else: logger.warn("No proxies available; marking all proxies " "as unchecked") self.proxies.reset() proxy = self.proxies.get_random() if proxy is None: logger.error("No proxies available even after a reset.") raise CloseSpider("no_proxies_after_reset") request.meta['proxy'] = proxy request.meta['download_slot'] = self.get_proxy_slot(proxy) request.meta['_rotating_proxy'] = True
Example #10
Source File: qdaily_spider.py From NewsScrapy with GNU Lesser General Public License v3.0 | 6 votes |
def parse_article(self,response): #content,news_no,crawl_date item = response.meta.get("item",NewsItem()) # news_date = item.get("news_date",None) # if news_date: # struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d %H:%M:%S") # delta = self.end_now-struct_date # print delta.days # if delta.days == self.end_day: # raise CloseSpider('today scrapy end') soup =BeautifulSoup(response.body) author = soup.find("span",class_="name").text if soup.find("span",class_="name") else None abstract = soup.find("p",class_="excerpt").text if soup.find("p",class_="excerpt") else None content = soup.find("div",class_="detail").text if soup.find("div",class_="detail") else None news_no = response.url.split("/")[-1][:-5] item["author"] = author item["abstract"] = abstract item["content"] = content item["crawl_date"] = NOW item["news_no"] = news_no yield item
Example #11
Source File: pipelines.py From scrape with MIT License | 5 votes |
def process_item(self, item, spider): if not spider.postgres.checkConnection(): raise CloseSpider("Unable to Establish a Database Connection") if spider.postgres.checkUrlExists(item['link']): raise DropItem("Url " + item['link'] + " Exists in Database") return item
Example #12
Source File: eksisozluk.py From sozlukcrawler with GNU General Public License v2.0 | 5 votes |
def parse(self, response): self.log("PARSING: %s" % response.request.url, level=log.INFO) items_to_scrape = response.xpath('//*[@id="topic"]/ul[@id="entry-list"]/li') if len(items_to_scrape) == 0: self.log("!!! No item to parse found. It may indicate a problem with HTML !!!", level=log.ERROR) raise CloseSpider('no_item_found') for sel in items_to_scrape: girdi_id = sel.xpath('./@data-id').extract()[0] baslik_id = response.xpath('//*[@id="title"]/a/@href').re(r'--(\d*)')[0] baslik = response.xpath('//*[@id="title"]/a/span/text()').extract()[0] date = sel.xpath('./footer/div[@class="info"]/a[@class="entry-date permalink"]/text()').re(r'\d{2}[.]\d{2}[.]\d{4} \d{2}[:]\d{2}')[0] text = sel.xpath('string(./div)').extract()[0] nick = sel.xpath('./footer/div[@class="info"]/a[@class="entry-author"]/text()').extract()[0] item = Girdi() item['source'] = self.name item['baslik'] = baslik item['girdi_id'] = girdi_id item['baslik_id'] = baslik_id item['datetime'] = datetime.strptime(date, '%d.%m.%Y %H:%M') item['text'] = text item['nick'] = nick yield item # Sozluk sayfalamayi javascript ile yapiyor, dolayisi ile sayfa linkini XPath ile alamiyoruz ancak kacinci # sayfada oldugumuz ve son sayfa html icerisinde yer aliyor. Bu bilgileri kullanarak crawl edilecek bir # sonraki sayfanin adresini belirle. SSG degistirmez umarim :( current_page = int(response.xpath('//*[@id="topic"]/div[2]/@data-currentpage').extract()[0]) page_count = int(response.xpath('//*[@id="topic"]/div[2]/@data-pagecount').extract()[0]) current_url = response.request.url.split('?p')[0] next_page = current_page + 1 if page_count >= next_page: # if current_page < 1: yield Request('%s?p=%s' % (current_url, next_page))
Example #13
Source File: __init__.py From fp-server with MIT License | 5 votes |
def build_check_recipient(self, ip, port, scheme, user=None, password=None): """ 1. build a request for availability checking 2. drop it if already existed :return: Request """ if self.complete_condition(): raise exceptions.CloseSpider('Enough items') spec = dict(ip=ip, port=port, scheme=scheme) if not valid_format(spec): self.logger.debug('Got wrong format (%s, %s). Clear it.' % (ip, port)) return {} if self.already_exists(spec): self.logger.debug('Got duplicated %s. Clear it.' % spec.values()) return {} # drop it proxy_url = utils.build_proxy_url(ip, port, scheme, user, password) need_auth = int(bool(user and password)) item = Proxy( ip=ip, scheme=scheme, port=port, need_auth=need_auth, url=proxy_url, ) if need_auth: item['user'], item['password'] = user, password self.logger.debug('Got unchecked %s' % item) return self.build_check_request(item)
Example #14
Source File: scraping.py From ws-backend-community with GNU General Public License v3.0 | 5 votes |
def __check_for_close(self): """ Check to see if this spider has been running for longer than the maximum amount of allowed time, and stop the spider if it has. :return: None """ if self._start_time is None: self._start_time = DatetimeHelper.now() elapsed_time = (DatetimeHelper.now() - self.start_time).total_seconds() if elapsed_time > self.max_run_time: raise CloseSpider( "Spider run time exceeded maximum time of %s seconds. Closing." % (self.max_run_time,) )
Example #15
Source File: meadin_spider.py From NewsScrapy with GNU Lesser General Public License v3.0 | 5 votes |
def parse_news(self,response): #content,news_date,news_no,crawl_date,referer_web item = response.meta.get("item",NewsItem()) pageindex = response.meta.get("pageindex",1) soup = BeautifulSoup(response.body) # news_date = item.get("news_date",None) #需要爬取具体的时间 news_date = soup.find("span",class_="arial").text if soup.find("span",class_="arial") else None #http://info.meadin.com/PictureNews/2938_1.shtml Exception if news_date: # struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d %H:%M:%S") # delta = self.end_now-struct_date # if delta.days == self.end_day: # raise CloseSpider('today scrapy end') referer_web = list(soup.find("p",class_="source").strings)[-1] if soup.find("p",class_="source") else None #爬取正文 art,content = None,None art = soup.find("div",class_="article js-article") if art: #剔除摘要! art.find("div",class_="intro").replace_with("") content =art.text.strip() news_no =response.url.split("/")[-1].split("_")[0] item["news_date"]=news_date item["content"]=content item["referer_web"]=referer_web item["crawl_date"]=NOW item["news_no"]=news_no item = judge_news_crawl(item) if item: yield item else: self.flag = pageindex else: logger.warning("can't find news_date.the url is %s" % response.url)
Example #16
Source File: thepaper_spider.py From NewsScrapy with GNU Lesser General Public License v3.0 | 5 votes |
def parse(self, response): #首页内容 html = response.body soup = BeautifulSoup(html,"lxml") #爬取首页新闻列表 for i in self.fetch_newslist(soup): # raise CloseSpider(str(i['time'] == u"一天前")) # if i['time'] == "一天前": raise CloseSpider("today news end") request = scrapy.Request(i['news_url'],callback=self.parse_news) request.meta['item'] = i request.meta['pageindex'] = 1 yield request #爬取下一页的链接 lasttime = "nothing" for i in soup.select('div[class="news_li"]'): if i.attrs.has_key("lasttime"): lasttime = i["lasttime"] break #得到下一个url的连接。 # 格式:load_chosen.jsp?nodeids=25949&topCids=1495258,1494171,1495064,1495130,1495285,&pageidx= load_chosen = re.search(r'data.:."(.*)".+.masonry',html) page = 2 if load_chosen : tp_url = "http://www.thepaper.cn/load_chosen.jsp?%s%s&lastTime=%s" % (load_chosen.group(1),page,lasttime) yield scrapy.Request(tp_url, callback=self.next_page_parse)
Example #17
Source File: database.py From livetv_mining with Apache License 2.0 | 5 votes |
def open_spider(self, spider): site_setting = spider.settings.get('SITE') if not site_setting: error_msg = 'Can not find the website configuration from settings.' spider.logger.error(error_msg) raise CloseSpider(error_msg) self.session = self.session_maker() site = self.session.query(LiveTVSite).filter(LiveTVSite.code == site_setting['code']).one_or_none() if not site: site = LiveTVSite(code=site_setting['code'], name=site_setting['name'], description=site_setting['description'], url=site_setting['url'], image=site_setting['image'], show_seq=site_setting['show_seq']) self.session.add(site) self.session.commit() self.site[site.code] = {'id': site.id, 'starttime': datetime.utcnow(), 'channels': {}}
Example #18
Source File: middlewares.py From fooltrader with MIT License | 5 votes |
def process_spider_exception(self, response, exception, spider): if isinstance(exception, HttpError): if response.status == 456: # response.meta['fool_blocked'] = True # return None raise CloseSpider('catch forbidden,close for a while') # downloader middleware
Example #19
Source File: pipelines.py From scrapy-picture-spider with Apache License 2.0 | 5 votes |
def process_item(self, item, spider): if item is None: raise DropItem('Item is null') dir_path = self.make_dir() image_final_name = item['image_name'] + '-' + item['image_id'] + '-by@' + item['author'] + '.jpg' dest_path = os.path.join(dir_path, image_final_name) self.download_image(item['image_src'], dest_path) self.image_max_counter += 1 if self.image_max_counter >= self.MAXIMUM_IMAGE_NUMBER: raise CloseSpider('Current downloaded image already equal maximum number') return item
Example #20
Source File: pipelines.py From scrapy-picture-spider with Apache License 2.0 | 5 votes |
def __init__(self, IMAGE_STORE, MAXIMUM_IMAGE_NUMBER): if IMAGE_STORE is None or MAXIMUM_IMAGE_NUMBER is None: raise CloseSpider('Pipeline load settings failed') self.IMAGE_STORE = IMAGE_STORE self.MAXIMUM_IMAGE_NUMBER = MAXIMUM_IMAGE_NUMBER # recording number of downloaded image self.image_max_counter = 0 # recording dir name number,it each one thousand add 1 self.dir_counter = 0
Example #21
Source File: utils.py From scrapy-poet with BSD 3-Clause "New" or "Revised" License | 5 votes |
def capture_exceptions(callback): """ Wrapper for Scrapy callbacks that captures exceptions within the provided callback and yields it under `exception` property. Also spider is closed on the first exception. """ def parse(*args, **kwargs): try: yield from callback(*args, **kwargs) except Exception as e: yield {'exception': e} raise CloseSpider("Exception in callback detected") # Mimic type annotations parse.__annotations__ = callback.__annotations__ return parse
Example #22
Source File: uludagsozluk.py From sozlukcrawler with GNU General Public License v2.0 | 5 votes |
def parse(self, response): self.log("PARSING: %s" % response.request.url, level=log.INFO) items_to_scrape = response.xpath('//*[@id="entry-list"]/li/article') if len(items_to_scrape) == 0: self.log("!!! No item to parse found. It may indicate a problem with HTML !!!", level=log.ERROR) raise CloseSpider('no_item_found') for sel in items_to_scrape: girdi_id = sel.css('span.voting').css('a.entryid_a').xpath('./span/text()').re(r'#(\d*)')[0] baslik_id = response.xpath('//*[@id="main"]/div/div[1]/div[1]/div/ul/li[1]/ul/li/a/@onclick').re("'(\d*)'")[0] baslik = response.css('h1.title').xpath('./a/text()').extract()[0] date = sel.xpath('.//a[@class="entry_tarih"]/small/text()').re(r'\d{2}[.]\d{2}[.]\d{4} \d{2}[:]\d{2}')[0] text = sel.css('div.entry-p').xpath('string(.)').extract()[0] nick = sel.css('span.entry-author').xpath('./a/text()').extract()[0].lower() item = Girdi() item['source'] = self.name item['baslik'] = baslik item['girdi_id'] = girdi_id item['baslik_id'] = baslik_id item['datetime'] = datetime.strptime(date, '%d.%m.%Y %H:%M') item['text'] = text item['nick'] = nick yield item current_page = int(response.css('div.pagination').css('li.active').xpath('./a/text()').extract()[0]) page_count = int(response.xpath('//*[@id="main"]/div/div[3]/ul/li/a')[-2].xpath('text()').extract()[0]) next_page = current_page + 1 # Bir sonraki adimda sayfalama linkini dondurmek icin sayfalamadan onceki baslik adresini cikarmamiz gerek. # Adres uludagsozluk.com/k/BASLIK/10 seklinde gitmekte. Path'in sayfalamadan onceki kismini al url_split = urlsplit(response.request.url) current_baslik_url = '%s://%s%s' % (url_split.scheme, url_split.netloc, '/'.join(url_split.path.split('/')[:3])) if page_count >= next_page: # if current_page < 1: yield Request('%s/%s' % (current_baslik_url, next_page))
Example #23
Source File: itusozluk.py From sozlukcrawler with GNU General Public License v2.0 | 5 votes |
def parse(self, response): self.log("PARSING: %s" % response.request.url, level=log.INFO) items_to_scrape = response.xpath('//*[@id="entry-list"]/li/article') if len(items_to_scrape) == 0: self.log("!!! No item to parse found. It may indicate a problem with HTML !!!", level=log.ERROR) raise CloseSpider('no_item_found') for sel in items_to_scrape: girdi_id = sel.xpath('./footer/div[@class="entrymenu"]/@data-info').extract()[0].split(',')[0] baslik_id = response.xpath('//*[@id="canonical_url"]/@value').re(r'--(\d*)')[0] baslik = response.xpath('//*[@id="title"]/a/text()').extract()[0] date = sel.xpath('./footer/div[2]/time/a/text()').re(r'\d{2}[.]\d{2}[.]\d{4} \d{2}[:]\d{2}')[0] text = sel.xpath('string(./div)').extract()[0] nick = sel.css('a.yazarlink').xpath('text()').extract()[0] item = Girdi() item['source'] = self.name item['baslik'] = baslik item['girdi_id'] = girdi_id item['baslik_id'] = baslik_id item['datetime'] = datetime.strptime(date, '%d.%m.%Y %H:%M') item['text'] = text item['nick'] = nick yield item current_url = response.request.url.split('/sayfa')[0] title_re = response.xpath('//title').re(r'sayfa (\d*)') current_page = int(title_re[0]) if title_re else 1 page_count = int(response.xpath('//a[@rel="last"]')[0].xpath('text()').extract()[0]) next_page = current_page + 1 if page_count >= next_page: # if current_page < 2: yield Request('%s/sayfa/%s' % (current_url, next_page))
Example #24
Source File: __init__.py From sozlukcrawler with GNU General Public License v2.0 | 5 votes |
def __init__(self, **kwargs): super(GenericSozlukSpider, self).__init__(**kwargs) if 'baslik' not in kwargs: raise CloseSpider('Baslik should be given to scrape') self.urls = kwargs['baslik'].split(',') self.allowed_domains = []
Example #25
Source File: travelweeklychina_spider.py From NewsScrapy with GNU Lesser General Public License v3.0 | 4 votes |
def parse_news(self,response): # print response.url,"response" PageKey = response.meta.get("topic_id") PageNumber =response.meta.get("PageNumber") flag_id =str(int(PageKey)-40037910) soup =BeautifulSoup(response.body,"lxml") #2016-07-13 news_date = soup.find("time").text if soup.find("time") else None # print self.flag[flag_id],int(PageNumber) """ 条件是该类别标记(self.flag[flag_id])是0爬取,说明还没有爬到过期的。 爬取页面是该页的也继续爬取。因为一个页面的爬取顺序是异步的。 self.flag[flag_id]=过期页数 """ if not self.flag[flag_id] or int(PageNumber)==self.flag[flag_id]: #,没有超出范围 struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d") # print self.end_now,struct_date,"time" delta = self.end_now-struct_date # print delta.days,"delta day ~~~~~~~~~~~~~~~~" if delta.days > self.end_day: self.flag[str(flag_id)]=int(PageNumber) # print flag_id,"stop ~~~~~~" # raise CloseSpider('today scrapy end') else: head = soup.find("div",class_="post-head") topic,title,abstract=None,None,None if head: topic = head.find("span",class_="category").text if head.find("span",class_="category") else None title =head.find("h1",class_="h1").text if head.find("h1",class_="h1") else None abstract = head.find("span",class_="kicker").text if head.find("span",class_="kicker") else None content = soup.find("div",class_="post-body clearfix").text if soup.find("div",class_="post-body clearfix") else None news_no = response.url.split("/")[-1].split("?")[0] #TODO 评论数量js渲染,未解决 item = NewsItem(title=title,topic=topic, abstract=abstract,news_date=news_date, content=content,news_no=news_no ,crawl_date=NOW,news_url=response.url,catalogue='新闻板块') yield item