Python scrapy.exceptions.CloseSpider() Examples

The following are 25 code examples of scrapy.exceptions.CloseSpider(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module scrapy.exceptions , or try the search function .
Example #1
Source File: pipelines.py    From scrape with MIT License 6 votes vote down vote up
def spider_closed(self, spider, reason):
        # Calls After Spider is closed

        # Check Connection
        if not spider.postgres.checkConnection():
            raise CloseSpider("Unable to Establish a Database Connection")

        # Collect all Stats
        url_stats = {
            "urls_dropped": spider.urls_dropped,
            "urls_scraped": spider.urls_scraped,
            "urls_parsed": spider.urls_parsed,
            "urls_stored": spider.urls_stored
        }

        # End The Log
        if not spider.postgres.end_log(spider.log_id, url_stats, reason):
            logger.error(__name__ + " Unable to End Log for Spider " + spider.name + " with stats: " + str(url_stats))
        
        # Close the database connection
        spider.postgres.connection.close()
        logger.info(__name__ + " [" + spider.name + "] SPIDER CLOSED") 
Example #2
Source File: pipelines.py    From scrape with MIT License 6 votes vote down vote up
def checkSite(self, spider):
        """ Verifies if site exist in database, add otherwise """
        # Verify Database Connection
        if not spider.postgres.checkConnection():
            logger.error(__name__ + " No Database Connection Found!")
            raise CloseSpider(" No Database Connection Found!")
        
        try:
            # Check if site Exists in Database using it's site_id
            if not spider.postgres.siteExists(spider.custom_settings['site_id']):
                # Add it to Database if not
                spider.postgres.cursor.execute(spider.postgres.insert_site_str, (
                    spider.custom_settings['site_id'],
                    spider.custom_settings['site_name'],
                    spider.custom_settings['site_url'],
                    spider.name,
                    )
                )
        except Exception as e:
            logger.error(__name__ + " Unable to add site to Database! Msg: " + str(e))
            raise CloseSpider("Unable to add site to Database")

    # Special Methods Below, Read about them before altering 
Example #3
Source File: pipelines.py    From scrape with MIT License 6 votes vote down vote up
def open_spider(self, spider):
        # Called when a spider starts

        #Create a dedicated Database Connection for the spider
        spider.postgres = postgresSQL()

        #Verify the Connection
        if spider.postgres.connect() == False:
            raise CloseSpider(" Database Connection cannot be established!")

        #Initialize the Stats
        spider.urls_dropped = 0
        spider.urls_scraped = 0
        spider.urls_parsed = 0
        spider.urls_stored = 0

        #Add/Verify Site in Database
        self.checkSite(spider)

        #Start Spider's Log
        spider.log_id = spider.postgres.start_log(spider.custom_settings['site_id'], os.getpid())
        if not spider.log_id:
            raise CloseSpider(" Unable to Start Log!") 
Example #4
Source File: scraper.py    From learn_python3_spider with MIT License 6 votes vote down vote up
def handle_spider_error(self, _failure, request, response, spider):
        exc = _failure.value
        if isinstance(exc, CloseSpider):
            self.crawler.engine.close_spider(spider, exc.reason or 'cancelled')
            return
        logger.error(
            "Spider error processing %(request)s (referer: %(referer)s)",
            {'request': request, 'referer': referer_str(request)},
            exc_info=failure_to_exc_info(_failure),
            extra={'spider': spider}
        )
        self.signals.send_catch_log(
            signal=signals.spider_error,
            failure=_failure, response=response,
            spider=spider
        )
        self.crawler.stats.inc_value(
            "spider_exceptions/%s" % _failure.value.__class__.__name__,
            spider=spider
        ) 
Example #5
Source File: scraper.py    From learn_python3_spider with MIT License 6 votes vote down vote up
def handle_spider_error(self, _failure, request, response, spider):
        exc = _failure.value
        if isinstance(exc, CloseSpider):
            self.crawler.engine.close_spider(spider, exc.reason or 'cancelled')
            return
        logger.error(
            "Spider error processing %(request)s (referer: %(referer)s)",
            {'request': request, 'referer': referer_str(request)},
            exc_info=failure_to_exc_info(_failure),
            extra={'spider': spider}
        )
        self.signals.send_catch_log(
            signal=signals.spider_error,
            failure=_failure, response=response,
            spider=spider
        )
        self.crawler.stats.inc_value(
            "spider_exceptions/%s" % _failure.value.__class__.__name__,
            spider=spider
        ) 
Example #6
Source File: spiders.py    From autologin with Apache License 2.0 6 votes vote down vote up
def parse(self, response):
        self.logger.info(response.url)
        if response.text:
            for _, meta in formasaurus.extract_forms(response.text):
                form_type = meta['form']
                if form_type == 'login' and not self.found_login:
                    self.found_login = True
                    self.handle_login_form(response.url)
                elif form_type == 'registration' \
                        and not self.found_registration:
                    self.found_registration = True
                    self.handle_registration_form(response.url)
        if self.found_registration and self.found_login:
            raise CloseSpider('done')
        for link in self.link_extractor.extract_links(response):
            priority = 0
            text = ' '.join([relative_url(link.url), link.text]).lower()
            if any(pattern in text for pattern in self.priority_patterns):
                priority = 100
            yield self.request(link.url, self.parse, priority=priority) 
Example #7
Source File: luxe_spider.py    From NewsScrapy with GNU Lesser General Public License v3.0 6 votes vote down vote up
def parse_news(self,response):
        item = response.meta.get("item",None)
        # #把结束条件移到爬取内容中,以免引起事务的错误
        # news_date = item.get("news_date",None)
        # if news_date:
        #     struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d")
        #     news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S")
        #
        #     delta = self.end_now-struct_date
        #     if delta.days == self.end_day:
        #         # pass
        #         raise CloseSpider('today scrapy end')
        soup = BeautifulSoup(response.body)
        news_content_group = soup.find("div",class_="entry-content group")
        #去除相关阅读
        news_content_group.find("div",class_="related_posts").replace_with("")
        content = news_content_group.text.strip()
        item["content"] = content
        item["catalogue"] = u"最新内容"
        yield item 
Example #8
Source File: inventus.py    From Inventus with MIT License 6 votes vote down vote up
def parse_item(self, response):
        item = InventusSpiderItem()
        for url in Selector(text=response.body).xpath('//a/@href').extract():
            if not url.startswith('http://') or url.startswith('https://'):
                url = self.base_url + url
            try:
                parsed_uri = urlparse(url)
            except ValueError:
                # If the URL is invalid we can ignore it.
                continue
            if parsed_uri.netloc.endswith('.' + self.domain) and 'mailto:' not in url:
                if not parsed_uri.netloc in self.subdomains:
                    self.subdomains.append(parsed_uri.netloc)
                    item['subdomain'] = parsed_uri.netloc
                    yield item

                    if len(self.subdomains) > int(self.subdomain_limit):
                        break

                yield Request(url, callback=self.parse)

        if len(self.subdomains) >= int(self.subdomain_limit):
            raise CloseSpider('subdomain limit reached') 
Example #9
Source File: middlewares.py    From scrapy-rotating-proxies with MIT License 6 votes vote down vote up
def process_request(self, request, spider):
        if 'proxy' in request.meta and not request.meta.get('_rotating_proxy'):
            return
        proxy = self.proxies.get_random()
        if not proxy:
            if self.stop_if_no_proxies:
                raise CloseSpider("no_proxies")
            else:
                logger.warn("No proxies available; marking all proxies "
                            "as unchecked")
                self.proxies.reset()
                proxy = self.proxies.get_random()
                if proxy is None:
                    logger.error("No proxies available even after a reset.")
                    raise CloseSpider("no_proxies_after_reset")

        request.meta['proxy'] = proxy
        request.meta['download_slot'] = self.get_proxy_slot(proxy)
        request.meta['_rotating_proxy'] = True 
Example #10
Source File: qdaily_spider.py    From NewsScrapy with GNU Lesser General Public License v3.0 6 votes vote down vote up
def parse_article(self,response):
        #content,news_no,crawl_date
        item = response.meta.get("item",NewsItem())
        # news_date = item.get("news_date",None)
        # if news_date:
        #     struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d %H:%M:%S")
        #     delta = self.end_now-struct_date
        #     print delta.days
        #     if delta.days == self.end_day:
        #         raise CloseSpider('today scrapy end')
        soup =BeautifulSoup(response.body)
        author = soup.find("span",class_="name").text if soup.find("span",class_="name") else None
        abstract =  soup.find("p",class_="excerpt").text if soup.find("p",class_="excerpt") else None
        content = soup.find("div",class_="detail").text if soup.find("div",class_="detail") else None
        news_no = response.url.split("/")[-1][:-5]
        item["author"] = author
        item["abstract"] = abstract
        item["content"] = content
        item["crawl_date"] = NOW
        item["news_no"] = news_no
        yield item 
Example #11
Source File: pipelines.py    From scrape with MIT License 5 votes vote down vote up
def process_item(self, item, spider):

        if not spider.postgres.checkConnection():
            raise CloseSpider("Unable to Establish a Database Connection")
        
        if spider.postgres.checkUrlExists(item['link']):
            raise DropItem("Url " + item['link'] + " Exists in Database")
        
        return item 
Example #12
Source File: eksisozluk.py    From sozlukcrawler with GNU General Public License v2.0 5 votes vote down vote up
def parse(self, response):
        self.log("PARSING: %s" % response.request.url, level=log.INFO)

        items_to_scrape = response.xpath('//*[@id="topic"]/ul[@id="entry-list"]/li')
        if len(items_to_scrape) == 0:
            self.log("!!! No item to parse found. It may indicate a problem with HTML !!!",
                     level=log.ERROR)
            raise CloseSpider('no_item_found')

        for sel in items_to_scrape:
            girdi_id = sel.xpath('./@data-id').extract()[0]
            baslik_id = response.xpath('//*[@id="title"]/a/@href').re(r'--(\d*)')[0]
            baslik = response.xpath('//*[@id="title"]/a/span/text()').extract()[0]
            date = sel.xpath('./footer/div[@class="info"]/a[@class="entry-date permalink"]/text()').re(r'\d{2}[.]\d{2}[.]\d{4} \d{2}[:]\d{2}')[0]
            text = sel.xpath('string(./div)').extract()[0]
            nick = sel.xpath('./footer/div[@class="info"]/a[@class="entry-author"]/text()').extract()[0]

            item = Girdi()
            item['source'] = self.name
            item['baslik'] = baslik
            item['girdi_id'] = girdi_id
            item['baslik_id'] = baslik_id
            item['datetime'] = datetime.strptime(date, '%d.%m.%Y %H:%M')
            item['text'] = text
            item['nick'] = nick

            yield item

        # Sozluk sayfalamayi javascript ile yapiyor, dolayisi ile sayfa linkini XPath ile alamiyoruz ancak kacinci
        # sayfada oldugumuz ve son sayfa html icerisinde yer aliyor. Bu bilgileri kullanarak crawl edilecek bir
        # sonraki sayfanin adresini belirle. SSG degistirmez umarim :(
        current_page = int(response.xpath('//*[@id="topic"]/div[2]/@data-currentpage').extract()[0])
        page_count = int(response.xpath('//*[@id="topic"]/div[2]/@data-pagecount').extract()[0])

        current_url = response.request.url.split('?p')[0]

        next_page = current_page + 1
        if page_count >= next_page:
        # if current_page < 1:
            yield Request('%s?p=%s' % (current_url, next_page)) 
Example #13
Source File: __init__.py    From fp-server with MIT License 5 votes vote down vote up
def build_check_recipient(self, ip, port, scheme,
                              user=None, password=None):
        """
        1. build a request for availability checking
        2. drop it if already existed

        :return: Request
        """

        if self.complete_condition():
            raise exceptions.CloseSpider('Enough items')

        spec = dict(ip=ip, port=port, scheme=scheme)

        if not valid_format(spec):
            self.logger.debug('Got wrong format (%s, %s). Clear it.' % (ip, port))

            return {}

        if self.already_exists(spec):
            self.logger.debug('Got duplicated %s. Clear it.' % spec.values())

            return {}  # drop it

        proxy_url = utils.build_proxy_url(ip, port, scheme, user, password)
        need_auth = int(bool(user and password))
        item = Proxy(
            ip=ip,
            scheme=scheme,
            port=port,
            need_auth=need_auth,
            url=proxy_url,
        )

        if need_auth:
            item['user'], item['password'] = user, password

        self.logger.debug('Got unchecked %s' % item)

        return self.build_check_request(item) 
Example #14
Source File: scraping.py    From ws-backend-community with GNU General Public License v3.0 5 votes vote down vote up
def __check_for_close(self):
        """
        Check to see if this spider has been running for longer than the maximum amount
        of allowed time, and stop the spider if it has.
        :return: None
        """
        if self._start_time is None:
            self._start_time = DatetimeHelper.now()
        elapsed_time = (DatetimeHelper.now() - self.start_time).total_seconds()
        if elapsed_time > self.max_run_time:
            raise CloseSpider(
                "Spider run time exceeded maximum time of %s seconds. Closing."
                % (self.max_run_time,)
            ) 
Example #15
Source File: meadin_spider.py    From NewsScrapy with GNU Lesser General Public License v3.0 5 votes vote down vote up
def parse_news(self,response):
        #content,news_date,news_no,crawl_date,referer_web
        item = response.meta.get("item",NewsItem())
        pageindex = response.meta.get("pageindex",1)
        soup = BeautifulSoup(response.body)
        # news_date = item.get("news_date",None)
        #需要爬取具体的时间
        news_date = soup.find("span",class_="arial").text if soup.find("span",class_="arial") else None
        #http://info.meadin.com/PictureNews/2938_1.shtml Exception
        if news_date:

            # struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d %H:%M:%S")
            # delta = self.end_now-struct_date
            # if delta.days == self.end_day:
            #     raise CloseSpider('today scrapy end')
            referer_web = list(soup.find("p",class_="source").strings)[-1] if soup.find("p",class_="source") else None
            #爬取正文
            art,content = None,None
            art = soup.find("div",class_="article js-article")
            if art:
                #剔除摘要!
                art.find("div",class_="intro").replace_with("")
                content =art.text.strip()
            news_no =response.url.split("/")[-1].split("_")[0]
            item["news_date"]=news_date
            item["content"]=content
            item["referer_web"]=referer_web
            item["crawl_date"]=NOW
            item["news_no"]=news_no
            item = judge_news_crawl(item)
            if item:
                yield item
            else:
                self.flag = pageindex
        else:
            logger.warning("can't find news_date.the url is %s" % response.url) 
Example #16
Source File: thepaper_spider.py    From NewsScrapy with GNU Lesser General Public License v3.0 5 votes vote down vote up
def parse(self, response):
        #首页内容
        html = response.body
        soup = BeautifulSoup(html,"lxml")
        #爬取首页新闻列表
        for i in self.fetch_newslist(soup):
            # raise CloseSpider(str(i['time'] == u"一天前"))
            # if i['time'] == "一天前": raise CloseSpider("today news end")
            request = scrapy.Request(i['news_url'],callback=self.parse_news)
            request.meta['item'] = i
            request.meta['pageindex'] = 1
            yield request

        #爬取下一页的链接
        lasttime = "nothing"
        for i in  soup.select('div[class="news_li"]'):
            if i.attrs.has_key("lasttime"):
                lasttime =  i["lasttime"]
                break
        #得到下一个url的连接。
        # 格式:load_chosen.jsp?nodeids=25949&topCids=1495258,1494171,1495064,1495130,1495285,&pageidx=
        load_chosen = re.search(r'data.:."(.*)".+.masonry',html)
        page = 2
        if load_chosen :
            tp_url = "http://www.thepaper.cn/load_chosen.jsp?%s%s&lastTime=%s" % (load_chosen.group(1),page,lasttime)
            yield scrapy.Request(tp_url, callback=self.next_page_parse) 
Example #17
Source File: database.py    From livetv_mining with Apache License 2.0 5 votes vote down vote up
def open_spider(self, spider):
        site_setting = spider.settings.get('SITE')
        if not site_setting:
            error_msg = 'Can not find the website configuration from settings.'
            spider.logger.error(error_msg)
            raise CloseSpider(error_msg)
        self.session = self.session_maker()
        site = self.session.query(LiveTVSite).filter(LiveTVSite.code == site_setting['code']).one_or_none()
        if not site:
            site = LiveTVSite(code=site_setting['code'], name=site_setting['name'],
                              description=site_setting['description'], url=site_setting['url'],
                              image=site_setting['image'], show_seq=site_setting['show_seq'])
            self.session.add(site)
            self.session.commit()
        self.site[site.code] = {'id': site.id, 'starttime': datetime.utcnow(), 'channels': {}} 
Example #18
Source File: middlewares.py    From fooltrader with MIT License 5 votes vote down vote up
def process_spider_exception(self, response, exception, spider):
        if isinstance(exception, HttpError):
            if response.status == 456:
                # response.meta['fool_blocked'] = True
                # return None
                raise CloseSpider('catch forbidden,close for a while')


# downloader middleware 
Example #19
Source File: pipelines.py    From scrapy-picture-spider with Apache License 2.0 5 votes vote down vote up
def process_item(self, item, spider):
        if item is None:
            raise DropItem('Item is null')
        dir_path = self.make_dir()
        image_final_name = item['image_name'] + '-' + item['image_id'] + '-by@' + item['author'] + '.jpg'
        dest_path = os.path.join(dir_path, image_final_name)
        self.download_image(item['image_src'], dest_path)
        self.image_max_counter += 1
        if self.image_max_counter >= self.MAXIMUM_IMAGE_NUMBER:
            raise CloseSpider('Current downloaded image already equal maximum number')
        return item 
Example #20
Source File: pipelines.py    From scrapy-picture-spider with Apache License 2.0 5 votes vote down vote up
def __init__(self, IMAGE_STORE, MAXIMUM_IMAGE_NUMBER):
        if IMAGE_STORE is None or MAXIMUM_IMAGE_NUMBER is None:
            raise CloseSpider('Pipeline load settings failed')
        self.IMAGE_STORE = IMAGE_STORE
        self.MAXIMUM_IMAGE_NUMBER = MAXIMUM_IMAGE_NUMBER
        # recording number of downloaded image
        self.image_max_counter = 0
        # recording dir name number,it each one thousand add 1
        self.dir_counter = 0 
Example #21
Source File: utils.py    From scrapy-poet with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def capture_exceptions(callback):
    """ Wrapper for Scrapy callbacks that captures exceptions within
    the provided callback and yields it under `exception` property. Also
    spider is closed on the first exception. """
    def parse(*args, **kwargs):
        try:
            yield from callback(*args, **kwargs)
        except Exception as e:
            yield {'exception': e}
            raise CloseSpider("Exception in callback detected")
    # Mimic type annotations
    parse.__annotations__ = callback.__annotations__
    return parse 
Example #22
Source File: uludagsozluk.py    From sozlukcrawler with GNU General Public License v2.0 5 votes vote down vote up
def parse(self, response):
        self.log("PARSING: %s" % response.request.url, level=log.INFO)

        items_to_scrape = response.xpath('//*[@id="entry-list"]/li/article')
        if len(items_to_scrape) == 0:
            self.log("!!! No item to parse found. It may indicate a problem with HTML !!!",
                     level=log.ERROR)
            raise CloseSpider('no_item_found')

        for sel in items_to_scrape:
            girdi_id = sel.css('span.voting').css('a.entryid_a').xpath('./span/text()').re(r'#(\d*)')[0]
            baslik_id = response.xpath('//*[@id="main"]/div/div[1]/div[1]/div/ul/li[1]/ul/li/a/@onclick').re("'(\d*)'")[0]
            baslik = response.css('h1.title').xpath('./a/text()').extract()[0]
            date = sel.xpath('.//a[@class="entry_tarih"]/small/text()').re(r'\d{2}[.]\d{2}[.]\d{4} \d{2}[:]\d{2}')[0]
            text = sel.css('div.entry-p').xpath('string(.)').extract()[0]
            nick = sel.css('span.entry-author').xpath('./a/text()').extract()[0].lower()

            item = Girdi()
            item['source'] = self.name
            item['baslik'] = baslik
            item['girdi_id'] = girdi_id
            item['baslik_id'] = baslik_id
            item['datetime'] = datetime.strptime(date, '%d.%m.%Y %H:%M')
            item['text'] = text
            item['nick'] = nick

            yield item

        current_page = int(response.css('div.pagination').css('li.active').xpath('./a/text()').extract()[0])
        page_count = int(response.xpath('//*[@id="main"]/div/div[3]/ul/li/a')[-2].xpath('text()').extract()[0])
        next_page = current_page + 1

        # Bir sonraki adimda sayfalama linkini dondurmek icin sayfalamadan onceki baslik adresini cikarmamiz gerek.
        # Adres uludagsozluk.com/k/BASLIK/10 seklinde gitmekte. Path'in sayfalamadan onceki kismini al
        url_split = urlsplit(response.request.url)
        current_baslik_url = '%s://%s%s' % (url_split.scheme, url_split.netloc, '/'.join(url_split.path.split('/')[:3]))

        if page_count >= next_page:
        # if current_page < 1:
            yield Request('%s/%s' % (current_baslik_url, next_page)) 
Example #23
Source File: itusozluk.py    From sozlukcrawler with GNU General Public License v2.0 5 votes vote down vote up
def parse(self, response):
        self.log("PARSING: %s" % response.request.url, level=log.INFO)

        items_to_scrape = response.xpath('//*[@id="entry-list"]/li/article')
        if len(items_to_scrape) == 0:
            self.log("!!! No item to parse found. It may indicate a problem with HTML !!!",
                     level=log.ERROR)
            raise CloseSpider('no_item_found')

        for sel in items_to_scrape:
            girdi_id = sel.xpath('./footer/div[@class="entrymenu"]/@data-info').extract()[0].split(',')[0]
            baslik_id = response.xpath('//*[@id="canonical_url"]/@value').re(r'--(\d*)')[0]
            baslik = response.xpath('//*[@id="title"]/a/text()').extract()[0]
            date = sel.xpath('./footer/div[2]/time/a/text()').re(r'\d{2}[.]\d{2}[.]\d{4} \d{2}[:]\d{2}')[0]
            text = sel.xpath('string(./div)').extract()[0]
            nick = sel.css('a.yazarlink').xpath('text()').extract()[0]

            item = Girdi()
            item['source'] = self.name
            item['baslik'] = baslik
            item['girdi_id'] = girdi_id
            item['baslik_id'] = baslik_id
            item['datetime'] = datetime.strptime(date, '%d.%m.%Y %H:%M')
            item['text'] = text
            item['nick'] = nick

            yield item

        current_url = response.request.url.split('/sayfa')[0]

        title_re = response.xpath('//title').re(r'sayfa (\d*)')
        current_page = int(title_re[0]) if title_re else 1

        page_count = int(response.xpath('//a[@rel="last"]')[0].xpath('text()').extract()[0])

        next_page = current_page + 1
        if page_count >= next_page:
        # if current_page < 2:
            yield Request('%s/sayfa/%s' % (current_url, next_page)) 
Example #24
Source File: __init__.py    From sozlukcrawler with GNU General Public License v2.0 5 votes vote down vote up
def __init__(self, **kwargs):
        super(GenericSozlukSpider, self).__init__(**kwargs)

        if 'baslik' not in kwargs:
            raise CloseSpider('Baslik should be given to scrape')

        self.urls = kwargs['baslik'].split(',')
        self.allowed_domains = [] 
Example #25
Source File: travelweeklychina_spider.py    From NewsScrapy with GNU Lesser General Public License v3.0 4 votes vote down vote up
def parse_news(self,response):
        # print response.url,"response"
        PageKey = response.meta.get("topic_id")
        PageNumber =response.meta.get("PageNumber")
        flag_id =str(int(PageKey)-40037910)
        soup =BeautifulSoup(response.body,"lxml")
        #2016-07-13
        news_date = soup.find("time").text if soup.find("time") else None
        # print self.flag[flag_id],int(PageNumber)
        """
        条件是该类别标记(self.flag[flag_id])是0爬取,说明还没有爬到过期的。
        爬取页面是该页的也继续爬取。因为一个页面的爬取顺序是异步的。
        self.flag[flag_id]=过期页数
        """
        if not self.flag[flag_id] or int(PageNumber)==self.flag[flag_id]:
            #,没有超出范围


            struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d")
            # print self.end_now,struct_date,"time"
            delta = self.end_now-struct_date
            # print delta.days,"delta day ~~~~~~~~~~~~~~~~"
            if delta.days > self.end_day:
                self.flag[str(flag_id)]=int(PageNumber)
                # print flag_id,"stop ~~~~~~"
                # raise CloseSpider('today scrapy end')
            else:

                head = soup.find("div",class_="post-head")
                topic,title,abstract=None,None,None
                if head:
                    topic = head.find("span",class_="category").text if head.find("span",class_="category") else None
                    title =head.find("h1",class_="h1").text if head.find("h1",class_="h1") else None
                    abstract = head.find("span",class_="kicker").text if head.find("span",class_="kicker") else None
                content = soup.find("div",class_="post-body clearfix").text if soup.find("div",class_="post-body clearfix") else None
                news_no = response.url.split("/")[-1].split("?")[0]
                #TODO 评论数量js渲染,未解决
                item = NewsItem(title=title,topic=topic,
                                abstract=abstract,news_date=news_date,
                                content=content,news_no=news_no
                                ,crawl_date=NOW,news_url=response.url,catalogue='新闻板块')
                yield item