Python scrapy.FormRequest() Examples

The following are 30 code examples of scrapy.FormRequest(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module scrapy , or try the search function .
Example #1
Source File: pts_spider.py    From Taiwan-news-crawlers with MIT License 7 votes vote down vote up
def get_news(self, response):
        response.meta['iter_time'] += 1
        news_items = json.loads(response.text)

        if news_items:
            for n in news_items:
                yield {
                    'website': '公視',
                    'url': ARTICLE_PREFIX + n['news_id'],
                    'title': n['subject'],
                    'date': n['news_date'],
                    'content': n['content'],
                    'category': n['program_name']
                }
            yield scrapy.FormRequest(
                url="https://news.pts.org.tw/list/getmore.php",
                callback=self.get_news,
                meta=response.meta,
                formdata={
                    'page': str(response.meta['iter_time'])
                }) 
Example #2
Source File: inshorts_scraper.py    From scrape with MIT License 6 votes vote down vote up
def parse(self, response):

        for news in response.css('div.news-card'):
            item = {
                'headline': news.css('a.clickable>span::text').extract_first(),
                'author': news.css('span.author::text').extract_first(),
                'time': news.css('span.time::text').extract_first(),
                'date': news.css('span[clas*=date]::text').extract_first(),
                'body': news.css('div[itemprop*=articleBody]::text').extract_first(),
            }

            yield item

        while self.pages > 1:
            pattern = re.compile('var min_news_id\s+=\s+"(.*?)"')
            script = response.css('script[type*="text/javascript"]').extract()[-1]
            id = pattern.search(script).group(1)
            r = scrapy.FormRequest('https://www.inshorts.com/en/ajax/more_news', callback=self.parse, formdata={'news_offset':id})
            yield scrapy.fetch(r)
            self.pages -= 1 
Example #3
Source File: mangaspider.py    From kmanga with GNU General Public License v3.0 6 votes vote down vote up
def parse_login(self, response):
        self._check_login_params()
        self._login = False
        form_data = {
            self.username_field: self.username,
            self.password_field: self.password
        }
        if hasattr(self, 'form_xpath'):
            return scrapy.FormRequest.from_response(
                response,
                formxpath=self.form_xpath,
                formdata=form_data,
                callback=self.parse_after_login
            )
        elif hasattr(self, 'form_url'):
            return scrapy.FormRequest(
                self.form_url,
                formdata=form_data,
                callback=self.parse_after_login
            ) 
Example #4
Source File: post_pass_item.py    From scrapy-templates with MIT License 6 votes vote down vote up
def parse(self, response):
        item_loader = ItemLoader(item=MyItem(), response=response)
        item_loader.default_input_processor = MapCompose(remove_tags)
        #item_loader.add_css("", "")
        #item_loader.add_css("", "")
        #item_loader.add_css("", "")
        yield FormRequest("POST_URL", formdata={'parameter': 'p'},
                                        meta={'item': item_loader.load_item()}, callback=self.populate_field) 
Example #5
Source File: patent.py    From PatentCrawler with Apache License 2.0 6 votes vote down vote up
def gen_detail(self, **kwargs):
        """
        生成查询详情的请求
        :param patent_id, sipo, data_item, nrdAn, nrdPn:
        :return:
        """
        patent_id = str(kwargs.pop('patent_id'))
        formdata = url_detail.get('form_data')
        formdata.__setitem__('nrdAn', patent_id.split('.')[0])
        formdata.__setitem__('cid', patent_id)
        formdata.__setitem__('sid', patent_id)

        return FormRequest(
            url=url_detail.get('url'),
            formdata=formdata,
            headers=url_detail.get('headers'),
            callback=self.parse_patent_detail,
            meta={'sipo': kwargs.pop('sipo'), 'data_item': kwargs.pop('data_item'), 'patent_id': patent_id,
                  'law_info': {'nrdAn': kwargs.pop('nrdAn'), 'nrdPn': kwargs.pop('nrdPn')}}
        ) 
Example #6
Source File: patent.py    From PatentCrawler with Apache License 2.0 6 votes vote down vote up
def gen_related_info(self, **kwargs):
        """
        生成相关信息的请求,包含法律信息和同族信息
        :param sipo:
        :param data_item:
        :param nrdAn:
        :param nrdPn:
        :return:
        """
        form_data = url_related_info.get('form_data')
        form_data.__setitem__('literaInfo.nrdAn', kwargs.pop('nrdAn'))
        form_data.__setitem__('literaInfo.nrdPn', kwargs.pop('nrdPn'))
        return FormRequest(
            url=url_related_info.get('url'),
            method='POST',
            dont_filter=True,  # 此处可能会发生重复采集,但是还是想要采集,所以关闭过滤
            formdata=form_data,
            callback=self.parse_related_info,
            meta={'sipo': kwargs.pop('sipo'), 'data_item': kwargs.pop('data_item'), 'patent_id': kwargs.pop('patent_id')}
        ) 
Example #7
Source File: patent.py    From PatentCrawler with Apache License 2.0 6 votes vote down vote up
def gen_full_text(self, **kwargs):
        """
        生成全文文本的请求
        :param patent_id:
        :param sipo:
        :param data_item:
        :return:
        """
        patent_id = str(kwargs.pop('patent_id'))
        form_data = url_full_text.get('form_data')
        form_data.__setitem__('nrdAn', patent_id.split('.')[0])
        form_data.__setitem__('cid', patent_id)
        form_data.__setitem__('sid', patent_id)
        return FormRequest(
            url=url_full_text.get('url'),
            method='POST',
            dont_filter=True,  # 此处可能会发生重复采集,但是还是想要采集,所以关闭过滤
            formdata=form_data,
            callback=self.parse_full_text,
            meta={'sipo': kwargs.pop('sipo'), 'data_item': kwargs.pop('data_item')}
        ) 
Example #8
Source File: future_dce_spider.py    From fooltrader with MIT License 6 votes vote down vote up
def request_currentyear_kdata(self):
        today = pd.Timestamp.today()
        requests=[]
        for date in pd.date_range(start=today.date()-pd.Timedelta(days=today.dayofyear-1),end=today):
            the_dir = get_exchange_cache_path(security_type='future', exchange='dce',the_date=to_timestamp(date),data_type="day_kdata")+'.xls'
            if(date.dayofweek<5 and not os.path.exists(the_dir)):
                requests.append( FormRequest(url="http://www.dce.com.cn/publicweb/quotesdata/exportDayQuotesChData.html",formdata={
            'year':str(date.year),
                'month':str(date.month-1),
                'day':str(date.day),
                'dayQuotes.trade_type':'0',
                'dayQuotes.variety':'all',
                'exportType':'excel'
            },callback=self.download_dce_kline_data,meta={
                'filename':the_dir
            }))
        return requests 
Example #9
Source File: future_dce_spider.py    From fooltrader with MIT License 6 votes vote down vote up
def request_inventory_data(self):
        today = pd.Timestamp.today()
        requests = []
        for date in pd.date_range(start=today.date()-pd.Timedelta(weeks=520),end=today):
            the_dir = get_exchange_cache_path(security_type='future', exchange='dce',the_date=to_timestamp(date),data_type="day_inventory")+'.zip'
            if(date.dayofweek<5 and not os.path.exists(the_dir)):
                requests.append(FormRequest(url="http://www.dce.com.cn/publicweb/quotesdata/exportMemberDealPosiQuotesBatchData.html",formdata={
            'batchExportFlag':'batch',
            'contract.contract_id':'all',
            'contract.variety_id':'a',
            'year':str(date.year),
                'month':str(date.month-1),
                'day':str(date.day),
                'memberDealPosiQuotes.trade_type':'0',
                'memberDealPosiQuotes.variety':'all'
            },callback=self.download_dce_kline_data,meta={
                'filename':the_dir
            }))
        return requests 
Example #10
Source File: patent.py    From PatentCrawler with Apache License 2.0 6 votes vote down vote up
def start_requests(self):
        """
        初始请求
        :return:
        """
        for sipo in self.query_list:
            headers = url_search.get('headers')
            search_exp_cn = sipo.search_exp_cn
            logger.info('检索表达式--- %s' % search_exp_cn)
            form_data = url_search.get('form_data')
            form_data.__setitem__('searchCondition.searchExp', search_exp_cn)
            yield FormRequest(
                url=url_search.get('url'),
                callback=self.parse,
                method="POST",
                headers=headers,
                formdata=form_data,
                meta={'sipo': sipo}
            ) 
Example #11
Source File: douban.py    From spider_python with Apache License 2.0 6 votes vote down vote up
def parse_person_center(self, response):
        """
        解析个人中心页面
        :param response:
        :return:
        """
        if response.url == self.person_center_url:
            print('进入到个人中心页面了')
            ck = response.xpath('//input[@name="ck"]/@value').get()
            print('获取的ck是:%s' % ck)
            formdata = {
                'ck': ck,
                'signature': '时光如水,岁月如斯'
            }
            # 发送post请求来更改签名
            yield scrapy.FormRequest(self.edit_signature, formdata=formdata)
        else:
            print('进入个人中心页面失败') 
Example #12
Source File: nachrichten_at.py    From PyFeeds with GNU Affero General Public License v3.0 6 votes vote down vote up
def _login(self, response):
        response = yield scrapy.Request(
            "https://www.{}/login/".format(self.name),
            meta={"cache_expires": timedelta(days=14)},
        )
        response = yield scrapy.FormRequest(
            "https://www.{}/login/".format(self.name),
            formdata=OrderedDict(
                [
                    ("user[control][login]", "true"),
                    ("permanent", "checked"),
                    ("username", self._username),
                    ("password", self._password),
                ]
            ),
            meta={"cache_expires": timedelta(days=14)},
        )
        if response and response.css(".notloggedin"):
            # We tried to login but we failed.
            self.logger.error("Login failed: Username or password wrong") 
Example #13
Source File: test-scrapy.py    From python-examples with MIT License 6 votes vote down vote up
def parse_item(self,response):
        #print('parse_item] url:', response.url)
        #print('parse_item] text:', response.text)

        #for quote in response.xpath('//div[contains(@style,"overflow-x:auto")]'):
        #    for row in quote.xpath('./table[contains(@class,"table-striped")]/tbody/tr'):
        #        link = row.xpath('td[1]/a/@href').extract_first()
        #        yield scrapy.Request(link, callback=self.parse_product)

        for row in response.xpath('//table[@name="MVCGridTable_advancesearchawardedprojectsp"]/tbody/tr'):
            link = row.xpath('.//a/@href').get()
            #title = row.xpath('.//a/text()').get()
            yield scrapy.Request(link, callback=self.parse_product)

        # create request for next page
        onclick = response.xpath('//a[@aria-label="Next page"]/@onclick').get()
        
        if onclick:
            # next page 
            self.args['page'] += 1
            args = urllib.parse.urlencode(self.args)
            url = 'https://researchgrant.gov.sg/eservices/mvcgrid?' + args
            yield scrapy.FormRequest(url, callback=self.parse_item, method='POST', formdata=self.params, headers={'X-Requested-With': 'XMLHttpRequest'}) 
Example #14
Source File: scrapper.py    From scrape with MIT License 6 votes vote down vote up
def extract_more_news(self):
        pattern = re.compile('var min_news_id\s+=\s+"(.*?)"')
        script = response.css('script[type*="text/javascript"]').extract()[-1]
        try:
            id = pattern.search(script).group(1)
        except:
            id = response['min_news_id']
        r = scrapy.FormRequest('https://www.inshorts.com/en/ajax/more_news', formdata={'news_offset':id})
        scrapy.fetch(r)
        contents = json.loads(response.text)
        html = lxml.html.fromstring(contents['html'])
        for news in lxml.cssselect('div.news-card'):
            item = {
                'title': news.cssselect('a.clickable>span::text').extract_first(),
                'author': news.css('span.author::text').extract_first(),
                'time': news.css('span.time::text').extract_first(),
                'date': news.css('span[clas*=date]::text').extract_first(),
                'content': news.css('div[itemprop*=articleBody]::text').extract_first(),
                'link' : news.css('div.read-more>a::attr(href)')
            }
            yield item 
Example #15
Source File: cursos_prouni.py    From cursos-prouni with GNU Lesser General Public License v3.0 6 votes vote down vote up
def parse(self, response):
        'Extrai os nomes dos cursos disponíveis'

        html = response.body_as_unicode()
        codigo_cursos = [line for line in html.splitlines()
                         if 'var listaProcurar' in line][0]
        json_cursos = codigo_cursos.replace('var listaProcurar =', '').strip()[:-1]
        for curso_busca in json.loads(json_cursos):
            curso_busca = curso_busca['id']
            yield FormRequest(
                callback=self.parse_cidades,
                formdata={'opcao': '1', 'tipo': '3', 'valor': curso_busca},
                meta={'curso_busca': curso_busca},
                method='POST',
                url='http://prounialuno.mec.gov.br/consulta/resultado-procurar/',
            ) 
Example #16
Source File: anncSpider.py    From China_stock_announcement with MIT License 5 votes vote down vote up
def parse(self, response, datei):
		j = json.loads(response.body_as_unicode())
		totalRecordNum = j['totalRecordNum']
		pageNum = totalRecordNum / 30 + 1 if totalRecordNum % 30 > 0 else totalRecordNum / 30
		for i in range (1, pageNum + 1):
			yield scrapy.FormRequest( url = req, method = "POST", formdata={'column': self.type, 'seDate': str(datei)[0:10], 'pageNum': str(i), 'tabName': 'fulltext'}, callback = lambda response, datei = datei: self.main(response, datei), dont_filter = True) 
Example #17
Source File: lwn_net.py    From PyFeeds with GNU Affero General Public License v3.0 5 votes vote down vote up
def start_requests(self):
        if not self.settings.get("HTTPCACHE_ENABLED"):
            self.logger.error("LWN.net spider requires caching to be enabled.")
            return

        username = self.settings.get("FEEDS_SPIDER_LWN_NET_USERNAME")
        password = self.settings.get("FEEDS_SPIDER_LWN_NET_PASSWORD")
        if username and password:
            yield scrapy.FormRequest(
                url="https://{}/login".format(self.name),
                formdata=OrderedDict(
                    [
                        ("Username", username),
                        ("Password", password),
                        ("target", "/MyAccount/"),
                        ("submit", "Log+in"),
                    ]
                ),
                callback=self._after_login,
                # Session cookie is valid for a month. 14 days is a good compromise.
                meta={"cache_expires": timedelta(days=14)},
            )
        else:
            # Username, password or section not found in feeds.cfg.
            self.logger.info(
                "Login failed: No username or password given. "
                "Only free articles are available in full text."
            )
            yield self._start_requests() 
Example #18
Source File: main.py    From python-examples with MIT License 5 votes vote down vote up
def start_requests(self):

        # create request for first page
        args = urllib.parse.urlencode(self.args)

        url = 'https://researchgrant.gov.sg/eservices/mvcgrid?' + args

        yield scrapy.FormRequest(url, callback=self.parse_item, method='POST', formdata=self.params, headers={'X-Requested-With': 'XMLHttpRequest'}) 
Example #19
Source File: main.py    From python-examples with MIT License 5 votes vote down vote up
def parse_item(self,response):
        #print('parse_item] url:', response.url)
        #print('parse_item] text:', response.text)

        #for quote in response.xpath('//div[contains(@style,"overflow-x:auto")]'):
        #    for row in quote.xpath('./table[contains(@class,"table-striped")]/tbody/tr'):
        #        link = row.xpath('td[1]/a/@href').extract_first()
        #        yield scrapy.Request(link, callback=self.parse_product)

        for row in response.xpath('//table[@name="MVCGridTable_advancesearchawardedprojectsp"]/tbody/tr'):
            cols = row.xpath('.//td')
            link = cols[0].xpath('.//a/@href').get().strip()
            title = cols[0].xpath('.//a/text()').get().strip()
            status = cols[1].xpath('.//text()').get().strip()
            pi = cols[2].xpath('.//text()').get().strip()
            hi = cols[3].xpath('.//text()').get().strip()
            date = cols[4].xpath('.//text()').get().strip()

            item = {
                #'id': project_id,
                'status': status,
                'title': title,
                'link': link,
                'pi': pi,
                'hi': hi,
                'date': date,
            }
        
            # few links are redirected to main page so they are filtered and it needs `dont_filter=True`
            yield scrapy.Request(link, meta={'item': item}, callback=self.parse_product, dont_filter=True)

        # create request for next page
        onclick = response.xpath('//a[@aria-label="Next page"]/@onclick').get()

        if onclick:
            # next page 
            self.args['page'] += 1
            args = urllib.parse.urlencode(self.args)
            url = 'https://researchgrant.gov.sg/eservices/mvcgrid?' + args
            yield scrapy.FormRequest(url, callback=self.parse_item, method='POST', formdata=self.params, headers={'X-Requested-With': 'XMLHttpRequest'}) 
Example #20
Source File: test-scrapy.py    From python-examples with MIT License 5 votes vote down vote up
def start_requests(self):
        
        # create request for first page
        args = urllib.parse.urlencode(self.args)

        url = 'https://researchgrant.gov.sg/eservices/mvcgrid?' + args

        yield scrapy.FormRequest(url, callback=self.parse_item, method='POST', formdata=self.params, headers={'X-Requested-With': 'XMLHttpRequest'}) 
Example #21
Source File: munksroll.py    From StrepHit with GNU General Public License v3.0 5 votes vote down vote up
def start_requests(self):
        yield FormRequest('http://munksroll.rcplondon.ac.uk/Biography/Search',
                          self.parse, formdata={'Forename': '', 'Surname': ''}) 
Example #22
Source File: wshang_spider.py    From NewsScrapy with GNU Lesser General Public License v3.0 5 votes vote down vote up
def parse(self, response):
        """

        :param response:
        :return:抛出每个类别的post请求

                post参数:
                    inslider
                    page
                    pagesize
                Content-Type:application/x-www-form-urlencoded
        """
        soup = BeautifulSoup(response.body)
        menu = soup.find_all("a",class_="ui-more")  #所有的类别的链接
        if menu:
            for topic in menu:
                topic_name = topic.text.replace(u"查看","")
                topic_url = topic.get("href")
                self.flag.setdefault(topic_url,0)
                page="1"
                #post_data需要字符串
                post_data = {
                    "inslider":"0",
                    "page":page,
                    "pagesize":"10"
                }
                # yield scrapy.Request(topic_url,
                #                      callback=self.parse_topic,
                #                      method="POST",
                #                      headers={"Content-Type":"application/x-www-form-urlencoded"},
                #                      body=json.dumps(post_data)
                #                      )
                yield scrapy.FormRequest(
                    url=topic_url,
                    formdata=post_data,
                    callback=self.parse_topic,
                    meta={"page":page,"topic_name":topic_name}
                ) 
Example #23
Source File: ctcnn_spider.py    From NewsScrapy with GNU Lesser General Public License v3.0 5 votes vote down vote up
def parse_newest(self, response):
        soup = BeautifulSoup(response.body,"lxml")
        page =response.request.body.split('=')[-1]
        li = soup.find_all('li')
        if li:
            for news in li :
                news_date = news.find(class_="time").string[2:] if news.find(class_="time") else None
                struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d %H:%M")
                news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S")
                title = news.find(class_="title").string if news.find(class_="title") else None
                news_url = self.domain+news.find(class_="title").a.get("href",None) if news.find(class_="title") else None
                abstract = news.find(class_="info").string if news.find(class_="info") else None
                pic = self.domain+news.find('img').get('src',None) if news.find('img') else None
                topic = news.find(class_="type").string if news.find(class_="type") else None
                item = NewsItem(catalogue=u"最新内容",
                                title=title,
                                news_url=news_url,
                                abstract=abstract,
                                pic=pic,
                                topic=topic,
                                news_date=news_date)
                item = judge_news_crawl(item)
                if item:
                    request = scrapy.Request(news_url,callback=self.parse_news,dont_filter=True)
                    request.meta["item"] = item
                    yield request
                else:
                    self.flag=page
        else:
            logger.info("can't find news list")


        #下一页
        if not self.flag:
            new_request = scrapy.FormRequest(self.start_url,formdata={'page':str(int(page)+1)},callback=self.parse_newest)
            yield new_request 
Example #24
Source File: ctcnn_spider.py    From NewsScrapy with GNU Lesser General Public License v3.0 5 votes vote down vote up
def parse(self,response):
        yield scrapy.FormRequest(self.start_url,formdata={'page':'1'},callback=self.parse_newest)
        soup = BeautifulSoup(response.body,"lxml")

        index_list = soup.find(class_="index-first-list")("li") if soup.find(class_="index-first-list") else None
        for news in index_list:
            title = news.h2.a.string if news.h2.a else None
            abstract = news.p.string if news.p else None
            news_url = self.domain+news.a.get("href",None) if news.a else None
            item = NewsItem(title=title,abstract=abstract,news_url=news_url,catalogue=u"原创内容")
            request = scrapy.Request(news_url,self.parse_news,dont_filter=True)
            request.meta["item"] = item
            yield request

    #最新内容的列表 
Example #25
Source File: cursos_prouni.py    From cursos-prouni with GNU Lesser General Public License v3.0 5 votes vote down vote up
def parse_cidades(self, response):
        'Para cada nome de curso devolvido, extrai as cidades que o possui'

        meta = {
            'curso_busca': response.request.meta['curso_busca'],
        }

        uf = None
        for child in response.xpath('//div[@id="conteudo_esquerda"]/*'):
            if child.re('<div class="uf_lista"'):
                uf = child.xpath('./text()').extract()[0]

            else:
                cidade = child.xpath('./li/a')
                cidade_meta = meta.copy()
                cidade_meta['cidade_busca'] = cidade.xpath('./text()').extract()[0]
                cidade_meta['cidade_filtro'] = \
                        cidade.xpath('./@onclick').extract()[0]\
                        .replace("mostrarResultadoFinal('", '')\
                        .replace("')", '')
                cidade_meta['uf_busca'] = uf

                yield FormRequest(
                        callback=self.parse_cursos,
                        formdata={
                            'opcao': '',
                            'tipo': '3',
                            'valor': meta['curso_busca'],
                            'filtro': cidade_meta['cidade_filtro'],
                        },
                        meta=cidade_meta,
                        method='POST',
                        url='http://prounialuno.mec.gov.br/consulta/resultado-final-procurar/',
                ) 
Example #26
Source File: renren.py    From Python24 with MIT License 5 votes vote down vote up
def start_requests(self):
        url = self.start_urls[0]
        post_data = {
            'email': '18949599846',
            'password':'shengjun'
        }

        # 发送post
        yield scrapy.FormRequest(url=url, formdata=post_data, callback=self.parse) 
Example #27
Source File: pixiv-beta.py    From Pixiv-Crawler with GNU General Public License v3.0 5 votes vote down vote up
def login(self, response):
        index_request = requests.get('http://www.pixiv.net', headers=self.header)
        index_cookie = index_request.cookies
        index_html = index_request.text
        pixiv_token = re.search(r'pixiv.context.token = (")(.*?)(")', index_html).group()
        start = pixiv_token.find('"')
        token = pixiv_token[start + 1:-1]
        # post_key = re.match('.*"pixivAccount.postKey":"(\w+?)"', response.text, re.S).group(1)
        print("please login")
        account = self.account if self.account else input("account >")
        password = self.password if self.password else input("password >")
        post_data = {
            "pixiv_id": account,
            "password": password,
            "captcha": "",
            "g_recaptcha_response": "",
            "post_key": token,
            "source": "pc",
            "ref": "wwwtop_accounts_index",
            "return_to": "http://www.pixiv.net/",
        }
        return [scrapy.FormRequest("https://accounts.pixiv.net/api/login?lang=zh",
                                   headers=self.header, formdata=post_data,
                                   callback=self.center, cookies=dict(index_cookie))]

    # 功能分支 
Example #28
Source File: ettoday_tag_spider.py    From Taiwan-news-crawlers with MIT License 5 votes vote down vote up
def parse_news_list(self, response):
        has_next_page = True
        response.meta['iter_time'] += 1
        current_date_str = response.meta['date_str']
        is_first_iter = response.meta['iter_time'] == 1
        prefix = '.part_list_2' if is_first_iter else ''
        for news_item in response.css(prefix + ' h3'):
            url = news_item.css('a::attr(href)').extract_first()
            if ROOT_URL not in url:
                url = ROOT_URL + url
            category = news_item.css('em::text').extract_first()
            date_time = news_item.css('span::text').extract_first()

            if current_date_str not in date_time:
                has_next_page = False
                continue

            response.meta['category'] = category
            yield scrapy.Request(
                url, callback=self.parse_tag_of_news, meta=response.meta)
        if has_next_page:
            tFile = datetime.date.today().strftime('%Y%m%d') + '.xml'
            yield scrapy.FormRequest(
                url="http://www.ettoday.net/show_roll.php",
                callback=self.parse_news_list,
                meta=response.meta,
                formdata={
                    'offset': str(response.meta['iter_time']),
                    'tPage': '3',
                    'tFile': tFile,
                    'tOt': '0',
                    'tSi': '100'
                }) 
Example #29
Source File: pts_spider.py    From Taiwan-news-crawlers with MIT License 5 votes vote down vote up
def parse_news_list(self, response):
        response.meta['iter_time'] = 1
        for news_item in response.css('ul.list-news li'):
            url = news_item.css('h2 a::attr(href)').extract_first()
            date_time = news_item.css('.list-news-time::text').extract_first()
            title = news_item.css('h2 a::text').extract_first()
            content = news_item.css(
                '.list-news-description::text').extract_first()
            category = news_item.css(
                '.list-news-program::text').extract_first()

            if TODAY in date_time:
                yield {
                    'website': '公視',
                    'url': url,
                    'title': title,
                    'date': date_time,
                    'content': content,
                    'category': category
                }

        yield scrapy.FormRequest(
            url='https://news.pts.org.tw/list/getmore.php',
            callback=self.get_news,
            meta=response.meta,
            formdata={
                'page': '1'
            }) 
Example #30
Source File: ettoday_spider.py    From Taiwan-news-crawlers with MIT License 5 votes vote down vote up
def parse_news_list(self, response):
        has_next_page = True
        response.meta['iter_time'] += 1
        is_first_iter = response.meta['iter_time'] == 1
        prefix = '.part_list_2' if is_first_iter else ''
        for news_item in response.css(prefix + ' h3'):
            url = news_item.css('a::attr(href)').extract_first()
            url = ROOT_URL + url
            category = news_item.css('em::text').extract_first()
            date_time = news_item.css('span::text').extract_first()

            if TODAY not in date_time:
                has_next_page = False
                continue

            response.meta['category'] = category
            yield scrapy.Request(
                url, callback=self.parse_news, meta=response.meta)
        if has_next_page:
            tFile = time.strftime('%Y%m%d') + '.xml'
            yield scrapy.FormRequest(
                url="https://www.ettoday.net/show_roll.php",
                callback=self.parse_news_list,
                meta=response.meta,
                formdata={
                    'offset': str(response.meta['iter_time']),
                    'tPage': '3',
                    'tFile': tFile,
                    'tOt': '0',
                    'tSi': '100'
                })