Python scrapy.FormRequest() Examples
The following are 30
code examples of scrapy.FormRequest().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
scrapy
, or try the search function
.
Example #1
Source File: pts_spider.py From Taiwan-news-crawlers with MIT License | 7 votes |
def get_news(self, response): response.meta['iter_time'] += 1 news_items = json.loads(response.text) if news_items: for n in news_items: yield { 'website': '公視', 'url': ARTICLE_PREFIX + n['news_id'], 'title': n['subject'], 'date': n['news_date'], 'content': n['content'], 'category': n['program_name'] } yield scrapy.FormRequest( url="https://news.pts.org.tw/list/getmore.php", callback=self.get_news, meta=response.meta, formdata={ 'page': str(response.meta['iter_time']) })
Example #2
Source File: inshorts_scraper.py From scrape with MIT License | 6 votes |
def parse(self, response): for news in response.css('div.news-card'): item = { 'headline': news.css('a.clickable>span::text').extract_first(), 'author': news.css('span.author::text').extract_first(), 'time': news.css('span.time::text').extract_first(), 'date': news.css('span[clas*=date]::text').extract_first(), 'body': news.css('div[itemprop*=articleBody]::text').extract_first(), } yield item while self.pages > 1: pattern = re.compile('var min_news_id\s+=\s+"(.*?)"') script = response.css('script[type*="text/javascript"]').extract()[-1] id = pattern.search(script).group(1) r = scrapy.FormRequest('https://www.inshorts.com/en/ajax/more_news', callback=self.parse, formdata={'news_offset':id}) yield scrapy.fetch(r) self.pages -= 1
Example #3
Source File: mangaspider.py From kmanga with GNU General Public License v3.0 | 6 votes |
def parse_login(self, response): self._check_login_params() self._login = False form_data = { self.username_field: self.username, self.password_field: self.password } if hasattr(self, 'form_xpath'): return scrapy.FormRequest.from_response( response, formxpath=self.form_xpath, formdata=form_data, callback=self.parse_after_login ) elif hasattr(self, 'form_url'): return scrapy.FormRequest( self.form_url, formdata=form_data, callback=self.parse_after_login )
Example #4
Source File: post_pass_item.py From scrapy-templates with MIT License | 6 votes |
def parse(self, response): item_loader = ItemLoader(item=MyItem(), response=response) item_loader.default_input_processor = MapCompose(remove_tags) #item_loader.add_css("", "") #item_loader.add_css("", "") #item_loader.add_css("", "") yield FormRequest("POST_URL", formdata={'parameter': 'p'}, meta={'item': item_loader.load_item()}, callback=self.populate_field)
Example #5
Source File: patent.py From PatentCrawler with Apache License 2.0 | 6 votes |
def gen_detail(self, **kwargs): """ 生成查询详情的请求 :param patent_id, sipo, data_item, nrdAn, nrdPn: :return: """ patent_id = str(kwargs.pop('patent_id')) formdata = url_detail.get('form_data') formdata.__setitem__('nrdAn', patent_id.split('.')[0]) formdata.__setitem__('cid', patent_id) formdata.__setitem__('sid', patent_id) return FormRequest( url=url_detail.get('url'), formdata=formdata, headers=url_detail.get('headers'), callback=self.parse_patent_detail, meta={'sipo': kwargs.pop('sipo'), 'data_item': kwargs.pop('data_item'), 'patent_id': patent_id, 'law_info': {'nrdAn': kwargs.pop('nrdAn'), 'nrdPn': kwargs.pop('nrdPn')}} )
Example #6
Source File: patent.py From PatentCrawler with Apache License 2.0 | 6 votes |
def gen_related_info(self, **kwargs): """ 生成相关信息的请求,包含法律信息和同族信息 :param sipo: :param data_item: :param nrdAn: :param nrdPn: :return: """ form_data = url_related_info.get('form_data') form_data.__setitem__('literaInfo.nrdAn', kwargs.pop('nrdAn')) form_data.__setitem__('literaInfo.nrdPn', kwargs.pop('nrdPn')) return FormRequest( url=url_related_info.get('url'), method='POST', dont_filter=True, # 此处可能会发生重复采集,但是还是想要采集,所以关闭过滤 formdata=form_data, callback=self.parse_related_info, meta={'sipo': kwargs.pop('sipo'), 'data_item': kwargs.pop('data_item'), 'patent_id': kwargs.pop('patent_id')} )
Example #7
Source File: patent.py From PatentCrawler with Apache License 2.0 | 6 votes |
def gen_full_text(self, **kwargs): """ 生成全文文本的请求 :param patent_id: :param sipo: :param data_item: :return: """ patent_id = str(kwargs.pop('patent_id')) form_data = url_full_text.get('form_data') form_data.__setitem__('nrdAn', patent_id.split('.')[0]) form_data.__setitem__('cid', patent_id) form_data.__setitem__('sid', patent_id) return FormRequest( url=url_full_text.get('url'), method='POST', dont_filter=True, # 此处可能会发生重复采集,但是还是想要采集,所以关闭过滤 formdata=form_data, callback=self.parse_full_text, meta={'sipo': kwargs.pop('sipo'), 'data_item': kwargs.pop('data_item')} )
Example #8
Source File: future_dce_spider.py From fooltrader with MIT License | 6 votes |
def request_currentyear_kdata(self): today = pd.Timestamp.today() requests=[] for date in pd.date_range(start=today.date()-pd.Timedelta(days=today.dayofyear-1),end=today): the_dir = get_exchange_cache_path(security_type='future', exchange='dce',the_date=to_timestamp(date),data_type="day_kdata")+'.xls' if(date.dayofweek<5 and not os.path.exists(the_dir)): requests.append( FormRequest(url="http://www.dce.com.cn/publicweb/quotesdata/exportDayQuotesChData.html",formdata={ 'year':str(date.year), 'month':str(date.month-1), 'day':str(date.day), 'dayQuotes.trade_type':'0', 'dayQuotes.variety':'all', 'exportType':'excel' },callback=self.download_dce_kline_data,meta={ 'filename':the_dir })) return requests
Example #9
Source File: future_dce_spider.py From fooltrader with MIT License | 6 votes |
def request_inventory_data(self): today = pd.Timestamp.today() requests = [] for date in pd.date_range(start=today.date()-pd.Timedelta(weeks=520),end=today): the_dir = get_exchange_cache_path(security_type='future', exchange='dce',the_date=to_timestamp(date),data_type="day_inventory")+'.zip' if(date.dayofweek<5 and not os.path.exists(the_dir)): requests.append(FormRequest(url="http://www.dce.com.cn/publicweb/quotesdata/exportMemberDealPosiQuotesBatchData.html",formdata={ 'batchExportFlag':'batch', 'contract.contract_id':'all', 'contract.variety_id':'a', 'year':str(date.year), 'month':str(date.month-1), 'day':str(date.day), 'memberDealPosiQuotes.trade_type':'0', 'memberDealPosiQuotes.variety':'all' },callback=self.download_dce_kline_data,meta={ 'filename':the_dir })) return requests
Example #10
Source File: patent.py From PatentCrawler with Apache License 2.0 | 6 votes |
def start_requests(self): """ 初始请求 :return: """ for sipo in self.query_list: headers = url_search.get('headers') search_exp_cn = sipo.search_exp_cn logger.info('检索表达式--- %s' % search_exp_cn) form_data = url_search.get('form_data') form_data.__setitem__('searchCondition.searchExp', search_exp_cn) yield FormRequest( url=url_search.get('url'), callback=self.parse, method="POST", headers=headers, formdata=form_data, meta={'sipo': sipo} )
Example #11
Source File: douban.py From spider_python with Apache License 2.0 | 6 votes |
def parse_person_center(self, response): """ 解析个人中心页面 :param response: :return: """ if response.url == self.person_center_url: print('进入到个人中心页面了') ck = response.xpath('//input[@name="ck"]/@value').get() print('获取的ck是:%s' % ck) formdata = { 'ck': ck, 'signature': '时光如水,岁月如斯' } # 发送post请求来更改签名 yield scrapy.FormRequest(self.edit_signature, formdata=formdata) else: print('进入个人中心页面失败')
Example #12
Source File: nachrichten_at.py From PyFeeds with GNU Affero General Public License v3.0 | 6 votes |
def _login(self, response): response = yield scrapy.Request( "https://www.{}/login/".format(self.name), meta={"cache_expires": timedelta(days=14)}, ) response = yield scrapy.FormRequest( "https://www.{}/login/".format(self.name), formdata=OrderedDict( [ ("user[control][login]", "true"), ("permanent", "checked"), ("username", self._username), ("password", self._password), ] ), meta={"cache_expires": timedelta(days=14)}, ) if response and response.css(".notloggedin"): # We tried to login but we failed. self.logger.error("Login failed: Username or password wrong")
Example #13
Source File: test-scrapy.py From python-examples with MIT License | 6 votes |
def parse_item(self,response): #print('parse_item] url:', response.url) #print('parse_item] text:', response.text) #for quote in response.xpath('//div[contains(@style,"overflow-x:auto")]'): # for row in quote.xpath('./table[contains(@class,"table-striped")]/tbody/tr'): # link = row.xpath('td[1]/a/@href').extract_first() # yield scrapy.Request(link, callback=self.parse_product) for row in response.xpath('//table[@name="MVCGridTable_advancesearchawardedprojectsp"]/tbody/tr'): link = row.xpath('.//a/@href').get() #title = row.xpath('.//a/text()').get() yield scrapy.Request(link, callback=self.parse_product) # create request for next page onclick = response.xpath('//a[@aria-label="Next page"]/@onclick').get() if onclick: # next page self.args['page'] += 1 args = urllib.parse.urlencode(self.args) url = 'https://researchgrant.gov.sg/eservices/mvcgrid?' + args yield scrapy.FormRequest(url, callback=self.parse_item, method='POST', formdata=self.params, headers={'X-Requested-With': 'XMLHttpRequest'})
Example #14
Source File: scrapper.py From scrape with MIT License | 6 votes |
def extract_more_news(self): pattern = re.compile('var min_news_id\s+=\s+"(.*?)"') script = response.css('script[type*="text/javascript"]').extract()[-1] try: id = pattern.search(script).group(1) except: id = response['min_news_id'] r = scrapy.FormRequest('https://www.inshorts.com/en/ajax/more_news', formdata={'news_offset':id}) scrapy.fetch(r) contents = json.loads(response.text) html = lxml.html.fromstring(contents['html']) for news in lxml.cssselect('div.news-card'): item = { 'title': news.cssselect('a.clickable>span::text').extract_first(), 'author': news.css('span.author::text').extract_first(), 'time': news.css('span.time::text').extract_first(), 'date': news.css('span[clas*=date]::text').extract_first(), 'content': news.css('div[itemprop*=articleBody]::text').extract_first(), 'link' : news.css('div.read-more>a::attr(href)') } yield item
Example #15
Source File: cursos_prouni.py From cursos-prouni with GNU Lesser General Public License v3.0 | 6 votes |
def parse(self, response): 'Extrai os nomes dos cursos disponíveis' html = response.body_as_unicode() codigo_cursos = [line for line in html.splitlines() if 'var listaProcurar' in line][0] json_cursos = codigo_cursos.replace('var listaProcurar =', '').strip()[:-1] for curso_busca in json.loads(json_cursos): curso_busca = curso_busca['id'] yield FormRequest( callback=self.parse_cidades, formdata={'opcao': '1', 'tipo': '3', 'valor': curso_busca}, meta={'curso_busca': curso_busca}, method='POST', url='http://prounialuno.mec.gov.br/consulta/resultado-procurar/', )
Example #16
Source File: anncSpider.py From China_stock_announcement with MIT License | 5 votes |
def parse(self, response, datei): j = json.loads(response.body_as_unicode()) totalRecordNum = j['totalRecordNum'] pageNum = totalRecordNum / 30 + 1 if totalRecordNum % 30 > 0 else totalRecordNum / 30 for i in range (1, pageNum + 1): yield scrapy.FormRequest( url = req, method = "POST", formdata={'column': self.type, 'seDate': str(datei)[0:10], 'pageNum': str(i), 'tabName': 'fulltext'}, callback = lambda response, datei = datei: self.main(response, datei), dont_filter = True)
Example #17
Source File: lwn_net.py From PyFeeds with GNU Affero General Public License v3.0 | 5 votes |
def start_requests(self): if not self.settings.get("HTTPCACHE_ENABLED"): self.logger.error("LWN.net spider requires caching to be enabled.") return username = self.settings.get("FEEDS_SPIDER_LWN_NET_USERNAME") password = self.settings.get("FEEDS_SPIDER_LWN_NET_PASSWORD") if username and password: yield scrapy.FormRequest( url="https://{}/login".format(self.name), formdata=OrderedDict( [ ("Username", username), ("Password", password), ("target", "/MyAccount/"), ("submit", "Log+in"), ] ), callback=self._after_login, # Session cookie is valid for a month. 14 days is a good compromise. meta={"cache_expires": timedelta(days=14)}, ) else: # Username, password or section not found in feeds.cfg. self.logger.info( "Login failed: No username or password given. " "Only free articles are available in full text." ) yield self._start_requests()
Example #18
Source File: main.py From python-examples with MIT License | 5 votes |
def start_requests(self): # create request for first page args = urllib.parse.urlencode(self.args) url = 'https://researchgrant.gov.sg/eservices/mvcgrid?' + args yield scrapy.FormRequest(url, callback=self.parse_item, method='POST', formdata=self.params, headers={'X-Requested-With': 'XMLHttpRequest'})
Example #19
Source File: main.py From python-examples with MIT License | 5 votes |
def parse_item(self,response): #print('parse_item] url:', response.url) #print('parse_item] text:', response.text) #for quote in response.xpath('//div[contains(@style,"overflow-x:auto")]'): # for row in quote.xpath('./table[contains(@class,"table-striped")]/tbody/tr'): # link = row.xpath('td[1]/a/@href').extract_first() # yield scrapy.Request(link, callback=self.parse_product) for row in response.xpath('//table[@name="MVCGridTable_advancesearchawardedprojectsp"]/tbody/tr'): cols = row.xpath('.//td') link = cols[0].xpath('.//a/@href').get().strip() title = cols[0].xpath('.//a/text()').get().strip() status = cols[1].xpath('.//text()').get().strip() pi = cols[2].xpath('.//text()').get().strip() hi = cols[3].xpath('.//text()').get().strip() date = cols[4].xpath('.//text()').get().strip() item = { #'id': project_id, 'status': status, 'title': title, 'link': link, 'pi': pi, 'hi': hi, 'date': date, } # few links are redirected to main page so they are filtered and it needs `dont_filter=True` yield scrapy.Request(link, meta={'item': item}, callback=self.parse_product, dont_filter=True) # create request for next page onclick = response.xpath('//a[@aria-label="Next page"]/@onclick').get() if onclick: # next page self.args['page'] += 1 args = urllib.parse.urlencode(self.args) url = 'https://researchgrant.gov.sg/eservices/mvcgrid?' + args yield scrapy.FormRequest(url, callback=self.parse_item, method='POST', formdata=self.params, headers={'X-Requested-With': 'XMLHttpRequest'})
Example #20
Source File: test-scrapy.py From python-examples with MIT License | 5 votes |
def start_requests(self): # create request for first page args = urllib.parse.urlencode(self.args) url = 'https://researchgrant.gov.sg/eservices/mvcgrid?' + args yield scrapy.FormRequest(url, callback=self.parse_item, method='POST', formdata=self.params, headers={'X-Requested-With': 'XMLHttpRequest'})
Example #21
Source File: munksroll.py From StrepHit with GNU General Public License v3.0 | 5 votes |
def start_requests(self): yield FormRequest('http://munksroll.rcplondon.ac.uk/Biography/Search', self.parse, formdata={'Forename': '', 'Surname': ''})
Example #22
Source File: wshang_spider.py From NewsScrapy with GNU Lesser General Public License v3.0 | 5 votes |
def parse(self, response): """ :param response: :return:抛出每个类别的post请求 post参数: inslider page pagesize Content-Type:application/x-www-form-urlencoded """ soup = BeautifulSoup(response.body) menu = soup.find_all("a",class_="ui-more") #所有的类别的链接 if menu: for topic in menu: topic_name = topic.text.replace(u"查看","") topic_url = topic.get("href") self.flag.setdefault(topic_url,0) page="1" #post_data需要字符串 post_data = { "inslider":"0", "page":page, "pagesize":"10" } # yield scrapy.Request(topic_url, # callback=self.parse_topic, # method="POST", # headers={"Content-Type":"application/x-www-form-urlencoded"}, # body=json.dumps(post_data) # ) yield scrapy.FormRequest( url=topic_url, formdata=post_data, callback=self.parse_topic, meta={"page":page,"topic_name":topic_name} )
Example #23
Source File: ctcnn_spider.py From NewsScrapy with GNU Lesser General Public License v3.0 | 5 votes |
def parse_newest(self, response): soup = BeautifulSoup(response.body,"lxml") page =response.request.body.split('=')[-1] li = soup.find_all('li') if li: for news in li : news_date = news.find(class_="time").string[2:] if news.find(class_="time") else None struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d %H:%M") news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S") title = news.find(class_="title").string if news.find(class_="title") else None news_url = self.domain+news.find(class_="title").a.get("href",None) if news.find(class_="title") else None abstract = news.find(class_="info").string if news.find(class_="info") else None pic = self.domain+news.find('img').get('src',None) if news.find('img') else None topic = news.find(class_="type").string if news.find(class_="type") else None item = NewsItem(catalogue=u"最新内容", title=title, news_url=news_url, abstract=abstract, pic=pic, topic=topic, news_date=news_date) item = judge_news_crawl(item) if item: request = scrapy.Request(news_url,callback=self.parse_news,dont_filter=True) request.meta["item"] = item yield request else: self.flag=page else: logger.info("can't find news list") #下一页 if not self.flag: new_request = scrapy.FormRequest(self.start_url,formdata={'page':str(int(page)+1)},callback=self.parse_newest) yield new_request
Example #24
Source File: ctcnn_spider.py From NewsScrapy with GNU Lesser General Public License v3.0 | 5 votes |
def parse(self,response): yield scrapy.FormRequest(self.start_url,formdata={'page':'1'},callback=self.parse_newest) soup = BeautifulSoup(response.body,"lxml") index_list = soup.find(class_="index-first-list")("li") if soup.find(class_="index-first-list") else None for news in index_list: title = news.h2.a.string if news.h2.a else None abstract = news.p.string if news.p else None news_url = self.domain+news.a.get("href",None) if news.a else None item = NewsItem(title=title,abstract=abstract,news_url=news_url,catalogue=u"原创内容") request = scrapy.Request(news_url,self.parse_news,dont_filter=True) request.meta["item"] = item yield request #最新内容的列表
Example #25
Source File: cursos_prouni.py From cursos-prouni with GNU Lesser General Public License v3.0 | 5 votes |
def parse_cidades(self, response): 'Para cada nome de curso devolvido, extrai as cidades que o possui' meta = { 'curso_busca': response.request.meta['curso_busca'], } uf = None for child in response.xpath('//div[@id="conteudo_esquerda"]/*'): if child.re('<div class="uf_lista"'): uf = child.xpath('./text()').extract()[0] else: cidade = child.xpath('./li/a') cidade_meta = meta.copy() cidade_meta['cidade_busca'] = cidade.xpath('./text()').extract()[0] cidade_meta['cidade_filtro'] = \ cidade.xpath('./@onclick').extract()[0]\ .replace("mostrarResultadoFinal('", '')\ .replace("')", '') cidade_meta['uf_busca'] = uf yield FormRequest( callback=self.parse_cursos, formdata={ 'opcao': '', 'tipo': '3', 'valor': meta['curso_busca'], 'filtro': cidade_meta['cidade_filtro'], }, meta=cidade_meta, method='POST', url='http://prounialuno.mec.gov.br/consulta/resultado-final-procurar/', )
Example #26
Source File: renren.py From Python24 with MIT License | 5 votes |
def start_requests(self): url = self.start_urls[0] post_data = { 'email': '18949599846', 'password':'shengjun' } # 发送post yield scrapy.FormRequest(url=url, formdata=post_data, callback=self.parse)
Example #27
Source File: pixiv-beta.py From Pixiv-Crawler with GNU General Public License v3.0 | 5 votes |
def login(self, response): index_request = requests.get('http://www.pixiv.net', headers=self.header) index_cookie = index_request.cookies index_html = index_request.text pixiv_token = re.search(r'pixiv.context.token = (")(.*?)(")', index_html).group() start = pixiv_token.find('"') token = pixiv_token[start + 1:-1] # post_key = re.match('.*"pixivAccount.postKey":"(\w+?)"', response.text, re.S).group(1) print("please login") account = self.account if self.account else input("account >") password = self.password if self.password else input("password >") post_data = { "pixiv_id": account, "password": password, "captcha": "", "g_recaptcha_response": "", "post_key": token, "source": "pc", "ref": "wwwtop_accounts_index", "return_to": "http://www.pixiv.net/", } return [scrapy.FormRequest("https://accounts.pixiv.net/api/login?lang=zh", headers=self.header, formdata=post_data, callback=self.center, cookies=dict(index_cookie))] # 功能分支
Example #28
Source File: ettoday_tag_spider.py From Taiwan-news-crawlers with MIT License | 5 votes |
def parse_news_list(self, response): has_next_page = True response.meta['iter_time'] += 1 current_date_str = response.meta['date_str'] is_first_iter = response.meta['iter_time'] == 1 prefix = '.part_list_2' if is_first_iter else '' for news_item in response.css(prefix + ' h3'): url = news_item.css('a::attr(href)').extract_first() if ROOT_URL not in url: url = ROOT_URL + url category = news_item.css('em::text').extract_first() date_time = news_item.css('span::text').extract_first() if current_date_str not in date_time: has_next_page = False continue response.meta['category'] = category yield scrapy.Request( url, callback=self.parse_tag_of_news, meta=response.meta) if has_next_page: tFile = datetime.date.today().strftime('%Y%m%d') + '.xml' yield scrapy.FormRequest( url="http://www.ettoday.net/show_roll.php", callback=self.parse_news_list, meta=response.meta, formdata={ 'offset': str(response.meta['iter_time']), 'tPage': '3', 'tFile': tFile, 'tOt': '0', 'tSi': '100' })
Example #29
Source File: pts_spider.py From Taiwan-news-crawlers with MIT License | 5 votes |
def parse_news_list(self, response): response.meta['iter_time'] = 1 for news_item in response.css('ul.list-news li'): url = news_item.css('h2 a::attr(href)').extract_first() date_time = news_item.css('.list-news-time::text').extract_first() title = news_item.css('h2 a::text').extract_first() content = news_item.css( '.list-news-description::text').extract_first() category = news_item.css( '.list-news-program::text').extract_first() if TODAY in date_time: yield { 'website': '公視', 'url': url, 'title': title, 'date': date_time, 'content': content, 'category': category } yield scrapy.FormRequest( url='https://news.pts.org.tw/list/getmore.php', callback=self.get_news, meta=response.meta, formdata={ 'page': '1' })
Example #30
Source File: ettoday_spider.py From Taiwan-news-crawlers with MIT License | 5 votes |
def parse_news_list(self, response): has_next_page = True response.meta['iter_time'] += 1 is_first_iter = response.meta['iter_time'] == 1 prefix = '.part_list_2' if is_first_iter else '' for news_item in response.css(prefix + ' h3'): url = news_item.css('a::attr(href)').extract_first() url = ROOT_URL + url category = news_item.css('em::text').extract_first() date_time = news_item.css('span::text').extract_first() if TODAY not in date_time: has_next_page = False continue response.meta['category'] = category yield scrapy.Request( url, callback=self.parse_news, meta=response.meta) if has_next_page: tFile = time.strftime('%Y%m%d') + '.xml' yield scrapy.FormRequest( url="https://www.ettoday.net/show_roll.php", callback=self.parse_news_list, meta=response.meta, formdata={ 'offset': str(response.meta['iter_time']), 'tPage': '3', 'tFile': tFile, 'tOt': '0', 'tSi': '100' })