Python scrapy.http.FormRequest() Examples
The following are 18
code examples of scrapy.http.FormRequest().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
scrapy.http
, or try the search function
.
Example #1
Source File: zhihu_user_spider.py From openslack-crawler with Apache License 2.0 | 7 votes |
def start_requests(self): return [FormRequest( "http://www.zhihu.com/login", formdata={'email': 'june.chan@foxmail.com', 'password': 'czj0617_zhihu' }, callback=self.after_login )]
Example #2
Source File: coursera_spider.py From scrapy_example with MIT License | 6 votes |
def start_requests(self): print 'Preparing login' return [FormRequest("https://accounts.coursera.org/api/v1/login", headers = self.make_header(response), formdata = { "email": "1095511864@qq.com", "password": "HUAZANG.55789260", "webrequest": "true" }, callback = self.parse_page )]
Example #3
Source File: zhizhu_user_topic_spider.py From Zhihu_Spider with Apache License 2.0 | 6 votes |
def start_requests(self): return [FormRequest( "http://www.zhihu.com/login", formdata = {'email':'xxx@gmail.com', 'password':'123456' }, callback = self.after_login )]
Example #4
Source File: zhizhu_user_topic_spider.py From Zhihu_Spider with Apache License 2.0 | 6 votes |
def gen_topic_form(self, response): # yield the beginning topics sel = Selector(response) for topic_sel in sel.xpath('//div[@id="zh-profile-topic-list"]/div[contains(@class, "zm-profile-section-item")]'): # new user-topic relationship yield self.get_UT_item(topic_sel, response.url) # get the number of topics of one user num_topic = sel.xpath('//div[contains(@class, "zm-profile-section-wrap")]/div[contains(@class, "zm-profile-section-head")]//span[contains(@class, "zm-profile-section-name")]/text()') number_str = num_topic.extract()[0] # print number_str p = re.compile(r'\d+') m = p.findall(number_str) if m: num_topic = int(m[0]) # crawl the remainding topics of a user base_line = 20 if num_topic > 20: while num_topic > 0: yield FormRequest( url = response.url, formdata = { 'start': '0', 'offset': str(base_line), '_xsrf': self.xsrf }, callback=self.parse ) num_topic = num_topic - 20 base_line += 20
Example #5
Source File: zhihu_spider.py From Zhihu_Spider with Apache License 2.0 | 6 votes |
def start_requests(self): return [FormRequest( "http://www.zhihu.com/login", formdata = {'email':'example.com', 'password':'123456' }, callback = self.after_login )]
Example #6
Source File: login1_spider.py From openslack-crawler with Apache License 2.0 | 6 votes |
def start_requests(self): for i, url in enumerate(self.start_urls): yield FormRequest(url, meta={'cookiejar': i}, \ headers=self.headers, \ cookies=self.cookies, callback=self.parse_item) # jump to login page
Example #7
Source File: zhihu_answer_spider.py From openslack-crawler with Apache License 2.0 | 6 votes |
def start_requests(self): return [FormRequest( "http://www.zhihu.com/login", formdata={'email': 'june.chan@foxmail.com', 'password': 'czj0617_zhihu' }, callback=self.after_login )]
Example #8
Source File: main.py From python-examples with MIT License | 6 votes |
def parse(self, response): # small images 200x200 #urls = response.xpath('//div[@id="thumbsContainer"]//img/@data-original').extract() #urls = response.xpath('//img[@class="res-photo-thumbnail thumb-load lazy-photo-inner"]/@data-original').extract() #yield {'image_urls': urls} # big images 800x600 #urls = [url.replace('200%3A200', '800%3A600') for url in urls] #yield {'image_urls': urls} # big images 1900x1200 #urls = [url.replace('200%3A200', '1900%3A1200') for url in urls] #yield {'image_urls': urls} data = { 'res_id': '16761868', #, '16780723', # place ID 'offset': '30', # change it 'category': 'all', # 'food' 'action': 'fetch_photos', 'index': '30', 'limit': '10', # chage it } url = 'https://www.zomato.com/php/load_more_res_pics.php' yield FormRequest(url, callback=self.parse_post, formdata=data)
Example #9
Source File: msi_spider.py From uefi-spider with MIT License | 5 votes |
def parse_search(self, response): sel = Selector(response) ### Parse each sub-product type. searches = [] product_selector = sel.css(".mr20").xpath("@no") if product_selector: pno = product_selector.extract()[0] products = sel.css(".ProdSel-item") for product in products: no = product.xpath("@no").extract()[0] searches.append((no, pno)) #print searches ### Parse the actual products/boards. boards = [] items = sel.css(".Prod-item") for item in items: title = item.xpath("@title").extract()[0] no = item.xpath("@no").extract()[0] boards.append((title, no)) #print boards for sub_search in searches: search_vars = self._get_vars(sub_search[0], sub_search[1]) yield FormRequest(url= self.start_urls[0], method= "POST", headers= json_headers, formdata= search_vars, callback= self.parse_search) for board in boards: url = "http://us.msi.com/product/mb/%s.html" % board[0] item = MsiUpdateLinkItem() item["id"] = board[1] item["title"] = board[0] item["url"] = url yield Request(url= "%s#/?div=BIOS" % url, callback= self.parse_board, meta= {"attrs": item}) pass
Example #10
Source File: inshorts.py From scrape with MIT License | 5 votes |
def parse(self, response): try: for news in response.css('div.news-card'): self.urls_parsed += 1 try: item = ScrapenewsItem() item['image'] = news.css('div.news-card-image::attr(style)').extract_first()[23:-3] item['title'] = news.css('a.clickable>span::text').extract_first() item['content'] = news.css('div[itemprop*=articleBody]::text').extract_first() item['newsDate'] = news.css('span.time::attr(content)').extract_first()[:-5] item['link'] = news.css('div.read-more>a::attr(href)').extract_first() item['source'] = 105 yield item self.urls_scraped += 1 except Exception as e: logger.error(__name__ + " [UNHANDLED] Unable to Extract Data : " + str(e)) self.urls_dropped += 1 #news_id extraction pattern = re.compile('var min_news_id\s+=\s+"(.*?)"') js = response.xpath('//script[@type="text/javascript"]/text()').extract()[-1] self.news_id = pattern.search(js).group(1) while (self.pages > 1 and not self.infinite): yield FormRequest('https://www.inshorts.com/en/ajax/more_news', formdata={'news-offset' : self.news_id}, callback=self.parse_more_news, errback=self.errorRequestHandler, dont_filter=True) self.pages -= 1 while (self.infinite): yield FormRequest('https://www.inshorts.com/en/ajax/more_news', formdata={'news-offset' : self.news_id}, callback=self.parse_more_news, errback=self.errorRequestHandler, dont_filter=True) except Exception as e: logger.error(__name__ + " [UNHANDLED] " + str(e) + " for response url " + response.url)
Example #11
Source File: zhihu_ask_spider.py From openslack-crawler with Apache License 2.0 | 5 votes |
def start_requests(self): return [FormRequest( "http://www.zhihu.com/login", formdata={'email': 'june.chan@foxmail.com', 'password': 'czj0617_zhihu' }, callback=self.after_login )]
Example #12
Source File: zapimoveis.py From realestate-scraper with MIT License | 5 votes |
def parse(self, response): hidden = lambda id: response.xpath( '/html/body/input[@id="{}"]/@data-value'. format(id)).extract_first() total_pages = int(hidden('quantidadeTotalPaginas').replace('.','')) hashfragment = OrderedDict([ ('pagina', None), ('semente', self.seed or hidden('semente')), ]) formdata = OrderedDict([ ('tipoOferta', '1'), ('paginaAtual', None), ('pathName', parse_url(response.url).path), ('hashFragment', ''), ]) headers = {'X-Requested-With': 'XMLHttpRequest'} url = 'https://www.zapimoveis.com.br/Busca/RetornarBuscaAssincrona/' from_page = self.start if self.count: to_page = min(self.start + self.count - 1, total_pages) else: to_page = total_pages self.crawler.stats.set_value('total_pages', total_pages) self.crawler.stats.set_value('selected_pages', max(0, to_page - from_page + 1)) for page in range(from_page, to_page + 1): hashfragment['pagina'] = formdata['paginaAtual'] = str(page) formdata['hashFragment'] = json.dumps(hashfragment, separators=(',', ':')) yield FormRequest( url, headers=headers, formdata=formdata, callback=self.parse_busca)
Example #13
Source File: music.py From Python_Master_Courses with GNU General Public License v3.0 | 5 votes |
def parse(self, response): for songid in response.xpath('//a/@href').re('/song/(\d+)'): print('songIds:', songid) data = {'songIds': songid} # 257524668 yield FormRequest(url=self.songlink_url, formdata=data, callback=self.parse_song) # break
Example #14
Source File: msi_spider.py From uefi-spider with MIT License | 5 votes |
def parse(self, response): ### Generate a search for AMD and Intel chips intel_search = self._get_vars(170, 1) amd_search = self._get_vars(171, 1) yield FormRequest(url= self.start_urls[0], method= "POST", headers= json_headers, formdata= intel_search, callback= self.parse_search) yield FormRequest(url= self.start_urls[0], method= "POST", headers= json_headers, formdata= amd_search, callback= self.parse_search)
Example #15
Source File: asus_spider.py From uefi-spider with MIT License | 5 votes |
def parse_again(self, response): sel = Selector(response) hidden_fields = {} inputs = sel.xpath("//input") for ele in inputs: input_type = ele.xpath(".//@type").extract()[0] value = ele.xpath(".//@value").extract()[0] name = ele.xpath(".//@name").extract()[0] if input_type not in ["hidden"]: continue hidden_fields[name] = value for product_type in self.product_types: ### Create a POST form and apply a generated ScriptManager form_data = _select_form(1, product_type) for field in hidden_fields: ### Replace static fields with page-generated inputs. form_data[field] = hidden_fields[field] #print form_data yield FormRequest(formdata= form_data, method= "POST", headers= { "Content-Type": "application/x-www-form-urlencoded", #"X-MicrosoftAjax": "Delta=true", "X-Requested-With": "XMLHttpRequest", "User-Agent": self._get_uas() }, url= self.select_urls[0], #meta= {"cookiejar": "GLOBAL"}, callback= self.parse_series) return
Example #16
Source File: intel_spider.py From uefi-spider with MIT License | 5 votes |
def parse(self, response): url = "https://downloadcenter.intel.com/SearchResult.aspx?lang=eng" search_form = { "search_downloads": ".BIO", "ctl00$body$submit_search_downloads": "Search downloads", "ctl00$body$searchKeyword": "BIO" } return [FormRequest(url= url, method= "POST", formdata= search_form, callback= self.parse_form)]
Example #17
Source File: hp_spider.py From uefi-spider with MIT License | 4 votes |
def parse_accept(self, response): ### At the search form, begin to generate monthly searches, alert if >100 results. sel = Selector(response) ### This will select the REAL url (with appended query string "tokens"). url_path = "" forms = sel.xpath("//form") for form in forms: form_ids = form.xpath("@id").extract() if len(form_ids) == 0: continue if form_ids[0] == "refineSearchForm": url_path = form.xpath("@action").extract()[0] ### The search load-balances domain = response.url[len("http://"):response.url.find(".")] url = "http://%s.www2.hp.com/%s" form_data = { "didYouMean": "", "searchCrit": "allwords", "docType":"Drivers", #"docType":"Patch", "dateRange":"all", "dateSearchType":"dateRange", "startDateYear": None, "startDateMonth": None, "startDateDay": "1", "endDateYear": None, "endDateMonth": None, "endDateDay":"1", "resPerPage":"100", "sortCrit":"date", "showSummary":"yesX", "calledBy":"Search_Main", "mode":"text", "searchString":"BIOS Update", "searchRes":"Search", "advSearchFlag":"true", } ### Pull off the remaining searchs, and fill in vars for the 'next' search. remaining_searches = response.meta["searches"] form_data["startDateYear"] = str(remaining_searches[0][0]) form_data["startDateMonth"] = str(remaining_searches[0][1]) form_data["endDateYear"] = str(remaining_searches[0][2]) form_data["endDateMonth"] = str(remaining_searches[0][3]) return FormRequest(url= url % (domain, url_path) + "&month=%d&year=%d" % (remaining_searches[0][1], remaining_searches[0][0]), headers= {"Content-Type": "application/x-www-form-urlencoded"}, formdata= form_data, method= "POST", cookies= self.cookies, meta= {"searches": remaining_searches[1:], "this": (form_data["startDateYear"], form_data["startDateMonth"], form_data["endDateYear"], form_data["endDateMonth"])}, dont_filter= True, callback= self.parse_search) pass
Example #18
Source File: lagou.py From IPProxyTool with MIT License | 4 votes |
def start_requests(self): count = self.sql.get_proxy_count(self.name) count_httpbin = self.sql.get_proxy_count(config.httpbin_table) ids = self.sql.get_proxy_ids(self.name) ids_httpbin = self.sql.get_proxy_ids(config.httpbin_table) for i in range(0, count + count_httpbin): table = self.name if (i < count) else config.httpbin_table id = ids[i] if i < count else ids_httpbin[i - len(ids)] proxy = self.sql.get_proxy_with_id(table, id) if proxy == None: continue for url in self.urls: cur_time = time.time() yield FormRequest( url = url, headers = self.headers, method = 'POST', meta = { 'cur_time': cur_time, 'download_timeout': self.timeout, 'proxy_info': proxy, 'table': table, 'id': proxy.id, 'proxy': 'http://%s:%s' % (proxy.ip, proxy.port), 'vali_count': proxy.vali_count, }, cookies = { 'Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6': '1488937030', '_ga': 'GA1.2.40497390.1488937014', 'TG-TRACK-CODE': 'search_code', 'index_location_city': '%E5%8C%97%E4%BA%AC', 'LGRID': '20170308093710-bf6755eb-039f-11e7-8025-525400f775ce', 'Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6': '1488881288,1488936799,1488936947,1488937014', 'JSESSIONID': 'BDCBB6167F960CE43AF54B75A651F586', 'LGSID': '20170308093653-b59316f0-039f-11e7-9229-5254005c3644', 'LGUID': '20170308093653-b593185f-039f-11e7-9229-5254005c3644', 'user_trace_token': '20170308093654-723efcfac8fb4c28a670d073d5113e02', 'SEARCH_ID': '4db4dc3dea1c46b49018ae5421b53ffa' }, formdata = { 'first': 'true', 'kd': 'ios', 'pn': '1', }, dont_filter = True, callback = self.success_parse, errback = self.error_parse, )