Python scrapy.http.FormRequest.from_response() Examples
The following are 11
code examples of scrapy.http.FormRequest.from_response().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
scrapy.http.FormRequest
, or try the search function
.
Example #1
Source File: middleware.py From captcha-middleware with GNU General Public License v3.0 | 7 votes |
def process_response(self, request, response, spider): captchaUrl = self.findCaptchaUrl(response.text) if captchaUrl is None: return response; # No CAPTCHA is present elif request.meta.get(RETRY_KEY, self.MAX_CAPTCHA_ATTEMPTS) == self.MAX_CAPTCHA_ATTEMPTS: logger.warning("Too many CAPTCHA attempts; surrendering.") raise IgnoreRequest captchaSolution = solveCaptcha(imgUrl=captchaUrl, brazen=True) if captchaSolution is None: logger.error("CAPTCHA page detected, but no solution was proposed.") raise IgnoreRequest # Return a request to submit the captcha logger.info("Submitting solution %s for CAPTCHA at %s", captchaSolution, captchaUrl) formRequest = FormRequest.from_response( response, formnumber=0, formdata={self.findCaptchaField(response.text):captchaSolution}) formRequest.meta[RETRY_KEY] = request.meta.get('captcha_retries', 0) + 1 return formRequest
Example #2
Source File: crawlpy_spider.py From crawlpy with MIT License | 6 votes |
def login(self, response): """Generate a login request.""" # Add CSRF data to login. # Note: scrapy already does this automatically, if it finds # pre-filled input fields. If everything works without having # to use this custom csrf feature, it could be removed in the future. if self.config['login']['csrf']['enabled']: field = self.config['login']['csrf']['field'] csrf = response.xpath('//input[@name="' + field + '"]/@value')[0].extract() self.config['login']['fields'][field] = csrf logging.info('Adding CSRF data to login. Field: "' + field + '" | value: "' + csrf + "'") return FormRequest.from_response( response, formdata=self.config['login']['fields'], method=self.config['login']['method'], dont_filter=True, callback=self.post_login ) #----------------------------------------------------------------------
Example #3
Source File: belkin.py From scraper with MIT License | 6 votes |
def parse(self, response): if not response.xpath( "//form[@id='productSearchForm']//input[@name='category']/@value").extract()[0]: for category in response.xpath("//form[@id='productSearchForm']/div[1]//ul[@class='select-options']//a/@data-id").extract(): yield FormRequest.from_response(response, formname="productSearchForm", formdata={ "category": category}, callback=self.parse) elif not response.xpath("//form[@id='productSearchForm']//input[@name='subCategory']/@value").extract()[0]: for subcategory in response.xpath("//form[@id='productSearchForm']/div[2]//ul[@class='select-options']//a/@data-id").extract(): yield FormRequest.from_response(response, formname="productSearchForm", formdata={ "subCategory": subcategory}, callback=self.parse) else: for product in response.xpath("//form[@id='productSearchForm']/div[3]//ul[@class='select-options']//a/@data-id").extract(): yield Request( url=urlparse.urljoin( response.url, "/us/support-product?pid=%s" % (product)), headers={"Referer": response.url}, callback=self.parse_product)
Example #4
Source File: zhihu_spider.py From scrapy_example with MIT License | 6 votes |
def post_login(self, response): print 'Preparing login' #下面这句话用于抓取请求网页后返回网页中的_xsrf字段的文字, 用于成功提交表单 xsrf = Selector(response).xpath('//input[@name="_xsrf"]/@value').extract()[0] print xsrf #FormRequeset.from_response是Scrapy提供的一个函数, 用于post表单 #登陆成功后, 会调用after_login回调函数 return [FormRequest.from_response(response, #"http://www.zhihu.com/login", meta = {'cookiejar' : response.meta['cookiejar']}, headers = self.headers, formdata = { '_xsrf': xsrf, 'email': '123456', 'password': '123456' }, callback = self.after_login, dont_filter = True )]
Example #5
Source File: zhihu_spider.py From openslack-crawler with Apache License 2.0 | 6 votes |
def post_login(self, response): print 'Preparing login' # 下面这句话用于抓取请求网页后返回网页中的_xsrf字段的文字, 用于成功提交表单 xsrf = Selector(response).xpath('//input[@name="_xsrf"]/@value').extract()[0] print xsrf # FormRequeset.from_response是Scrapy提供的一个函数, 用于post表单 # 登陆成功后, 会调用after_login回调函数 return [FormRequest.from_response(response, # "http://www.zhihu.com/login", meta={'cookiejar': response.meta['cookiejar']}, headers=self.headers, # 注意此处的headers formdata={ '_xsrf': xsrf, 'email': '1009137312@qq.com', 'password': '6yhn6yhn' }, callback=self.after_login, dont_filter=True )]
Example #6
Source File: pixiv_spider.py From scrapy-picture-spider with Apache License 2.0 | 5 votes |
def post_login(self, response): # username and password from settings.py self.set_username_and_password() username, password = PixivSpider.username, PixivSpider.password # check username and password if username is None or password is None: raise CloseSpider('username or password is null!') self.logger.debug('Preparing login, username = %s password = %s' % (username, password)) post_key = response.css('#old-login input[name=post_key]::attr(value)').extract_first() # FormRequest for dealing with HTML forms # function from_response for simulate a user login self.headers['Referer'] = response.url return FormRequest.from_response( response, meta={'cookiejar': response.meta['cookiejar']}, headers=self.headers, formdata={ 'pixiv_id': username, 'password': password, 'post_key': post_key, 'mode': 'login' }, callback=self.after_login, dont_filter=True ) # username and password from settings.py
Example #7
Source File: fbcrawl.py From fbcrawl with Apache License 2.0 | 5 votes |
def parse(self, response): ''' Handle login with provided credentials ''' return FormRequest.from_response( response, formxpath='//form[contains(@action, "login")]', formdata={'email': self.email,'pass': self.password}, callback=self.parse_home )
Example #8
Source File: uebermedien_de.py From PyFeeds with GNU Affero General Public License v3.0 | 5 votes |
def _steady_login(self, response): response = yield scrapy.Request( "https://steadyhq.com/oauth/authorize?" + "client_id=0c29f006-1a98-48f1-8a63-2c0652c59f28&" + "redirect_uri=https://uebermedien.de&scope=read&" + "response_type=code&refresh_only=false", meta={"cache_expires": timedelta(days=1)}, ) response = yield FormRequest.from_response( response, formdata=OrderedDict( [("user[email]", self._username), ("user[password]", self._password)] ), dont_filter=True, meta={"handle_httpstatus_list": [301], "cache_expires": timedelta(days=1)}, ) try: code = parse_qs(urlparse(response.url).query)["code"][0] except KeyError: self.logger.error("Login failed: Wrong username and password") return body = OrderedDict( [ ("client_id", "0c29f006-1a98-48f1-8a63-2c0652c59f28"), ("grant_type", "authorization_code"), ("code", code), ("redirect_uri", "https://uebermedien.de"), ] ) response = yield scrapy.Request( "https://steadyhq.com/api/v1/oauth/token", method="POST", body=json.dumps(body), headers={"Accept": "application/json", "Content-Type": "application/json"}, meta={"cache_expires": timedelta(days=1)}, ) self._steady_token = json.loads(response.text)["access_token"]
Example #9
Source File: a51newren.py From openslack-crawler with Apache License 2.0 | 5 votes |
def parse(self, response): sel = scrapy.Selector(response) item = DmozItem() sel = scrapy.Selector(response) href = str(response.url) hidden = sel.xpath("//input[@name='__RequestVerificationToken']/@value").extract() return [FormRequest.from_response(response, \ formdata=self.formdata, \ headers=self.headers, \ meta={ '__RequestVerificationToken': 'BSDY33UtJXv0XqMkIvAJXAdMXC-jqACBsiZb6-mx4uW8Hr89aArTh9DfLtQFDh6NwQsqHXiZMTzheuim3ETI78PhOzQf263wliXL8ArkTrA1'}, \ callback=self.parse_item)]
Example #10
Source File: AmazonSpider.py From openslack-crawler with Apache License 2.0 | 5 votes |
def login(self, response): self._log_page(response, 'amazon_login.html') return [FormRequest.from_response(response, \ formdata=self.formdata, \ headers=self.headers, \ meta={'cookiejar': response.meta['cookiejar']}, \ callback=self.parse_item)] # success login
Example #11
Source File: fbcrawl.py From fbcrawl with Apache License 2.0 | 4 votes |
def parse_home(self, response): ''' This method has multiple purposes: 1) Handle failed logins due to facebook 'save-device' redirection 2) Set language interface, if not already provided 3) Navigate to given page ''' #handle 'save-device' redirection if response.xpath("//div/a[contains(@href,'save-device')]"): self.logger.info('Going through the "save-device" checkpoint') return FormRequest.from_response( response, formdata={'name_action_selected': 'dont_save'}, callback=self.parse_home ) #set language interface if self.lang == '_': if response.xpath("//input[@placeholder='Search Facebook']"): self.logger.info('Language recognized: lang="en"') self.lang = 'en' elif response.xpath("//input[@placeholder='Buscar en Facebook']"): self.logger.info('Language recognized: lang="es"') self.lang = 'es' elif response.xpath("//input[@placeholder='Rechercher sur Facebook']"): self.logger.info('Language recognized: lang="fr"') self.lang = 'fr' elif response.xpath("//input[@placeholder='Cerca su Facebook']"): self.logger.info('Language recognized: lang="it"') self.lang = 'it' elif response.xpath("//input[@placeholder='Pesquisa no Facebook']"): self.logger.info('Language recognized: lang="pt"') self.lang = 'pt' else: raise AttributeError('Language not recognized\n' 'Change your interface lang from facebook ' 'and try again') #navigate to provided page href = response.urljoin(self.page) self.logger.info('Scraping facebook page {}'.format(href)) return scrapy.Request(url=href,callback=self.parse_page,meta={'index':1})