Python scrapy.http.FormRequest.from_response() Examples

The following are 11 code examples of scrapy.http.FormRequest.from_response(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module scrapy.http.FormRequest , or try the search function .
Example #1
Source File: middleware.py    From captcha-middleware with GNU General Public License v3.0 7 votes vote down vote up
def process_response(self, request, response, spider):
        captchaUrl = self.findCaptchaUrl(response.text)
        if captchaUrl is None:
            return response; # No CAPTCHA is present
        elif request.meta.get(RETRY_KEY, self.MAX_CAPTCHA_ATTEMPTS) == self.MAX_CAPTCHA_ATTEMPTS:
            logger.warning("Too many CAPTCHA attempts; surrendering.")
            raise IgnoreRequest
        captchaSolution = solveCaptcha(imgUrl=captchaUrl, brazen=True)
        if captchaSolution is None:
            logger.error("CAPTCHA page detected, but no solution was proposed.")
            raise IgnoreRequest
        # Return a request to submit the captcha
        logger.info("Submitting solution %s for CAPTCHA at %s", captchaSolution, captchaUrl)
        formRequest = FormRequest.from_response(
            response, formnumber=0, formdata={self.findCaptchaField(response.text):captchaSolution})
        formRequest.meta[RETRY_KEY] = request.meta.get('captcha_retries', 0) + 1
        return formRequest 
Example #2
Source File: crawlpy_spider.py    From crawlpy with MIT License 6 votes vote down vote up
def login(self, response):
        """Generate a login request."""

        # Add CSRF data to login.
        # Note: scrapy already does this automatically, if it finds
        # pre-filled input fields. If everything works without having
        # to use this custom csrf feature, it could be removed in the future.
        if self.config['login']['csrf']['enabled']:
            field = self.config['login']['csrf']['field']
            csrf = response.xpath('//input[@name="' + field + '"]/@value')[0].extract()
            self.config['login']['fields'][field] = csrf
            logging.info('Adding CSRF data to login. Field: "' + field + '" | value: "' + csrf + "'")

        return FormRequest.from_response(
            response,
            formdata=self.config['login']['fields'],
            method=self.config['login']['method'],
            dont_filter=True,
            callback=self.post_login
        )


    #---------------------------------------------------------------------- 
Example #3
Source File: belkin.py    From scraper with MIT License 6 votes vote down vote up
def parse(self, response):
        if not response.xpath(
                "//form[@id='productSearchForm']//input[@name='category']/@value").extract()[0]:
            for category in response.xpath("//form[@id='productSearchForm']/div[1]//ul[@class='select-options']//a/@data-id").extract():
                yield FormRequest.from_response(response,
                                                formname="productSearchForm",
                                                formdata={
                                                    "category": category},
                                                callback=self.parse)
        elif not response.xpath("//form[@id='productSearchForm']//input[@name='subCategory']/@value").extract()[0]:
            for subcategory in response.xpath("//form[@id='productSearchForm']/div[2]//ul[@class='select-options']//a/@data-id").extract():
                yield FormRequest.from_response(response,
                                                formname="productSearchForm",
                                                formdata={
                                                    "subCategory": subcategory},
                                                callback=self.parse)
        else:
            for product in response.xpath("//form[@id='productSearchForm']/div[3]//ul[@class='select-options']//a/@data-id").extract():
                yield Request(
                    url=urlparse.urljoin(
                        response.url, "/us/support-product?pid=%s" % (product)),
                    headers={"Referer": response.url},
                    callback=self.parse_product) 
Example #4
Source File: zhihu_spider.py    From scrapy_example with MIT License 6 votes vote down vote up
def post_login(self, response):
        print 'Preparing login'
        #下面这句话用于抓取请求网页后返回网页中的_xsrf字段的文字, 用于成功提交表单
        xsrf = Selector(response).xpath('//input[@name="_xsrf"]/@value').extract()[0]
        print xsrf
        #FormRequeset.from_response是Scrapy提供的一个函数, 用于post表单
        #登陆成功后, 会调用after_login回调函数
        return [FormRequest.from_response(response,   #"http://www.zhihu.com/login",
                            meta = {'cookiejar' : response.meta['cookiejar']},
                            headers = self.headers,
                            formdata = {
                            '_xsrf': xsrf,
                            'email': '123456',
                            'password': '123456'
                            },
                            callback = self.after_login,
                            dont_filter = True
                            )] 
Example #5
Source File: zhihu_spider.py    From openslack-crawler with Apache License 2.0 6 votes vote down vote up
def post_login(self, response):
        print 'Preparing login'
        # 下面这句话用于抓取请求网页后返回网页中的_xsrf字段的文字, 用于成功提交表单
        xsrf = Selector(response).xpath('//input[@name="_xsrf"]/@value').extract()[0]
        print xsrf
        # FormRequeset.from_response是Scrapy提供的一个函数, 用于post表单
        # 登陆成功后, 会调用after_login回调函数
        return [FormRequest.from_response(response,  # "http://www.zhihu.com/login",
                                          meta={'cookiejar': response.meta['cookiejar']},
                                          headers=self.headers,  # 注意此处的headers
                                          formdata={
                                              '_xsrf': xsrf,
                                              'email': '1009137312@qq.com',
                                              'password': '6yhn6yhn'
                                          },
                                          callback=self.after_login,
                                          dont_filter=True
                                          )] 
Example #6
Source File: pixiv_spider.py    From scrapy-picture-spider with Apache License 2.0 5 votes vote down vote up
def post_login(self, response):
        # username and password from settings.py
        self.set_username_and_password()
        username, password = PixivSpider.username, PixivSpider.password
        # check username and password
        if username is None or password is None:
            raise CloseSpider('username or password is null!')

        self.logger.debug('Preparing login, username = %s password = %s' % (username, password))
        post_key = response.css('#old-login input[name=post_key]::attr(value)').extract_first()
        # FormRequest for dealing with HTML forms
        # function from_response for simulate a user login
        self.headers['Referer'] = response.url
        return FormRequest.from_response(
            response,
            meta={'cookiejar': response.meta['cookiejar']},
            headers=self.headers,
            formdata={
                'pixiv_id': username,
                'password': password,
                'post_key': post_key,
                'mode': 'login'
            },
            callback=self.after_login,
            dont_filter=True
        )

    # username and password from settings.py 
Example #7
Source File: fbcrawl.py    From fbcrawl with Apache License 2.0 5 votes vote down vote up
def parse(self, response):
        '''
        Handle login with provided credentials
        '''
        return FormRequest.from_response(
                response,
                formxpath='//form[contains(@action, "login")]',
                formdata={'email': self.email,'pass': self.password},
                callback=self.parse_home
                ) 
Example #8
Source File: uebermedien_de.py    From PyFeeds with GNU Affero General Public License v3.0 5 votes vote down vote up
def _steady_login(self, response):
        response = yield scrapy.Request(
            "https://steadyhq.com/oauth/authorize?"
            + "client_id=0c29f006-1a98-48f1-8a63-2c0652c59f28&"
            + "redirect_uri=https://uebermedien.de&scope=read&"
            + "response_type=code&refresh_only=false",
            meta={"cache_expires": timedelta(days=1)},
        )

        response = yield FormRequest.from_response(
            response,
            formdata=OrderedDict(
                [("user[email]", self._username), ("user[password]", self._password)]
            ),
            dont_filter=True,
            meta={"handle_httpstatus_list": [301], "cache_expires": timedelta(days=1)},
        )

        try:
            code = parse_qs(urlparse(response.url).query)["code"][0]
        except KeyError:
            self.logger.error("Login failed: Wrong username and password")
            return

        body = OrderedDict(
            [
                ("client_id", "0c29f006-1a98-48f1-8a63-2c0652c59f28"),
                ("grant_type", "authorization_code"),
                ("code", code),
                ("redirect_uri", "https://uebermedien.de"),
            ]
        )
        response = yield scrapy.Request(
            "https://steadyhq.com/api/v1/oauth/token",
            method="POST",
            body=json.dumps(body),
            headers={"Accept": "application/json", "Content-Type": "application/json"},
            meta={"cache_expires": timedelta(days=1)},
        )
        self._steady_token = json.loads(response.text)["access_token"] 
Example #9
Source File: a51newren.py    From openslack-crawler with Apache License 2.0 5 votes vote down vote up
def parse(self, response):
        sel = scrapy.Selector(response)
        item = DmozItem()
        sel = scrapy.Selector(response)
        href = str(response.url)
        hidden = sel.xpath("//input[@name='__RequestVerificationToken']/@value").extract()
        return [FormRequest.from_response(response, \
                                          formdata=self.formdata, \
                                          headers=self.headers, \
                                          meta={
                                              '__RequestVerificationToken': 'BSDY33UtJXv0XqMkIvAJXAdMXC-jqACBsiZb6-mx4uW8Hr89aArTh9DfLtQFDh6NwQsqHXiZMTzheuim3ETI78PhOzQf263wliXL8ArkTrA1'}, \
                                          callback=self.parse_item)] 
Example #10
Source File: AmazonSpider.py    From openslack-crawler with Apache License 2.0 5 votes vote down vote up
def login(self, response):
        self._log_page(response, 'amazon_login.html')
        return [FormRequest.from_response(response, \
                                          formdata=self.formdata, \
                                          headers=self.headers, \
                                          meta={'cookiejar': response.meta['cookiejar']}, \
                                          callback=self.parse_item)]  # success login 
Example #11
Source File: fbcrawl.py    From fbcrawl with Apache License 2.0 4 votes vote down vote up
def parse_home(self, response):
        '''
        This method has multiple purposes:
        1) Handle failed logins due to facebook 'save-device' redirection
        2) Set language interface, if not already provided
        3) Navigate to given page 
        '''
        #handle 'save-device' redirection
        if response.xpath("//div/a[contains(@href,'save-device')]"):
            self.logger.info('Going through the "save-device" checkpoint')
            return FormRequest.from_response(
                response,
                formdata={'name_action_selected': 'dont_save'},
                callback=self.parse_home
                )
            
        #set language interface
        if self.lang == '_':
            if response.xpath("//input[@placeholder='Search Facebook']"):
                self.logger.info('Language recognized: lang="en"')
                self.lang = 'en'
            elif response.xpath("//input[@placeholder='Buscar en Facebook']"):
                self.logger.info('Language recognized: lang="es"')
                self.lang = 'es'
            elif response.xpath("//input[@placeholder='Rechercher sur Facebook']"):
                self.logger.info('Language recognized: lang="fr"')
                self.lang = 'fr'
            elif response.xpath("//input[@placeholder='Cerca su Facebook']"):
                self.logger.info('Language recognized: lang="it"')
                self.lang = 'it'
            elif response.xpath("//input[@placeholder='Pesquisa no Facebook']"):
                self.logger.info('Language recognized: lang="pt"')
                self.lang = 'pt'
            else:
                raise AttributeError('Language not recognized\n'
                                     'Change your interface lang from facebook ' 
                                     'and try again')
                                                                 
        #navigate to provided page
        href = response.urljoin(self.page)
        self.logger.info('Scraping facebook page {}'.format(href))
        return scrapy.Request(url=href,callback=self.parse_page,meta={'index':1})