Python Examples of scrapy.http.FormRequest

Source File: zhihu_user_spider.py From openslack-crawler with Apache License 2.0

7 votes

def start_requests(self):
        return [FormRequest(
            "http://www.zhihu.com/login",
            formdata={'email': 'june.chan@foxmail.com',
                      'password': 'czj0617_zhihu'
                      },
            callback=self.after_login
        )]

Source File: coursera_spider.py From scrapy_example with MIT License

6 votes

def start_requests(self):
        print 'Preparing login'
        return [FormRequest("https://accounts.coursera.org/api/v1/login",
                            headers = self.make_header(response),
                            formdata = {
                            "email": "1095511864@qq.com",
                            "password": "HUAZANG.55789260",
                            "webrequest": "true"
                            },
                            callback = self.parse_page
                            )]

Source File: zhizhu_user_topic_spider.py From Zhihu_Spider with Apache License 2.0

6 votes

def start_requests(self):
        return [FormRequest(
            "http://www.zhihu.com/login",
            formdata = {'email':'xxx@gmail.com',
                'password':'123456'
                },
            callback = self.after_login
            )]

Source File: zhizhu_user_topic_spider.py From Zhihu_Spider with Apache License 2.0

6 votes

def gen_topic_form(self, response):
        # yield the beginning topics
        sel = Selector(response)
        for topic_sel in sel.xpath('//div[@id="zh-profile-topic-list"]/div[contains(@class, "zm-profile-section-item")]'):
            # new user-topic relationship
            yield self.get_UT_item(topic_sel, response.url)

        # get the number of topics of one user
        num_topic = sel.xpath('//div[contains(@class, "zm-profile-section-wrap")]/div[contains(@class, "zm-profile-section-head")]//span[contains(@class, "zm-profile-section-name")]/text()')
        number_str = num_topic.extract()[0]
        # print number_str
        p = re.compile(r'\d+')
        m = p.findall(number_str)
        if m:
            num_topic = int(m[0])
            # crawl the remainding topics of a user
            base_line = 20
            if num_topic > 20:
                while  num_topic > 0:
                    yield FormRequest(
                            url = response.url,
                            formdata = {
                                'start': '0',
                                'offset': str(base_line),
                                '_xsrf': self.xsrf
                                },
                            callback=self.parse
                            )
                    num_topic = num_topic - 20
                    base_line += 20

Source File: zhihu_spider.py From Zhihu_Spider with Apache License 2.0

6 votes

def start_requests(self):
        return [FormRequest(
            "http://www.zhihu.com/login",
            formdata = {'email':'example.com',
                'password':'123456'
                },
            callback = self.after_login
            )]

Source File: login1_spider.py From openslack-crawler with Apache License 2.0

6 votes

def start_requests(self):
        for i, url in enumerate(self.start_urls):
            yield FormRequest(url, meta={'cookiejar': i}, \
                              headers=self.headers, \
                              cookies=self.cookies,
                              callback=self.parse_item)  # jump to login page

Source File: zhihu_answer_spider.py From openslack-crawler with Apache License 2.0

6 votes

def start_requests(self):
        return [FormRequest(
            "http://www.zhihu.com/login",
            formdata={'email': 'june.chan@foxmail.com',
                      'password': 'czj0617_zhihu'
                      },
            callback=self.after_login
        )]

Source File: main.py From python-examples with MIT License

6 votes

def parse(self, response):
        # small images 200x200
        #urls = response.xpath('//div[@id="thumbsContainer"]//img/@data-original').extract()
        #urls = response.xpath('//img[@class="res-photo-thumbnail thumb-load lazy-photo-inner"]/@data-original').extract()
        #yield {'image_urls': urls}

        # big images 800x600
        #urls = [url.replace('200%3A200', '800%3A600') for url in urls]
        #yield {'image_urls': urls}

        # big images 1900x1200
        #urls = [url.replace('200%3A200', '1900%3A1200') for url in urls]
        #yield {'image_urls': urls}

        data = {
            'res_id': '16761868', #, '16780723', # place ID
            'offset': '30',    # change it
            'category':	'all', # 'food'
            'action': 'fetch_photos',
            'index': '30',
            'limit': '10', # chage it
        }

        url = 'https://www.zomato.com/php/load_more_res_pics.php'
        yield FormRequest(url, callback=self.parse_post, formdata=data)

Source File: msi_spider.py From uefi-spider with MIT License

5 votes

def parse_search(self, response):
        sel = Selector(response)

        ### Parse each sub-product type.
        searches = []
        product_selector = sel.css(".mr20").xpath("@no")
        if product_selector:
            pno = product_selector.extract()[0]

            products = sel.css(".ProdSel-item")
            for product in products:
                no = product.xpath("@no").extract()[0]
                searches.append((no, pno))
        #print searches

        ### Parse the actual products/boards.
        boards = []
        items = sel.css(".Prod-item")
        for item in items:
            title = item.xpath("@title").extract()[0]
            no = item.xpath("@no").extract()[0]
            boards.append((title, no))
        #print boards

        for sub_search in searches:
            search_vars = self._get_vars(sub_search[0], sub_search[1])
            yield FormRequest(url= self.start_urls[0], method= "POST", headers= json_headers,
                formdata= search_vars, callback= self.parse_search)

        for board in boards:
            url = "http://us.msi.com/product/mb/%s.html" % board[0]
            item = MsiUpdateLinkItem()
            item["id"] = board[1]
            item["title"] = board[0]
            item["url"] = url

            yield Request(url= "%s#/?div=BIOS" % url, callback= self.parse_board, 
                meta= {"attrs": item})
        pass

Source File: inshorts.py From scrape with MIT License

5 votes

def parse(self, response):
        try:
            for news in response.css('div.news-card'):
                self.urls_parsed += 1
                try:
                    item = ScrapenewsItem()
                    item['image'] = news.css('div.news-card-image::attr(style)').extract_first()[23:-3]
                    item['title'] = news.css('a.clickable>span::text').extract_first()
                    item['content'] = news.css('div[itemprop*=articleBody]::text').extract_first()
                    item['newsDate'] = news.css('span.time::attr(content)').extract_first()[:-5]
                    item['link'] = news.css('div.read-more>a::attr(href)').extract_first()
                    item['source'] = 105
                    yield item
                    self.urls_scraped += 1
                except Exception as e:
                    logger.error(__name__ + " [UNHANDLED] Unable to Extract Data : " + str(e))
                    self.urls_dropped += 1

            #news_id extraction
            pattern = re.compile('var min_news_id\s+=\s+"(.*?)"')
            js = response.xpath('//script[@type="text/javascript"]/text()').extract()[-1]
            self.news_id = pattern.search(js).group(1)

            while (self.pages > 1 and not self.infinite):
                yield FormRequest('https://www.inshorts.com/en/ajax/more_news',
                                    formdata={'news-offset' : self.news_id},
                                    callback=self.parse_more_news,
                                    errback=self.errorRequestHandler,
                                    dont_filter=True)
                self.pages -= 1

            while (self.infinite):
                yield FormRequest('https://www.inshorts.com/en/ajax/more_news',
                                    formdata={'news-offset' : self.news_id},
                                    callback=self.parse_more_news,
                                    errback=self.errorRequestHandler,
                                    dont_filter=True)
        except Exception as e:
            logger.error(__name__ + " [UNHANDLED] " + str(e) + " for response url " + response.url)

Source File: zhihu_ask_spider.py From openslack-crawler with Apache License 2.0

5 votes

def start_requests(self):
        return [FormRequest(
            "http://www.zhihu.com/login",
            formdata={'email': 'june.chan@foxmail.com',
                      'password': 'czj0617_zhihu'
                      },
            callback=self.after_login
        )]

Source File: zapimoveis.py From realestate-scraper with MIT License

5 votes

def parse(self, response):
        hidden = lambda id: response.xpath(
                '/html/body/input[@id="{}"]/@data-value'.
                format(id)).extract_first()

        total_pages = int(hidden('quantidadeTotalPaginas').replace('.',''))

        hashfragment = OrderedDict([
            ('pagina', None),
            ('semente', self.seed or hidden('semente')),
        ])

        formdata = OrderedDict([
            ('tipoOferta', '1'),
            ('paginaAtual', None),
            ('pathName', parse_url(response.url).path),
            ('hashFragment', ''),
        ])

        headers = {'X-Requested-With': 'XMLHttpRequest'}
        url = 'https://www.zapimoveis.com.br/Busca/RetornarBuscaAssincrona/'

        from_page = self.start
        if self.count:
            to_page = min(self.start + self.count - 1, total_pages)
        else:
            to_page = total_pages

        self.crawler.stats.set_value('total_pages', total_pages)
        self.crawler.stats.set_value('selected_pages',
                                     max(0, to_page - from_page + 1))

        for page in range(from_page, to_page + 1):
            hashfragment['pagina'] = formdata['paginaAtual'] = str(page)
            formdata['hashFragment'] = json.dumps(hashfragment,
                                                  separators=(',', ':'))
            yield FormRequest(
                    url,
                    headers=headers,
                    formdata=formdata,
                    callback=self.parse_busca)

Source File: music.py From Python_Master_Courses with GNU General Public License v3.0

5 votes

def parse(self, response):
        for songid in response.xpath('//a/@href').re('/song/(\d+)'):
            print('songIds:', songid)
            data = {'songIds': songid}  # 257524668
            yield FormRequest(url=self.songlink_url, formdata=data, callback=self.parse_song)
            # break

Source File: msi_spider.py From uefi-spider with MIT License

5 votes

def parse(self, response):
        ### Generate a search for AMD and Intel chips
        intel_search = self._get_vars(170, 1)
        amd_search   = self._get_vars(171, 1)
        yield FormRequest(url= self.start_urls[0], method= "POST", headers= json_headers,
            formdata= intel_search, callback= self.parse_search)
        yield FormRequest(url= self.start_urls[0], method= "POST", headers= json_headers,
            formdata= amd_search, callback= self.parse_search)

Source File: asus_spider.py From uefi-spider with MIT License

5 votes

def parse_again(self, response):
        sel = Selector(response)

        hidden_fields = {}
        inputs = sel.xpath("//input")
        for ele in inputs:
            input_type = ele.xpath(".//@type").extract()[0]
            value = ele.xpath(".//@value").extract()[0]
            name = ele.xpath(".//@name").extract()[0]
            if input_type not in ["hidden"]:
                continue
            hidden_fields[name] = value

        for product_type in self.product_types:
            ### Create a POST form and apply a generated ScriptManager
            form_data = _select_form(1, product_type)
            for field in hidden_fields:
                ### Replace static fields with page-generated inputs.
                form_data[field] = hidden_fields[field]
            #print form_data
            yield FormRequest(formdata= form_data, method= "POST",
                headers= {
                    "Content-Type": "application/x-www-form-urlencoded",
                    #"X-MicrosoftAjax": "Delta=true",
                    "X-Requested-With": "XMLHttpRequest",
                    "User-Agent": self._get_uas()
                },
                url= self.select_urls[0],
                #meta= {"cookiejar": "GLOBAL"},
                callback= self.parse_series)
            return

Source File: intel_spider.py From uefi-spider with MIT License

5 votes

def parse(self, response):
    url = "https://downloadcenter.intel.com/SearchResult.aspx?lang=eng"

    search_form = {
      "search_downloads": ".BIO",
      "ctl00$body$submit_search_downloads": "Search downloads",
      "ctl00$body$searchKeyword": "BIO"
    }

    return [FormRequest(url= url, method= "POST",
      formdata= search_form, callback= self.parse_form)]

Source File: hp_spider.py From uefi-spider with MIT License

4 votes

def parse_accept(self, response):
    ### At the search form, begin to generate monthly searches, alert if >100 results.
    sel = Selector(response)

    ### This will select the REAL url (with appended query string "tokens").
    url_path = ""
    forms = sel.xpath("//form")
    for form in forms:
      form_ids = form.xpath("@id").extract()
      if len(form_ids) == 0: 
        continue
      if form_ids[0] == "refineSearchForm":
        url_path = form.xpath("@action").extract()[0]

    ### The search load-balances
    domain = response.url[len("http://"):response.url.find(".")]

    url = "http://%s.www2.hp.com/%s"
    form_data = {
      "didYouMean": "",
      "searchCrit": "allwords",
      "docType":"Drivers",
      #"docType":"Patch",
      "dateRange":"all",
      "dateSearchType":"dateRange",
      "startDateYear": None,
      "startDateMonth": None,
      "startDateDay": "1",
      "endDateYear": None,
      "endDateMonth": None,
      "endDateDay":"1",
      "resPerPage":"100",
      "sortCrit":"date",
      "showSummary":"yesX",
      "calledBy":"Search_Main",
      "mode":"text",
      "searchString":"BIOS Update",
      "searchRes":"Search",
      "advSearchFlag":"true",
    }

    ### Pull off the remaining searchs, and fill in vars for the 'next' search.
    remaining_searches = response.meta["searches"]

    form_data["startDateYear"] = str(remaining_searches[0][0])
    form_data["startDateMonth"] = str(remaining_searches[0][1])
    form_data["endDateYear"] = str(remaining_searches[0][2])
    form_data["endDateMonth"] = str(remaining_searches[0][3])

    return FormRequest(url= url % (domain, url_path) + "&month=%d&year=%d" % (remaining_searches[0][1], remaining_searches[0][0]), 
      headers= {"Content-Type": "application/x-www-form-urlencoded"},
      formdata= form_data, method= "POST", cookies= self.cookies,
      meta= {"searches": remaining_searches[1:], "this": (form_data["startDateYear"], form_data["startDateMonth"], form_data["endDateYear"], form_data["endDateMonth"])},
      dont_filter= True,
      callback= self.parse_search)
    pass

Source File: lagou.py From IPProxyTool with MIT License

4 votes

def start_requests(self):
        count = self.sql.get_proxy_count(self.name)
        count_httpbin = self.sql.get_proxy_count(config.httpbin_table)

        ids = self.sql.get_proxy_ids(self.name)
        ids_httpbin = self.sql.get_proxy_ids(config.httpbin_table)

        for i in range(0, count + count_httpbin):
            table = self.name if (i < count) else config.httpbin_table
            id = ids[i] if i < count else ids_httpbin[i - len(ids)]

            proxy = self.sql.get_proxy_with_id(table, id)
            if proxy == None:
                continue

            for url in self.urls:
                cur_time = time.time()
                yield FormRequest(
                        url = url,
                        headers = self.headers,
                        method = 'POST',
                        meta = {
                            'cur_time': cur_time,
                            'download_timeout': self.timeout,
                            'proxy_info': proxy,
                            'table': table,
                            'id': proxy.id,
                            'proxy': 'http://%s:%s' % (proxy.ip, proxy.port),
                            'vali_count': proxy.vali_count,
                        },
                        cookies = {
                            'Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6': '1488937030',
                            '_ga': 'GA1.2.40497390.1488937014',
                            'TG-TRACK-CODE': 'search_code',
                            'index_location_city': '%E5%8C%97%E4%BA%AC',
                            'LGRID': '20170308093710-bf6755eb-039f-11e7-8025-525400f775ce',
                            'Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6': '1488881288,1488936799,1488936947,1488937014',
                            'JSESSIONID': 'BDCBB6167F960CE43AF54B75A651F586',
                            'LGSID': '20170308093653-b59316f0-039f-11e7-9229-5254005c3644',
                            'LGUID': '20170308093653-b593185f-039f-11e7-9229-5254005c3644',
                            'user_trace_token': '20170308093654-723efcfac8fb4c28a670d073d5113e02',
                            'SEARCH_ID': '4db4dc3dea1c46b49018ae5421b53ffa'
                        },
                        formdata = {
                            'first': 'true',
                            'kd': 'ios',
                            'pn': '1',
                        },
                        dont_filter = True,
                        callback = self.success_parse,
                        errback = self.error_parse,
                )

Python scrapy.http.FormRequest() Examples