Python Examples of scrapy.http.Request

Source File: spider.py From google-scholar-crawler with Apache License 2.0

7 votes

def parse_1(self, response):
        info('Parse '+response.url)
        #sel = Selector(response)
        #v = sel.css('.gs_ggs a::attr(href)').extract()
        #import pdb; pdb.set_trace()
        x = self.parse_with_rules(response, self.list_css_rules, dict)
        items = []
        if len(x) > 0:
            items = x[0]['.gs_r']
            pp.pprint(items)
        import pdb; pdb.set_trace()
        # return self.parse_with_rules(response, self.css_rules, googlescholarItem)

        for item in items:
            if item['related-url'] == '' or item['related-type'] != '[PDF]':
                continue
            url = item['related-url']
            info('pdf-url: ' + url)
            yield Request(url, callback=self.save_pdf)

Source File: belkin.py From scraper with MIT License

6 votes

def parse(self, response):
        if not response.xpath(
                "//form[@id='productSearchForm']//input[@name='category']/@value").extract()[0]:
            for category in response.xpath("//form[@id='productSearchForm']/div[1]//ul[@class='select-options']//a/@data-id").extract():
                yield FormRequest.from_response(response,
                                                formname="productSearchForm",
                                                formdata={
                                                    "category": category},
                                                callback=self.parse)
        elif not response.xpath("//form[@id='productSearchForm']//input[@name='subCategory']/@value").extract()[0]:
            for subcategory in response.xpath("//form[@id='productSearchForm']/div[2]//ul[@class='select-options']//a/@data-id").extract():
                yield FormRequest.from_response(response,
                                                formname="productSearchForm",
                                                formdata={
                                                    "subCategory": subcategory},
                                                callback=self.parse)
        else:
            for product in response.xpath("//form[@id='productSearchForm']/div[3]//ul[@class='select-options']//a/@data-id").extract():
                yield Request(
                    url=urlparse.urljoin(
                        response.url, "/us/support-product?pid=%s" % (product)),
                    headers={"Referer": response.url},
                    callback=self.parse_product)

Source File: test_wandering_spider.py From scrapy-cluster with MIT License

6 votes

def evaluate(self, meta_object,
                text, expected_raw, expected_requests):
        request = Request(url='http://www.drudgereport.com',
                          meta=meta_object)
        response = HtmlResponse('drudge.url', body=text, request=request,
                                encoding='utf8')

        raw_item_count = 0
        request_count = 0

        for x in self.spider.parse(response):
            if isinstance(x, RawResponseItem):
                raw_item_count = raw_item_count + 1
            elif isinstance(x, Request):
                request_count = request_count + 1

        self.assertEqual(raw_item_count, expected_raw)
        self.assertEqual(request_count, expected_requests)

Source File: tenda_zh.py From scraper with MIT License

6 votes

def parse(self, response):
        for a in response.xpath("//dd/a"):
            url = a.xpath("./@href").extract()[0]
            text = a.xpath("./text()").extract()[0]

            items = text.split(u'升级软件')
            version = items[-1].strip()
            product = items[0].strip().split(u'（')[0].split(' ')[0]

            yield Request(
                url=self.base_url.format(url),
                headers={"Referer": response.url},
                meta={
                    "product":product,
                    "version":version,
                },
                callback=self.parse_product)

Source File: test_distributed_scheduler.py From scrapy-cluster with MIT License

6 votes

def get_request(self):
        req = None

        # required
        req = Request('http://ex.com')
        req.meta['crawlid'] = "abc123"
        req.meta['appid'] = "myapp"

        req.meta['url'] = "http://ex.com"
        req.meta['spiderid'] = "link"
        req.meta["attrs"] = None
        req.meta["allowed_domains"] = None
        req.meta["allow_regex"] = None
        req.meta["deny_regex"] = None
        req.meta["deny_extensions"] = None
        req.meta['curdepth'] = 0
        req.meta["maxdepth"] = 0
        req.meta['priority'] = 0
        req.meta['retry_times'] = 0
        req.meta['expires'] = 0
        req.meta['useragent'] = None
        req.meta['cookie'] = None

        return req

Source File: test_link_spider.py From scrapy-cluster with MIT License

6 votes

def evaluate(self, meta_object,
                text, expected_raw, expected_requests):
        request = Request(url='http://www.drudgereport.com',
                          meta=meta_object)
        response = HtmlResponse('drudge.url', body=text, request=request,
                                encoding='utf8')

        raw_item_count = 0
        request_count = 0

        for x in self.spider.parse(response):
            if isinstance(x, RawResponseItem):
                raw_item_count = raw_item_count + 1
            elif isinstance(x, Request):
                request_count = request_count + 1

        self.assertEqual(raw_item_count, expected_raw)
        self.assertEqual(request_count, expected_requests)

Source File: meta_passthrough_middleware.py From scrapy-cluster with MIT License

6 votes

def process_spider_output(self, response, result, spider):
        '''
        Ensures the meta data from the response is passed
        through in any Request's generated from the spider
        '''
        self.logger.debug("processing meta passthrough middleware")
        for x in result:
            # only operate on requests
            if isinstance(x, Request):
                self.logger.debug("found request")
                # pass along all known meta fields, only if
                # they were not already set in the spider's new request
                for key in list(response.meta.keys()):
                    if key not in x.meta:
                        x.meta[key] = response.meta[key]
            yield x

Source File: distributed_scheduler.py From scrapy-cluster with MIT License

6 votes

def request_to_dict(self, request):
        '''
        Convert Request object to a dict.
        modified from scrapy.utils.reqser
        '''
        req_dict = {
            # urls should be safe (safe_string_url)
            'url': to_unicode(request.url),
            'method': request.method,
            'headers': dict(request.headers),
            'body': request.body,
            'cookies': request.cookies,
            'meta': request.meta,
            '_encoding': request._encoding,
            'priority': request.priority,
            'dont_filter': request.dont_filter,
             #  callback/errback are assumed to be a bound instance of the spider
            'callback': None if request.callback is None else request.callback.__name__,
            'errback': None if request.errback is None else request.errback.__name__,
        }
        return req_dict

Source File: crawlpy_spider.py From crawlpy with MIT License

6 votes

def init_request(self):
        """This function is called before crawling starts."""

        # Do not start a request on error,
        # simply return nothing and quit scrapy
        if self.abort:
            return

        logging.info('All set, start crawling with depth: ' + str(self.max_depth))

        # Do a login
        if self.config['login']['enabled']:
            # Start with login first
            logging.info('Login required')
            return Request(url=self.login_url, callback=self.login)
        else:
            # Start with pase function
            logging.info('Not login required')
            return Request(url=self.base_url, callback=self.parse)



    #----------------------------------------------------------------------

Source File: tp-link_en.py From scraper with MIT License

6 votes

def parse_json(self, response):
        json_response = json.loads(response.body_as_unicode())

        if json_response:
            for entry in json_response:
                yield Request(
                    url=urlparse.urljoin(
                        self.base_path, "/getMenuList.html?action=getsubcatlist&catid=%s&appPath=us" % entry["id"]),
                    meta={"cid": entry["id"]},
                    headers={"Referer": response.url,
                             "X-Requested-With": "XMLHttpRequest"},
                    callback=self.parse_json)
        else:
            yield Request(
                url=urlparse.urljoin(
                    self.base_path, "phppage/down-load-model-list.html?showEndLife=false&catid={}&appPath=us".format(response.meta["cid"])),
                headers={"Referer": response.url,
                         "X-Requested-With": "XMLHttpRequest"},
                callback=self.parse_products)

Source File: pornHubSpider.py From PornHubBot with MIT License

6 votes

def parse_ph_key(self,response):
        selector = Selector(response)
        logging.debug('request url:------>' + response.url)
        # logging.info(selector)
        divs = selector.xpath('//div[@class="phimage"]')
        for div in divs:
            viewkey = re.findall('viewkey=(.*?)"',div.extract())
            # logging.debug(viewkey)
            yield Request(url='https://www.pornhub.com/embed/%s' % viewkey[0],callback = self.parse_ph_info)
        url_next = selector.xpath('//a[@class="orangeButton" and text()="Next"]/@href').extract()
        # logging.debug(url_next)
        if url_next:
        # if self.test:
            logging.debug(' next page:---------->' + self.host+url_next[0])
            yield Request(url=self.host+url_next[0],callback=self.parse_ph_key)
            # self.test = False

Source File: tp-link_en.py From scraper with MIT License

6 votes

def parse_product_version(self, response):
        # <div class="hardware-version">
        if response.xpath("//div[@class=\"hardware-version\"]").extract():
            for i in [1, 2]:
                yield Request(
                    url = response.url.replace(".html", "-V{}.html".format(i)),
                    meta = {"product": response.meta['product'],
                            "version": "V{}".format(int(i)+1),
                            },
                    callback = self.parse_product)

        else: #only for v1?
            yield Request(
                url = response.url + "?again=true",
                meta = {"product": response.meta['product'],
                        "version": "V1"
                        },
                callback = self.parse_product)

Source File: assetstore.py From IPProxyTool with MIT License

6 votes

def start_requests(self):
        url = 'https://www.assetstore.unity3d.com/login'
        yield Request(
                url = url,
                headers = {
                    'Accept': 'application/json',
                    'Accept-Encoding': 'gzip, deflate, br',
                    'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
                    'Connection': 'keep-alive',
                    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
                    'Host': 'www.assetstore.unity3d.com',
                    'Referer': 'https://www.assetstore.unity3d.com/en/',
                    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 '
                                  'Firefox/50.0',
                    'X-Kharma-Version': '0',
                    'X-Requested-With': 'UnityAssetStore',
                    'X-Unity-Session': '26c4202eb475d02864b40827dfff11a14657aa41',
                },
                meta = {
                },
                dont_filter = True,
                callback = self.get_unity_version,
                errback = self.error_parse,
        )

Source File: se.py From scraper with MIT License

6 votes

def parse_product(self, response):
        # Find the "Software and Firmware" tab link to get to the product-range-download page
        meta = response.meta
        meta['dont_redirect'] = True
        for link in response.css('a.tab-link'):
            href = link.xpath('@href').extract_first()
            if href.endswith(u'software-firmware-tab'):
                logging.debug("Requesting SW+FW page for %s at %s",
                        response.meta['product'], urlparse.urljoin(response.url, href))

                yield Request(
                    url=urlparse.urljoin(response.url, href),
                    headers={"Referer": response.url},
                    meta=meta,
                    callback=self.parse_product_sw_fw)

                break
        else:
            logging.debug("Did not find a 'Software and Firmware' tab for %s",
                    response.meta['product'])

Source File: se.py From scraper with MIT License

5 votes

def parse(self, response):
        a_to_z = response.css('ul.product-finder-result')
        for link in a_to_z.xpath('.//a'):
            product = u' '.join(link.xpath('.//text()').extract()).strip()
            href = link.xpath('@href').extract_first()

            yield Request(
                url=urlparse.urljoin(response.url, href),
                headers={"Referer": response.url},
                meta={'product': product},
                callback=self.parse_product)

Source File: 360.py From scraper with MIT License

5 votes

def parse(self, response):
        yield Request(
            url=self.json_url,
            headers={"Referer": response.url},
            callback=self.parse_product)

Source File: tomato-shibby.py From scraper with MIT License

5 votes

def parse(self, response):
        for link in response.xpath("//table//tr"):
            if not link.xpath("./td[2]/a"):
                continue

            text = link.xpath("./td[2]/a/text()").extract()[0]
            href = link.xpath("./td[2]//@href").extract()[0]

            if ".." in href:
                continue
            elif href.endswith('/'):
                build = response.meta.get("build", None)
                product = response.meta.get("product", None)

                if not product:
                    product = text
                elif not build:
                    build = text.replace("build", "")

                yield Request(
                    url=urlparse.urljoin(response.url, href),
                    headers={"Referer": response.url},
                    meta={"build": build, "product": product},
                    callback=self.parse)
            elif any(href.endswith(x) for x in [".bin", ".elf", ".fdt", ".imx", ".chk", ".trx"]):
                item = FirmwareLoader(
                    item=FirmwareImage(), response=response, date_fmt=["%Y-%m-%d"])
                item.add_value("build", response.meta["build"])
                item.add_value("url", href)
                item.add_value("version", FirmwareLoader.find_version_period(
                    os.path.splitext(text)[0].split("-")))
                item.add_value("date", item.find_date(
                    link.xpath("./td[3]/text()").extract()))
                item.add_value("product", response.meta["product"])
                item.add_value("vendor", self.name)
                yield item.load_item()

Source File: tp-link_en.py From scraper with MIT License

5 votes

def parse_products(self, response):
        json_response = json.loads(response.body_as_unicode()) 
        if json_response:
            #description = json_response[0]['title']
            for row in json_response[0]['row']:
                yield Request(
                    url = urlparse.urljoin(self.base_path, row['href']),
                    meta = {"product": row['model'],
                            },
                    callback = self.parse_product_version)

Source File: phicomm.py From scraper with MIT License

5 votes

def parse(self, response):
        head = False
        for tr in response.xpath("//table//tr"):
            if not head:
                head = True
                continue

            description = tr.xpath("./td[2]/text()").extract()[0]
            product = description.split(u'（')[0]
            version = tr.xpath("./td[4]/text()").extract()[0]
            #2017-03-14
            date = tr.xpath("./td[6]/p/text()").extract()[0]
            downloadid = tr.xpath("./td[7]/a/@downloadid").extract()[0]

            #http://www.phicomm.com/cn/support.php/Mobile/Downhit.html?id=437
            firmware_url = "http://www.phicomm.com/cn/support.php/Mobile/Downhit.html?id={}".format(downloadid)
            yield Request(
                url = firmware_url,
                headers={"Referer": response.url},
                meta={
                    "product":product,
                    "version":version,
                    "date":date,
                    'description':description
                },
                callback = self.parse_product)

Source File: actiontec.py From scraper with MIT License

5 votes

def parse(self, response):
        for link in response.xpath("//div[@class='newboxes2']//a"):
            product = link.xpath(".//text()").extract()[0]
            # some product strings are e.g. "(GT701-WRU) - 54 Mbps Wireless
            # Cable/DSL Router"
            actual = re.match(r"\(([\w ,\\/()-]+?)\)", product)
            if actual:
                product = actual.group(1).replace("(", "").replace(")", "")

            yield Request(
                url=urlparse.urljoin(
                    response.url, link.xpath(".//@href").extract()[0]),
                headers={"Referer": response.url},
                meta={"product": product},
                callback=self.parse_product)

Source File: airlink101.py From scraper with MIT License

5 votes

def parse(self, response):
        for entry in response.xpath(
                "//div[@class='menu2']//table//table//table[2]//td[1]//td[2]"):
            desc = entry.xpath(".//text()").extract()

            for link in entry.xpath(".//a"):
                href = link.xpath("./@href").extract()[0]
                text = link.xpath(".//text()").extract()[0]

                if "_a=download" not in href:
                    yield Request(
                        url=urlparse.urljoin(response.url, href),
                        headers={"Referer": response.url},
                        meta={"product": text.strip().split(' ')},
                        callback=self.parse)
                elif "firmware" in text.lower() or "f/w" in text.lower():
                    item = FirmwareLoader(item=FirmwareImage(),
                                          response=response,
                                          date_fmt=["%m/%d/%Y", "%m/%d/%y"])
                    item.add_value("version", FirmwareLoader.find_version(desc))
                    item.add_value("date", item.find_date(desc))
                    item.add_value("description", text)
                    item.add_value("url", href)
                    item.add_value("product", response.meta["product"])
                    item.add_value("vendor", self.name)
                    yield item.load_item()

Source File: camius.py From scraper with MIT License

5 votes

def parse(self, response):
        for link in response.xpath('//a'):
            text = link.xpath('text()').extract_first()
            href = link.xpath('@href').extract_first()

            if text is None or href in self.start_urls or "firmware" not in text.lower():
                continue

            yield Request(
                url=urlparse.urljoin(response.url, href),
                headers={"Referer": response.url},
                meta={"product": text},
                callback=self.parse_product_firmware)

Source File: xerox.py From scraper with MIT License

5 votes

def parse(self, response):
        for href in response.xpath(
                "//div[@class='productResults a2z']//a/@href").extract():
            if "downloads" in href:
                yield Request(
                    url=urlparse.urljoin(response.url, href),
                    headers={"Referer": response.url},
                    callback=self.parse_download)

Source File: huawei_zh.py From scraper with MIT License

5 votes

def parse_product(self, response):
        json_response = json.loads(response.body_as_unicode())

        for product in json_response:
            yield Request(
                url=urlparse.urljoin(
                    response.url, "/support/services/service/file/list?productID=%s&siteCode=%s" % (product["productId"], self.region)),
                meta={"product": product["productCode"]},
                headers={"Referer": response.url,
                         "X-Requested-With": "XMLHttpRequest"},
                callback=self.parse_download)

Source File: huawei_zh.py From scraper with MIT License

5 votes

def parse_category(self, response):
        json_response = json.loads(response.body_as_unicode())

        for category in json_response:
            yield Request(
                url=urlparse.urljoin(
                    response.url, "/support/services/service/product/list?productID=%s&siteCode=%s" % (category["productId"], self.region)),
                headers={"Referer": response.url,
                         "X-Requested-With": "XMLHttpRequest"},
                callback=self.parse_product)

Source File: huawei_zh.py From scraper with MIT License

5 votes

def parse(self, response):
        yield Request(
            url=urlparse.urljoin(
                response.url, "/support/services/service/product/category?siteCode=%s" % (self.region)),
            headers={"Referer": response.url,
                     "X-Requested-With": "XMLHttpRequest"},
            callback=self.parse_category)

Source File: openwrt.py From scraper with MIT License

5 votes

def parse(self, response):
        for link in response.xpath("//a"):
            text = link.xpath("text()").extract_first()
            href = link.xpath("@href").extract_first()

            if text is None and href == u"/":
                # <a href="/"><em>(root)</em></a>
                continue

            yield Request(
                url=urlparse.urljoin(response.url, href),
                headers={"Referer": response.url},
                meta={"version": FirmwareLoader.find_version_period(text)},
                callback=self.parse_url)

Source File: tenda_en.py From scraper with MIT License

5 votes

def parse(self, response):
        for page in response.xpath("//div[@class='next-page']/a/text()").extract():
            yield Request(
                url=self.url.format(page),
                headers={"Referer": response.url,
                         "X-Requested-With": "XMLHttpRequest"},
                callback=self.parse_product)

Source File: centurylink.py From scraper with MIT License

5 votes

def parse(self, response):
        product = None
        for section in response.xpath("//div[@class='product-content']/div[@class='product-box2']/div"):
            text = section.xpath(".//text()").extract()
            if not section.xpath(".//a"):
                product = text[0].strip()
            else:
                for link in section.xpath(".//a/@href").extract():
                    if link.endswith(".html"):
                        yield Request(
                            url=urlparse.urljoin(response.url, link),
                            meta={"product": product,
                                  "version": FirmwareLoader.find_version(text)},
                            headers={"Referer": response.url},
                            callback=self.parse_download)

Source File: foscam.py From scraper with MIT License

5 votes

def start_requests(self):
        for url in self.start_urls:
            yield Request(url, cookies={'loginEmail': "@.com"}, dont_filter=True)

Python scrapy.http.Request() Examples