Python scrapy.http.Request() Examples

The following are 30 code examples of scrapy.http.Request(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module scrapy.http , or try the search function .
Example #1
Source File: spider.py    From google-scholar-crawler with Apache License 2.0 7 votes vote down vote up
def parse_1(self, response):
        info('Parse '+response.url)
        #sel = Selector(response)
        #v = sel.css('.gs_ggs a::attr(href)').extract()
        #import pdb; pdb.set_trace()
        x = self.parse_with_rules(response, self.list_css_rules, dict)
        items = []
        if len(x) > 0:
            items = x[0]['.gs_r']
            pp.pprint(items)
        import pdb; pdb.set_trace()
        # return self.parse_with_rules(response, self.css_rules, googlescholarItem)

        for item in items:
            if item['related-url'] == '' or item['related-type'] != '[PDF]':
                continue
            url = item['related-url']
            info('pdf-url: ' + url)
            yield Request(url, callback=self.save_pdf) 
Example #2
Source File: belkin.py    From scraper with MIT License 6 votes vote down vote up
def parse(self, response):
        if not response.xpath(
                "//form[@id='productSearchForm']//input[@name='category']/@value").extract()[0]:
            for category in response.xpath("//form[@id='productSearchForm']/div[1]//ul[@class='select-options']//a/@data-id").extract():
                yield FormRequest.from_response(response,
                                                formname="productSearchForm",
                                                formdata={
                                                    "category": category},
                                                callback=self.parse)
        elif not response.xpath("//form[@id='productSearchForm']//input[@name='subCategory']/@value").extract()[0]:
            for subcategory in response.xpath("//form[@id='productSearchForm']/div[2]//ul[@class='select-options']//a/@data-id").extract():
                yield FormRequest.from_response(response,
                                                formname="productSearchForm",
                                                formdata={
                                                    "subCategory": subcategory},
                                                callback=self.parse)
        else:
            for product in response.xpath("//form[@id='productSearchForm']/div[3]//ul[@class='select-options']//a/@data-id").extract():
                yield Request(
                    url=urlparse.urljoin(
                        response.url, "/us/support-product?pid=%s" % (product)),
                    headers={"Referer": response.url},
                    callback=self.parse_product) 
Example #3
Source File: test_wandering_spider.py    From scrapy-cluster with MIT License 6 votes vote down vote up
def evaluate(self, meta_object,
                text, expected_raw, expected_requests):
        request = Request(url='http://www.drudgereport.com',
                          meta=meta_object)
        response = HtmlResponse('drudge.url', body=text, request=request,
                                encoding='utf8')

        raw_item_count = 0
        request_count = 0

        for x in self.spider.parse(response):
            if isinstance(x, RawResponseItem):
                raw_item_count = raw_item_count + 1
            elif isinstance(x, Request):
                request_count = request_count + 1

        self.assertEqual(raw_item_count, expected_raw)
        self.assertEqual(request_count, expected_requests) 
Example #4
Source File: tenda_zh.py    From scraper with MIT License 6 votes vote down vote up
def parse(self, response):
        for a in response.xpath("//dd/a"):
            url = a.xpath("./@href").extract()[0]
            text = a.xpath("./text()").extract()[0]

            items = text.split(u'升级软件')
            version = items[-1].strip()
            product = items[0].strip().split(u'(')[0].split(' ')[0]

            yield Request(
                url=self.base_url.format(url),
                headers={"Referer": response.url},
                meta={
                    "product":product,
                    "version":version,
                },
                callback=self.parse_product) 
Example #5
Source File: test_distributed_scheduler.py    From scrapy-cluster with MIT License 6 votes vote down vote up
def get_request(self):
        req = None

        # required
        req = Request('http://ex.com')
        req.meta['crawlid'] = "abc123"
        req.meta['appid'] = "myapp"

        req.meta['url'] = "http://ex.com"
        req.meta['spiderid'] = "link"
        req.meta["attrs"] = None
        req.meta["allowed_domains"] = None
        req.meta["allow_regex"] = None
        req.meta["deny_regex"] = None
        req.meta["deny_extensions"] = None
        req.meta['curdepth'] = 0
        req.meta["maxdepth"] = 0
        req.meta['priority'] = 0
        req.meta['retry_times'] = 0
        req.meta['expires'] = 0
        req.meta['useragent'] = None
        req.meta['cookie'] = None

        return req 
Example #6
Source File: test_link_spider.py    From scrapy-cluster with MIT License 6 votes vote down vote up
def evaluate(self, meta_object,
                text, expected_raw, expected_requests):
        request = Request(url='http://www.drudgereport.com',
                          meta=meta_object)
        response = HtmlResponse('drudge.url', body=text, request=request,
                                encoding='utf8')

        raw_item_count = 0
        request_count = 0

        for x in self.spider.parse(response):
            if isinstance(x, RawResponseItem):
                raw_item_count = raw_item_count + 1
            elif isinstance(x, Request):
                request_count = request_count + 1

        self.assertEqual(raw_item_count, expected_raw)
        self.assertEqual(request_count, expected_requests) 
Example #7
Source File: meta_passthrough_middleware.py    From scrapy-cluster with MIT License 6 votes vote down vote up
def process_spider_output(self, response, result, spider):
        '''
        Ensures the meta data from the response is passed
        through in any Request's generated from the spider
        '''
        self.logger.debug("processing meta passthrough middleware")
        for x in result:
            # only operate on requests
            if isinstance(x, Request):
                self.logger.debug("found request")
                # pass along all known meta fields, only if
                # they were not already set in the spider's new request
                for key in list(response.meta.keys()):
                    if key not in x.meta:
                        x.meta[key] = response.meta[key]
            yield x 
Example #8
Source File: distributed_scheduler.py    From scrapy-cluster with MIT License 6 votes vote down vote up
def request_to_dict(self, request):
        '''
        Convert Request object to a dict.
        modified from scrapy.utils.reqser
        '''
        req_dict = {
            # urls should be safe (safe_string_url)
            'url': to_unicode(request.url),
            'method': request.method,
            'headers': dict(request.headers),
            'body': request.body,
            'cookies': request.cookies,
            'meta': request.meta,
            '_encoding': request._encoding,
            'priority': request.priority,
            'dont_filter': request.dont_filter,
             #  callback/errback are assumed to be a bound instance of the spider
            'callback': None if request.callback is None else request.callback.__name__,
            'errback': None if request.errback is None else request.errback.__name__,
        }
        return req_dict 
Example #9
Source File: crawlpy_spider.py    From crawlpy with MIT License 6 votes vote down vote up
def init_request(self):
        """This function is called before crawling starts."""

        # Do not start a request on error,
        # simply return nothing and quit scrapy
        if self.abort:
            return

        logging.info('All set, start crawling with depth: ' + str(self.max_depth))

        # Do a login
        if self.config['login']['enabled']:
            # Start with login first
            logging.info('Login required')
            return Request(url=self.login_url, callback=self.login)
        else:
            # Start with pase function
            logging.info('Not login required')
            return Request(url=self.base_url, callback=self.parse)



    #---------------------------------------------------------------------- 
Example #10
Source File: tp-link_en.py    From scraper with MIT License 6 votes vote down vote up
def parse_json(self, response):
        json_response = json.loads(response.body_as_unicode())

        if json_response:
            for entry in json_response:
                yield Request(
                    url=urlparse.urljoin(
                        self.base_path, "/getMenuList.html?action=getsubcatlist&catid=%s&appPath=us" % entry["id"]),
                    meta={"cid": entry["id"]},
                    headers={"Referer": response.url,
                             "X-Requested-With": "XMLHttpRequest"},
                    callback=self.parse_json)
        else:
            yield Request(
                url=urlparse.urljoin(
                    self.base_path, "phppage/down-load-model-list.html?showEndLife=false&catid={}&appPath=us".format(response.meta["cid"])),
                headers={"Referer": response.url,
                         "X-Requested-With": "XMLHttpRequest"},
                callback=self.parse_products) 
Example #11
Source File: pornHubSpider.py    From PornHubBot with MIT License 6 votes vote down vote up
def parse_ph_key(self,response):
        selector = Selector(response)
        logging.debug('request url:------>' + response.url)
        # logging.info(selector)
        divs = selector.xpath('//div[@class="phimage"]')
        for div in divs:
            viewkey = re.findall('viewkey=(.*?)"',div.extract())
            # logging.debug(viewkey)
            yield Request(url='https://www.pornhub.com/embed/%s' % viewkey[0],callback = self.parse_ph_info)
        url_next = selector.xpath('//a[@class="orangeButton" and text()="Next"]/@href').extract()
        # logging.debug(url_next)
        if url_next:
        # if self.test:
            logging.debug(' next page:---------->' + self.host+url_next[0])
            yield Request(url=self.host+url_next[0],callback=self.parse_ph_key)
            # self.test = False 
Example #12
Source File: tp-link_en.py    From scraper with MIT License 6 votes vote down vote up
def parse_product_version(self, response):
        # <div class="hardware-version">
        if response.xpath("//div[@class=\"hardware-version\"]").extract():
            for i in [1, 2]:
                yield Request(
                    url = response.url.replace(".html", "-V{}.html".format(i)),
                    meta = {"product": response.meta['product'],
                            "version": "V{}".format(int(i)+1),
                            },
                    callback = self.parse_product)

        else: #only for v1?
            yield Request(
                url = response.url + "?again=true",
                meta = {"product": response.meta['product'],
                        "version": "V1"
                        },
                callback = self.parse_product) 
Example #13
Source File: assetstore.py    From IPProxyTool with MIT License 6 votes vote down vote up
def start_requests(self):
        url = 'https://www.assetstore.unity3d.com/login'
        yield Request(
                url = url,
                headers = {
                    'Accept': 'application/json',
                    'Accept-Encoding': 'gzip, deflate, br',
                    'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
                    'Connection': 'keep-alive',
                    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
                    'Host': 'www.assetstore.unity3d.com',
                    'Referer': 'https://www.assetstore.unity3d.com/en/',
                    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 '
                                  'Firefox/50.0',
                    'X-Kharma-Version': '0',
                    'X-Requested-With': 'UnityAssetStore',
                    'X-Unity-Session': '26c4202eb475d02864b40827dfff11a14657aa41',
                },
                meta = {
                },
                dont_filter = True,
                callback = self.get_unity_version,
                errback = self.error_parse,
        ) 
Example #14
Source File: se.py    From scraper with MIT License 6 votes vote down vote up
def parse_product(self, response):
        # Find the "Software and Firmware" tab link to get to the product-range-download page
        meta = response.meta
        meta['dont_redirect'] = True
        for link in response.css('a.tab-link'):
            href = link.xpath('@href').extract_first()
            if href.endswith(u'software-firmware-tab'):
                logging.debug("Requesting SW+FW page for %s at %s",
                        response.meta['product'], urlparse.urljoin(response.url, href))

                yield Request(
                    url=urlparse.urljoin(response.url, href),
                    headers={"Referer": response.url},
                    meta=meta,
                    callback=self.parse_product_sw_fw)

                break
        else:
            logging.debug("Did not find a 'Software and Firmware' tab for %s",
                    response.meta['product']) 
Example #15
Source File: se.py    From scraper with MIT License 5 votes vote down vote up
def parse(self, response):
        a_to_z = response.css('ul.product-finder-result')
        for link in a_to_z.xpath('.//a'):
            product = u' '.join(link.xpath('.//text()').extract()).strip()
            href = link.xpath('@href').extract_first()

            yield Request(
                url=urlparse.urljoin(response.url, href),
                headers={"Referer": response.url},
                meta={'product': product},
                callback=self.parse_product) 
Example #16
Source File: 360.py    From scraper with MIT License 5 votes vote down vote up
def parse(self, response):
        yield Request(
            url=self.json_url,
            headers={"Referer": response.url},
            callback=self.parse_product) 
Example #17
Source File: tomato-shibby.py    From scraper with MIT License 5 votes vote down vote up
def parse(self, response):
        for link in response.xpath("//table//tr"):
            if not link.xpath("./td[2]/a"):
                continue

            text = link.xpath("./td[2]/a/text()").extract()[0]
            href = link.xpath("./td[2]//@href").extract()[0]

            if ".." in href:
                continue
            elif href.endswith('/'):
                build = response.meta.get("build", None)
                product = response.meta.get("product", None)

                if not product:
                    product = text
                elif not build:
                    build = text.replace("build", "")

                yield Request(
                    url=urlparse.urljoin(response.url, href),
                    headers={"Referer": response.url},
                    meta={"build": build, "product": product},
                    callback=self.parse)
            elif any(href.endswith(x) for x in [".bin", ".elf", ".fdt", ".imx", ".chk", ".trx"]):
                item = FirmwareLoader(
                    item=FirmwareImage(), response=response, date_fmt=["%Y-%m-%d"])
                item.add_value("build", response.meta["build"])
                item.add_value("url", href)
                item.add_value("version", FirmwareLoader.find_version_period(
                    os.path.splitext(text)[0].split("-")))
                item.add_value("date", item.find_date(
                    link.xpath("./td[3]/text()").extract()))
                item.add_value("product", response.meta["product"])
                item.add_value("vendor", self.name)
                yield item.load_item() 
Example #18
Source File: tp-link_en.py    From scraper with MIT License 5 votes vote down vote up
def parse_products(self, response):
        json_response = json.loads(response.body_as_unicode()) 
        if json_response:
            #description = json_response[0]['title']
            for row in json_response[0]['row']:
                yield Request(
                    url = urlparse.urljoin(self.base_path, row['href']),
                    meta = {"product": row['model'],
                            },
                    callback = self.parse_product_version) 
Example #19
Source File: phicomm.py    From scraper with MIT License 5 votes vote down vote up
def parse(self, response):
        head = False
        for tr in response.xpath("//table//tr"):
            if not head:
                head = True
                continue

            description = tr.xpath("./td[2]/text()").extract()[0]
            product = description.split(u'(')[0]
            version = tr.xpath("./td[4]/text()").extract()[0]
            #2017-03-14
            date = tr.xpath("./td[6]/p/text()").extract()[0]
            downloadid = tr.xpath("./td[7]/a/@downloadid").extract()[0]

            #http://www.phicomm.com/cn/support.php/Mobile/Downhit.html?id=437
            firmware_url = "http://www.phicomm.com/cn/support.php/Mobile/Downhit.html?id={}".format(downloadid)
            yield Request(
                url = firmware_url,
                headers={"Referer": response.url},
                meta={
                    "product":product,
                    "version":version,
                    "date":date,
                    'description':description
                },
                callback = self.parse_product) 
Example #20
Source File: actiontec.py    From scraper with MIT License 5 votes vote down vote up
def parse(self, response):
        for link in response.xpath("//div[@class='newboxes2']//a"):
            product = link.xpath(".//text()").extract()[0]
            # some product strings are e.g. "(GT701-WRU) - 54 Mbps Wireless
            # Cable/DSL Router"
            actual = re.match(r"\(([\w ,\\/()-]+?)\)", product)
            if actual:
                product = actual.group(1).replace("(", "").replace(")", "")

            yield Request(
                url=urlparse.urljoin(
                    response.url, link.xpath(".//@href").extract()[0]),
                headers={"Referer": response.url},
                meta={"product": product},
                callback=self.parse_product) 
Example #21
Source File: airlink101.py    From scraper with MIT License 5 votes vote down vote up
def parse(self, response):
        for entry in response.xpath(
                "//div[@class='menu2']//table//table//table[2]//td[1]//td[2]"):
            desc = entry.xpath(".//text()").extract()

            for link in entry.xpath(".//a"):
                href = link.xpath("./@href").extract()[0]
                text = link.xpath(".//text()").extract()[0]

                if "_a=download" not in href:
                    yield Request(
                        url=urlparse.urljoin(response.url, href),
                        headers={"Referer": response.url},
                        meta={"product": text.strip().split(' ')},
                        callback=self.parse)
                elif "firmware" in text.lower() or "f/w" in text.lower():
                    item = FirmwareLoader(item=FirmwareImage(),
                                          response=response,
                                          date_fmt=["%m/%d/%Y", "%m/%d/%y"])
                    item.add_value("version", FirmwareLoader.find_version(desc))
                    item.add_value("date", item.find_date(desc))
                    item.add_value("description", text)
                    item.add_value("url", href)
                    item.add_value("product", response.meta["product"])
                    item.add_value("vendor", self.name)
                    yield item.load_item() 
Example #22
Source File: camius.py    From scraper with MIT License 5 votes vote down vote up
def parse(self, response):
        for link in response.xpath('//a'):
            text = link.xpath('text()').extract_first()
            href = link.xpath('@href').extract_first()

            if text is None or href in self.start_urls or "firmware" not in text.lower():
                continue

            yield Request(
                url=urlparse.urljoin(response.url, href),
                headers={"Referer": response.url},
                meta={"product": text},
                callback=self.parse_product_firmware) 
Example #23
Source File: xerox.py    From scraper with MIT License 5 votes vote down vote up
def parse(self, response):
        for href in response.xpath(
                "//div[@class='productResults a2z']//a/@href").extract():
            if "downloads" in href:
                yield Request(
                    url=urlparse.urljoin(response.url, href),
                    headers={"Referer": response.url},
                    callback=self.parse_download) 
Example #24
Source File: huawei_zh.py    From scraper with MIT License 5 votes vote down vote up
def parse_product(self, response):
        json_response = json.loads(response.body_as_unicode())

        for product in json_response:
            yield Request(
                url=urlparse.urljoin(
                    response.url, "/support/services/service/file/list?productID=%s&siteCode=%s" % (product["productId"], self.region)),
                meta={"product": product["productCode"]},
                headers={"Referer": response.url,
                         "X-Requested-With": "XMLHttpRequest"},
                callback=self.parse_download) 
Example #25
Source File: huawei_zh.py    From scraper with MIT License 5 votes vote down vote up
def parse_category(self, response):
        json_response = json.loads(response.body_as_unicode())

        for category in json_response:
            yield Request(
                url=urlparse.urljoin(
                    response.url, "/support/services/service/product/list?productID=%s&siteCode=%s" % (category["productId"], self.region)),
                headers={"Referer": response.url,
                         "X-Requested-With": "XMLHttpRequest"},
                callback=self.parse_product) 
Example #26
Source File: huawei_zh.py    From scraper with MIT License 5 votes vote down vote up
def parse(self, response):
        yield Request(
            url=urlparse.urljoin(
                response.url, "/support/services/service/product/category?siteCode=%s" % (self.region)),
            headers={"Referer": response.url,
                     "X-Requested-With": "XMLHttpRequest"},
            callback=self.parse_category) 
Example #27
Source File: openwrt.py    From scraper with MIT License 5 votes vote down vote up
def parse(self, response):
        for link in response.xpath("//a"):
            text = link.xpath("text()").extract_first()
            href = link.xpath("@href").extract_first()

            if text is None and href == u"/":
                # <a href="/"><em>(root)</em></a>
                continue

            yield Request(
                url=urlparse.urljoin(response.url, href),
                headers={"Referer": response.url},
                meta={"version": FirmwareLoader.find_version_period(text)},
                callback=self.parse_url) 
Example #28
Source File: tenda_en.py    From scraper with MIT License 5 votes vote down vote up
def parse(self, response):
        for page in response.xpath("//div[@class='next-page']/a/text()").extract():
            yield Request(
                url=self.url.format(page),
                headers={"Referer": response.url,
                         "X-Requested-With": "XMLHttpRequest"},
                callback=self.parse_product) 
Example #29
Source File: centurylink.py    From scraper with MIT License 5 votes vote down vote up
def parse(self, response):
        product = None
        for section in response.xpath("//div[@class='product-content']/div[@class='product-box2']/div"):
            text = section.xpath(".//text()").extract()
            if not section.xpath(".//a"):
                product = text[0].strip()
            else:
                for link in section.xpath(".//a/@href").extract():
                    if link.endswith(".html"):
                        yield Request(
                            url=urlparse.urljoin(response.url, link),
                            meta={"product": product,
                                  "version": FirmwareLoader.find_version(text)},
                            headers={"Referer": response.url},
                            callback=self.parse_download) 
Example #30
Source File: foscam.py    From scraper with MIT License 5 votes vote down vote up
def start_requests(self):
        for url in self.start_urls:
            yield Request(url, cookies={'loginEmail': "@.com"}, dont_filter=True)