Python scrapy.http.Request() Examples
The following are 30
code examples of scrapy.http.Request().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
scrapy.http
, or try the search function
.
Example #1
Source File: spider.py From google-scholar-crawler with Apache License 2.0 | 7 votes |
def parse_1(self, response): info('Parse '+response.url) #sel = Selector(response) #v = sel.css('.gs_ggs a::attr(href)').extract() #import pdb; pdb.set_trace() x = self.parse_with_rules(response, self.list_css_rules, dict) items = [] if len(x) > 0: items = x[0]['.gs_r'] pp.pprint(items) import pdb; pdb.set_trace() # return self.parse_with_rules(response, self.css_rules, googlescholarItem) for item in items: if item['related-url'] == '' or item['related-type'] != '[PDF]': continue url = item['related-url'] info('pdf-url: ' + url) yield Request(url, callback=self.save_pdf)
Example #2
Source File: belkin.py From scraper with MIT License | 6 votes |
def parse(self, response): if not response.xpath( "//form[@id='productSearchForm']//input[@name='category']/@value").extract()[0]: for category in response.xpath("//form[@id='productSearchForm']/div[1]//ul[@class='select-options']//a/@data-id").extract(): yield FormRequest.from_response(response, formname="productSearchForm", formdata={ "category": category}, callback=self.parse) elif not response.xpath("//form[@id='productSearchForm']//input[@name='subCategory']/@value").extract()[0]: for subcategory in response.xpath("//form[@id='productSearchForm']/div[2]//ul[@class='select-options']//a/@data-id").extract(): yield FormRequest.from_response(response, formname="productSearchForm", formdata={ "subCategory": subcategory}, callback=self.parse) else: for product in response.xpath("//form[@id='productSearchForm']/div[3]//ul[@class='select-options']//a/@data-id").extract(): yield Request( url=urlparse.urljoin( response.url, "/us/support-product?pid=%s" % (product)), headers={"Referer": response.url}, callback=self.parse_product)
Example #3
Source File: test_wandering_spider.py From scrapy-cluster with MIT License | 6 votes |
def evaluate(self, meta_object, text, expected_raw, expected_requests): request = Request(url='http://www.drudgereport.com', meta=meta_object) response = HtmlResponse('drudge.url', body=text, request=request, encoding='utf8') raw_item_count = 0 request_count = 0 for x in self.spider.parse(response): if isinstance(x, RawResponseItem): raw_item_count = raw_item_count + 1 elif isinstance(x, Request): request_count = request_count + 1 self.assertEqual(raw_item_count, expected_raw) self.assertEqual(request_count, expected_requests)
Example #4
Source File: tenda_zh.py From scraper with MIT License | 6 votes |
def parse(self, response): for a in response.xpath("//dd/a"): url = a.xpath("./@href").extract()[0] text = a.xpath("./text()").extract()[0] items = text.split(u'升级软件') version = items[-1].strip() product = items[0].strip().split(u'(')[0].split(' ')[0] yield Request( url=self.base_url.format(url), headers={"Referer": response.url}, meta={ "product":product, "version":version, }, callback=self.parse_product)
Example #5
Source File: test_distributed_scheduler.py From scrapy-cluster with MIT License | 6 votes |
def get_request(self): req = None # required req = Request('http://ex.com') req.meta['crawlid'] = "abc123" req.meta['appid'] = "myapp" req.meta['url'] = "http://ex.com" req.meta['spiderid'] = "link" req.meta["attrs"] = None req.meta["allowed_domains"] = None req.meta["allow_regex"] = None req.meta["deny_regex"] = None req.meta["deny_extensions"] = None req.meta['curdepth'] = 0 req.meta["maxdepth"] = 0 req.meta['priority'] = 0 req.meta['retry_times'] = 0 req.meta['expires'] = 0 req.meta['useragent'] = None req.meta['cookie'] = None return req
Example #6
Source File: test_link_spider.py From scrapy-cluster with MIT License | 6 votes |
def evaluate(self, meta_object, text, expected_raw, expected_requests): request = Request(url='http://www.drudgereport.com', meta=meta_object) response = HtmlResponse('drudge.url', body=text, request=request, encoding='utf8') raw_item_count = 0 request_count = 0 for x in self.spider.parse(response): if isinstance(x, RawResponseItem): raw_item_count = raw_item_count + 1 elif isinstance(x, Request): request_count = request_count + 1 self.assertEqual(raw_item_count, expected_raw) self.assertEqual(request_count, expected_requests)
Example #7
Source File: meta_passthrough_middleware.py From scrapy-cluster with MIT License | 6 votes |
def process_spider_output(self, response, result, spider): ''' Ensures the meta data from the response is passed through in any Request's generated from the spider ''' self.logger.debug("processing meta passthrough middleware") for x in result: # only operate on requests if isinstance(x, Request): self.logger.debug("found request") # pass along all known meta fields, only if # they were not already set in the spider's new request for key in list(response.meta.keys()): if key not in x.meta: x.meta[key] = response.meta[key] yield x
Example #8
Source File: distributed_scheduler.py From scrapy-cluster with MIT License | 6 votes |
def request_to_dict(self, request): ''' Convert Request object to a dict. modified from scrapy.utils.reqser ''' req_dict = { # urls should be safe (safe_string_url) 'url': to_unicode(request.url), 'method': request.method, 'headers': dict(request.headers), 'body': request.body, 'cookies': request.cookies, 'meta': request.meta, '_encoding': request._encoding, 'priority': request.priority, 'dont_filter': request.dont_filter, # callback/errback are assumed to be a bound instance of the spider 'callback': None if request.callback is None else request.callback.__name__, 'errback': None if request.errback is None else request.errback.__name__, } return req_dict
Example #9
Source File: crawlpy_spider.py From crawlpy with MIT License | 6 votes |
def init_request(self): """This function is called before crawling starts.""" # Do not start a request on error, # simply return nothing and quit scrapy if self.abort: return logging.info('All set, start crawling with depth: ' + str(self.max_depth)) # Do a login if self.config['login']['enabled']: # Start with login first logging.info('Login required') return Request(url=self.login_url, callback=self.login) else: # Start with pase function logging.info('Not login required') return Request(url=self.base_url, callback=self.parse) #----------------------------------------------------------------------
Example #10
Source File: tp-link_en.py From scraper with MIT License | 6 votes |
def parse_json(self, response): json_response = json.loads(response.body_as_unicode()) if json_response: for entry in json_response: yield Request( url=urlparse.urljoin( self.base_path, "/getMenuList.html?action=getsubcatlist&catid=%s&appPath=us" % entry["id"]), meta={"cid": entry["id"]}, headers={"Referer": response.url, "X-Requested-With": "XMLHttpRequest"}, callback=self.parse_json) else: yield Request( url=urlparse.urljoin( self.base_path, "phppage/down-load-model-list.html?showEndLife=false&catid={}&appPath=us".format(response.meta["cid"])), headers={"Referer": response.url, "X-Requested-With": "XMLHttpRequest"}, callback=self.parse_products)
Example #11
Source File: pornHubSpider.py From PornHubBot with MIT License | 6 votes |
def parse_ph_key(self,response): selector = Selector(response) logging.debug('request url:------>' + response.url) # logging.info(selector) divs = selector.xpath('//div[@class="phimage"]') for div in divs: viewkey = re.findall('viewkey=(.*?)"',div.extract()) # logging.debug(viewkey) yield Request(url='https://www.pornhub.com/embed/%s' % viewkey[0],callback = self.parse_ph_info) url_next = selector.xpath('//a[@class="orangeButton" and text()="Next"]/@href').extract() # logging.debug(url_next) if url_next: # if self.test: logging.debug(' next page:---------->' + self.host+url_next[0]) yield Request(url=self.host+url_next[0],callback=self.parse_ph_key) # self.test = False
Example #12
Source File: tp-link_en.py From scraper with MIT License | 6 votes |
def parse_product_version(self, response): # <div class="hardware-version"> if response.xpath("//div[@class=\"hardware-version\"]").extract(): for i in [1, 2]: yield Request( url = response.url.replace(".html", "-V{}.html".format(i)), meta = {"product": response.meta['product'], "version": "V{}".format(int(i)+1), }, callback = self.parse_product) else: #only for v1? yield Request( url = response.url + "?again=true", meta = {"product": response.meta['product'], "version": "V1" }, callback = self.parse_product)
Example #13
Source File: assetstore.py From IPProxyTool with MIT License | 6 votes |
def start_requests(self): url = 'https://www.assetstore.unity3d.com/login' yield Request( url = url, headers = { 'Accept': 'application/json', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Connection': 'keep-alive', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Host': 'www.assetstore.unity3d.com', 'Referer': 'https://www.assetstore.unity3d.com/en/', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 ' 'Firefox/50.0', 'X-Kharma-Version': '0', 'X-Requested-With': 'UnityAssetStore', 'X-Unity-Session': '26c4202eb475d02864b40827dfff11a14657aa41', }, meta = { }, dont_filter = True, callback = self.get_unity_version, errback = self.error_parse, )
Example #14
Source File: se.py From scraper with MIT License | 6 votes |
def parse_product(self, response): # Find the "Software and Firmware" tab link to get to the product-range-download page meta = response.meta meta['dont_redirect'] = True for link in response.css('a.tab-link'): href = link.xpath('@href').extract_first() if href.endswith(u'software-firmware-tab'): logging.debug("Requesting SW+FW page for %s at %s", response.meta['product'], urlparse.urljoin(response.url, href)) yield Request( url=urlparse.urljoin(response.url, href), headers={"Referer": response.url}, meta=meta, callback=self.parse_product_sw_fw) break else: logging.debug("Did not find a 'Software and Firmware' tab for %s", response.meta['product'])
Example #15
Source File: se.py From scraper with MIT License | 5 votes |
def parse(self, response): a_to_z = response.css('ul.product-finder-result') for link in a_to_z.xpath('.//a'): product = u' '.join(link.xpath('.//text()').extract()).strip() href = link.xpath('@href').extract_first() yield Request( url=urlparse.urljoin(response.url, href), headers={"Referer": response.url}, meta={'product': product}, callback=self.parse_product)
Example #16
Source File: 360.py From scraper with MIT License | 5 votes |
def parse(self, response): yield Request( url=self.json_url, headers={"Referer": response.url}, callback=self.parse_product)
Example #17
Source File: tomato-shibby.py From scraper with MIT License | 5 votes |
def parse(self, response): for link in response.xpath("//table//tr"): if not link.xpath("./td[2]/a"): continue text = link.xpath("./td[2]/a/text()").extract()[0] href = link.xpath("./td[2]//@href").extract()[0] if ".." in href: continue elif href.endswith('/'): build = response.meta.get("build", None) product = response.meta.get("product", None) if not product: product = text elif not build: build = text.replace("build", "") yield Request( url=urlparse.urljoin(response.url, href), headers={"Referer": response.url}, meta={"build": build, "product": product}, callback=self.parse) elif any(href.endswith(x) for x in [".bin", ".elf", ".fdt", ".imx", ".chk", ".trx"]): item = FirmwareLoader( item=FirmwareImage(), response=response, date_fmt=["%Y-%m-%d"]) item.add_value("build", response.meta["build"]) item.add_value("url", href) item.add_value("version", FirmwareLoader.find_version_period( os.path.splitext(text)[0].split("-"))) item.add_value("date", item.find_date( link.xpath("./td[3]/text()").extract())) item.add_value("product", response.meta["product"]) item.add_value("vendor", self.name) yield item.load_item()
Example #18
Source File: tp-link_en.py From scraper with MIT License | 5 votes |
def parse_products(self, response): json_response = json.loads(response.body_as_unicode()) if json_response: #description = json_response[0]['title'] for row in json_response[0]['row']: yield Request( url = urlparse.urljoin(self.base_path, row['href']), meta = {"product": row['model'], }, callback = self.parse_product_version)
Example #19
Source File: phicomm.py From scraper with MIT License | 5 votes |
def parse(self, response): head = False for tr in response.xpath("//table//tr"): if not head: head = True continue description = tr.xpath("./td[2]/text()").extract()[0] product = description.split(u'(')[0] version = tr.xpath("./td[4]/text()").extract()[0] #2017-03-14 date = tr.xpath("./td[6]/p/text()").extract()[0] downloadid = tr.xpath("./td[7]/a/@downloadid").extract()[0] #http://www.phicomm.com/cn/support.php/Mobile/Downhit.html?id=437 firmware_url = "http://www.phicomm.com/cn/support.php/Mobile/Downhit.html?id={}".format(downloadid) yield Request( url = firmware_url, headers={"Referer": response.url}, meta={ "product":product, "version":version, "date":date, 'description':description }, callback = self.parse_product)
Example #20
Source File: actiontec.py From scraper with MIT License | 5 votes |
def parse(self, response): for link in response.xpath("//div[@class='newboxes2']//a"): product = link.xpath(".//text()").extract()[0] # some product strings are e.g. "(GT701-WRU) - 54 Mbps Wireless # Cable/DSL Router" actual = re.match(r"\(([\w ,\\/()-]+?)\)", product) if actual: product = actual.group(1).replace("(", "").replace(")", "") yield Request( url=urlparse.urljoin( response.url, link.xpath(".//@href").extract()[0]), headers={"Referer": response.url}, meta={"product": product}, callback=self.parse_product)
Example #21
Source File: airlink101.py From scraper with MIT License | 5 votes |
def parse(self, response): for entry in response.xpath( "//div[@class='menu2']//table//table//table[2]//td[1]//td[2]"): desc = entry.xpath(".//text()").extract() for link in entry.xpath(".//a"): href = link.xpath("./@href").extract()[0] text = link.xpath(".//text()").extract()[0] if "_a=download" not in href: yield Request( url=urlparse.urljoin(response.url, href), headers={"Referer": response.url}, meta={"product": text.strip().split(' ')}, callback=self.parse) elif "firmware" in text.lower() or "f/w" in text.lower(): item = FirmwareLoader(item=FirmwareImage(), response=response, date_fmt=["%m/%d/%Y", "%m/%d/%y"]) item.add_value("version", FirmwareLoader.find_version(desc)) item.add_value("date", item.find_date(desc)) item.add_value("description", text) item.add_value("url", href) item.add_value("product", response.meta["product"]) item.add_value("vendor", self.name) yield item.load_item()
Example #22
Source File: camius.py From scraper with MIT License | 5 votes |
def parse(self, response): for link in response.xpath('//a'): text = link.xpath('text()').extract_first() href = link.xpath('@href').extract_first() if text is None or href in self.start_urls or "firmware" not in text.lower(): continue yield Request( url=urlparse.urljoin(response.url, href), headers={"Referer": response.url}, meta={"product": text}, callback=self.parse_product_firmware)
Example #23
Source File: xerox.py From scraper with MIT License | 5 votes |
def parse(self, response): for href in response.xpath( "//div[@class='productResults a2z']//a/@href").extract(): if "downloads" in href: yield Request( url=urlparse.urljoin(response.url, href), headers={"Referer": response.url}, callback=self.parse_download)
Example #24
Source File: huawei_zh.py From scraper with MIT License | 5 votes |
def parse_product(self, response): json_response = json.loads(response.body_as_unicode()) for product in json_response: yield Request( url=urlparse.urljoin( response.url, "/support/services/service/file/list?productID=%s&siteCode=%s" % (product["productId"], self.region)), meta={"product": product["productCode"]}, headers={"Referer": response.url, "X-Requested-With": "XMLHttpRequest"}, callback=self.parse_download)
Example #25
Source File: huawei_zh.py From scraper with MIT License | 5 votes |
def parse_category(self, response): json_response = json.loads(response.body_as_unicode()) for category in json_response: yield Request( url=urlparse.urljoin( response.url, "/support/services/service/product/list?productID=%s&siteCode=%s" % (category["productId"], self.region)), headers={"Referer": response.url, "X-Requested-With": "XMLHttpRequest"}, callback=self.parse_product)
Example #26
Source File: huawei_zh.py From scraper with MIT License | 5 votes |
def parse(self, response): yield Request( url=urlparse.urljoin( response.url, "/support/services/service/product/category?siteCode=%s" % (self.region)), headers={"Referer": response.url, "X-Requested-With": "XMLHttpRequest"}, callback=self.parse_category)
Example #27
Source File: openwrt.py From scraper with MIT License | 5 votes |
def parse(self, response): for link in response.xpath("//a"): text = link.xpath("text()").extract_first() href = link.xpath("@href").extract_first() if text is None and href == u"/": # <a href="/"><em>(root)</em></a> continue yield Request( url=urlparse.urljoin(response.url, href), headers={"Referer": response.url}, meta={"version": FirmwareLoader.find_version_period(text)}, callback=self.parse_url)
Example #28
Source File: tenda_en.py From scraper with MIT License | 5 votes |
def parse(self, response): for page in response.xpath("//div[@class='next-page']/a/text()").extract(): yield Request( url=self.url.format(page), headers={"Referer": response.url, "X-Requested-With": "XMLHttpRequest"}, callback=self.parse_product)
Example #29
Source File: centurylink.py From scraper with MIT License | 5 votes |
def parse(self, response): product = None for section in response.xpath("//div[@class='product-content']/div[@class='product-box2']/div"): text = section.xpath(".//text()").extract() if not section.xpath(".//a"): product = text[0].strip() else: for link in section.xpath(".//a/@href").extract(): if link.endswith(".html"): yield Request( url=urlparse.urljoin(response.url, link), meta={"product": product, "version": FirmwareLoader.find_version(text)}, headers={"Referer": response.url}, callback=self.parse_download)
Example #30
Source File: foscam.py From scraper with MIT License | 5 votes |
def start_requests(self): for url in self.start_urls: yield Request(url, cookies={'loginEmail': "@.com"}, dont_filter=True)