Python scrapy.Request() Examples
The following are 30
code examples of scrapy.Request().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
scrapy
, or try the search function
.
Example #1
Source File: item_id.py From TaobaoAnalysis with MIT License | 6 votes |
def parse(self, response): data = response.xpath('//div[@tms-data]/@tms-data').extract() data = [json.loads(cur_data) for cur_data in data] tce_ids = [] for cur_data in data: for key in cur_data: if not key.startswith('items'): continue for item in cur_data[key]: if not ('tms_type' in item and item['tms_type'] == 'jsonp'): continue tce_ids.append([ str(item['data_para']['tce_sid']), item['data_para']['tce_vid'] ]) if not tce_ids: self.logger.warning('没有tce_id "%s"', response.url) else: for tce_url in self.get_tce_urls(tce_ids): yield Request(tce_url, callback=self.parse_item_id)
Example #2
Source File: douban_spider.py From scrapy-tutorial with Apache License 2.0 | 6 votes |
def parse(self, response): item = DoubanMovieItem() movies = response.xpath('//ol[@class="grid_view"]/li') for movie in movies: item['ranking'] = movie.xpath( './/div[@class="pic"]/em/text()').extract()[0] item['movie_name'] = movie.xpath( './/div[@class="hd"]/a/span[1]/text()').extract()[0] item['score'] = movie.xpath( './/div[@class="star"]/span[@class="rating_num"]/text()' ).extract()[0] item['score_num'] = movie.xpath( './/div[@class="star"]/span/text()').re(ur'(\d+)人评价')[0] yield item next_url = response.xpath('//span[@class="next"]/a/@href').extract() if next_url: next_url = 'https://movie.douban.com/top250' + next_url[0] yield Request(next_url, headers=self.headers)
Example #3
Source File: ip_proxy.py From Python_Master_Courses with GNU General Public License v3.0 | 6 votes |
def parse(self, response): for tr in response.xpath('//tbody/tr'): try: ip = tr.xpath('td[@data-title="IP"]/text()').extract()[0] port = tr.xpath('td[@data-title="PORT"]/text()').extract()[0] http_type = tr.xpath('td[@data-title="类型"]/text()').extract()[0].lower() # print(http_type,ip,port) except Exception as e: # print(e) continue # url = '%s://httpbin.org/ip' % http_type proxy = '%s://%s:%s' % (http_type, ip, port) meta = { 'proxy': proxy, 'dont_retry': True, 'download_timeout': 10, # '_proxy_scheme': http_type, '_proxy_ip': ip, 'port': port } yield Request(url, callback=self.check_available, meta=meta, dont_filter=True)
Example #4
Source File: test_retry_middleware.py From scrapy-fake-useragent with BSD 3-Clause "New" or "Revised" License | 6 votes |
def retry_middleware_exception(request): """ Fixture to simplify creating a crawler with an activated retry middleware and going through the request-response cycle. Executes process_exception() method of the middleware. """ settings, exception = request.param crawler = get_crawler(Spider, settings_dict=settings) spider = crawler._create_spider('foo') mw = RetryUserAgentMiddleware.from_crawler(crawler) req = Request('http://www.scrapytest.org/') yield mw.process_exception(req, exception, spider)
Example #5
Source File: douban_ajax_spider.py From scrapy-tutorial with Apache License 2.0 | 6 votes |
def parse(self, response): datas = json.loads(response.body) item = DoubanMovieItem() if datas: for data in datas: item['ranking'] = data['rank'] item['movie_name'] = data['title'] item['score'] = data['score'] item['score_num'] = data['vote_count'] yield item # 如果datas存在数据则对下一页进行采集 page_num = re.search(r'start=(\d+)', response.url).group(1) page_num = 'start=' + str(int(page_num)+20) next_url = re.sub(r'start=\d+', page_num, response.url) yield Request(next_url, headers=self.headers)
Example #6
Source File: books.py From Python_Master_Courses with GNU General Public License v3.0 | 6 votes |
def parse(self, response): # if self.counter > 2: # return # else: # self.counter += 1 for book in response.css('article.product_pod'): try: bname = book.xpath('./h3/a/@title').extract_first() price = book.css('p.price_color::text').extract()[0] # yield {'name': bname, 'price': price} bookit = BooksItem() bookit['name'] = bname bookit['price'] = price yield bookit except Exception as e: print(e) # next_url = response.css('li.next a::attr(href)').extract_first() if next_url: next_url = response.urljoin(next_url) yield scrapy.Request(next_url, callback=self.parse)
Example #7
Source File: update-challenge-list.py From HackerRank with MIT License | 6 votes |
def start_requests(self): tracks_list = [ { 'title': 'Algorithms', 'name': 'algorithms' }, { 'title': 'Data Structures', 'name': 'data-structures' }, { 'title': 'Mathematics', 'name': 'mathematics' }, ] for i, track in enumerate(tracks_list): tracks.append({ 'title': track['title'], 'name': track['name'], 'chapters': [], }) url = 'https://www.hackerrank.com/rest/contests/master/tracks/' + track['name'] + '/chapters' yield scrapy.Request(url=url, callback=functools.partial(self.parse_chapters, d={ 'track-id': i, }))
Example #8
Source File: update-challenge-list.py From HackerRank with MIT License | 6 votes |
def parse_chapters(self, response, d): json_object = json.loads(response.text) for i, chapter in enumerate(json_object['models']): tracks[d['track-id']]['chapters'].append({ 'title': chapter['name'], 'name': chapter['slug'], 'challenges': [None] * chapter['challenges_count'], }) for offset in range(0, chapter['challenges_count'], 10): url = 'https://www.hackerrank.com/rest/contests/master/categories/' \ + tracks[d['track-id']]['name'] + '%7C' + chapter['slug'] \ + '/challenges?offset=' + str(offset) + '&limit=10' yield scrapy.Request(url=url, callback=functools.partial(self.parse_page, d={ 'track-id': d['track-id'], 'chapter-id': i, 'offset': offset, }))
Example #9
Source File: aiqiyi_spider.py From video_url_crawler_demo with GNU General Public License v3.0 | 6 votes |
def main_list_parse(self, response): for sel in response.xpath('//div[@class="wrapper-piclist"]/ul/li'): item = AlbumItem() item['level'] = 1 item['title'] = sel.xpath('div[2]/div[1]/p/a/text()').extract_first() item['img_url'] = sel.xpath('div[1]/a/img/@src').extract_first() item['main_url'] = sel.xpath('div[2]/div[1]/p/a/@href').extract_first() item['type_id'] = 0 update_status = sel.xpath('div[1]/a/div/div/p/span/text()').extract_first().strip() item['status'] = 1 if update_status[0] == u'共' else 0 if item['title'] is not None and item['main_url'] is not None: yield item yield scrapy.Request(response.urljoin(item['main_url']), callback=self.video_list_parse, errback=self.errback_httpbin) no_page = response.xpath('//span[@class="curPage"]/following-sibling::span[@class="noPage"]').extract_first() # to crawl next page if no_page is None: next_page_url = response.xpath('//div[@class="mod-page"]/a[last()]/@href').extract_first() print('visit next page url: ', next_page_url) yield scrapy.Request(response.urljoin(next_page_url), callback=self.main_list_parse, errback=self.errback_httpbin)
Example #10
Source File: pixiv-beta.py From Pixiv-Crawler with GNU General Public License v3.0 | 6 votes |
def collection(self, response): self.update_process(response, ".column-label .count-badge::text", 'Crawling collections...') image_items = response.css('._image-items.js-legacy-mark-unmark-list li.image-item') all_collection_urls = [] for image_item in image_items: # 对于已经删除的图片 可能会包含在image_items中,但无法提取bookmark,在转为int时报错,程序到此终止 # 在image_page会再检查一遍fav_num item_url = image_item.css('a.work._work::attr(href)').extract_first('') pid = item_url.split('illust_id=')[-1] if pid in self.collection_set: continue img_bookmark = image_item.css('ul li a.bookmark-count._ui-tooltip::text').extract_first('') if img_bookmark and int(img_bookmark) >= self.MIN_FAV: all_collection_urls.append(item_url) all_collection_urls = [parse.urljoin(response.url, url) for url in all_collection_urls] next_page_url = response.css('.column-order-menu .pager-container .next ._button::attr(href)').extract_first("") # ??? if self.tryNextPage(next_page_url): next_page_url = parse.urljoin(response.url, next_page_url) yield scrapy.Request(next_page_url, headers=self.header, callback=self.collection) for url in all_collection_urls: yield scrapy.Request(url, headers=self.header, callback=self.image_page)
Example #11
Source File: pixiv-beta.py From Pixiv-Crawler with GNU General Public License v3.0 | 6 votes |
def search(self, response): # for debug if self.process > self.maxsize: return js_text = response.css("div.layout-body div._unit input#js-mount-point-search-result-list::attr(data-items)").extract_first('Not Found') if js_text == "Not Found": print("json接口变动,烦请issue") js = json.loads(js_text) self.update_process(response, '._unit .column-header span.count-badge::text', 'Searching {0}'.format(cf.get('SRH', 'TAGS'))) all_works_url = [] for image_item in js: if image_item["bookmarkCount"] >= self.MIN_FAV: all_works_url.append(('https://www.pixiv.net/member_illust.php?mode=medium&illust_id={0}'.format(image_item["illustId"]), image_item['bookmarkCount'])) next_page_url = response.css('.column-order-menu .pager-container .next ._button::attr(href)').extract_first("") if self.tryNextPage(next_page_url): next_page_url = parse.urljoin(response.url, next_page_url) yield scrapy.Request(next_page_url, headers=self.header, callback=self.search) for url, bookmarkCount in all_works_url: request = scrapy.Request(url, headers=self.header, callback=self.image_page) # 就是这里改成提取数据 request.meta['collection'] = bookmarkCount yield request
Example #12
Source File: china_spider.py From Taiwan-news-crawlers with MIT License | 6 votes |
def parse(self, response): news_in_page = response.css('.listRight li h2 a') if not news_in_page: return for news in news_in_page: url = news.css('a::attr(href)').extract_first() if ROOT_URL not in url: url = ROOT_URL + url url = response.urljoin(url) yield scrapy.Request(url, callback=self.parse_news) if 'next_page' in response.meta: meta = {'next_page': response.meta['next_page'] + 1} else: meta = {'next_page': 2} next_url = PAGE_URL + '?page=' + str(meta['next_page']) yield scrapy.Request(next_url, callback=self.parse, meta=meta)
Example #13
Source File: liberty_tag_spider.py From Taiwan-news-crawlers with MIT License | 6 votes |
def start_requests(self): urls = [ 'http://news.ltn.com.tw/list/newspaper/focus/', 'http://news.ltn.com.tw/list/newspaper/politics/', 'http://news.ltn.com.tw/list/newspaper/society/', 'http://news.ltn.com.tw/list/newspaper/local/', 'http://news.ltn.com.tw/list/newspaper/life/', 'http://news.ltn.com.tw/list/newspaper/opinion/', 'http://news.ltn.com.tw/list/newspaper/world/', 'http://news.ltn.com.tw/list/newspaper/business/', 'http://news.ltn.com.tw/list/newspaper/sports/', 'http://news.ltn.com.tw/list/newspaper/entertainment/', 'http://news.ltn.com.tw/list/newspaper/consumer/', 'http://news.ltn.com.tw/list/newspaper/supplement/' ] day = datetime.timedelta(days=1) current_time = NEWS_DATE_BEGIN while current_time <= TODAY: date = current_time.strftime('%Y%m%d') for url in urls: target = url + date yield scrapy.Request(target, callback=self.parse_news_list) current_time += day
Example #14
Source File: test_retry_middleware.py From scrapy-fake-useragent with BSD 3-Clause "New" or "Revised" License | 6 votes |
def retry_middleware_response(request): """ Fixture to simplify creating a crawler with an activated middleware and going through the request-response cycle. Executes process_response() method of the middleware. """ settings, status = request.param crawler = get_crawler(Spider, settings_dict=settings) spider = crawler._create_spider('foo') mw = RetryUserAgentMiddleware.from_crawler(crawler) req = Request('http://www.scrapytest.org/') rsp = Response(req.url, body=b'', status=status) yield mw.process_response(req, rsp, spider)
Example #15
Source File: jd.py From jd_analysis with GNU Lesser General Public License v3.0 | 6 votes |
def start_requests(self): yield Request( url = self.url, headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'en-US,en;q=0.5', 'Connection': 'keep-alive', 'Host': 'item.jd.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:52.0) Gecko/20100101 ' 'Firefox/52.0', }, method = 'GET', meta = { 'dont_merge_cookies': True, 'cookiejar': CookieJar(), }, dont_filter = True, callback = self.get_comment_count )
Example #16
Source File: jd_item_info.py From jd_analysis with GNU Lesser General Public License v3.0 | 6 votes |
def start_requests(self): yield Request( url = self.url, headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'en-US,en;q=0.5', 'Connection': 'keep-alive', 'Host': 'item.jd.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:52.0) Gecko/20100101 ' 'Firefox/52.0', }, method = 'GET', meta = { 'dont_merge_cookies': True, 'cookiejar': CookieJar(), }, dont_filter = True, callback = self.get_comment_count, )
Example #17
Source File: tvbs_spider.py From Taiwan-news-crawlers with MIT License | 6 votes |
def parse(self, response): for news in response.css('.realtime_news_content_titel'): category = news.css('p::text').extract_first() meta = {'category': category} url = news.css('div a::attr(href)').extract_first() url = response.urljoin(url) yield scrapy.Request(url, callback=self.parse_news, meta=meta) total_pages = response.css( '.realtime_news_underbtn li:last-child::text').extract_first() total_pages_num = int(total_pages[1:-1]) url_arr = response.url.split('/') current_page_index = int(url_arr[-1]) if current_page_index < total_pages_num: next_page_url = '/'.join(url_arr[:-1]) + \ '/' + str(current_page_index + 1) yield scrapy.Request(next_page_url, callback=self.parse)
Example #18
Source File: cts_spider.py From Taiwan-news-crawlers with MIT License | 6 votes |
def parse(self, response): for news in response.css('.news_right'): url = news.css('a::attr(href)').extract_first() yield scrapy.Request(url, callback=self.parse_news) page_desc = response.css('.page-desc::text').extract_first() total_pages = page_desc.split('/')[1] total_pages = int(total_pages[2:-2]) url_arr = response.url.split('/') url_suffix = url_arr[-1] current_page_index = url_suffix[5:-5] if current_page_index is '': current_page_index = 1 else: current_page_index = int(current_page_index) if current_page_index < total_pages: next_page = '/'.join(url_arr[:-1]) + '/index' + str( current_page_index + 1) + '.html' yield scrapy.Request(next_page, callback=self.parse)
Example #19
Source File: liberty_realtimenews_spider.py From Taiwan-news-crawlers with MIT License | 6 votes |
def parse(self, response): regex = r'\/all\/(\d+)' current_index = re.search(regex, response.url) if current_index: next_index = int(current_index.group(1)) + 1 else: next_index = 2 date_of_news = response.css('a.tit span::text').extract() last_page = False for d in date_of_news: if '-' in d: last_page = True break for news_url in response.css('a.tit::attr(href)').extract(): yield scrapy.Request(news_url, callback=self.parse_news) if not last_page: next_target = Realtime_NEWS_URL + str(next_index) yield scrapy.Request(next_target, callback=self.parse)
Example #20
Source File: setn_spider.py From Taiwan-news-crawlers with MIT License | 6 votes |
def parse(self, response): for news in response.css('.box ul li'): category = news.css('.tab_list_type span::text').extract_first() meta = {'category': category} url = news.css('a::attr(href)').extract_first() url = response.urljoin(url) yield scrapy.Request(url, callback=self.parse_news, meta=meta) last_two_pages = response.css('.pager a::attr(href)').extract()[-2:] page1 = last_two_pages[0].split('&p=')[1] page2 = last_two_pages[1].split('&p=')[1] if page1 == page2: self.last_page_flag = self.last_page_flag + 1 if self.last_page_flag < 2: url_arr = response.url.split('&p=') current_page = int(url_arr[1]) next_page_url = '&p='.join( url_arr[:-1]) + '&p=' + str(current_page + 1) yield scrapy.Request(next_page_url, callback=self.parse)
Example #21
Source File: liberty_spider.py From Taiwan-news-crawlers with MIT License | 6 votes |
def parse_news_list(self, response): for news_item in response.css('.list li'): relative_url = news_item.css('a.tit::attr(href)').extract_first() abs_url = response.urljoin(relative_url) yield scrapy.Request(abs_url, callback=self.parse_news) page_list = [ int(p) for p in response.css('.pagination a::text').extract() if p.isdigit() ] current_page_extract = response.css( '.pagination a.active::text').extract_first() current_page = int( current_page_extract) if current_page_extract is True else 1 if (not page_list) or (current_page >= max(page_list)): return next_page = current_page + 1 if next_page in page_list: prefix = re.search(r'.*\/', response.url).group(0) relative_url = prefix + '/' + str(next_page) abs_url = response.urljoin(relative_url) yield scrapy.Request(abs_url, callback=self.parse_news_list)
Example #22
Source File: liberty_spider.py From Taiwan-news-crawlers with MIT License | 6 votes |
def start_requests(self): urls = [ 'http://news.ltn.com.tw/list/newspaper/focus/', 'http://news.ltn.com.tw/list/newspaper/politics/', 'http://news.ltn.com.tw/list/newspaper/society/', 'http://news.ltn.com.tw/list/newspaper/local/', 'http://news.ltn.com.tw/list/newspaper/life/', 'http://news.ltn.com.tw/list/newspaper/opinion/', 'http://news.ltn.com.tw/list/newspaper/world/', 'http://news.ltn.com.tw/list/newspaper/business/', 'http://news.ltn.com.tw/list/newspaper/sports/', 'http://news.ltn.com.tw/list/newspaper/entertainment/', 'http://news.ltn.com.tw/list/newspaper/consumer/', 'http://news.ltn.com.tw/list/newspaper/supplement/' ] date = time.strftime('%Y%m%d') for url in urls: target = url + date yield scrapy.Request(target, callback=self.parse_news_list)
Example #23
Source File: cna_spider.py From Taiwan-news-crawlers with MIT License | 6 votes |
def parse(self, response): current_page_index = int( response.css('.pagination li.current a::text').extract_first()) newses_time_str = response.css('.article_list li span::text').extract() newses_time = [ datetime.strptime(i, '%Y/%m/%d %H:%M').date() for i in newses_time_str ] is_over_today = False for t in newses_time: if t < TODAY: is_over_today = True if not is_over_today: next_url = 'http://www.cna.com.tw/list/aall-' + str( current_page_index + 1) + '.aspx' yield scrapy.Request(next_url, callback=self.parse) for news in response.css('div.article_list li a'): url = response.urljoin(news.css('a::attr(href)').extract_first()) yield scrapy.Request(url, callback=self.parse_news)
Example #24
Source File: apple_spider.py From Taiwan-news-crawlers with MIT License | 6 votes |
def parse(self, response): section = response.css('section.nclnbx.slvl.clearmen, article.nclns') for part in section: if part.css('header.schh h1::text'): category = part.css('header.schh h1::text').extract_first() category = category.strip() else: meta = {'category': category} for news in part.css('ul.fillup li'): if 'eat-travel' in news.css( "a::attr(href)").extract_first(): continue elif 'entertainment.appledaily' in news.css( "a::attr(href)").extract_first(): url = news.css("a::attr(href)").extract_first() elif 'http' in news.css("a::attr(href)").extract_first(): url = news.css("a::attr(href)").extract_first() else: url = "http://www.appledaily.com.tw{}".format( news.css("a::attr(href)").extract_first()) if url: url = response.urljoin(url) yield scrapy.Request( url, callback=self.parse_news, meta=meta)
Example #25
Source File: udn_spider.py From Taiwan-news-crawlers with MIT License | 5 votes |
def start_requests(self): url = 'https://udn.com/news/breaknews/1' meta = {'iter_time': 1} yield scrapy.Request(url, callback=self.parse, meta=meta)
Example #26
Source File: Example.py From SourceCodeOfBook with MIT License | 5 votes |
def start_requests(self): for i in range(20): yield scrapy.Request('http://exercise.kingname.info/exercise_middleware_ip/{}'.format(i))
Example #27
Source File: udn_spider.py From Taiwan-news-crawlers with MIT License | 5 votes |
def parse(self, response): has_next_page = True is_first_iter = response.meta['iter_time'] == 1 response.meta['iter_time'] += 1 el_selector = '#breaknews_body dt' if is_first_iter else 'dt' target = response.css(el_selector) if not target: has_next_page = False for news in target: url = news.css('a::attr(href)').extract_first() url = response.urljoin(url) date_time = news.css('.info .dt::text').extract_first() if TODAY_STR not in date_time: has_next_page = False break yield scrapy.Request(url, callback=self.parse_news) if has_next_page: iter_time = response.meta['iter_time'] yield scrapy.FormRequest( url='https://udn.com/news/get_breaks_article/%d/1/0' % iter_time, callback=self.parse, meta=response.meta)
Example #28
Source File: base.py From invana-bot with MIT License | 5 votes |
def _build_request(self, rule, link): headers = {} user_agent_header = os.environ.get("WCP_REQUEST_HEADERS_USER_AGENT") if user_agent_header: headers = {"User-Agent": user_agent_header} r = Request(url=link.url, headers=headers, callback=self._response_downloaded) r.meta.update(rule=rule, link_text=link.text) return r
Example #29
Source File: BlogSpider.py From SourceCodeOfBook with MIT License | 5 votes |
def parse(self, response): title_tag_list = response.xpath('//a[@class="post-title-link"]') for title_tag in title_tag_list: article_title = title_tag.xpath('span/text()').extract_first() article_url = self.host + title_tag.xpath('@href').extract_first() item = BlogItem() item['title'] = article_title item['url'] = article_url yield scrapy.Request(article_url, headers=self.settings['HEADERS'], callback=self.parse_detail, meta={'item': item})
Example #30
Source File: douban_spider.py From scrapy-tutorial with Apache License 2.0 | 5 votes |
def start_requests(self): url = 'https://movie.douban.com/top250' yield Request(url, headers=self.headers)