Python scrapy.Request() Examples

The following are 30 code examples of scrapy.Request(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module scrapy , or try the search function .
Example #1
Source File: item_id.py    From TaobaoAnalysis with MIT License 6 votes vote down vote up
def parse(self, response):
        data = response.xpath('//div[@tms-data]/@tms-data').extract()
        data = [json.loads(cur_data) for cur_data in data]

        tce_ids = []
        for cur_data in data:
            for key in cur_data:
                if not key.startswith('items'):
                    continue
                for item in cur_data[key]:
                    if not ('tms_type' in item
                            and item['tms_type'] == 'jsonp'):
                        continue
                    tce_ids.append([
                        str(item['data_para']['tce_sid']),
                        item['data_para']['tce_vid']
                    ])

        if not tce_ids:
            self.logger.warning('没有tce_id "%s"', response.url)
        else:
            for tce_url in self.get_tce_urls(tce_ids):
                yield Request(tce_url, callback=self.parse_item_id) 
Example #2
Source File: douban_spider.py    From scrapy-tutorial with Apache License 2.0 6 votes vote down vote up
def parse(self, response):
        item = DoubanMovieItem()
        movies = response.xpath('//ol[@class="grid_view"]/li')
        for movie in movies:
            item['ranking'] = movie.xpath(
                './/div[@class="pic"]/em/text()').extract()[0]
            item['movie_name'] = movie.xpath(
                './/div[@class="hd"]/a/span[1]/text()').extract()[0]
            item['score'] = movie.xpath(
                './/div[@class="star"]/span[@class="rating_num"]/text()'
            ).extract()[0]
            item['score_num'] = movie.xpath(
                './/div[@class="star"]/span/text()').re(ur'(\d+)人评价')[0]
            yield item

        next_url = response.xpath('//span[@class="next"]/a/@href').extract()
        if next_url:
            next_url = 'https://movie.douban.com/top250' + next_url[0]
            yield Request(next_url, headers=self.headers) 
Example #3
Source File: ip_proxy.py    From Python_Master_Courses with GNU General Public License v3.0 6 votes vote down vote up
def parse(self, response):
        for tr in response.xpath('//tbody/tr'):
            try:
                ip = tr.xpath('td[@data-title="IP"]/text()').extract()[0]
                port = tr.xpath('td[@data-title="PORT"]/text()').extract()[0]
                http_type = tr.xpath('td[@data-title="类型"]/text()').extract()[0].lower()
                # print(http_type,ip,port)
            except Exception as e:
                # print(e)
                continue

            #
            url = '%s://httpbin.org/ip' % http_type
            proxy = '%s://%s:%s' % (http_type, ip, port)

            meta = {
                'proxy': proxy,
                'dont_retry': True,
                'download_timeout': 10,
                #
                '_proxy_scheme': http_type,
                '_proxy_ip': ip,
                'port': port
            }
            yield Request(url, callback=self.check_available, meta=meta, dont_filter=True) 
Example #4
Source File: test_retry_middleware.py    From scrapy-fake-useragent with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def retry_middleware_exception(request):
    """
    Fixture to simplify creating a crawler
    with an activated retry middleware and going through
    the request-response cycle.

    Executes process_exception() method of the middleware.
    """
    settings, exception = request.param

    crawler = get_crawler(Spider, settings_dict=settings)
    spider = crawler._create_spider('foo')
    mw = RetryUserAgentMiddleware.from_crawler(crawler)

    req = Request('http://www.scrapytest.org/')

    yield mw.process_exception(req, exception, spider) 
Example #5
Source File: douban_ajax_spider.py    From scrapy-tutorial with Apache License 2.0 6 votes vote down vote up
def parse(self, response):
        datas = json.loads(response.body)
        item = DoubanMovieItem()
        if datas:
            for data in datas:
                item['ranking'] = data['rank']
                item['movie_name'] = data['title']
                item['score'] = data['score']
                item['score_num'] = data['vote_count']
                yield item

            # 如果datas存在数据则对下一页进行采集
            page_num = re.search(r'start=(\d+)', response.url).group(1)
            page_num = 'start=' + str(int(page_num)+20)
            next_url = re.sub(r'start=\d+', page_num, response.url)
            yield Request(next_url, headers=self.headers) 
Example #6
Source File: books.py    From Python_Master_Courses with GNU General Public License v3.0 6 votes vote down vote up
def parse(self, response):
        # if self.counter > 2:
        #     return
        # else:
        #     self.counter += 1

        for book in response.css('article.product_pod'):
            try:
                bname = book.xpath('./h3/a/@title').extract_first()
                price = book.css('p.price_color::text').extract()[0]
                # yield {'name': bname, 'price': price}

                bookit = BooksItem()
                bookit['name'] = bname
                bookit['price'] = price
                yield bookit

            except Exception as e:
                print(e)

        #
        next_url = response.css('li.next a::attr(href)').extract_first()
        if next_url:
            next_url = response.urljoin(next_url)
            yield scrapy.Request(next_url, callback=self.parse) 
Example #7
Source File: update-challenge-list.py    From HackerRank with MIT License 6 votes vote down vote up
def start_requests(self):
    tracks_list = [
      { 'title': 'Algorithms', 'name': 'algorithms' },
      { 'title': 'Data Structures', 'name': 'data-structures' },
      { 'title': 'Mathematics', 'name': 'mathematics' },
      ]
    for i, track in enumerate(tracks_list):
      tracks.append({
        'title': track['title'],
        'name': track['name'],
        'chapters': [],
        })
      url = 'https://www.hackerrank.com/rest/contests/master/tracks/' + track['name'] + '/chapters'
      yield scrapy.Request(url=url, callback=functools.partial(self.parse_chapters, d={
        'track-id': i,
        })) 
Example #8
Source File: update-challenge-list.py    From HackerRank with MIT License 6 votes vote down vote up
def parse_chapters(self, response, d):
    json_object = json.loads(response.text)
    for i, chapter in enumerate(json_object['models']):
      tracks[d['track-id']]['chapters'].append({
        'title': chapter['name'],
        'name': chapter['slug'],
        'challenges': [None] * chapter['challenges_count'],
        })
      for offset in range(0, chapter['challenges_count'], 10):
        url = 'https://www.hackerrank.com/rest/contests/master/categories/' \
              + tracks[d['track-id']]['name'] + '%7C' + chapter['slug'] \
              + '/challenges?offset=' + str(offset) + '&limit=10'
        yield scrapy.Request(url=url, callback=functools.partial(self.parse_page, d={
          'track-id': d['track-id'],
          'chapter-id': i,
          'offset': offset,
          })) 
Example #9
Source File: aiqiyi_spider.py    From video_url_crawler_demo with GNU General Public License v3.0 6 votes vote down vote up
def main_list_parse(self, response):
		for sel in response.xpath('//div[@class="wrapper-piclist"]/ul/li'):
			item = AlbumItem()
			item['level'] = 1
			item['title'] = sel.xpath('div[2]/div[1]/p/a/text()').extract_first()
			item['img_url'] = sel.xpath('div[1]/a/img/@src').extract_first()
			item['main_url'] = sel.xpath('div[2]/div[1]/p/a/@href').extract_first()
			item['type_id'] = 0
			update_status = sel.xpath('div[1]/a/div/div/p/span/text()').extract_first().strip()
			item['status'] = 1 if update_status[0] == u'共' else 0

			if item['title'] is not None and item['main_url'] is not None:
				yield item
				yield scrapy.Request(response.urljoin(item['main_url']), callback=self.video_list_parse, errback=self.errback_httpbin)
		
		no_page = response.xpath('//span[@class="curPage"]/following-sibling::span[@class="noPage"]').extract_first()
		# to crawl next page
		if no_page is None:
			next_page_url = response.xpath('//div[@class="mod-page"]/a[last()]/@href').extract_first()
			print('visit next page url: ', next_page_url)
			yield scrapy.Request(response.urljoin(next_page_url), callback=self.main_list_parse, errback=self.errback_httpbin) 
Example #10
Source File: pixiv-beta.py    From Pixiv-Crawler with GNU General Public License v3.0 6 votes vote down vote up
def collection(self, response):
        self.update_process(response, ".column-label .count-badge::text", 'Crawling collections...')
        image_items = response.css('._image-items.js-legacy-mark-unmark-list li.image-item')
        all_collection_urls = []

        for image_item in image_items:
            # 对于已经删除的图片 可能会包含在image_items中,但无法提取bookmark,在转为int时报错,程序到此终止
            # 在image_page会再检查一遍fav_num
            item_url = image_item.css('a.work._work::attr(href)').extract_first('')
            pid = item_url.split('illust_id=')[-1]
            if pid in self.collection_set:
                continue
            img_bookmark = image_item.css('ul li a.bookmark-count._ui-tooltip::text').extract_first('')
            if img_bookmark and int(img_bookmark) >= self.MIN_FAV:
                all_collection_urls.append(item_url)
        all_collection_urls = [parse.urljoin(response.url, url) for url in all_collection_urls]
        next_page_url = response.css('.column-order-menu .pager-container .next ._button::attr(href)').extract_first("")
        # ???
        if self.tryNextPage(next_page_url):
            next_page_url = parse.urljoin(response.url, next_page_url)
            yield scrapy.Request(next_page_url, headers=self.header, callback=self.collection)
        for url in all_collection_urls:
            yield scrapy.Request(url, headers=self.header, callback=self.image_page) 
Example #11
Source File: pixiv-beta.py    From Pixiv-Crawler with GNU General Public License v3.0 6 votes vote down vote up
def search(self, response):
        # for debug
        if self.process > self.maxsize:
            return
        js_text = response.css("div.layout-body div._unit input#js-mount-point-search-result-list::attr(data-items)").extract_first('Not Found')
        if js_text == "Not Found":
            print("json接口变动,烦请issue")
        js = json.loads(js_text)
        self.update_process(response, '._unit .column-header span.count-badge::text', 'Searching {0}'.format(cf.get('SRH', 'TAGS')))
        all_works_url = []
        for image_item in js:
            if image_item["bookmarkCount"] >= self.MIN_FAV:
                all_works_url.append(('https://www.pixiv.net/member_illust.php?mode=medium&illust_id={0}'.format(image_item["illustId"]),
                                      image_item['bookmarkCount']))
        next_page_url = response.css('.column-order-menu .pager-container .next ._button::attr(href)').extract_first("")
        if self.tryNextPage(next_page_url):
            next_page_url = parse.urljoin(response.url, next_page_url)
            yield scrapy.Request(next_page_url, headers=self.header, callback=self.search)
        for url, bookmarkCount in all_works_url:
            request = scrapy.Request(url, headers=self.header, callback=self.image_page)  # 就是这里改成提取数据
            request.meta['collection'] = bookmarkCount
            yield request 
Example #12
Source File: china_spider.py    From Taiwan-news-crawlers with MIT License 6 votes vote down vote up
def parse(self, response):
        news_in_page = response.css('.listRight li h2 a')
        if not news_in_page:
            return

        for news in news_in_page:
            url = news.css('a::attr(href)').extract_first()
            if ROOT_URL not in url:
                url = ROOT_URL + url
            url = response.urljoin(url)
            yield scrapy.Request(url, callback=self.parse_news)
        if 'next_page' in response.meta:
            meta = {'next_page': response.meta['next_page'] + 1}
        else:
            meta = {'next_page': 2}
        next_url = PAGE_URL + '?page=' + str(meta['next_page'])
        yield scrapy.Request(next_url, callback=self.parse, meta=meta) 
Example #13
Source File: liberty_tag_spider.py    From Taiwan-news-crawlers with MIT License 6 votes vote down vote up
def start_requests(self):
        urls = [
            'http://news.ltn.com.tw/list/newspaper/focus/',
            'http://news.ltn.com.tw/list/newspaper/politics/',
            'http://news.ltn.com.tw/list/newspaper/society/',
            'http://news.ltn.com.tw/list/newspaper/local/',
            'http://news.ltn.com.tw/list/newspaper/life/',
            'http://news.ltn.com.tw/list/newspaper/opinion/',
            'http://news.ltn.com.tw/list/newspaper/world/',
            'http://news.ltn.com.tw/list/newspaper/business/',
            'http://news.ltn.com.tw/list/newspaper/sports/',
            'http://news.ltn.com.tw/list/newspaper/entertainment/',
            'http://news.ltn.com.tw/list/newspaper/consumer/',
            'http://news.ltn.com.tw/list/newspaper/supplement/'
        ]

        day = datetime.timedelta(days=1)
        current_time = NEWS_DATE_BEGIN

        while current_time <= TODAY:
            date = current_time.strftime('%Y%m%d')
            for url in urls:
                target = url + date
                yield scrapy.Request(target, callback=self.parse_news_list)
            current_time += day 
Example #14
Source File: test_retry_middleware.py    From scrapy-fake-useragent with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def retry_middleware_response(request):
    """
    Fixture to simplify creating a crawler
    with an activated middleware and going through
    the request-response cycle.

    Executes process_response() method of the middleware.
    """
    settings, status = request.param

    crawler = get_crawler(Spider, settings_dict=settings)
    spider = crawler._create_spider('foo')
    mw = RetryUserAgentMiddleware.from_crawler(crawler)

    req = Request('http://www.scrapytest.org/')
    rsp = Response(req.url, body=b'', status=status)

    yield mw.process_response(req, rsp, spider) 
Example #15
Source File: jd.py    From jd_analysis with GNU Lesser General Public License v3.0 6 votes vote down vote up
def start_requests(self):
        yield Request(
                url = self.url,
                headers = {
                    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                    'Accept-Encoding': 'gzip, deflate, br',
                    'Accept-Language': 'en-US,en;q=0.5',
                    'Connection': 'keep-alive',
                    'Host': 'item.jd.com',
                    'Upgrade-Insecure-Requests': '1',
                    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:52.0) Gecko/20100101 '
                                  'Firefox/52.0',
                },
                method = 'GET',
                meta = {
                    'dont_merge_cookies': True,
                    'cookiejar': CookieJar(),
                },
                dont_filter = True,
                callback = self.get_comment_count
        ) 
Example #16
Source File: jd_item_info.py    From jd_analysis with GNU Lesser General Public License v3.0 6 votes vote down vote up
def start_requests(self):
        yield Request(
                url = self.url,
                headers = {
                    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                    'Accept-Encoding': 'gzip, deflate, br',
                    'Accept-Language': 'en-US,en;q=0.5',
                    'Connection': 'keep-alive',
                    'Host': 'item.jd.com',
                    'Upgrade-Insecure-Requests': '1',
                    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:52.0) Gecko/20100101 '
                                  'Firefox/52.0',
                },
                method = 'GET',
                meta = {
                    'dont_merge_cookies': True,
                    'cookiejar': CookieJar(),
                },
                dont_filter = True,
                callback = self.get_comment_count,
        ) 
Example #17
Source File: tvbs_spider.py    From Taiwan-news-crawlers with MIT License 6 votes vote down vote up
def parse(self, response):
        for news in response.css('.realtime_news_content_titel'):
            category = news.css('p::text').extract_first()
            meta = {'category': category}
            url = news.css('div a::attr(href)').extract_first()
            url = response.urljoin(url)
            yield scrapy.Request(url, callback=self.parse_news, meta=meta)

        total_pages = response.css(
            '.realtime_news_underbtn li:last-child::text').extract_first()
        total_pages_num = int(total_pages[1:-1])
        url_arr = response.url.split('/')
        current_page_index = int(url_arr[-1])

        if current_page_index < total_pages_num:
            next_page_url = '/'.join(url_arr[:-1]) + \
                '/' + str(current_page_index + 1)
            yield scrapy.Request(next_page_url, callback=self.parse) 
Example #18
Source File: cts_spider.py    From Taiwan-news-crawlers with MIT License 6 votes vote down vote up
def parse(self, response):
        for news in response.css('.news_right'):
            url = news.css('a::attr(href)').extract_first()
            yield scrapy.Request(url, callback=self.parse_news)

        page_desc = response.css('.page-desc::text').extract_first()
        total_pages = page_desc.split('/')[1]
        total_pages = int(total_pages[2:-2])
        url_arr = response.url.split('/')
        url_suffix = url_arr[-1]
        current_page_index = url_suffix[5:-5]
        if current_page_index is '':
            current_page_index = 1
        else:
            current_page_index = int(current_page_index)

        if current_page_index < total_pages:
            next_page = '/'.join(url_arr[:-1]) + '/index' + str(
                current_page_index + 1) + '.html'
            yield scrapy.Request(next_page, callback=self.parse) 
Example #19
Source File: liberty_realtimenews_spider.py    From Taiwan-news-crawlers with MIT License 6 votes vote down vote up
def parse(self, response):
        regex = r'\/all\/(\d+)'
        current_index = re.search(regex, response.url)
        if current_index:
            next_index = int(current_index.group(1)) + 1
        else:
            next_index = 2
        date_of_news = response.css('a.tit span::text').extract()
        last_page = False
        for d in date_of_news:
            if '-' in d:
                last_page = True
                break

        for news_url in response.css('a.tit::attr(href)').extract():
            yield scrapy.Request(news_url, callback=self.parse_news)

        if not last_page:
            next_target = Realtime_NEWS_URL + str(next_index)
            yield scrapy.Request(next_target, callback=self.parse) 
Example #20
Source File: setn_spider.py    From Taiwan-news-crawlers with MIT License 6 votes vote down vote up
def parse(self, response):

        for news in response.css('.box ul li'):
            category = news.css('.tab_list_type span::text').extract_first()
            meta = {'category': category}
            url = news.css('a::attr(href)').extract_first()
            url = response.urljoin(url)
            yield scrapy.Request(url, callback=self.parse_news, meta=meta)

        last_two_pages = response.css('.pager a::attr(href)').extract()[-2:]
        page1 = last_two_pages[0].split('&p=')[1]
        page2 = last_two_pages[1].split('&p=')[1]

        if page1 == page2:
            self.last_page_flag = self.last_page_flag + 1

        if self.last_page_flag < 2:
            url_arr = response.url.split('&p=')
            current_page = int(url_arr[1])
            next_page_url = '&p='.join(
                url_arr[:-1]) + '&p=' + str(current_page + 1)
            yield scrapy.Request(next_page_url, callback=self.parse) 
Example #21
Source File: liberty_spider.py    From Taiwan-news-crawlers with MIT License 6 votes vote down vote up
def parse_news_list(self, response):
        for news_item in response.css('.list li'):
            relative_url = news_item.css('a.tit::attr(href)').extract_first()
            abs_url = response.urljoin(relative_url)
            yield scrapy.Request(abs_url, callback=self.parse_news)

        page_list = [
            int(p) for p in response.css('.pagination a::text').extract()
            if p.isdigit()
        ]
        current_page_extract = response.css(
            '.pagination a.active::text').extract_first()
        current_page = int(
            current_page_extract) if current_page_extract is True else 1
        if (not page_list) or (current_page >= max(page_list)):
            return

        next_page = current_page + 1

        if next_page in page_list:
            prefix = re.search(r'.*\/', response.url).group(0)
            relative_url = prefix + '/' + str(next_page)
            abs_url = response.urljoin(relative_url)
            yield scrapy.Request(abs_url, callback=self.parse_news_list) 
Example #22
Source File: liberty_spider.py    From Taiwan-news-crawlers with MIT License 6 votes vote down vote up
def start_requests(self):
        urls = [
            'http://news.ltn.com.tw/list/newspaper/focus/',
            'http://news.ltn.com.tw/list/newspaper/politics/',
            'http://news.ltn.com.tw/list/newspaper/society/',
            'http://news.ltn.com.tw/list/newspaper/local/',
            'http://news.ltn.com.tw/list/newspaper/life/',
            'http://news.ltn.com.tw/list/newspaper/opinion/',
            'http://news.ltn.com.tw/list/newspaper/world/',
            'http://news.ltn.com.tw/list/newspaper/business/',
            'http://news.ltn.com.tw/list/newspaper/sports/',
            'http://news.ltn.com.tw/list/newspaper/entertainment/',
            'http://news.ltn.com.tw/list/newspaper/consumer/',
            'http://news.ltn.com.tw/list/newspaper/supplement/'
        ]

        date = time.strftime('%Y%m%d')
        for url in urls:
            target = url + date
            yield scrapy.Request(target, callback=self.parse_news_list) 
Example #23
Source File: cna_spider.py    From Taiwan-news-crawlers with MIT License 6 votes vote down vote up
def parse(self, response):
        current_page_index = int(
            response.css('.pagination li.current a::text').extract_first())

        newses_time_str = response.css('.article_list li span::text').extract()
        newses_time = [
            datetime.strptime(i, '%Y/%m/%d %H:%M').date()
            for i in newses_time_str
        ]
        is_over_today = False

        for t in newses_time:
            if t < TODAY:
                is_over_today = True

        if not is_over_today:
            next_url = 'http://www.cna.com.tw/list/aall-' + str(
                current_page_index + 1) + '.aspx'
            yield scrapy.Request(next_url, callback=self.parse)

        for news in response.css('div.article_list li a'):
            url = response.urljoin(news.css('a::attr(href)').extract_first())
            yield scrapy.Request(url, callback=self.parse_news) 
Example #24
Source File: apple_spider.py    From Taiwan-news-crawlers with MIT License 6 votes vote down vote up
def parse(self, response):
        section = response.css('section.nclnbx.slvl.clearmen, article.nclns')
        for part in section:
            if part.css('header.schh h1::text'):
                category = part.css('header.schh h1::text').extract_first()
                category = category.strip()
            else:
                meta = {'category': category}
                for news in part.css('ul.fillup li'):
                    if 'eat-travel' in news.css(
                            "a::attr(href)").extract_first():
                        continue
                    elif 'entertainment.appledaily' in news.css(
                            "a::attr(href)").extract_first():
                        url = news.css("a::attr(href)").extract_first()
                    elif 'http' in news.css("a::attr(href)").extract_first():
                        url = news.css("a::attr(href)").extract_first()
                    else:
                        url = "http://www.appledaily.com.tw{}".format(
                            news.css("a::attr(href)").extract_first())
                    if url:
                        url = response.urljoin(url)
                        yield scrapy.Request(
                            url, callback=self.parse_news, meta=meta) 
Example #25
Source File: udn_spider.py    From Taiwan-news-crawlers with MIT License 5 votes vote down vote up
def start_requests(self):
        url = 'https://udn.com/news/breaknews/1'
        meta = {'iter_time': 1}
        yield scrapy.Request(url, callback=self.parse, meta=meta) 
Example #26
Source File: Example.py    From SourceCodeOfBook with MIT License 5 votes vote down vote up
def start_requests(self):
        for i in range(20):
            yield scrapy.Request('http://exercise.kingname.info/exercise_middleware_ip/{}'.format(i)) 
Example #27
Source File: udn_spider.py    From Taiwan-news-crawlers with MIT License 5 votes vote down vote up
def parse(self, response):
        has_next_page = True
        is_first_iter = response.meta['iter_time'] == 1
        response.meta['iter_time'] += 1
        el_selector = '#breaknews_body dt' if is_first_iter else 'dt'
        target = response.css(el_selector)
        if not target:
            has_next_page = False
        for news in target:
            url = news.css('a::attr(href)').extract_first()
            url = response.urljoin(url)
            date_time = news.css('.info .dt::text').extract_first()

            if TODAY_STR not in date_time:
                has_next_page = False
                break

            yield scrapy.Request(url, callback=self.parse_news)

        if has_next_page:
            iter_time = response.meta['iter_time']
            yield scrapy.FormRequest(
                url='https://udn.com/news/get_breaks_article/%d/1/0' %
                iter_time,
                callback=self.parse,
                meta=response.meta) 
Example #28
Source File: base.py    From invana-bot with MIT License 5 votes vote down vote up
def _build_request(self, rule, link):
        headers = {}
        user_agent_header = os.environ.get("WCP_REQUEST_HEADERS_USER_AGENT")
        if user_agent_header:
            headers = {"User-Agent": user_agent_header}
        r = Request(url=link.url, headers=headers, callback=self._response_downloaded)
        r.meta.update(rule=rule, link_text=link.text)
        return r 
Example #29
Source File: BlogSpider.py    From SourceCodeOfBook with MIT License 5 votes vote down vote up
def parse(self, response):
        title_tag_list = response.xpath('//a[@class="post-title-link"]')
        for title_tag in title_tag_list:
            article_title = title_tag.xpath('span/text()').extract_first()
            article_url = self.host + title_tag.xpath('@href').extract_first()
            item = BlogItem()
            item['title'] = article_title
            item['url'] = article_url
            yield scrapy.Request(article_url,
                                 headers=self.settings['HEADERS'],
                                 callback=self.parse_detail,
                                 meta={'item': item}) 
Example #30
Source File: douban_spider.py    From scrapy-tutorial with Apache License 2.0 5 votes vote down vote up
def start_requests(self):
        url = 'https://movie.douban.com/top250'
        yield Request(url, headers=self.headers)