Python scrapy.selector.HtmlXPathSelector() Examples

The following are 13 code examples of scrapy.selector.HtmlXPathSelector(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module scrapy.selector , or try the search function .
Example #1
Source File: dmoz.py    From tripadvisor-scraper with GNU General Public License v3.0 6 votes vote down vote up
def parse(self, response):
        """
        The lines below is a spider contract. For more info see:
        http://doc.scrapy.org/en/latest/topics/contracts.html

        @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/
        @scrapes name
        """
        hxs = HtmlXPathSelector(response)
        sites = hxs.select('//ul[@class="directory-url"]/li')
        items = []

        for site in sites:
            item = Website()
            item['name'] = site.select('a/text()').extract()
            item['url'] = site.select('a/@href').extract()
            item['description'] = site.select('text()').re('-\s([^\n]*?)\\n')
            items.append(item)

        return items 
Example #2
Source File: yeeyan.py    From openslack-crawler with Apache License 2.0 6 votes vote down vote up
def parse(self, response):
        self.log("OK,%s" % response.url)
        hxs = HtmlXPathSelector(response)
        # 将文章的链接继续进行处理
        divs = hxs.x('//div[@class="publicLeftCon mt10"]')
        for div in divs:
            url = div.x('h5/a/@href').extract()[0]
            yield self.make_requests_from_url(url).replace(callback=self.parse_content)
        # 将下一页的链接继续进行处理
        try:
            next_url = \
            hxs.x('//div[@id="project_left"]/div[@class="publicMiddleLine"]/span/a[b="下一页"]/@href').extract()[0]
        except Exception:
            return
        next_url = 'http://article.yeeyan.org' + next_url
        #  if self.count==10:
        #      return
        #  self.count+=1
        yield self.make_requests_from_url(next_url).replace(callback=self.parse)

    # 过滤文章内容 
Example #3
Source File: yeeyan.py    From openslack-crawler with Apache License 2.0 6 votes vote down vote up
def parse_content(self, response):
        hxs = HtmlXPathSelector(response)
        item = YeeyanItem()
        if hxs.x('//a[@class="jx_logo"]/text()'):
            item = self.parse_jx(item, response)
        else:
            item['url'] = response.url
            item['title'] = hxs.x('//title/text()').extract()[0].split('|')[1].strip()
            div = hxs.x('//div[@class="user_info"]')
            item['author'] = div.x('.//h2/a/text()').extract()[0]
            item['excerpt'] = hxs.x('//p[@class="excerpt"]/text()').extract()
            if item['excerpt']:
                item['excerpt'] = item['excerpt'][0]
            else:
                item['excerpt'] = ''
            item['content_html'] = hxs.x('//div[@id="conBox"]').extract()[0]
            item['release_time'] = div.x('.//p/text()').extract()[0].strip()[1:-7]
            item['category'] = hxs.x('//div[@class="crumb"]/a/text()').extract()[1]
        return item

    # 过滤精选的文章 
Example #4
Source File: LinkedinSpider.py    From openslack-crawler with Apache License 2.0 6 votes vote down vote up
def parse(self, response):
        """
        default parse method, rule is not useful now
        """
        # import pdb; pdb.set_trace()
        response = response.replace(url=HtmlParser.remove_url_parameter(response.url))
        hxs = HtmlXPathSelector(response)
        index_level = self.determine_level(response)
        log.msg("Parse: index level:" + str(index_level))
        if index_level in [1, 2, 3, 4]:
            self.save_to_file_system(index_level, response)
            relative_urls = self.get_follow_links(index_level, hxs)
            if relative_urls is not None:
                for url in relative_urls:
                    log.msg('yield process, url:' + url)
                    yield Request(url, callback=self.parse)
        elif index_level == 5:
            personProfile = HtmlParser.extract_person_profile(hxs)
            linkedin_id = self.get_linkedin_id(response.url)
            linkedin_id = UnicodeDammit(urllib.unquote_plus(linkedin_id)).markup
            if linkedin_id:
                personProfile['_id'] = linkedin_id
                personProfile['url'] = UnicodeDammit(response.url).markup
                yield personProfile 
Example #5
Source File: LCW_spider.py    From LotteryTicket with MIT License 5 votes vote down vote up
def parse_page(response):
        hxs = HtmlXPathSelector(response)
        item = LotteryticketItem()
        # 期数
        title = hxs.select('//html/body/center/center/table/tr/td/table[1]/tr[2]/td[1]/text()').extract()[0]
        item['title'] = filter(str.isdigit, ("".join(title.split()).encode("utf-8")))
        # 红色球区
        red1 = hxs.select('//html/body/center/center/table/tr/td/table[2]/tbody/tr[2]/td[1]/font/text()').extract()[0]
        item['red1'] = filter(str.isdigit, ("".join(red1.split()).encode("utf-8")))
        red2 = hxs.select('//html/body/center/center/table/tr/td/table[2]/tbody/tr[2]/td[2]/font/text()').extract()[0]
        item['red2'] = filter(str.isdigit, ("".join(red2.split()).encode("utf-8")))
        red3 = hxs.select('//html/body/center/center/table/tr/td/table[2]/tbody/tr[2]/td[3]/font/text()').extract()[0]
        item['red3'] = filter(str.isdigit, ("".join(red3.split()).encode("utf-8")))
        red4 = hxs.select('//html/body/center/center/table/tr/td/table[2]/tbody/tr[2]/td[4]/font/text()').extract()[0]
        item['red4'] = filter(str.isdigit, ("".join(red4.split()).encode("utf-8")))
        red5 = hxs.select('//html/body/center/center/table/tr/td/table[2]/tbody/tr[2]/td[5]/font/text()').extract()[0]
        item['red5'] = filter(str.isdigit, ("".join(red5.split()).encode("utf-8")))
        red6 = hxs.select('//html/body/center/center/table/tr/td/table[2]/tbody/tr[2]/td[6]/font/text()').extract()[0]
        item['red6'] = filter(str.isdigit, ("".join(red6.split()).encode("utf-8")))
        # 蓝色球区
        blue = hxs.select('//html/body/center/center/table/tr/td/table[2]/tbody/tr[2]/td[7]/font/text()').extract()[0]
        item['blue'] = filter(str.isdigit, ("".join(blue.split()).encode("utf-8")))
        # 开奖时间
        created_at = hxs.select('//html/body/center/center/table/tr/td/table[1]/tr[2]/td[2]/text()').extract()[0]
        item['created_at'] = ("".join(created_at.split()).encode("utf-8"))[0:10]

        return item 
Example #6
Source File: __init__ copy.py    From scrapy-spiders with MIT License 5 votes vote down vote up
def parse_torrent(self,response):
		x = HtmlXPathSelector(response)
		
		torrent['url'] = response.url
		torrent['description'] = x.select("//span[@id='lblDescription']/text()").extract()
		torrent['jurisdictiontype'] = x.select("//span[@id='lblJurisdictionType']").extract()
		torrent['agency'] = x.select("//span[@id='lblUmbrellaAgency']/text()").extract()
		torrent['contactinfo'] = x.select("//span[@id='lblContact']/p/text()").extract()
		torrent['links'] = x.select("//span[@id='lblContacts']/p/a/@href").extract()
		return torrent 
Example #7
Source File: scrape.py    From evolve-music2 with MIT License 5 votes vote down vote up
def parse(self, response):
        x = HtmlXPathSelector(response)
        links = []
        url = response.url
        music_links = x.select('//ul/li/a/@href').extract()
        music_links = [m for m in music_links if m.endswith(".mid")]
        for l in music_links:
            link = MIDIFile()
            link['url'] = url
            link['ltype'] = self.ltype
            link['link'] = l
            link["file_urls"] = [l]
            links.append(link)
        return links 
Example #8
Source File: scrape.py    From evolve-music2 with MIT License 5 votes vote down vote up
def parse(self, response):
        x = HtmlXPathSelector(response)
        links = []
        url = response.url
        music_links = x.select("//td/a/@href").extract()
        music_links = [m for m in music_links if m.endswith(".mid")]
        for l in music_links:
            link = MIDIFile()
            link['url'] =  url
            link['ltype'] = self.ltype
            link['link'] = "http://midi-archive.com/" + l
            link["file_urls"] = [link['link']]
            links.append(link)
        return links 
Example #9
Source File: middlewares.py    From fp-server with MIT License 5 votes vote down vote up
def process_response(self, request, response, spider):
        url = response.url

        if response.status in [301, 307]:
            log.msg("trying to redirect us: %s" % url, level=log.INFO)
            reason = 'redirect %d' % response.status

            return self._retry(request, reason, spider) or response
        interval, redirect_url = get_meta_refresh(response)
        # handle meta redirect

        if redirect_url:
            log.msg("trying to redirect us: %s" % url, level=log.INFO)
            reason = 'meta'

            return self._retry(request, reason, spider) or response

        hxs = HtmlXPathSelector(response)
        # test for captcha page
        captcha = hxs.select(
            ".//input[contains(@id, 'captchacharacters')]").extract()

        if captcha:
            log.msg("captcha page %s" % url, level=log.INFO)
            reason = 'capcha'

            return self._retry(request, reason, spider) or response

        return response 
Example #10
Source File: weibo_spider.py    From openslack-crawler with Apache License 2.0 5 votes vote down vote up
def parse_item(self, response):
        print response.body
        hxs = HtmlXPathSelector(response)
        i = DmozItem()
        i['id'] = hxs.select('//input[@id="sid"]/@value').extract()
        i['title'] = hxs.select('//div[@id="name"]').extract()
        i['desc'] = hxs.select('//div[@id="description"]').extract()
        return i 
Example #11
Source File: yeeyan.py    From openslack-crawler with Apache License 2.0 5 votes vote down vote up
def parse_jx(self, item, response):
        hxs = HtmlXPathSelector(response)
        item['url'] = response.url
        item['title'] = hxs.x('//title/text()').extract()[0].split('|')[1].strip()
        div = hxs.x('//div[@class="jxar_author"]')
        item['author'] = div.x('.//a/text()').extract()[0]
        item['release_time'] = hxs.x('//p[@class="jxa_info"]/span[1]/text()').extract()[0]
        try:
            item['excerpt'] = hxs.x('//p[@class="jxa_intro"]/text()').extract()[0]
        except Exception:
            item['excerpt'] = None
        item['category'] = hxs.x('//p[@class="jxa_map"]/text()').extract()[1].split()[1]
        item['content_html'] = hxs.x('//div[@class="jxa_content"]').extract()[0]
        return item 
Example #12
Source File: woaidu_detail.py    From openslack-crawler with Apache License 2.0 5 votes vote down vote up
def parse_detail(self, response):
        woaidu_item = WoaiduCrawlerItem()

        response_selector = HtmlXPathSelector(response)
        woaidu_item['book_name'] = list_first_item(
            response_selector.select('//div[@class="zizida"][1]/text()').extract())
        woaidu_item['author'] = [
            list_first_item(response_selector.select('//div[@class="xiaoxiao"][1]/text()').extract())[5:].strip(), ]
        woaidu_item['book_description'] = list_first_item(
            response_selector.select('//div[@class="lili"][1]/text()').extract()).strip()
        woaidu_item['book_covor_image_url'] = list_first_item(
            response_selector.select('//div[@class="hong"][1]/img/@src').extract())

        download = []
        for i in response_selector.select('//div[contains(@class,"xiazai_xiao")]')[1:]:
            download_item = {}
            download_item['url'] = strip_null(
                    deduplication(
                        [
                            list_first_item(i.select('./div')[0].select('./a/@href').extract()),
                            list_first_item(i.select('./div')[1].select('./a/@href').extract())
                            ]
                        )
                    )

            download_item['progress'] = list_first_item(i.select('./div')[2].select('./text()').extract())
            download_item['update_time'] = list_first_item(i.select('./div')[3].select('./text()').extract())
            download_item['source_site'] = \
                [
                    list_first_item(i.select('./div')[4].select('./a/text()').extract()), \
                    list_first_item(i.select('./div')[4].select('./a/@href').extract()) \
                ]

            download.append(download_item)

        woaidu_item['book_download'] = download
        woaidu_item['original_url'] = response.url

        yield woaidu_item 
Example #13
Source File: AmazonSpider.py    From openslack-crawler with Apache License 2.0 5 votes vote down vote up
def parse_item(self, response):
        self._log_page(response, 'after_login.html')
        hxs = HtmlXPathSelector(response)
        report_urls = hxs.select('//div[@id="menuh"]/ul/li[4]/div//a/@href').extract()
        for report_url in report_urls:
            # print "list:"+report_url
            yield Request(self._ab_path(response, report_url), \
                          headers=self.headers, \
                          meta={'cookiejar': response.meta['cookiejar'], \
                                }, \
                          callback=self.parse_report)