Python Examples of scrapy.selector.HtmlXPathSelector

Source File: dmoz.py From tripadvisor-scraper with GNU General Public License v3.0

6 votes

def parse(self, response):
        """
        The lines below is a spider contract. For more info see:
        http://doc.scrapy.org/en/latest/topics/contracts.html

        @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/
        @scrapes name
        """
        hxs = HtmlXPathSelector(response)
        sites = hxs.select('//ul[@class="directory-url"]/li')
        items = []

        for site in sites:
            item = Website()
            item['name'] = site.select('a/text()').extract()
            item['url'] = site.select('a/@href').extract()
            item['description'] = site.select('text()').re('-\s([^\n]*?)\\n')
            items.append(item)

        return items

Source File: yeeyan.py From openslack-crawler with Apache License 2.0

6 votes

def parse(self, response):
        self.log("OK,%s" % response.url)
        hxs = HtmlXPathSelector(response)
        # 将文章的链接继续进行处理
        divs = hxs.x('//div[@class="publicLeftCon mt10"]')
        for div in divs:
            url = div.x('h5/a/@href').extract()[0]
            yield self.make_requests_from_url(url).replace(callback=self.parse_content)
        # 将下一页的链接继续进行处理
        try:
            next_url = \
            hxs.x('//div[@id="project_left"]/div[@class="publicMiddleLine"]/span/a[b="下一页"]/@href').extract()[0]
        except Exception:
            return
        next_url = 'http://article.yeeyan.org' + next_url
        #  if self.count==10:
        #      return
        #  self.count+=1
        yield self.make_requests_from_url(next_url).replace(callback=self.parse)

    # 过滤文章内容

Source File: yeeyan.py From openslack-crawler with Apache License 2.0

6 votes

def parse_content(self, response):
        hxs = HtmlXPathSelector(response)
        item = YeeyanItem()
        if hxs.x('//a[@class="jx_logo"]/text()'):
            item = self.parse_jx(item, response)
        else:
            item['url'] = response.url
            item['title'] = hxs.x('//title/text()').extract()[0].split('|')[1].strip()
            div = hxs.x('//div[@class="user_info"]')
            item['author'] = div.x('.//h2/a/text()').extract()[0]
            item['excerpt'] = hxs.x('//p[@class="excerpt"]/text()').extract()
            if item['excerpt']:
                item['excerpt'] = item['excerpt'][0]
            else:
                item['excerpt'] = ''
            item['content_html'] = hxs.x('//div[@id="conBox"]').extract()[0]
            item['release_time'] = div.x('.//p/text()').extract()[0].strip()[1:-7]
            item['category'] = hxs.x('//div[@class="crumb"]/a/text()').extract()[1]
        return item

    # 过滤精选的文章

Source File: LinkedinSpider.py From openslack-crawler with Apache License 2.0

6 votes

def parse(self, response):
        """
        default parse method, rule is not useful now
        """
        # import pdb; pdb.set_trace()
        response = response.replace(url=HtmlParser.remove_url_parameter(response.url))
        hxs = HtmlXPathSelector(response)
        index_level = self.determine_level(response)
        log.msg("Parse: index level:" + str(index_level))
        if index_level in [1, 2, 3, 4]:
            self.save_to_file_system(index_level, response)
            relative_urls = self.get_follow_links(index_level, hxs)
            if relative_urls is not None:
                for url in relative_urls:
                    log.msg('yield process, url:' + url)
                    yield Request(url, callback=self.parse)
        elif index_level == 5:
            personProfile = HtmlParser.extract_person_profile(hxs)
            linkedin_id = self.get_linkedin_id(response.url)
            linkedin_id = UnicodeDammit(urllib.unquote_plus(linkedin_id)).markup
            if linkedin_id:
                personProfile['_id'] = linkedin_id
                personProfile['url'] = UnicodeDammit(response.url).markup
                yield personProfile

Source File: LCW_spider.py From LotteryTicket with MIT License

5 votes

def parse_page(response):
        hxs = HtmlXPathSelector(response)
        item = LotteryticketItem()
        # 期数
        title = hxs.select('//html/body/center/center/table/tr/td/table[1]/tr[2]/td[1]/text()').extract()[0]
        item['title'] = filter(str.isdigit, ("".join(title.split()).encode("utf-8")))
        # 红色球区
        red1 = hxs.select('//html/body/center/center/table/tr/td/table[2]/tbody/tr[2]/td[1]/font/text()').extract()[0]
        item['red1'] = filter(str.isdigit, ("".join(red1.split()).encode("utf-8")))
        red2 = hxs.select('//html/body/center/center/table/tr/td/table[2]/tbody/tr[2]/td[2]/font/text()').extract()[0]
        item['red2'] = filter(str.isdigit, ("".join(red2.split()).encode("utf-8")))
        red3 = hxs.select('//html/body/center/center/table/tr/td/table[2]/tbody/tr[2]/td[3]/font/text()').extract()[0]
        item['red3'] = filter(str.isdigit, ("".join(red3.split()).encode("utf-8")))
        red4 = hxs.select('//html/body/center/center/table/tr/td/table[2]/tbody/tr[2]/td[4]/font/text()').extract()[0]
        item['red4'] = filter(str.isdigit, ("".join(red4.split()).encode("utf-8")))
        red5 = hxs.select('//html/body/center/center/table/tr/td/table[2]/tbody/tr[2]/td[5]/font/text()').extract()[0]
        item['red5'] = filter(str.isdigit, ("".join(red5.split()).encode("utf-8")))
        red6 = hxs.select('//html/body/center/center/table/tr/td/table[2]/tbody/tr[2]/td[6]/font/text()').extract()[0]
        item['red6'] = filter(str.isdigit, ("".join(red6.split()).encode("utf-8")))
        # 蓝色球区
        blue = hxs.select('//html/body/center/center/table/tr/td/table[2]/tbody/tr[2]/td[7]/font/text()').extract()[0]
        item['blue'] = filter(str.isdigit, ("".join(blue.split()).encode("utf-8")))
        # 开奖时间
        created_at = hxs.select('//html/body/center/center/table/tr/td/table[1]/tr[2]/td[2]/text()').extract()[0]
        item['created_at'] = ("".join(created_at.split()).encode("utf-8"))[0:10]

        return item

Source File: __init__ copy.py From scrapy-spiders with MIT License

5 votes

def parse_torrent(self,response):
		x = HtmlXPathSelector(response)
		
		torrent['url'] = response.url
		torrent['description'] = x.select("//span[@id='lblDescription']/text()").extract()
		torrent['jurisdictiontype'] = x.select("//span[@id='lblJurisdictionType']").extract()
		torrent['agency'] = x.select("//span[@id='lblUmbrellaAgency']/text()").extract()
		torrent['contactinfo'] = x.select("//span[@id='lblContact']/p/text()").extract()
		torrent['links'] = x.select("//span[@id='lblContacts']/p/a/@href").extract()
		return torrent

Source File: scrape.py From evolve-music2 with MIT License

5 votes

def parse(self, response):
        x = HtmlXPathSelector(response)
        links = []
        url = response.url
        music_links = x.select('//ul/li/a/@href').extract()
        music_links = [m for m in music_links if m.endswith(".mid")]
        for l in music_links:
            link = MIDIFile()
            link['url'] = url
            link['ltype'] = self.ltype
            link['link'] = l
            link["file_urls"] = [l]
            links.append(link)
        return links

Source File: scrape.py From evolve-music2 with MIT License

5 votes

def parse(self, response):
        x = HtmlXPathSelector(response)
        links = []
        url = response.url
        music_links = x.select("//td/a/@href").extract()
        music_links = [m for m in music_links if m.endswith(".mid")]
        for l in music_links:
            link = MIDIFile()
            link['url'] =  url
            link['ltype'] = self.ltype
            link['link'] = "http://midi-archive.com/" + l
            link["file_urls"] = [link['link']]
            links.append(link)
        return links

Source File: middlewares.py From fp-server with MIT License

5 votes

def process_response(self, request, response, spider):
        url = response.url

        if response.status in [301, 307]:
            log.msg("trying to redirect us: %s" % url, level=log.INFO)
            reason = 'redirect %d' % response.status

            return self._retry(request, reason, spider) or response
        interval, redirect_url = get_meta_refresh(response)
        # handle meta redirect

        if redirect_url:
            log.msg("trying to redirect us: %s" % url, level=log.INFO)
            reason = 'meta'

            return self._retry(request, reason, spider) or response

        hxs = HtmlXPathSelector(response)
        # test for captcha page
        captcha = hxs.select(
            ".//input[contains(@id, 'captchacharacters')]").extract()

        if captcha:
            log.msg("captcha page %s" % url, level=log.INFO)
            reason = 'capcha'

            return self._retry(request, reason, spider) or response

        return response

Source File: weibo_spider.py From openslack-crawler with Apache License 2.0

5 votes

def parse_item(self, response):
        print response.body
        hxs = HtmlXPathSelector(response)
        i = DmozItem()
        i['id'] = hxs.select('//input[@id="sid"]/@value').extract()
        i['title'] = hxs.select('//div[@id="name"]').extract()
        i['desc'] = hxs.select('//div[@id="description"]').extract()
        return i

Source File: yeeyan.py From openslack-crawler with Apache License 2.0

5 votes

def parse_jx(self, item, response):
        hxs = HtmlXPathSelector(response)
        item['url'] = response.url
        item['title'] = hxs.x('//title/text()').extract()[0].split('|')[1].strip()
        div = hxs.x('//div[@class="jxar_author"]')
        item['author'] = div.x('.//a/text()').extract()[0]
        item['release_time'] = hxs.x('//p[@class="jxa_info"]/span[1]/text()').extract()[0]
        try:
            item['excerpt'] = hxs.x('//p[@class="jxa_intro"]/text()').extract()[0]
        except Exception:
            item['excerpt'] = None
        item['category'] = hxs.x('//p[@class="jxa_map"]/text()').extract()[1].split()[1]
        item['content_html'] = hxs.x('//div[@class="jxa_content"]').extract()[0]
        return item

Source File: woaidu_detail.py From openslack-crawler with Apache License 2.0

5 votes

def parse_detail(self, response):
        woaidu_item = WoaiduCrawlerItem()

        response_selector = HtmlXPathSelector(response)
        woaidu_item['book_name'] = list_first_item(
            response_selector.select('//div[@class="zizida"][1]/text()').extract())
        woaidu_item['author'] = [
            list_first_item(response_selector.select('//div[@class="xiaoxiao"][1]/text()').extract())[5:].strip(), ]
        woaidu_item['book_description'] = list_first_item(
            response_selector.select('//div[@class="lili"][1]/text()').extract()).strip()
        woaidu_item['book_covor_image_url'] = list_first_item(
            response_selector.select('//div[@class="hong"][1]/img/@src').extract())

        download = []
        for i in response_selector.select('//div[contains(@class,"xiazai_xiao")]')[1:]:
            download_item = {}
            download_item['url'] = strip_null(
                    deduplication(
                        [
                            list_first_item(i.select('./div')[0].select('./a/@href').extract()),
                            list_first_item(i.select('./div')[1].select('./a/@href').extract())
                            ]
                        )
                    )

            download_item['progress'] = list_first_item(i.select('./div')[2].select('./text()').extract())
            download_item['update_time'] = list_first_item(i.select('./div')[3].select('./text()').extract())
            download_item['source_site'] = \
                [
                    list_first_item(i.select('./div')[4].select('./a/text()').extract()), \
                    list_first_item(i.select('./div')[4].select('./a/@href').extract()) \
                ]

            download.append(download_item)

        woaidu_item['book_download'] = download
        woaidu_item['original_url'] = response.url

        yield woaidu_item

Source File: AmazonSpider.py From openslack-crawler with Apache License 2.0

5 votes

def parse_item(self, response):
        self._log_page(response, 'after_login.html')
        hxs = HtmlXPathSelector(response)
        report_urls = hxs.select('//div[@id="menuh"]/ul/li[4]/div//a/@href').extract()
        for report_url in report_urls:
            # print "list:"+report_url
            yield Request(self._ab_path(response, report_url), \
                          headers=self.headers, \
                          meta={'cookiejar': response.meta['cookiejar'], \
                                }, \
                          callback=self.parse_report)

Python scrapy.selector.HtmlXPathSelector() Examples