Python scrapy.selector.HtmlXPathSelector() Examples
The following are 13
code examples of scrapy.selector.HtmlXPathSelector().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
scrapy.selector
, or try the search function
.
Example #1
Source File: dmoz.py From tripadvisor-scraper with GNU General Public License v3.0 | 6 votes |
def parse(self, response): """ The lines below is a spider contract. For more info see: http://doc.scrapy.org/en/latest/topics/contracts.html @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/ @scrapes name """ hxs = HtmlXPathSelector(response) sites = hxs.select('//ul[@class="directory-url"]/li') items = [] for site in sites: item = Website() item['name'] = site.select('a/text()').extract() item['url'] = site.select('a/@href').extract() item['description'] = site.select('text()').re('-\s([^\n]*?)\\n') items.append(item) return items
Example #2
Source File: yeeyan.py From openslack-crawler with Apache License 2.0 | 6 votes |
def parse(self, response): self.log("OK,%s" % response.url) hxs = HtmlXPathSelector(response) # 将文章的链接继续进行处理 divs = hxs.x('//div[@class="publicLeftCon mt10"]') for div in divs: url = div.x('h5/a/@href').extract()[0] yield self.make_requests_from_url(url).replace(callback=self.parse_content) # 将下一页的链接继续进行处理 try: next_url = \ hxs.x('//div[@id="project_left"]/div[@class="publicMiddleLine"]/span/a[b="下一页"]/@href').extract()[0] except Exception: return next_url = 'http://article.yeeyan.org' + next_url # if self.count==10: # return # self.count+=1 yield self.make_requests_from_url(next_url).replace(callback=self.parse) # 过滤文章内容
Example #3
Source File: yeeyan.py From openslack-crawler with Apache License 2.0 | 6 votes |
def parse_content(self, response): hxs = HtmlXPathSelector(response) item = YeeyanItem() if hxs.x('//a[@class="jx_logo"]/text()'): item = self.parse_jx(item, response) else: item['url'] = response.url item['title'] = hxs.x('//title/text()').extract()[0].split('|')[1].strip() div = hxs.x('//div[@class="user_info"]') item['author'] = div.x('.//h2/a/text()').extract()[0] item['excerpt'] = hxs.x('//p[@class="excerpt"]/text()').extract() if item['excerpt']: item['excerpt'] = item['excerpt'][0] else: item['excerpt'] = '' item['content_html'] = hxs.x('//div[@id="conBox"]').extract()[0] item['release_time'] = div.x('.//p/text()').extract()[0].strip()[1:-7] item['category'] = hxs.x('//div[@class="crumb"]/a/text()').extract()[1] return item # 过滤精选的文章
Example #4
Source File: LinkedinSpider.py From openslack-crawler with Apache License 2.0 | 6 votes |
def parse(self, response): """ default parse method, rule is not useful now """ # import pdb; pdb.set_trace() response = response.replace(url=HtmlParser.remove_url_parameter(response.url)) hxs = HtmlXPathSelector(response) index_level = self.determine_level(response) log.msg("Parse: index level:" + str(index_level)) if index_level in [1, 2, 3, 4]: self.save_to_file_system(index_level, response) relative_urls = self.get_follow_links(index_level, hxs) if relative_urls is not None: for url in relative_urls: log.msg('yield process, url:' + url) yield Request(url, callback=self.parse) elif index_level == 5: personProfile = HtmlParser.extract_person_profile(hxs) linkedin_id = self.get_linkedin_id(response.url) linkedin_id = UnicodeDammit(urllib.unquote_plus(linkedin_id)).markup if linkedin_id: personProfile['_id'] = linkedin_id personProfile['url'] = UnicodeDammit(response.url).markup yield personProfile
Example #5
Source File: LCW_spider.py From LotteryTicket with MIT License | 5 votes |
def parse_page(response): hxs = HtmlXPathSelector(response) item = LotteryticketItem() # 期数 title = hxs.select('//html/body/center/center/table/tr/td/table[1]/tr[2]/td[1]/text()').extract()[0] item['title'] = filter(str.isdigit, ("".join(title.split()).encode("utf-8"))) # 红色球区 red1 = hxs.select('//html/body/center/center/table/tr/td/table[2]/tbody/tr[2]/td[1]/font/text()').extract()[0] item['red1'] = filter(str.isdigit, ("".join(red1.split()).encode("utf-8"))) red2 = hxs.select('//html/body/center/center/table/tr/td/table[2]/tbody/tr[2]/td[2]/font/text()').extract()[0] item['red2'] = filter(str.isdigit, ("".join(red2.split()).encode("utf-8"))) red3 = hxs.select('//html/body/center/center/table/tr/td/table[2]/tbody/tr[2]/td[3]/font/text()').extract()[0] item['red3'] = filter(str.isdigit, ("".join(red3.split()).encode("utf-8"))) red4 = hxs.select('//html/body/center/center/table/tr/td/table[2]/tbody/tr[2]/td[4]/font/text()').extract()[0] item['red4'] = filter(str.isdigit, ("".join(red4.split()).encode("utf-8"))) red5 = hxs.select('//html/body/center/center/table/tr/td/table[2]/tbody/tr[2]/td[5]/font/text()').extract()[0] item['red5'] = filter(str.isdigit, ("".join(red5.split()).encode("utf-8"))) red6 = hxs.select('//html/body/center/center/table/tr/td/table[2]/tbody/tr[2]/td[6]/font/text()').extract()[0] item['red6'] = filter(str.isdigit, ("".join(red6.split()).encode("utf-8"))) # 蓝色球区 blue = hxs.select('//html/body/center/center/table/tr/td/table[2]/tbody/tr[2]/td[7]/font/text()').extract()[0] item['blue'] = filter(str.isdigit, ("".join(blue.split()).encode("utf-8"))) # 开奖时间 created_at = hxs.select('//html/body/center/center/table/tr/td/table[1]/tr[2]/td[2]/text()').extract()[0] item['created_at'] = ("".join(created_at.split()).encode("utf-8"))[0:10] return item
Example #6
Source File: __init__ copy.py From scrapy-spiders with MIT License | 5 votes |
def parse_torrent(self,response): x = HtmlXPathSelector(response) torrent['url'] = response.url torrent['description'] = x.select("//span[@id='lblDescription']/text()").extract() torrent['jurisdictiontype'] = x.select("//span[@id='lblJurisdictionType']").extract() torrent['agency'] = x.select("//span[@id='lblUmbrellaAgency']/text()").extract() torrent['contactinfo'] = x.select("//span[@id='lblContact']/p/text()").extract() torrent['links'] = x.select("//span[@id='lblContacts']/p/a/@href").extract() return torrent
Example #7
Source File: scrape.py From evolve-music2 with MIT License | 5 votes |
def parse(self, response): x = HtmlXPathSelector(response) links = [] url = response.url music_links = x.select('//ul/li/a/@href').extract() music_links = [m for m in music_links if m.endswith(".mid")] for l in music_links: link = MIDIFile() link['url'] = url link['ltype'] = self.ltype link['link'] = l link["file_urls"] = [l] links.append(link) return links
Example #8
Source File: scrape.py From evolve-music2 with MIT License | 5 votes |
def parse(self, response): x = HtmlXPathSelector(response) links = [] url = response.url music_links = x.select("//td/a/@href").extract() music_links = [m for m in music_links if m.endswith(".mid")] for l in music_links: link = MIDIFile() link['url'] = url link['ltype'] = self.ltype link['link'] = "http://midi-archive.com/" + l link["file_urls"] = [link['link']] links.append(link) return links
Example #9
Source File: middlewares.py From fp-server with MIT License | 5 votes |
def process_response(self, request, response, spider): url = response.url if response.status in [301, 307]: log.msg("trying to redirect us: %s" % url, level=log.INFO) reason = 'redirect %d' % response.status return self._retry(request, reason, spider) or response interval, redirect_url = get_meta_refresh(response) # handle meta redirect if redirect_url: log.msg("trying to redirect us: %s" % url, level=log.INFO) reason = 'meta' return self._retry(request, reason, spider) or response hxs = HtmlXPathSelector(response) # test for captcha page captcha = hxs.select( ".//input[contains(@id, 'captchacharacters')]").extract() if captcha: log.msg("captcha page %s" % url, level=log.INFO) reason = 'capcha' return self._retry(request, reason, spider) or response return response
Example #10
Source File: weibo_spider.py From openslack-crawler with Apache License 2.0 | 5 votes |
def parse_item(self, response): print response.body hxs = HtmlXPathSelector(response) i = DmozItem() i['id'] = hxs.select('//input[@id="sid"]/@value').extract() i['title'] = hxs.select('//div[@id="name"]').extract() i['desc'] = hxs.select('//div[@id="description"]').extract() return i
Example #11
Source File: yeeyan.py From openslack-crawler with Apache License 2.0 | 5 votes |
def parse_jx(self, item, response): hxs = HtmlXPathSelector(response) item['url'] = response.url item['title'] = hxs.x('//title/text()').extract()[0].split('|')[1].strip() div = hxs.x('//div[@class="jxar_author"]') item['author'] = div.x('.//a/text()').extract()[0] item['release_time'] = hxs.x('//p[@class="jxa_info"]/span[1]/text()').extract()[0] try: item['excerpt'] = hxs.x('//p[@class="jxa_intro"]/text()').extract()[0] except Exception: item['excerpt'] = None item['category'] = hxs.x('//p[@class="jxa_map"]/text()').extract()[1].split()[1] item['content_html'] = hxs.x('//div[@class="jxa_content"]').extract()[0] return item
Example #12
Source File: woaidu_detail.py From openslack-crawler with Apache License 2.0 | 5 votes |
def parse_detail(self, response): woaidu_item = WoaiduCrawlerItem() response_selector = HtmlXPathSelector(response) woaidu_item['book_name'] = list_first_item( response_selector.select('//div[@class="zizida"][1]/text()').extract()) woaidu_item['author'] = [ list_first_item(response_selector.select('//div[@class="xiaoxiao"][1]/text()').extract())[5:].strip(), ] woaidu_item['book_description'] = list_first_item( response_selector.select('//div[@class="lili"][1]/text()').extract()).strip() woaidu_item['book_covor_image_url'] = list_first_item( response_selector.select('//div[@class="hong"][1]/img/@src').extract()) download = [] for i in response_selector.select('//div[contains(@class,"xiazai_xiao")]')[1:]: download_item = {} download_item['url'] = strip_null( deduplication( [ list_first_item(i.select('./div')[0].select('./a/@href').extract()), list_first_item(i.select('./div')[1].select('./a/@href').extract()) ] ) ) download_item['progress'] = list_first_item(i.select('./div')[2].select('./text()').extract()) download_item['update_time'] = list_first_item(i.select('./div')[3].select('./text()').extract()) download_item['source_site'] = \ [ list_first_item(i.select('./div')[4].select('./a/text()').extract()), \ list_first_item(i.select('./div')[4].select('./a/@href').extract()) \ ] download.append(download_item) woaidu_item['book_download'] = download woaidu_item['original_url'] = response.url yield woaidu_item
Example #13
Source File: AmazonSpider.py From openslack-crawler with Apache License 2.0 | 5 votes |
def parse_item(self, response): self._log_page(response, 'after_login.html') hxs = HtmlXPathSelector(response) report_urls = hxs.select('//div[@id="menuh"]/ul/li[4]/div//a/@href').extract() for report_url in report_urls: # print "list:"+report_url yield Request(self._ab_path(response, report_url), \ headers=self.headers, \ meta={'cookiejar': response.meta['cookiejar'], \ }, \ callback=self.parse_report)