Python scrapy.selector.Selector() Examples
The following are 30
code examples of scrapy.selector.Selector().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
scrapy.selector
, or try the search function
.
Example #1
Source File: spider.py From google-scholar-crawler with Apache License 2.0 | 7 votes |
def parse_1(self, response): info('Parse '+response.url) #sel = Selector(response) #v = sel.css('.gs_ggs a::attr(href)').extract() #import pdb; pdb.set_trace() x = self.parse_with_rules(response, self.list_css_rules, dict) items = [] if len(x) > 0: items = x[0]['.gs_r'] pp.pprint(items) import pdb; pdb.set_trace() # return self.parse_with_rules(response, self.css_rules, googlescholarItem) for item in items: if item['related-url'] == '' or item['related-type'] != '[PDF]': continue url = item['related-url'] info('pdf-url: ' + url) yield Request(url, callback=self.save_pdf)
Example #2
Source File: linkextractors.py From snippet with MIT License | 6 votes |
def extract_links(self, response): hxs = Selector(response) list_css = self.get_css("list_css") if not list_css: return [] urls = [] try: links = hxs.css(list_css).xpath('@href').extract() for url in links: urls.append(url) next_url = self.extract_next_links(response) urls.extend(next_url) except Exception as err: self.logger.error("%s" % err) rtn = [] for url in urls: url = URL.s_get_full_url(URL(url), URL(response.url)) if url: rtn.append(Link(url=url)) return rtn
Example #3
Source File: movie_spider.py From crawler_examples with Apache License 2.0 | 6 votes |
def parse_item(self, response): item = DoubanmovieItem() sel = Selector(response) title = sel.xpath('//*[@id="content"]/h1/span[1]/text()').extract()[0] year = sel.xpath('//*[@id="content"]/h1/span[2]/text()').extract()[0] commit_num = sel.xpath( '//*[@id="interest_sectl"]/div[1]/div[2]/div/div[2]/a/span/text()').extract()[0] star = sel.xpath( '//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()').extract()[0] director = sel.xpath( '//*[@id="info"]/span[1]/span[2]/a/text()').extract()[0] screenwriter = sel.xpath( '//*[@id="info"]/span[2]/span[2]/a/text()').extract()[0] item['title'] = title item['date'] = year item['star'] = star item['commit_num'] = commit_num item['director'] = director item['screenwriter'] = screenwriter return item
Example #4
Source File: asrock_spider.py From uefi-spider with MIT License | 6 votes |
def parse_machine(self, response): sel = Selector(response) download_link = None list_items = sel.css("#LeftMenu").css("li") for item in list_items: text = item.xpath(".//text()").extract()[0] if text.find("Download") < 0: continue try: download_link = item.css("a").xpath("@href").extract()[0] except: continue if download_link is not None: yield Request(url= "http://www.asrock.com%s&os=BIOS" % download_link, callback= self.parse_downloads, meta= {"item": response.meta["item"]}) pass
Example #5
Source File: crawler.py From prediction_api with MIT License | 6 votes |
def google_parse(html): page = Selector(text=html) rs = [] for ans in page.css('div.g'): title = ''.join(ans.css('h3').css('*::text').extract()) content = ''.join(ans.css('span.st').css('*::text').extract()) url = ans.css('*.r a::attr(href)').extract() try: url = re.findall('(http.*)', url[0]) url = re.sub('&.*', '', url[0]) rs.append({ 'url': url, 'content': content, 'title': title, }) except Exception: pass return rs # url = 'https://www.baidu.com/s?wd=jie%20tang&usm=1&tn=baidu&f=13&ie=utf-8&nojc=1&rqlang=en' # html = getHTMLText(url) # print(baidu_parse(html))
Example #6
Source File: pornHubSpider.py From PornHubBot with MIT License | 6 votes |
def parse_ph_key(self,response): selector = Selector(response) logging.debug('request url:------>' + response.url) # logging.info(selector) divs = selector.xpath('//div[@class="phimage"]') for div in divs: viewkey = re.findall('viewkey=(.*?)"',div.extract()) # logging.debug(viewkey) yield Request(url='https://www.pornhub.com/embed/%s' % viewkey[0],callback = self.parse_ph_info) url_next = selector.xpath('//a[@class="orangeButton" and text()="Next"]/@href').extract() # logging.debug(url_next) if url_next: # if self.test: logging.debug(' next page:---------->' + self.host+url_next[0]) yield Request(url=self.host+url_next[0],callback=self.parse_ph_key) # self.test = False
Example #7
Source File: pornHubSpider.py From PornHubBot with MIT License | 6 votes |
def parse_ph_info(self,response): phItem = PornVideoItem() selector = Selector(response) _ph_info = re.findall('flashvars_.*?=(.*?);\n',selector.extract()) logging.debug('PH信息的JSON:') logging.debug(_ph_info) _ph_info_json = json.loads(_ph_info[0]) duration = _ph_info_json.get('video_duration') phItem['video_duration'] = duration title = _ph_info_json.get('video_title') phItem['video_title'] = title image_url = _ph_info_json.get('image_url') phItem['image_url'] = image_url link_url = _ph_info_json.get('link_url') phItem['link_url'] = link_url quality_480p = _ph_info_json.get('quality_480p') phItem['quality_480p'] = quality_480p logging.info('duration:' + duration + ' title:' + title + ' image_url:' + image_url + ' link_url:' + link_url) yield phItem
Example #8
Source File: intel_spider.py From uefi-spider with MIT License | 6 votes |
def parse_form(self, response): '''Walking 'to' a form is not required, but just incase act like a human.''' ### The form will response with HTML, but data is refreshed with an XMLHTTP request. url = "https://downloadcenter.intel.com/JSONDataProvider.aspx?DownloadType=BIOS&pg=1&sortDir=descending&Hits=%d&keyword=BIO&lang=eng&refresh=filters&dataType=json&type=GET" sel = Selector(response) num_results = sel.css("span#num_results") if len(num_results) != 1: print "Error no results found?" return ### Example NNNN matching result(s) num_results = num_results.css("::text").extract()[0].split(" ")[0] try: num_results = int(num_results) except Exception, e: print "Cannot format results count as number? (%s)" % str(e) return ### Now send an AJAX request for ALL matching items.
Example #9
Source File: weibo_spider.py From weibo-analysis-system with MIT License | 6 votes |
def parse_follow(self, response): """ 抓取关注列表 """ # 如果是第1页,一次性获取后面的所有页 if response.url.endswith('page=1'): all_page = re.search(r'/> 1/(\d+)页</div>', response.text) if all_page: all_page = all_page.group(1) all_page = int(all_page) for page_num in range(2, all_page + 1): page_url = response.url.replace('page=1', 'page={}'.format(page_num)) yield Request(page_url, self.parse_follow, dont_filter=True, meta=response.meta) selector = Selector(response) urls = selector.xpath('//a[text()="关注他" or text()="关注她" or text()="取消关注"]/@href').extract() uids = re.findall('uid=(\d+)', ";".join(urls), re.S) ID = re.findall('(\d+)/follow', response.url)[0] for uid in uids: relationships_item = RelationshipsItem() relationships_item['crawl_time'] = datetime.now() relationships_item["fan_id"] = ID relationships_item["followed_id"] = uid relationships_item["_id"] = ID + '-' + uid yield relationships_item
Example #10
Source File: weibo_spider.py From weibo-analysis-system with MIT License | 6 votes |
def parse_fans(self, response): """ 抓取粉丝列表 """ # 如果是第1页,一次性获取后面的所有页 if response.url.endswith('page=1'): all_page = re.search(r'/> 1/(\d+)页</div>', response.text) if all_page: all_page = all_page.group(1) all_page = int(all_page) for page_num in range(2, all_page + 1): page_url = response.url.replace('page=1', 'page={}'.format(page_num)) yield Request(page_url, self.parse_fans, dont_filter=True, meta=response.meta) selector = Selector(response) urls = selector.xpath('//a[text()="关注他" or text()="关注她" or text()="移除"]/@href').extract() uids = re.findall('uid=(\d+)', ";".join(urls), re.S) ID = re.findall('(\d+)/fans', response.url)[0] for uid in uids: relationships_item = RelationshipsItem() relationships_item['crawl_time'] = datetime.now() relationships_item["fan_id"] = uid relationships_item["followed_id"] = ID relationships_item["_id"] = uid + '-' + ID yield relationships_item
Example #11
Source File: inventus.py From Inventus with MIT License | 6 votes |
def parse_item(self, response): item = InventusSpiderItem() for url in Selector(text=response.body).xpath('//a/@href').extract(): if not url.startswith('http://') or url.startswith('https://'): url = self.base_url + url try: parsed_uri = urlparse(url) except ValueError: # If the URL is invalid we can ignore it. continue if parsed_uri.netloc.endswith('.' + self.domain) and 'mailto:' not in url: if not parsed_uri.netloc in self.subdomains: self.subdomains.append(parsed_uri.netloc) item['subdomain'] = parsed_uri.netloc yield item if len(self.subdomains) > int(self.subdomain_limit): break yield Request(url, callback=self.parse) if len(self.subdomains) >= int(self.subdomain_limit): raise CloseSpider('subdomain limit reached')
Example #12
Source File: weibo_spider.py From weibo-analysis-system with MIT License | 6 votes |
def parse_fans(self, response): """ 抓取粉丝列表 """ # 如果是第1页,一次性获取后面的所有页 if response.url.endswith('page=1'): all_page = re.search(r'/> 1/(\d+)页</div>', response.text) if all_page: all_page = all_page.group(1) all_page = int(all_page) for page_num in range(2, all_page + 1): page_url = response.url.replace('page=1', 'page={}'.format(page_num)) yield Request(page_url, self.parse_fans, dont_filter=True, meta=response.meta) selector = Selector(response) urls = selector.xpath('//a[text()="关注他" or text()="关注她" or text()="移除"]/@href').extract() uids = re.findall('uid=(\d+)', ";".join(urls), re.S) ID = re.findall('(\d+)/fans', response.url)[0] for uid in uids: relationships_item = RelationshipsItem() relationships_item['crawl_time'] = datetime.now() relationships_item["fan_id"] = uid relationships_item["followed_id"] = ID relationships_item["_id"] = uid + '-' + ID yield relationships_item
Example #13
Source File: dmoz_spider.py From scrapy_example with MIT License | 6 votes |
def parse(self, response): """ The lines below is a spider contract. For more info see: http://doc.scrapy.org/en/latest/topics/contracts.html @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/ @scrapes name """ sel = Selector(response) sites = sel.xpath('//ul[@class="directory-url"]/li') items = [] for site in sites: item = DmozItem() item['name'] = site.xpath('a/text()').extract() item['url'] = site.xpath('a/@href').extract() item['description'] = site.xpath('text()').re('-\s[^\n]*\\r') items.append(item) return items
Example #14
Source File: zhizhu_user_topic_spider.py From Zhihu_Spider with Apache License 2.0 | 6 votes |
def gen_topic_form(self, response): # yield the beginning topics sel = Selector(response) for topic_sel in sel.xpath('//div[@id="zh-profile-topic-list"]/div[contains(@class, "zm-profile-section-item")]'): # new user-topic relationship yield self.get_UT_item(topic_sel, response.url) # get the number of topics of one user num_topic = sel.xpath('//div[contains(@class, "zm-profile-section-wrap")]/div[contains(@class, "zm-profile-section-head")]//span[contains(@class, "zm-profile-section-name")]/text()') number_str = num_topic.extract()[0] # print number_str p = re.compile(r'\d+') m = p.findall(number_str) if m: num_topic = int(m[0]) # crawl the remainding topics of a user base_line = 20 if num_topic > 20: while num_topic > 0: yield FormRequest( url = response.url, formdata = { 'start': '0', 'offset': str(base_line), '_xsrf': self.xsrf }, callback=self.parse ) num_topic = num_topic - 20 base_line += 20
Example #15
Source File: iterators.py From learn_python3_spider with MIT License | 6 votes |
def xmliter(obj, nodename): """Return a iterator of Selector's over all nodes of a XML document, given the name of the node to iterate. Useful for parsing XML feeds. obj can be: - a Response object - a unicode string - a string encoded as utf-8 """ nodename_patt = re.escape(nodename) HEADER_START_RE = re.compile(r'^(.*?)<\s*%s(?:\s|>)' % nodename_patt, re.S) HEADER_END_RE = re.compile(r'<\s*/%s\s*>' % nodename_patt, re.S) text = _body_or_str(obj) header_start = re.search(HEADER_START_RE, text) header_start = header_start.group(1).strip() if header_start else '' header_end = re_rsearch(HEADER_END_RE, text) header_end = text[header_end[1]:].strip() if header_end else '' r = re.compile(r'<%(np)s[\s>].*?</%(np)s>' % {'np': nodename_patt}, re.DOTALL) for match in r.finditer(text): nodetext = header_start + match.group() + header_end yield Selector(text=nodetext, type='xml').xpath('//' + nodename)[0]
Example #16
Source File: feed.py From learn_python3_spider with MIT License | 6 votes |
def parse(self, response): if not hasattr(self, 'parse_node'): raise NotConfigured('You must define parse_node method in order to scrape this XML feed') response = self.adapt_response(response) if self.iterator == 'iternodes': nodes = self._iternodes(response) elif self.iterator == 'xml': selector = Selector(response, type='xml') self._register_namespaces(selector) nodes = selector.xpath('//%s' % self.itertag) elif self.iterator == 'html': selector = Selector(response, type='html') self._register_namespaces(selector) nodes = selector.xpath('//%s' % self.itertag) else: raise NotSupported('Unsupported node iterator') return self.parse_nodes(response, nodes)
Example #17
Source File: weibo_spider.py From weibo-analysis-system with MIT License | 6 votes |
def parse_comment(self, response): # 如果是第1页,一次性获取后面的所有页 if response.url.endswith('page=1'): all_page = re.search(r'/> 1/(\d+)页</div>', response.text) if all_page: all_page = all_page.group(1) all_page = int(all_page) for page_num in range(2, all_page + 1): page_url = response.url.replace('page=1', 'page={}'.format(page_num)) yield Request(page_url, self.parse_comment, dont_filter=True, meta=response.meta) selector = Selector(response) comment_nodes = selector.xpath('//div[@class="c" and contains(@id,"C_")]') for comment_node in comment_nodes: comment_user_url = comment_node.xpath('.//a[contains(@href,"/u/")]/@href').extract_first() if not comment_user_url: continue comment_item = CommentItem() comment_item['crawl_time'] = datetime.now() comment_item['weibo_url'] = response.meta['weibo_url'] comment_item['comment_user_id'] = re.search(r'/u/(\d+)', comment_user_url).group(1) comment_item['content'] = comment_node.xpath('.//span[@class="ctt"]').xpath('string(.)').extract_first() comment_item['_id'] = comment_node.xpath('./@id').extract_first() created_at = comment_node.xpath('.//span[@class="ct"]/text()').extract_first() comment_item['created_at'] = time_fix(created_at.split('\xa0')[0]) yield comment_item
Example #18
Source File: weibo_spider.py From weibo-analysis-system with MIT License | 6 votes |
def parse_comment(self, response): # 如果是第1页,一次性获取后面的所有页 if response.url.endswith('page=1'): all_page = re.search(r'/> 1/(\d+)页</div>', response.text) if all_page: all_page = all_page.group(1) all_page = int(all_page) for page_num in range(2, all_page + 1): page_url = response.url.replace('page=1', 'page={}'.format(page_num)) yield Request(page_url, self.parse_comment, dont_filter=True, meta=response.meta) selector = Selector(response) comment_nodes = selector.xpath('//div[@class="c" and contains(@id,"C_")]') for comment_node in comment_nodes: comment_user_url = comment_node.xpath('.//a[contains(@href,"/u/")]/@href').extract_first() if not comment_user_url: continue comment_item = CommentItem() comment_item['crawl_time'] = datetime.now() comment_item['weibo_url'] = response.meta['weibo_url'] comment_item['comment_user_id'] = re.search(r'/u/(\d+)', comment_user_url).group(1) comment_item['content'] = comment_node.xpath('.//span[@class="ctt"]').xpath('string(.)').extract_first() comment_item['_id'] = comment_node.xpath('./@id').extract_first() created_at = comment_node.xpath('.//span[@class="ct"]/text()').extract_first() comment_item['created_at'] = time_fix(created_at.split('\xa0')[0]) yield comment_item
Example #19
Source File: textspider.py From snippet with MIT License | 6 votes |
def handle_page(self, response): hxs = Selector(response) # text_css = self.css_selector["text_css"] # title_css = self.css_selector["title_css"] text_css = self.get_css("text_css") title_css = self.get_css("title_css") if not text_css or not title_css: return [] item = TextItem() try: item["title"] = hxs.css(title_css).xpath('text()').extract()[0] except Exception: return [] item["texts"] = hxs.css(text_css).xpath('text()').extract() if not item["texts"]: return [] return [item]
Example #20
Source File: feed.py From learn_python3_spider with MIT License | 6 votes |
def parse(self, response): if not hasattr(self, 'parse_node'): raise NotConfigured('You must define parse_node method in order to scrape this XML feed') response = self.adapt_response(response) if self.iterator == 'iternodes': nodes = self._iternodes(response) elif self.iterator == 'xml': selector = Selector(response, type='xml') self._register_namespaces(selector) nodes = selector.xpath('//%s' % self.itertag) elif self.iterator == 'html': selector = Selector(response, type='html') self._register_namespaces(selector) nodes = selector.xpath('//%s' % self.itertag) else: raise NotSupported('Unsupported node iterator') return self.parse_nodes(response, nodes)
Example #21
Source File: post.py From tieba-crawler with MIT License | 6 votes |
def parse(self, response): """TODO: Docstring for pass. :response: TODO :returns: TODO """ for item in self._parse_posts(response): if not self.should_stop(item): yield item else: return if len(Selector(response).css('#frs_list_pager .next')): #贴吧的分页有的不是完整的链接 next_page_url = Selector(response).css('#frs_list_pager .next::attr(href)').extract_first() logging.debug('next_page_url %s', next_page_url) if -1 != next_page_url.find('http://tieba.baidu.com'): yield Request(next_page_url, callback=self.parse) else: yield Request('http://tieba.baidu.com' + next_page_url, callback=self.parse)
Example #22
Source File: xicidaili.py From IPProxyTool with MIT License | 6 votes |
def parse_page(self, response): sel = Selector(text = response.body) infos = sel.xpath('//tr[@class="odd"]').extract() for info in infos: val = Selector(text = info) ip = val.xpath('//td[2]/text()').extract_first() port = val.xpath('//td[3]/text()').extract_first() country = val.xpath('//td[4]/a/text()').extract_first() anonymity = val.xpath('//td[5]/text()').extract_first() proxy = Proxy() proxy.set_value( ip = ip, port = port, country = country, anonymity = anonymity, source = self.name, ) self.add_proxy(proxy = proxy)
Example #23
Source File: iterators.py From learn_python3_spider with MIT License | 6 votes |
def xmliter(obj, nodename): """Return a iterator of Selector's over all nodes of a XML document, given the name of the node to iterate. Useful for parsing XML feeds. obj can be: - a Response object - a unicode string - a string encoded as utf-8 """ nodename_patt = re.escape(nodename) HEADER_START_RE = re.compile(r'^(.*?)<\s*%s(?:\s|>)' % nodename_patt, re.S) HEADER_END_RE = re.compile(r'<\s*/%s\s*>' % nodename_patt, re.S) text = _body_or_str(obj) header_start = re.search(HEADER_START_RE, text) header_start = header_start.group(1).strip() if header_start else '' header_end = re_rsearch(HEADER_END_RE, text) header_end = text[header_end[1]:].strip() if header_end else '' r = re.compile(r'<%(np)s[\s>].*?</%(np)s>' % {'np': nodename_patt}, re.DOTALL) for match in r.finditer(text): nodetext = header_start + match.group() + header_end yield Selector(text=nodetext, type='xml').xpath('//' + nodename)[0]
Example #24
Source File: feed.py From learn_python3_spider with MIT License | 5 votes |
def parse_nodes(self, response, nodes): """This method is called for the nodes matching the provided tag name (itertag). Receives the response and an Selector for each node. Overriding this method is mandatory. Otherwise, you spider won't work. This method must return either a BaseItem, a Request, or a list containing any of them. """ for selector in nodes: ret = iterate_spider_output(self.parse_node(response, selector)) for result_item in self.process_results(response, ret): yield result_item
Example #25
Source File: iterators.py From learn_python3_spider with MIT License | 5 votes |
def xmliter_lxml(obj, nodename, namespace=None, prefix='x'): from lxml import etree reader = _StreamReader(obj) tag = '{%s}%s' % (namespace, nodename) if namespace else nodename iterable = etree.iterparse(reader, tag=tag, encoding=reader.encoding) selxpath = '//' + ('%s:%s' % (prefix, nodename) if namespace else nodename) for _, node in iterable: nodetext = etree.tostring(node, encoding='unicode') node.clear() xs = Selector(text=nodetext, type='xml') if namespace: xs.register_namespace(prefix, namespace) yield xs.xpath(selxpath)[0]
Example #26
Source File: text.py From learn_python3_spider with MIT License | 5 votes |
def selector(self): from scrapy.selector import Selector if self._cached_selector is None: self._cached_selector = Selector(self) return self._cached_selector
Example #27
Source File: text.py From learn_python3_spider with MIT License | 5 votes |
def _url_from_selector(sel): # type: (parsel.Selector) -> str if isinstance(sel.root, six.string_types): # e.g. ::attr(href) result return strip_html5_whitespace(sel.root) if not hasattr(sel.root, 'tag'): raise ValueError("Unsupported selector: %s" % sel) if sel.root.tag not in ('a', 'link'): raise ValueError("Only <a> and <link> elements are supported; got <%s>" % sel.root.tag) href = sel.root.get('href') if href is None: raise ValueError("<%s> element has no href attribute: %s" % (sel.root.tag, sel)) return strip_html5_whitespace(href)
Example #28
Source File: feed.py From learn_python3_spider with MIT License | 5 votes |
def parse_nodes(self, response, nodes): """This method is called for the nodes matching the provided tag name (itertag). Receives the response and an Selector for each node. Overriding this method is mandatory. Otherwise, you spider won't work. This method must return either a BaseItem, a Request, or a list containing any of them. """ for selector in nodes: ret = iterate_spider_output(self.parse_node(response, selector)) for result_item in self.process_results(response, ret): yield result_item
Example #29
Source File: text.py From learn_python3_spider with MIT License | 5 votes |
def follow(self, url, callback=None, method='GET', headers=None, body=None, cookies=None, meta=None, encoding=None, priority=0, dont_filter=False, errback=None, cb_kwargs=None): # type: (...) -> Request """ Return a :class:`~.Request` instance to follow a link ``url``. It accepts the same arguments as ``Request.__init__`` method, but ``url`` can be not only an absolute URL, but also * a relative URL; * a scrapy.link.Link object (e.g. a link extractor result); * an attribute Selector (not SelectorList) - e.g. ``response.css('a::attr(href)')[0]`` or ``response.xpath('//img/@src')[0]``. * a Selector for ``<a>`` or ``<link>`` element, e.g. ``response.css('a.my_link')[0]``. See :ref:`response-follow-example` for usage examples. """ if isinstance(url, parsel.Selector): url = _url_from_selector(url) elif isinstance(url, parsel.SelectorList): raise ValueError("SelectorList is not supported") encoding = self.encoding if encoding is None else encoding return super(TextResponse, self).follow(url, callback, method=method, headers=headers, body=body, cookies=cookies, meta=meta, encoding=encoding, priority=priority, dont_filter=dont_filter, errback=errback, cb_kwargs=cb_kwargs, )
Example #30
Source File: asrock_spider.py From uefi-spider with MIT License | 5 votes |
def parse_downloads(self, response): def extract_field(field_sel): return field_sel.xpath(".//text()").extract()[0] sel = Selector(response) updates = [] rows = sel.css("tr") for row in rows: cells = row.css("td") if len(cells) != 10: continue item = AsrockUpdateItem() item["version"] = extract_field(cells[0]) item["date"] = extract_field(cells[1]) item["bios_type"] = extract_field(cells[2]) if item["bios_type"] not in ["Instant Flash"]: continue item["desc"] = extract_field(cells[4]) item["bios_url"] = cells[8].css("a").xpath("@href").extract()[0] item["binary_name"] = item["bios_url"].split("/")[-1] item["item_id"] = item["binary_name"].replace(".zip", "") item["attrs"] = dict(response.meta["item"]) #print dict(item) updates.append(item) for update in updates: yield Request(url= update["bios_url"], callback= self.parse_binary, meta= {"item": update}) pass pass