Python scrapy.selector.Selector() Examples

The following are 30 code examples of scrapy.selector.Selector(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module scrapy.selector , or try the search function .
Example #1
Source File: spider.py    From google-scholar-crawler with Apache License 2.0 7 votes vote down vote up
def parse_1(self, response):
        info('Parse '+response.url)
        #sel = Selector(response)
        #v = sel.css('.gs_ggs a::attr(href)').extract()
        #import pdb; pdb.set_trace()
        x = self.parse_with_rules(response, self.list_css_rules, dict)
        items = []
        if len(x) > 0:
            items = x[0]['.gs_r']
            pp.pprint(items)
        import pdb; pdb.set_trace()
        # return self.parse_with_rules(response, self.css_rules, googlescholarItem)

        for item in items:
            if item['related-url'] == '' or item['related-type'] != '[PDF]':
                continue
            url = item['related-url']
            info('pdf-url: ' + url)
            yield Request(url, callback=self.save_pdf) 
Example #2
Source File: linkextractors.py    From snippet with MIT License 6 votes vote down vote up
def extract_links(self, response):
        hxs = Selector(response)
        list_css = self.get_css("list_css")
        if not list_css:
            return []

        urls = []
        try:
            links = hxs.css(list_css).xpath('@href').extract()
            for url in links:
                urls.append(url)
            next_url = self.extract_next_links(response)
            urls.extend(next_url)
        except Exception as err:
            self.logger.error("%s" % err)

        rtn = []
        for url in urls:
            url = URL.s_get_full_url(URL(url), URL(response.url))
            if url:
                rtn.append(Link(url=url))

        return rtn 
Example #3
Source File: movie_spider.py    From crawler_examples with Apache License 2.0 6 votes vote down vote up
def parse_item(self, response):
        item = DoubanmovieItem()
        sel = Selector(response)

        title = sel.xpath('//*[@id="content"]/h1/span[1]/text()').extract()[0]
        year = sel.xpath('//*[@id="content"]/h1/span[2]/text()').extract()[0]
        commit_num = sel.xpath(
            '//*[@id="interest_sectl"]/div[1]/div[2]/div/div[2]/a/span/text()').extract()[0]
        star = sel.xpath(
            '//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()').extract()[0]
        director = sel.xpath(
            '//*[@id="info"]/span[1]/span[2]/a/text()').extract()[0]
        screenwriter = sel.xpath(
            '//*[@id="info"]/span[2]/span[2]/a/text()').extract()[0]

        item['title'] = title
        item['date'] = year
        item['star'] = star
        item['commit_num'] = commit_num
        item['director'] = director
        item['screenwriter'] = screenwriter

        return item 
Example #4
Source File: asrock_spider.py    From uefi-spider with MIT License 6 votes vote down vote up
def parse_machine(self, response):
        sel = Selector(response)

        download_link = None
        list_items = sel.css("#LeftMenu").css("li")
        for item in list_items:
            text = item.xpath(".//text()").extract()[0]
            if text.find("Download") < 0:
                continue
            try:
                download_link = item.css("a").xpath("@href").extract()[0]
            except:
                continue

        if download_link is not None:
            yield Request(url= "http://www.asrock.com%s&os=BIOS" % download_link, 
                callback= self.parse_downloads,
                meta= {"item": response.meta["item"]})
        pass 
Example #5
Source File: crawler.py    From prediction_api with MIT License 6 votes vote down vote up
def google_parse(html):
    page = Selector(text=html)
    rs = []
    for ans in page.css('div.g'):
        title = ''.join(ans.css('h3').css('*::text').extract())
        content = ''.join(ans.css('span.st').css('*::text').extract())
        url = ans.css('*.r a::attr(href)').extract()
        try:
            url = re.findall('(http.*)', url[0])
            url = re.sub('&.*', '', url[0])
            rs.append({
                'url': url,
                'content': content,
                'title': title,
            })
        except Exception:
            pass
    return rs


# url = 'https://www.baidu.com/s?wd=jie%20tang&usm=1&tn=baidu&f=13&ie=utf-8&nojc=1&rqlang=en'
# html = getHTMLText(url)
# print(baidu_parse(html)) 
Example #6
Source File: pornHubSpider.py    From PornHubBot with MIT License 6 votes vote down vote up
def parse_ph_key(self,response):
        selector = Selector(response)
        logging.debug('request url:------>' + response.url)
        # logging.info(selector)
        divs = selector.xpath('//div[@class="phimage"]')
        for div in divs:
            viewkey = re.findall('viewkey=(.*?)"',div.extract())
            # logging.debug(viewkey)
            yield Request(url='https://www.pornhub.com/embed/%s' % viewkey[0],callback = self.parse_ph_info)
        url_next = selector.xpath('//a[@class="orangeButton" and text()="Next"]/@href').extract()
        # logging.debug(url_next)
        if url_next:
        # if self.test:
            logging.debug(' next page:---------->' + self.host+url_next[0])
            yield Request(url=self.host+url_next[0],callback=self.parse_ph_key)
            # self.test = False 
Example #7
Source File: pornHubSpider.py    From PornHubBot with MIT License 6 votes vote down vote up
def parse_ph_info(self,response):
        phItem = PornVideoItem()
        selector = Selector(response)
        _ph_info = re.findall('flashvars_.*?=(.*?);\n',selector.extract())
        logging.debug('PH信息的JSON:')
        logging.debug(_ph_info)
        _ph_info_json = json.loads(_ph_info[0])
        duration = _ph_info_json.get('video_duration')
        phItem['video_duration'] = duration
        title = _ph_info_json.get('video_title')
        phItem['video_title'] = title
        image_url = _ph_info_json.get('image_url')
        phItem['image_url'] = image_url
        link_url = _ph_info_json.get('link_url')
        phItem['link_url'] = link_url
        quality_480p = _ph_info_json.get('quality_480p')
        phItem['quality_480p'] = quality_480p
        logging.info('duration:' + duration + ' title:' + title + ' image_url:' + image_url + ' link_url:' + link_url)
        yield phItem 
Example #8
Source File: intel_spider.py    From uefi-spider with MIT License 6 votes vote down vote up
def parse_form(self, response):
    '''Walking 'to' a form is not required, but just incase act like a human.'''

    ### The form will response with HTML, but data is refreshed with an XMLHTTP request.
    url = "https://downloadcenter.intel.com/JSONDataProvider.aspx?DownloadType=BIOS&pg=1&sortDir=descending&Hits=%d&keyword=BIO&lang=eng&refresh=filters&dataType=json&type=GET"

    sel = Selector(response)
    num_results = sel.css("span#num_results")
    if len(num_results) != 1:
      print "Error no results found?"
      return

    ### Example NNNN matching result(s)
    num_results = num_results.css("::text").extract()[0].split(" ")[0]
    try:
      num_results = int(num_results)
    except Exception, e:
      print "Cannot format results count as number? (%s)" % str(e)
      return

    ### Now send an AJAX request for ALL matching items. 
Example #9
Source File: weibo_spider.py    From weibo-analysis-system with MIT License 6 votes vote down vote up
def parse_follow(self, response):
        """
        抓取关注列表
        """
        # 如果是第1页,一次性获取后面的所有页
        if response.url.endswith('page=1'):
            all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
            if all_page:
                all_page = all_page.group(1)
                all_page = int(all_page)
                for page_num in range(2, all_page + 1):
                    page_url = response.url.replace('page=1', 'page={}'.format(page_num))
                    yield Request(page_url, self.parse_follow, dont_filter=True, meta=response.meta)
        selector = Selector(response)
        urls = selector.xpath('//a[text()="关注他" or text()="关注她" or text()="取消关注"]/@href').extract()
        uids = re.findall('uid=(\d+)', ";".join(urls), re.S)
        ID = re.findall('(\d+)/follow', response.url)[0]
        for uid in uids:
            relationships_item = RelationshipsItem()
            relationships_item['crawl_time'] = datetime.now()
            relationships_item["fan_id"] = ID
            relationships_item["followed_id"] = uid
            relationships_item["_id"] = ID + '-' + uid
            yield relationships_item 
Example #10
Source File: weibo_spider.py    From weibo-analysis-system with MIT License 6 votes vote down vote up
def parse_fans(self, response):
        """
        抓取粉丝列表
        """
        # 如果是第1页,一次性获取后面的所有页
        if response.url.endswith('page=1'):
            all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
            if all_page:
                all_page = all_page.group(1)
                all_page = int(all_page)
                for page_num in range(2, all_page + 1):
                    page_url = response.url.replace('page=1', 'page={}'.format(page_num))
                    yield Request(page_url, self.parse_fans, dont_filter=True, meta=response.meta)
        selector = Selector(response)
        urls = selector.xpath('//a[text()="关注他" or text()="关注她" or text()="移除"]/@href').extract()
        uids = re.findall('uid=(\d+)', ";".join(urls), re.S)
        ID = re.findall('(\d+)/fans', response.url)[0]
        for uid in uids:
            relationships_item = RelationshipsItem()
            relationships_item['crawl_time'] = datetime.now()
            relationships_item["fan_id"] = uid
            relationships_item["followed_id"] = ID
            relationships_item["_id"] = uid + '-' + ID
            yield relationships_item 
Example #11
Source File: inventus.py    From Inventus with MIT License 6 votes vote down vote up
def parse_item(self, response):
        item = InventusSpiderItem()
        for url in Selector(text=response.body).xpath('//a/@href').extract():
            if not url.startswith('http://') or url.startswith('https://'):
                url = self.base_url + url
            try:
                parsed_uri = urlparse(url)
            except ValueError:
                # If the URL is invalid we can ignore it.
                continue
            if parsed_uri.netloc.endswith('.' + self.domain) and 'mailto:' not in url:
                if not parsed_uri.netloc in self.subdomains:
                    self.subdomains.append(parsed_uri.netloc)
                    item['subdomain'] = parsed_uri.netloc
                    yield item

                    if len(self.subdomains) > int(self.subdomain_limit):
                        break

                yield Request(url, callback=self.parse)

        if len(self.subdomains) >= int(self.subdomain_limit):
            raise CloseSpider('subdomain limit reached') 
Example #12
Source File: weibo_spider.py    From weibo-analysis-system with MIT License 6 votes vote down vote up
def parse_fans(self, response):
        """
        抓取粉丝列表
        """
        # 如果是第1页,一次性获取后面的所有页
        if response.url.endswith('page=1'):
            all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
            if all_page:
                all_page = all_page.group(1)
                all_page = int(all_page)
                for page_num in range(2, all_page + 1):
                    page_url = response.url.replace('page=1', 'page={}'.format(page_num))
                    yield Request(page_url, self.parse_fans, dont_filter=True, meta=response.meta)
        selector = Selector(response)
        urls = selector.xpath('//a[text()="关注他" or text()="关注她" or text()="移除"]/@href').extract()
        uids = re.findall('uid=(\d+)', ";".join(urls), re.S)
        ID = re.findall('(\d+)/fans', response.url)[0]
        for uid in uids:
            relationships_item = RelationshipsItem()
            relationships_item['crawl_time'] = datetime.now()
            relationships_item["fan_id"] = uid
            relationships_item["followed_id"] = ID
            relationships_item["_id"] = uid + '-' + ID
            yield relationships_item 
Example #13
Source File: dmoz_spider.py    From scrapy_example with MIT License 6 votes vote down vote up
def parse(self, response):
        """
        The lines below is a spider contract. For more info see:
        http://doc.scrapy.org/en/latest/topics/contracts.html
        @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/
        @scrapes name
        """
        sel = Selector(response)
        sites = sel.xpath('//ul[@class="directory-url"]/li')
        items = []

        for site in sites:
            item = DmozItem()
            item['name'] = site.xpath('a/text()').extract()
            item['url'] = site.xpath('a/@href').extract()
            item['description'] = site.xpath('text()').re('-\s[^\n]*\\r')
            items.append(item)

        return items 
Example #14
Source File: zhizhu_user_topic_spider.py    From Zhihu_Spider with Apache License 2.0 6 votes vote down vote up
def gen_topic_form(self, response):
        # yield the beginning topics
        sel = Selector(response)
        for topic_sel in sel.xpath('//div[@id="zh-profile-topic-list"]/div[contains(@class, "zm-profile-section-item")]'):
            # new user-topic relationship
            yield self.get_UT_item(topic_sel, response.url)

        # get the number of topics of one user
        num_topic = sel.xpath('//div[contains(@class, "zm-profile-section-wrap")]/div[contains(@class, "zm-profile-section-head")]//span[contains(@class, "zm-profile-section-name")]/text()')
        number_str = num_topic.extract()[0]
        # print number_str
        p = re.compile(r'\d+')
        m = p.findall(number_str)
        if m:
            num_topic = int(m[0])
            # crawl the remainding topics of a user
            base_line = 20
            if num_topic > 20:
                while  num_topic > 0:
                    yield FormRequest(
                            url = response.url,
                            formdata = {
                                'start': '0',
                                'offset': str(base_line),
                                '_xsrf': self.xsrf
                                },
                            callback=self.parse
                            )
                    num_topic = num_topic - 20
                    base_line += 20 
Example #15
Source File: iterators.py    From learn_python3_spider with MIT License 6 votes vote down vote up
def xmliter(obj, nodename):
    """Return a iterator of Selector's over all nodes of a XML document,
       given the name of the node to iterate. Useful for parsing XML feeds.

    obj can be:
    - a Response object
    - a unicode string
    - a string encoded as utf-8
    """
    nodename_patt = re.escape(nodename)

    HEADER_START_RE = re.compile(r'^(.*?)<\s*%s(?:\s|>)' % nodename_patt, re.S)
    HEADER_END_RE = re.compile(r'<\s*/%s\s*>' % nodename_patt, re.S)
    text = _body_or_str(obj)

    header_start = re.search(HEADER_START_RE, text)
    header_start = header_start.group(1).strip() if header_start else ''
    header_end = re_rsearch(HEADER_END_RE, text)
    header_end = text[header_end[1]:].strip() if header_end else ''

    r = re.compile(r'<%(np)s[\s>].*?</%(np)s>' % {'np': nodename_patt}, re.DOTALL)
    for match in r.finditer(text):
        nodetext = header_start + match.group() + header_end
        yield Selector(text=nodetext, type='xml').xpath('//' + nodename)[0] 
Example #16
Source File: feed.py    From learn_python3_spider with MIT License 6 votes vote down vote up
def parse(self, response):
        if not hasattr(self, 'parse_node'):
            raise NotConfigured('You must define parse_node method in order to scrape this XML feed')

        response = self.adapt_response(response)
        if self.iterator == 'iternodes':
            nodes = self._iternodes(response)
        elif self.iterator == 'xml':
            selector = Selector(response, type='xml')
            self._register_namespaces(selector)
            nodes = selector.xpath('//%s' % self.itertag)
        elif self.iterator == 'html':
            selector = Selector(response, type='html')
            self._register_namespaces(selector)
            nodes = selector.xpath('//%s' % self.itertag)
        else:
            raise NotSupported('Unsupported node iterator')

        return self.parse_nodes(response, nodes) 
Example #17
Source File: weibo_spider.py    From weibo-analysis-system with MIT License 6 votes vote down vote up
def parse_comment(self, response):
        # 如果是第1页,一次性获取后面的所有页
        if response.url.endswith('page=1'):
            all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
            if all_page:
                all_page = all_page.group(1)
                all_page = int(all_page)
                for page_num in range(2, all_page + 1):
                    page_url = response.url.replace('page=1', 'page={}'.format(page_num))
                    yield Request(page_url, self.parse_comment, dont_filter=True, meta=response.meta)
        selector = Selector(response)
        comment_nodes = selector.xpath('//div[@class="c" and contains(@id,"C_")]')
        for comment_node in comment_nodes:
            comment_user_url = comment_node.xpath('.//a[contains(@href,"/u/")]/@href').extract_first()
            if not comment_user_url:
                continue
            comment_item = CommentItem()
            comment_item['crawl_time'] = datetime.now()
            comment_item['weibo_url'] = response.meta['weibo_url']
            comment_item['comment_user_id'] = re.search(r'/u/(\d+)', comment_user_url).group(1)
            comment_item['content'] = comment_node.xpath('.//span[@class="ctt"]').xpath('string(.)').extract_first()
            comment_item['_id'] = comment_node.xpath('./@id').extract_first()
            created_at = comment_node.xpath('.//span[@class="ct"]/text()').extract_first()
            comment_item['created_at'] = time_fix(created_at.split('\xa0')[0])
            yield comment_item 
Example #18
Source File: weibo_spider.py    From weibo-analysis-system with MIT License 6 votes vote down vote up
def parse_comment(self, response):
        # 如果是第1页,一次性获取后面的所有页
        if response.url.endswith('page=1'):
            all_page = re.search(r'/>&nbsp;1/(\d+)页</div>', response.text)
            if all_page:
                all_page = all_page.group(1)
                all_page = int(all_page)
                for page_num in range(2, all_page + 1):
                    page_url = response.url.replace('page=1', 'page={}'.format(page_num))
                    yield Request(page_url, self.parse_comment, dont_filter=True, meta=response.meta)
        selector = Selector(response)
        comment_nodes = selector.xpath('//div[@class="c" and contains(@id,"C_")]')
        for comment_node in comment_nodes:
            comment_user_url = comment_node.xpath('.//a[contains(@href,"/u/")]/@href').extract_first()
            if not comment_user_url:
                continue
            comment_item = CommentItem()
            comment_item['crawl_time'] = datetime.now()
            comment_item['weibo_url'] = response.meta['weibo_url']
            comment_item['comment_user_id'] = re.search(r'/u/(\d+)', comment_user_url).group(1)
            comment_item['content'] = comment_node.xpath('.//span[@class="ctt"]').xpath('string(.)').extract_first()
            comment_item['_id'] = comment_node.xpath('./@id').extract_first()
            created_at = comment_node.xpath('.//span[@class="ct"]/text()').extract_first()
            comment_item['created_at'] = time_fix(created_at.split('\xa0')[0])
            yield comment_item 
Example #19
Source File: textspider.py    From snippet with MIT License 6 votes vote down vote up
def handle_page(self, response):
        hxs = Selector(response)
        # text_css = self.css_selector["text_css"]
        # title_css = self.css_selector["title_css"]
        text_css = self.get_css("text_css")
        title_css = self.get_css("title_css")
        if not text_css or not title_css:
            return []
        item = TextItem()

        try:
            item["title"] = hxs.css(title_css).xpath('text()').extract()[0]
        except Exception:
            return []

        item["texts"] = hxs.css(text_css).xpath('text()').extract()
        if not item["texts"]:
            return []

        return [item] 
Example #20
Source File: feed.py    From learn_python3_spider with MIT License 6 votes vote down vote up
def parse(self, response):
        if not hasattr(self, 'parse_node'):
            raise NotConfigured('You must define parse_node method in order to scrape this XML feed')

        response = self.adapt_response(response)
        if self.iterator == 'iternodes':
            nodes = self._iternodes(response)
        elif self.iterator == 'xml':
            selector = Selector(response, type='xml')
            self._register_namespaces(selector)
            nodes = selector.xpath('//%s' % self.itertag)
        elif self.iterator == 'html':
            selector = Selector(response, type='html')
            self._register_namespaces(selector)
            nodes = selector.xpath('//%s' % self.itertag)
        else:
            raise NotSupported('Unsupported node iterator')

        return self.parse_nodes(response, nodes) 
Example #21
Source File: post.py    From tieba-crawler with MIT License 6 votes vote down vote up
def parse(self, response):
        """TODO: Docstring for pass.

        :response: TODO
        :returns: TODO

        """
        for item in self._parse_posts(response):
            if not self.should_stop(item):
                yield item
            else:
                return

        if len(Selector(response).css('#frs_list_pager .next')):
            #贴吧的分页有的不是完整的链接
            next_page_url = Selector(response).css('#frs_list_pager .next::attr(href)').extract_first()
            logging.debug('next_page_url %s', next_page_url)
            if -1 != next_page_url.find('http://tieba.baidu.com'):
                yield Request(next_page_url, callback=self.parse)
            else:
                yield Request('http://tieba.baidu.com' + next_page_url, callback=self.parse) 
Example #22
Source File: xicidaili.py    From IPProxyTool with MIT License 6 votes vote down vote up
def parse_page(self, response):
        sel = Selector(text = response.body)
        infos = sel.xpath('//tr[@class="odd"]').extract()
        for info in infos:
            val = Selector(text = info)
            ip = val.xpath('//td[2]/text()').extract_first()
            port = val.xpath('//td[3]/text()').extract_first()
            country = val.xpath('//td[4]/a/text()').extract_first()
            anonymity = val.xpath('//td[5]/text()').extract_first()

            proxy = Proxy()
            proxy.set_value(
                    ip = ip,
                    port = port,
                    country = country,
                    anonymity = anonymity,
                    source = self.name,
            )

            self.add_proxy(proxy = proxy) 
Example #23
Source File: iterators.py    From learn_python3_spider with MIT License 6 votes vote down vote up
def xmliter(obj, nodename):
    """Return a iterator of Selector's over all nodes of a XML document,
       given the name of the node to iterate. Useful for parsing XML feeds.

    obj can be:
    - a Response object
    - a unicode string
    - a string encoded as utf-8
    """
    nodename_patt = re.escape(nodename)

    HEADER_START_RE = re.compile(r'^(.*?)<\s*%s(?:\s|>)' % nodename_patt, re.S)
    HEADER_END_RE = re.compile(r'<\s*/%s\s*>' % nodename_patt, re.S)
    text = _body_or_str(obj)

    header_start = re.search(HEADER_START_RE, text)
    header_start = header_start.group(1).strip() if header_start else ''
    header_end = re_rsearch(HEADER_END_RE, text)
    header_end = text[header_end[1]:].strip() if header_end else ''

    r = re.compile(r'<%(np)s[\s>].*?</%(np)s>' % {'np': nodename_patt}, re.DOTALL)
    for match in r.finditer(text):
        nodetext = header_start + match.group() + header_end
        yield Selector(text=nodetext, type='xml').xpath('//' + nodename)[0] 
Example #24
Source File: feed.py    From learn_python3_spider with MIT License 5 votes vote down vote up
def parse_nodes(self, response, nodes):
        """This method is called for the nodes matching the provided tag name
        (itertag). Receives the response and an Selector for each node.
        Overriding this method is mandatory. Otherwise, you spider won't work.
        This method must return either a BaseItem, a Request, or a list
        containing any of them.
        """

        for selector in nodes:
            ret = iterate_spider_output(self.parse_node(response, selector))
            for result_item in self.process_results(response, ret):
                yield result_item 
Example #25
Source File: iterators.py    From learn_python3_spider with MIT License 5 votes vote down vote up
def xmliter_lxml(obj, nodename, namespace=None, prefix='x'):
    from lxml import etree
    reader = _StreamReader(obj)
    tag = '{%s}%s' % (namespace, nodename) if namespace else nodename
    iterable = etree.iterparse(reader, tag=tag, encoding=reader.encoding)
    selxpath = '//' + ('%s:%s' % (prefix, nodename) if namespace else nodename)
    for _, node in iterable:
        nodetext = etree.tostring(node, encoding='unicode')
        node.clear()
        xs = Selector(text=nodetext, type='xml')
        if namespace:
            xs.register_namespace(prefix, namespace)
        yield xs.xpath(selxpath)[0] 
Example #26
Source File: text.py    From learn_python3_spider with MIT License 5 votes vote down vote up
def selector(self):
        from scrapy.selector import Selector
        if self._cached_selector is None:
            self._cached_selector = Selector(self)
        return self._cached_selector 
Example #27
Source File: text.py    From learn_python3_spider with MIT License 5 votes vote down vote up
def _url_from_selector(sel):
    # type: (parsel.Selector) -> str
    if isinstance(sel.root, six.string_types):
        # e.g. ::attr(href) result
        return strip_html5_whitespace(sel.root)
    if not hasattr(sel.root, 'tag'):
        raise ValueError("Unsupported selector: %s" % sel)
    if sel.root.tag not in ('a', 'link'):
        raise ValueError("Only <a> and <link> elements are supported; got <%s>" %
                         sel.root.tag)
    href = sel.root.get('href')
    if href is None:
        raise ValueError("<%s> element has no href attribute: %s" %
                         (sel.root.tag, sel))
    return strip_html5_whitespace(href) 
Example #28
Source File: feed.py    From learn_python3_spider with MIT License 5 votes vote down vote up
def parse_nodes(self, response, nodes):
        """This method is called for the nodes matching the provided tag name
        (itertag). Receives the response and an Selector for each node.
        Overriding this method is mandatory. Otherwise, you spider won't work.
        This method must return either a BaseItem, a Request, or a list
        containing any of them.
        """

        for selector in nodes:
            ret = iterate_spider_output(self.parse_node(response, selector))
            for result_item in self.process_results(response, ret):
                yield result_item 
Example #29
Source File: text.py    From learn_python3_spider with MIT License 5 votes vote down vote up
def follow(self, url, callback=None, method='GET', headers=None, body=None,
               cookies=None, meta=None, encoding=None, priority=0,
               dont_filter=False, errback=None, cb_kwargs=None):
        # type: (...) -> Request
        """
        Return a :class:`~.Request` instance to follow a link ``url``.
        It accepts the same arguments as ``Request.__init__`` method,
        but ``url`` can be not only an absolute URL, but also
        
        * a relative URL;
        * a scrapy.link.Link object (e.g. a link extractor result);
        * an attribute Selector (not SelectorList) - e.g.
          ``response.css('a::attr(href)')[0]`` or
          ``response.xpath('//img/@src')[0]``.
        * a Selector for ``<a>`` or ``<link>`` element, e.g.
          ``response.css('a.my_link')[0]``.
          
        See :ref:`response-follow-example` for usage examples.
        """
        if isinstance(url, parsel.Selector):
            url = _url_from_selector(url)
        elif isinstance(url, parsel.SelectorList):
            raise ValueError("SelectorList is not supported")
        encoding = self.encoding if encoding is None else encoding
        return super(TextResponse, self).follow(url, callback,
            method=method,
            headers=headers,
            body=body,
            cookies=cookies,
            meta=meta,
            encoding=encoding,
            priority=priority,
            dont_filter=dont_filter,
            errback=errback,
            cb_kwargs=cb_kwargs,
        ) 
Example #30
Source File: asrock_spider.py    From uefi-spider with MIT License 5 votes vote down vote up
def parse_downloads(self, response):
        def extract_field(field_sel):
            return field_sel.xpath(".//text()").extract()[0]
        sel = Selector(response)

        updates = []
        rows = sel.css("tr")
        for row in rows:
            cells = row.css("td")
            if len(cells) != 10:
                continue
            item = AsrockUpdateItem()
            item["version"] = extract_field(cells[0])
            item["date"] = extract_field(cells[1])
            item["bios_type"] = extract_field(cells[2])
            if item["bios_type"] not in ["Instant Flash"]:
                continue
            item["desc"] = extract_field(cells[4])
            item["bios_url"] = cells[8].css("a").xpath("@href").extract()[0]
            item["binary_name"] = item["bios_url"].split("/")[-1]
            item["item_id"] = item["binary_name"].replace(".zip", "")

            item["attrs"] = dict(response.meta["item"])
            #print dict(item)
            updates.append(item)

        for update in updates:
            yield Request(url= update["bios_url"], callback= self.parse_binary,
               meta= {"item": update})
            pass
        pass