Python Examples of scrapy.Selector

Source File: proxylistplus.py From IPProxyTool with MIT License

7 votes

def parse_page(self, response):
        self.write(response.body)

        sel = Selector(response)
        infos = sel.xpath('//tr[@class="cells"]').extract()
        for i, info in enumerate(infos):
            self.log(info)
            val = Selector(text = info)

            ip = val.xpath('//td[2]/text()').extract_first()
            port = val.xpath('//td[3]/text()').extract_first()
            country = val.xpath('//td[5]/text()').extract_first()
            anonymity = val.xpath('//td[4]/text()').extract_first()

            proxy = Proxy()
            proxy.set_value(
                    ip = ip,
                    port = port,
                    country = country,
                    anonymity = anonymity,
                    source = self.name,
            )

            self.add_proxy(proxy = proxy)

Source File: hidemy.py From IPProxyTool with MIT License

6 votes

def parse_page(self, response):
        self.write(response.body)

        sel = Selector(response)
        infos = sel.xpath('//tbody/tr').extract()
        for i, info in enumerate(infos):
            if i == 0:
                continue

            val = Selector(text = info)
            ip = val.xpath('//td[1]/text()').extract_first()
            port = val.xpath('//td[2]/text()').extract_first()
            country = val.xpath('//td[3]/div/text()').extract_first()
            anonymity = val.xpath('//td[6]/text()').extract_first()

            proxy = Proxy()
            proxy.set_value(
                    ip = ip,
                    port = port,
                    country = country,
                    anonymity = anonymity,
                    source = self.name,
            )

            self.add_proxy(proxy = proxy)

Source File: city.py From openslack-crawler with Apache License 2.0

6 votes

def parse(self, response):
        item = DmozItem()
        sel = scrapy.Selector(response)
        conn = pymssql.connect(host="121.42.136.4", user="sa", password="koala19920716!@#", database="test")
        cursor = conn.cursor()
        sites = sel.xpath("//dl[@id='clist']/dd/a/text()").extract()
        item['title'] = [n.encode('utf-8') for n in sites]
        yield item
        # sql = "select ID,CityName from Cities"
        # cursor.execute(sql)
        # for (ID,CityName) in cursor.fetchall():
        #     print ID
        for name in item['title']:
            # print name
            sql = "Insert into Cities(CityName)values('" + name + "')"
            cursor.execute(sql)
            conn.commit()

Source File: main.py From python-examples with MIT License

6 votes

def parse(self, response):
        print('url:', response.url)

        body = response.body.replace(b'<<+', b'&lt;&lt;+').replace(b'<+', b'&lt;+')
            
        selector = scrapy.Selector(text=body.decode('utf-8'))

        i = 1
        for x  in selector.css('.elem::text').extract():
            if 'Elements' in x:
                print('---', i, '---')
                i += 1
            else:
                print(x)

# --- it runs without project and saves in `output.csv` ---

Source File: spider.py From SinaWeiboSpider with MIT License

6 votes

def parse_user_1(self, response):
        """ 抓取个人信息2 """
        user_item = response.meta["item"]
        selector = Selector(response)
        text1 = ";".join(selector.xpath('body/div[@class="c"]/text()').extract())  # 获取标签里的所有text()

        nickname = re.findall(u'\u6635\u79f0[:|\uff1a](.*?);', text1)  # 昵称
        intro = re.findall(u'\u7b80\u4ecb[:|\uff1a](.*?);', text1)  # 简介
        auth = re.findall(u'\u8ba4\u8bc1[:|\uff1a](.*?);', text1)  # 认证信息

        gender = re.findall(u'\u6027\u522b[:|\uff1a](.*?);', text1)  # 性别
        place = re.findall(u'\u5730\u533a[:|\uff1a](.*?);', text1)  # 地区（包括省份和城市）
        birthday = re.findall(u'\u751f\u65e5[:|\uff1a](.*?);', text1)  # 生日
        sexorientation = re.findall(u'\u6027\u53d6\u5411[:|\uff1a](.*?);', text1)  # 性取向
        marriage = re.findall(u'\u611f\u60c5\u72b6\u51b5[:|\uff1a](.*?);', text1)  # 婚姻状况
        url = re.findall(u'\u4e92\u8054\u7f51[:|\uff1a](.*?);', text1)  # 首页链接

        if nickname:
            user_item["nickname"] = nickname[0]
        if auth:
            user_item["auth"] = auth[0]
        if intro:
            user_item["intro"] = intro[0]
        user_item['t'] = time.strftime('%Y-%m-%d', time.localtime(time.time()))
        yield user_item

Source File: spider.py From SinaWeiboSpider with MIT License

6 votes

def parse_user_0(self, response):
        """ 抓取个人信息-第一部分：微博数、关注数、粉丝数 """
        user_item = UserItem()
        selector = Selector(response)
        text0 = selector.xpath('body/div[@class="u"]/div[@class="tip2"]').extract_first()
        if text0:
            num_tweets = re.findall(u'\u5fae\u535a\[(\d+)\]', text0)  # 微博数
            num_follows = re.findall(u'\u5173\u6ce8\[(\d+)\]', text0)  # 关注数
            num_fans = re.findall(u'\u7c89\u4e1d\[(\d+)\]', text0)  # 粉丝数
            if num_tweets:
                user_item["ctweets"] = int(num_tweets[0])
            if num_follows:
                user_item["cfollows"] = int(num_follows[0])
            if num_fans:
                user_item["cfans"] = int(num_fans[0])
            user_item["_id"] = response.meta["user_id"]
            url_information1 = "http://weibo.cn/%s/info" % response.meta["user_id"]
            yield Request(url=url_information1, meta={"item": user_item}, callback=self.parse_user_1)

Source File: sp500_spider.py From fooltrader with MIT License

6 votes

def download_sp500_price(self, response):
        trs = response.xpath('//*[@id="datatable"]/tr').extract()

        price_jsons = []

        try:
            for tr in trs[1:]:
                tds = Selector(text=tr).xpath('//td//text()').extract()
                tds = [x.strip() for x in tds if x.strip()]

                price_jsons.append({"timestamp": to_time_str(tds[0]),
                                    "close": to_float(tds[1])})

            if price_jsons:
                self.df_close = self.df_close.append(price_jsons, ignore_index=True)
                self.df_close = index_df_with_time(self.df_close)
        except Exception as e:
            self.logger.exception('error when getting sp500 price url={} error={}'.format(response.url, e))

Source File: shell.py From learn_python3_spider with MIT License

6 votes

def get_help(self):
        b = []
        b.append("Available Scrapy objects:")
        b.append("  scrapy     scrapy module (contains scrapy.Request, scrapy.Selector, etc)")
        for k, v in sorted(self.vars.items()):
            if self._is_relevant(v):
                b.append("  %-10s %s" % (k, v))
        b.append("Useful shortcuts:")
        if self.inthread:
            b.append("  fetch(url[, redirect=True]) "
                     "Fetch URL and update local objects "
                     "(by default, redirects are followed)")
            b.append("  fetch(req)                  "
                     "Fetch a scrapy.Request and update local objects ")
        b.append("  shelp()           Shell help (print this help)")
        b.append("  view(response)    View response in a browser")

        return "\n".join("[s] %s" % l for l in b)

Source File: shell.py From learn_python3_spider with MIT License

6 votes

def get_help(self):
        b = []
        b.append("Available Scrapy objects:")
        b.append("  scrapy     scrapy module (contains scrapy.Request, scrapy.Selector, etc)")
        for k, v in sorted(self.vars.items()):
            if self._is_relevant(v):
                b.append("  %-10s %s" % (k, v))
        b.append("Useful shortcuts:")
        if self.inthread:
            b.append("  fetch(url[, redirect=True]) "
                     "Fetch URL and update local objects "
                     "(by default, redirects are followed)")
            b.append("  fetch(req)                  "
                     "Fetch a scrapy.Request and update local objects ")
        b.append("  shelp()           Shell help (print this help)")
        b.append("  view(response)    View response in a browser")

        return "\n".join("[s] %s" % l for l in b)

Source File: comment.py From tieba-crawler with MIT License

6 votes

def _get_next_page(self, response):
        """TODO: Docstring for _parse_next_page.

        :response: TODO
        :returns: TODO

        """
        #logging.debug('beginning parsing next page if existed..')
        meta = response.meta
        anchor_sels = Selector(response).css('.j_pager a')
        next_page = 1
        #logging.debug('anchor selectors: %r' % (anchor_sels))
        for sel in anchor_sels:
            #logging.debug('pager anchor text: ' % (sel.css('::text').extract_first()))
            if sel.css('::text').extract_first() == '下一页':
                next_page = sel.css('::attr(href)').extract_first()[1:]
                logging.debug('next page num: %s' % (next_page))

        return int(next_page)

Source File: data5u.py From IPProxyTool with MIT License

6 votes

def parse_page(self, response):
        self.write(response.body)

        sel = Selector(response)
        infos = sel.xpath('//ul[@class="l2"]').extract()
        for i, info in enumerate(infos):
            val = Selector(text = info)
            ip = val.xpath('//ul[@class="l2"]/span[1]/li/text()').extract_first()
            port = val.xpath('//ul[@class="l2"]/span[2]/li/text()').extract_first()
            anonymity = val.xpath('//ul[@class="l2"]/span[3]/li/text()').extract_first()
            https = val.xpath('//ul[@class="l2"]/span[4]/li/text()').extract_first()
            country = val.xpath('//ul[@class="l2"]/span[5]/li/a/text()').extract_first()

            proxy = Proxy()
            proxy.set_value(
                    ip = ip,
                    port = port,
                    country = country,
                    anonymity = anonymity,
                    source = self.name,
            )

            self.add_proxy(proxy = proxy)

Source File: reply.py From tieba-crawler with MIT License

6 votes

def _parse_general_post(self, post, response):
        """TODO: Docstring for _parse_general_post.

        :post: TODO
        :response: TODO
        :returns: TODO

        """
        item = Reply()
        #拼接字符串
        item['body'] = ''.join(post.css('cc div::text').extract()).strip()
        item['title'] = Selector(response).css('.core_title_txt::text').extract_first()
        item['post_time'] = json.loads(
            post
            .css('::attr(data-field)')
            .extract_first()
        )['content']['date']

        return item;

Source File: reply.py From tieba-crawler with MIT License

6 votes

def parse(self, response):
        """TODO: Docstring for parse.
        :returns: TODO

        """
        posts = Selector(response).css('.p_postlist .l_post')

        for i, post in enumerate(posts):
            if i == 0:
                yield self._parse_main_post(post, response)
            else:
                item = self._parse_reply(post, response)
                yield item

                if item['reply_num'] != 0:# 评论数
                    self._parse_comments(post)

Source File: user.py From tieba-crawler with MIT License

6 votes

def _parse_user_id(self, response):
        """TODO: Docstring for _parse_user_id.

        :response: TODO
        :returns: 32 digits user id hex

        """

        uri = Selector(response).css('.concern_num a::attr(href)').extract_first()
        logging.debug('user id href: %s' % (uri))
        if uri:
            query_dict = parse_qs(urlparse(uri).query)
            # uri maybe this: /home/concern?id=a3e3474fbda1bfb5bfecc0d6d121?t=1423636759&fr=home
            return query_dict['id'][0]
        else:
            return ''

Source File: user.py From tieba-crawler with MIT License

6 votes

def _parse_following_and_followed(self, response, item):
        """TODO: Docstring for _parse_following_and_followed.

        :response: TODO
        :item: item.following_num item.followed_num
        :returns: TODO

        """
        sels = Selector(response).css('.ihome_aside_title')
        for sel in sels:
            title = sel.css('::text').extract_first().strip()# 第一个text是'他关注的人'或者其它无用的信息
            #logging.debug('title: %s' % (title))
            #有的用户没有关注或被关注
            if title == '他关注的人' or title == '她关注的人':
                item['following_num'] = sel.css('a::text').extract_first()
            else:
                item['following_num'] = 0
            if title == '关注他的人' or title == '关注她的人':
                item['followed_num'] = sel.css('a::text').extract_first()
            else:
                item['followed_num'] = 0

        return item

Source File: stock_finance_report_event_spider.py From fooltrader with MIT License

5 votes

def download_fi_report_event_data(self, response):
        security_item = response.meta['item']
        period_type = response.meta['period_type']

        path = get_finance_report_event_path(security_item)

        df = pd.DataFrame()

        try:
            report_timestamps = response.xpath('//*[@id="con02-7"]/table[2]/tr/td[2]//ul/text()').extract()
            report_timestamps = [date.strip() for date in report_timestamps if date.strip()]

            report_contents = response.xpath('//*[@id="con02-7"]/table[2]/tr/td[2]//ul//a').extract()

            for i, tr in enumerate(report_contents):
                href = Selector(text=tr).xpath('//@href').extract()[0]
                title = Selector(text=tr).xpath('//text()').extract()[0]
                report_period = self.report_period_from_title(title, period_type, report_timestamps[i])

                df = df.append({
                    "securityId": security_item['id'],
                    "timestamp": report_timestamps[i],
                    "url": "http://vip.stock.finance.sina.com.cn" + href,
                    "title": title,
                    "reportPeriod": report_period}, ignore_index=True)
            if not df.empty:
                df = df.drop_duplicates()
                df = index_df_with_time(df)
                df.to_csv(path, index=False)
        except Exception as e:
            self.logger.exception('error when getting k data url={}'.format(response.url))

Source File: ip181.py From IPProxyTool with MIT License

5 votes

def parse_page(self, response):
        self.write(response.body)

        sel = Selector(response)
        infos = sel.xpath('//tbody/tr').extract()
        for i, info in enumerate(infos):
            if i == 0:
                continue

            val = Selector(text = info)
            ip = val.xpath('//td[1]/text()').extract_first()
            port = val.xpath('//td[2]/text()').extract_first()
            country = val.xpath('//td[6]/text()').extract_first()
            anonymity = val.xpath('//td[3]/text()').extract_first()
            https = val.xpath('//td[4]/text()').extract_first()

            proxy = Proxy()
            proxy.set_value(
                    ip = ip,
                    port = port,
                    country = country,
                    anonymity = anonymity,
                    source = self.name,
            )

            self.add_proxy(proxy = proxy)

Source File: a51newren.py From openslack-crawler with Apache License 2.0

5 votes

def parse_item(self, response):
        sel = scrapy.Selector(response)
        print sel.xpath("//span[@class=' user_name_show']/text()").extract()

Source File: a51newren.py From openslack-crawler with Apache License 2.0

5 votes

def parse(self, response):
        sel = scrapy.Selector(response)
        item = DmozItem()
        sel = scrapy.Selector(response)
        href = str(response.url)
        hidden = sel.xpath("//input[@name='__RequestVerificationToken']/@value").extract()
        return [FormRequest.from_response(response, \
                                          formdata=self.formdata, \
                                          headers=self.headers, \
                                          meta={
                                              '__RequestVerificationToken': 'BSDY33UtJXv0XqMkIvAJXAdMXC-jqACBsiZb6-mx4uW8Hr89aArTh9DfLtQFDh6NwQsqHXiZMTzheuim3ETI78PhOzQf263wliXL8ArkTrA1'}, \
                                          callback=self.parse_item)]

Source File: fans.py From tieba-crawler with MIT License

5 votes

def parse_page(self, response):
        """todo: docstring for parse_page.

        :response: todo
        :returns: todo

        """
        logging.debug('fans num: %s' % (len(Selector(response).css('.user'))))
        for sel in Selector(response).css('.user'):
            item = Fan()
            item['name'] = sel.css('.name a::text').extract_first()
            item['baidu_id'] = sel.css('::attr(portrait)').extract_first()
            item['user_name_followed'] = response.meta['row'][1]# 是谁的粉丝

            yield item

Source File: stock_kdata_sina_spider.py From fooltrader with MIT License

5 votes

def download_day_k_data(self, response):
        path = response.meta['path']
        item = response.meta['item']
        fuquan = response.meta['fuquan']
        trs = response.xpath('//*[@id="FundHoldSharesTable"]/tr[position()>1 and position()<=last()]').extract()

        try:
            if fuquan == 'hfq':
                df = pd.DataFrame(
                    columns=data_contract.KDATA_COLUMN_SINA_FQ)

            else:
                df = pd.DataFrame(
                    columns=data_contract.KDATA_COLUMN_SINA)

            for idx, tr in enumerate(trs):
                tds = Selector(text=tr).xpath('//td//text()').extract()
                tds = [x.strip() for x in tds if x.strip()]
                securityId = item['id']
                timestamp = tds[0]
                open = float(tds[1])
                high = float(tds[2])
                close = float(tds[3])
                low = float(tds[4])
                volume = tds[5]
                turnover = tds[6]
                if fuquan == 'hfq':
                    factor = tds[7]
                    df.loc[idx] = [timestamp, item['code'], low, open, close, high, volume, turnover, securityId,
                                   factor]
                else:
                    df.loc[idx] = [timestamp, item['code'], low, open, close, high, volume, turnover, securityId]
            df.to_csv(path, index=False)
        except Exception as e:
            self.logger.exception('error when getting k data url={} error={}'.format(response.url, e))

Source File: stock_summary_spider.py From fooltrader with MIT License

5 votes

def download_sz_summary(self, response):
        search_date = response.meta['search_date']
        trs = response.xpath('//table/tr').extract()

        turnOver = None
        if self.security_item['id'] == 'index_sz_399106':
            for tr in trs[1:]:
                str_list = Selector(text=tr).xpath('//td//text()').extract()
                if '股票总市值' in str_list[0]:
                    tCap = to_float(str_list[1], 0.0)
                elif '股票流通市值' in str_list[0]:
                    mCap = to_float(str_list[1], 0.0)
                elif '平均市盈率' in str_list[0]:
                    pe = to_float(str_list[1], 0.0)
                elif '平均换手率' in str_list[0]:
                    turnoverRate = to_float(str_list[1], 0.0)
        else:
            for tr in trs[1:]:
                str_list = Selector(text=tr).xpath('//td//text()').extract()
                if '上市公司市价总值' in str_list[0]:
                    tCap = to_float(str_list[1], 0.0)
                elif '上市公司流通市值' in str_list[0]:
                    mCap = to_float(str_list[1], 0.0)
                elif '平均市盈率' in str_list[0]:
                    pe = to_float(str_list[1], 0.0)
                elif '总成交金额' in str_list[0]:
                    turnOver = to_float(str_list[1], 0.0)
            if turnOver:
                turnoverRate = 100 * turnOver / tCap
        self.file_lock.acquire()
        # 有些较老的数据不存在,默认设为0.0
        self.current_df.at[search_date, 'pe'] = pe
        self.current_df.at[search_date, 'tCap'] = tCap
        self.current_df.at[search_date, 'mCap'] = mCap
        self.current_df.at[search_date, 'turnoverRate'] = turnoverRate
        self.file_lock.release()

Source File: user.py From tieba-crawler with MIT License

5 votes

def _parse_user_following_tieba(self, response):
        """TODO: Docstring for _parse_user_following_tieba.

        :response: TODO
        :returns: TODO

        """
        names = []
        for name in Selector(response).css('.u-f-item span:first-child::text').extract():
            names.append(name)

        return names

Source File: user_relation.py From tieba-crawler with MIT License

5 votes

def next_page(self, response):
        """todo: docstring for next_page.

        :response: todo
        :returns: todo

        """
        href = Selector(response).css('.next::attr(href)').extract_first()
        return 'http://tieba.baidu.com' + href if href else False

Source File: user.py From tieba-crawler with MIT License

5 votes

def _parse_user_posts_num(self, response):
        """TODO: Docstring for _parse_user_posts_num.

        :response: TODO
        :returns: TODO

        """
        num = Selector(response).css('.userinfo_userdata span:nth-child(4)::text').extract_first()[3:-1]# 发贴:(X)X.X万
        logging.debug('posts num: %s' % (num))
        if num:
            return num if num.find('.') != -1 else float(num) * 10000
        else:
            return 0

Source File: comment.py From tieba-crawler with MIT License

5 votes

def parse(self, response):
        """TODO: Docstring for parse.

        :response: TODO
        :returns: TODO

        """

        replies_sel = Selector(response).css('.lzl_single_post')
        for sel in replies_sel:
            item = Comment()
            item['body'] = ''.join(sel.css('.lzl_content_main::text').extract()).strip()
            comment_json_str = sel.css('::attr(data-field)').extract_first()
            comment_json = json.loads(comment_json_str)
            item['id'] = comment_json['spid']# 直接取百度的id
            item['author_name'] = comment_json['user_name']
            item['post_time'] = self._fill_time(sel.css('.lzl_time::text').extract_first())
            item['reply_id'] = response.meta['reply_id']
            logging.debug('comment: %r' % (item))
            yield item

        logging.debug('before parsing next page if existed..')
        meta = response.meta
        next_page = self._get_next_page(response)
        if  next_page > meta['cur_page']: #meta.reply_id meta.post_id
            yield Request(self.request_url_tmpl % (meta['post_id'], meta['reply_id'], next_page),
                    callback=self.parse, meta={'post_id': meta['post_id'], 'reply_id': meta['reply_id'], 'cur_page': next_page}) # tid is 主贴的id, pid是回复的id

Source File: member.py From tieba-crawler with MIT License

5 votes

def empty_page(self, response):
        """TODO: Docstring for empty_page.

        :response: TODO
        :returns: TODO

        """

        return len(Selector(response).css('.user_name').extract()) == 0

Source File: member.py From tieba-crawler with MIT License

5 votes

def next_page(self, response):
        """TODO: Docstring for next_page.

        :response: TODO
        :returns: TODO

        """
        return 'http://tieba.baidu.com' + Selector(response).css('.next_page::attr(href)').extract_first()

Source File: follow.py From tieba-crawler with MIT License

5 votes

def parse_page(self, response):
        """todo: docstring for parse_page.

        :response: todo
        :returns: todo

        """
        for sel in Selector(response).css('.user'):
            item = Follow()
            item['name'] = sel.css('.name a::text').extract_first()
            item['baidu_id'] = sel.css('::attr(portrait)').extract_first()
            item['user_name_following'] = response.meta['row'][1]# 是被谁关注的

            yield item

Source File: stock_forecast_spider.py From fooltrader with MIT License

4 votes

def download_forecast_data(self, response):
        security_item = response.meta['item']
        trs = response.xpath('//*[@id="dataTable"]//tr').extract()

        forecast_jsons = []

        try:
            for tr in trs[1:]:
                tds = Selector(text=tr).xpath('//td//text()').extract()
                tds = [x.strip() for x in tds if x.strip()]

                # 业绩变动字符串转为float
                change_str = tds[7]
                change_start = None

                if '~' in change_str:
                    i = change_str.index('~')
                    change_start = change_str[0:i]
                    change = change_str[i + 1:]
                else:
                    change = change_str

                if change:
                    change = change.strip('%')
                    change = float(change) / 100
                if change_start:
                    change_start = change_start.strip('%')
                    change_start = float(change_start) / 100

                # preEPS可能为空
                preEPS = None
                try:
                    preEPS = float(tds[6])
                except Exception as e:
                    pass

                json_item = {"securityId": security_item['id'],
                             "timestamp": tds[3],
                             "reportPeriod": tds[4],
                             "type": tds[2],
                             "description": tds[5],
                             "preEPS": preEPS,
                             "changeStart": change_start,
                             "change": change,
                             }
                forecast_jsons.append(json_item)

            if forecast_jsons:
                df = pd.DataFrame(forecast_jsons)
                df = df.drop_duplicates()
                df = df[:, EVENT_STOCK_FINANCE_FORECAST_COL]
                df = index_df_with_time(df)
                df.to_csv(get_finance_forecast_event_path(security_item), index=False)


        except Exception as e:
            self.logger.exception('error when getting k data url={} error={}'.format(response.url, e))

Python scrapy.Selector() Examples