Python scrapy.Selector() Examples

The following are 30 code examples of scrapy.Selector(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module scrapy , or try the search function .
Example #1
Source File: proxylistplus.py    From IPProxyTool with MIT License 7 votes vote down vote up
def parse_page(self, response):
        self.write(response.body)

        sel = Selector(response)
        infos = sel.xpath('//tr[@class="cells"]').extract()
        for i, info in enumerate(infos):
            self.log(info)
            val = Selector(text = info)

            ip = val.xpath('//td[2]/text()').extract_first()
            port = val.xpath('//td[3]/text()').extract_first()
            country = val.xpath('//td[5]/text()').extract_first()
            anonymity = val.xpath('//td[4]/text()').extract_first()

            proxy = Proxy()
            proxy.set_value(
                    ip = ip,
                    port = port,
                    country = country,
                    anonymity = anonymity,
                    source = self.name,
            )

            self.add_proxy(proxy = proxy) 
Example #2
Source File: hidemy.py    From IPProxyTool with MIT License 6 votes vote down vote up
def parse_page(self, response):
        self.write(response.body)

        sel = Selector(response)
        infos = sel.xpath('//tbody/tr').extract()
        for i, info in enumerate(infos):
            if i == 0:
                continue

            val = Selector(text = info)
            ip = val.xpath('//td[1]/text()').extract_first()
            port = val.xpath('//td[2]/text()').extract_first()
            country = val.xpath('//td[3]/div/text()').extract_first()
            anonymity = val.xpath('//td[6]/text()').extract_first()

            proxy = Proxy()
            proxy.set_value(
                    ip = ip,
                    port = port,
                    country = country,
                    anonymity = anonymity,
                    source = self.name,
            )

            self.add_proxy(proxy = proxy) 
Example #3
Source File: city.py    From openslack-crawler with Apache License 2.0 6 votes vote down vote up
def parse(self, response):
        item = DmozItem()
        sel = scrapy.Selector(response)
        conn = pymssql.connect(host="121.42.136.4", user="sa", password="koala19920716!@#", database="test")
        cursor = conn.cursor()
        sites = sel.xpath("//dl[@id='clist']/dd/a/text()").extract()
        item['title'] = [n.encode('utf-8') for n in sites]
        yield item
        # sql = "select ID,CityName from Cities"
        # cursor.execute(sql)
        # for (ID,CityName) in cursor.fetchall():
        #     print ID
        for name in item['title']:
            # print name
            sql = "Insert into Cities(CityName)values('" + name + "')"
            cursor.execute(sql)
            conn.commit() 
Example #4
Source File: main.py    From python-examples with MIT License 6 votes vote down vote up
def parse(self, response):
        print('url:', response.url)

        body = response.body.replace(b'<<+', b'&lt;&lt;+').replace(b'<+', b'&lt;+')
            
        selector = scrapy.Selector(text=body.decode('utf-8'))

        i = 1
        for x  in selector.css('.elem::text').extract():
            if 'Elements' in x:
                print('---', i, '---')
                i += 1
            else:
                print(x)

# --- it runs without project and saves in `output.csv` --- 
Example #5
Source File: spider.py    From SinaWeiboSpider with MIT License 6 votes vote down vote up
def parse_user_1(self, response):
        """ 抓取个人信息2 """
        user_item = response.meta["item"]
        selector = Selector(response)
        text1 = ";".join(selector.xpath('body/div[@class="c"]/text()').extract())  # 获取标签里的所有text()

        nickname = re.findall(u'\u6635\u79f0[:|\uff1a](.*?);', text1)  # 昵称
        intro = re.findall(u'\u7b80\u4ecb[:|\uff1a](.*?);', text1)  # 简介
        auth = re.findall(u'\u8ba4\u8bc1[:|\uff1a](.*?);', text1)  # 认证信息

        gender = re.findall(u'\u6027\u522b[:|\uff1a](.*?);', text1)  # 性别
        place = re.findall(u'\u5730\u533a[:|\uff1a](.*?);', text1)  # 地区(包括省份和城市)
        birthday = re.findall(u'\u751f\u65e5[:|\uff1a](.*?);', text1)  # 生日
        sexorientation = re.findall(u'\u6027\u53d6\u5411[:|\uff1a](.*?);', text1)  # 性取向
        marriage = re.findall(u'\u611f\u60c5\u72b6\u51b5[:|\uff1a](.*?);', text1)  # 婚姻状况
        url = re.findall(u'\u4e92\u8054\u7f51[:|\uff1a](.*?);', text1)  # 首页链接

        if nickname:
            user_item["nickname"] = nickname[0]
        if auth:
            user_item["auth"] = auth[0]
        if intro:
            user_item["intro"] = intro[0]
        user_item['t'] = time.strftime('%Y-%m-%d', time.localtime(time.time()))
        yield user_item 
Example #6
Source File: spider.py    From SinaWeiboSpider with MIT License 6 votes vote down vote up
def parse_user_0(self, response):
        """ 抓取个人信息-第一部分:微博数、关注数、粉丝数 """
        user_item = UserItem()
        selector = Selector(response)
        text0 = selector.xpath('body/div[@class="u"]/div[@class="tip2"]').extract_first()
        if text0:
            num_tweets = re.findall(u'\u5fae\u535a\[(\d+)\]', text0)  # 微博数
            num_follows = re.findall(u'\u5173\u6ce8\[(\d+)\]', text0)  # 关注数
            num_fans = re.findall(u'\u7c89\u4e1d\[(\d+)\]', text0)  # 粉丝数
            if num_tweets:
                user_item["ctweets"] = int(num_tweets[0])
            if num_follows:
                user_item["cfollows"] = int(num_follows[0])
            if num_fans:
                user_item["cfans"] = int(num_fans[0])
            user_item["_id"] = response.meta["user_id"]
            url_information1 = "http://weibo.cn/%s/info" % response.meta["user_id"]
            yield Request(url=url_information1, meta={"item": user_item}, callback=self.parse_user_1) 
Example #7
Source File: sp500_spider.py    From fooltrader with MIT License 6 votes vote down vote up
def download_sp500_price(self, response):
        trs = response.xpath('//*[@id="datatable"]/tr').extract()

        price_jsons = []

        try:
            for tr in trs[1:]:
                tds = Selector(text=tr).xpath('//td//text()').extract()
                tds = [x.strip() for x in tds if x.strip()]

                price_jsons.append({"timestamp": to_time_str(tds[0]),
                                    "close": to_float(tds[1])})

            if price_jsons:
                self.df_close = self.df_close.append(price_jsons, ignore_index=True)
                self.df_close = index_df_with_time(self.df_close)
        except Exception as e:
            self.logger.exception('error when getting sp500 price url={} error={}'.format(response.url, e)) 
Example #8
Source File: shell.py    From learn_python3_spider with MIT License 6 votes vote down vote up
def get_help(self):
        b = []
        b.append("Available Scrapy objects:")
        b.append("  scrapy     scrapy module (contains scrapy.Request, scrapy.Selector, etc)")
        for k, v in sorted(self.vars.items()):
            if self._is_relevant(v):
                b.append("  %-10s %s" % (k, v))
        b.append("Useful shortcuts:")
        if self.inthread:
            b.append("  fetch(url[, redirect=True]) "
                     "Fetch URL and update local objects "
                     "(by default, redirects are followed)")
            b.append("  fetch(req)                  "
                     "Fetch a scrapy.Request and update local objects ")
        b.append("  shelp()           Shell help (print this help)")
        b.append("  view(response)    View response in a browser")

        return "\n".join("[s] %s" % l for l in b) 
Example #9
Source File: shell.py    From learn_python3_spider with MIT License 6 votes vote down vote up
def get_help(self):
        b = []
        b.append("Available Scrapy objects:")
        b.append("  scrapy     scrapy module (contains scrapy.Request, scrapy.Selector, etc)")
        for k, v in sorted(self.vars.items()):
            if self._is_relevant(v):
                b.append("  %-10s %s" % (k, v))
        b.append("Useful shortcuts:")
        if self.inthread:
            b.append("  fetch(url[, redirect=True]) "
                     "Fetch URL and update local objects "
                     "(by default, redirects are followed)")
            b.append("  fetch(req)                  "
                     "Fetch a scrapy.Request and update local objects ")
        b.append("  shelp()           Shell help (print this help)")
        b.append("  view(response)    View response in a browser")

        return "\n".join("[s] %s" % l for l in b) 
Example #10
Source File: comment.py    From tieba-crawler with MIT License 6 votes vote down vote up
def _get_next_page(self, response):
        """TODO: Docstring for _parse_next_page.

        :response: TODO
        :returns: TODO

        """
        #logging.debug('beginning parsing next page if existed..')
        meta = response.meta
        anchor_sels = Selector(response).css('.j_pager a')
        next_page = 1
        #logging.debug('anchor selectors: %r' % (anchor_sels))
        for sel in anchor_sels:
            #logging.debug('pager anchor text: ' % (sel.css('::text').extract_first()))
            if sel.css('::text').extract_first() == '下一页':
                next_page = sel.css('::attr(href)').extract_first()[1:]
                logging.debug('next page num: %s' % (next_page))

        return int(next_page) 
Example #11
Source File: data5u.py    From IPProxyTool with MIT License 6 votes vote down vote up
def parse_page(self, response):
        self.write(response.body)

        sel = Selector(response)
        infos = sel.xpath('//ul[@class="l2"]').extract()
        for i, info in enumerate(infos):
            val = Selector(text = info)
            ip = val.xpath('//ul[@class="l2"]/span[1]/li/text()').extract_first()
            port = val.xpath('//ul[@class="l2"]/span[2]/li/text()').extract_first()
            anonymity = val.xpath('//ul[@class="l2"]/span[3]/li/text()').extract_first()
            https = val.xpath('//ul[@class="l2"]/span[4]/li/text()').extract_first()
            country = val.xpath('//ul[@class="l2"]/span[5]/li/a/text()').extract_first()

            proxy = Proxy()
            proxy.set_value(
                    ip = ip,
                    port = port,
                    country = country,
                    anonymity = anonymity,
                    source = self.name,
            )

            self.add_proxy(proxy = proxy) 
Example #12
Source File: reply.py    From tieba-crawler with MIT License 6 votes vote down vote up
def _parse_general_post(self, post, response):
        """TODO: Docstring for _parse_general_post.

        :post: TODO
        :response: TODO
        :returns: TODO

        """
        item = Reply()
        #拼接字符串
        item['body'] = ''.join(post.css('cc div::text').extract()).strip()
        item['title'] = Selector(response).css('.core_title_txt::text').extract_first()
        item['post_time'] = json.loads(
            post
            .css('::attr(data-field)')
            .extract_first()
        )['content']['date']

        return item; 
Example #13
Source File: reply.py    From tieba-crawler with MIT License 6 votes vote down vote up
def parse(self, response):
        """TODO: Docstring for parse.
        :returns: TODO

        """
        posts = Selector(response).css('.p_postlist .l_post')

        for i, post in enumerate(posts):
            if i == 0:
                yield self._parse_main_post(post, response)
            else:
                item = self._parse_reply(post, response)
                yield item

                if item['reply_num'] != 0:# 评论数
                    self._parse_comments(post) 
Example #14
Source File: user.py    From tieba-crawler with MIT License 6 votes vote down vote up
def _parse_user_id(self, response):
        """TODO: Docstring for _parse_user_id.

        :response: TODO
        :returns: 32 digits user id hex

        """

        uri = Selector(response).css('.concern_num a::attr(href)').extract_first()
        logging.debug('user id href: %s' % (uri))
        if uri:
            query_dict = parse_qs(urlparse(uri).query)
            # uri maybe this: /home/concern?id=a3e3474fbda1bfb5bfecc0d6d121?t=1423636759&fr=home
            return query_dict['id'][0]
        else:
            return '' 
Example #15
Source File: user.py    From tieba-crawler with MIT License 6 votes vote down vote up
def _parse_following_and_followed(self, response, item):
        """TODO: Docstring for _parse_following_and_followed.

        :response: TODO
        :item: item.following_num item.followed_num
        :returns: TODO

        """
        sels = Selector(response).css('.ihome_aside_title')
        for sel in sels:
            title = sel.css('::text').extract_first().strip()# 第一个text是'他关注的人'或者其它无用的信息
            #logging.debug('title: %s' % (title))
            #有的用户没有关注或被关注
            if title == '他关注的人' or title == '她关注的人':
                item['following_num'] = sel.css('a::text').extract_first()
            else:
                item['following_num'] = 0
            if title == '关注他的人' or title == '关注她的人':
                item['followed_num'] = sel.css('a::text').extract_first()
            else:
                item['followed_num'] = 0

        return item 
Example #16
Source File: stock_finance_report_event_spider.py    From fooltrader with MIT License 5 votes vote down vote up
def download_fi_report_event_data(self, response):
        security_item = response.meta['item']
        period_type = response.meta['period_type']

        path = get_finance_report_event_path(security_item)

        df = pd.DataFrame()

        try:
            report_timestamps = response.xpath('//*[@id="con02-7"]/table[2]/tr/td[2]//ul/text()').extract()
            report_timestamps = [date.strip() for date in report_timestamps if date.strip()]

            report_contents = response.xpath('//*[@id="con02-7"]/table[2]/tr/td[2]//ul//a').extract()

            for i, tr in enumerate(report_contents):
                href = Selector(text=tr).xpath('//@href').extract()[0]
                title = Selector(text=tr).xpath('//text()').extract()[0]
                report_period = self.report_period_from_title(title, period_type, report_timestamps[i])

                df = df.append({
                    "securityId": security_item['id'],
                    "timestamp": report_timestamps[i],
                    "url": "http://vip.stock.finance.sina.com.cn" + href,
                    "title": title,
                    "reportPeriod": report_period}, ignore_index=True)
            if not df.empty:
                df = df.drop_duplicates()
                df = index_df_with_time(df)
                df.to_csv(path, index=False)
        except Exception as e:
            self.logger.exception('error when getting k data url={}'.format(response.url)) 
Example #17
Source File: ip181.py    From IPProxyTool with MIT License 5 votes vote down vote up
def parse_page(self, response):
        self.write(response.body)

        sel = Selector(response)
        infos = sel.xpath('//tbody/tr').extract()
        for i, info in enumerate(infos):
            if i == 0:
                continue

            val = Selector(text = info)
            ip = val.xpath('//td[1]/text()').extract_first()
            port = val.xpath('//td[2]/text()').extract_first()
            country = val.xpath('//td[6]/text()').extract_first()
            anonymity = val.xpath('//td[3]/text()').extract_first()
            https = val.xpath('//td[4]/text()').extract_first()

            proxy = Proxy()
            proxy.set_value(
                    ip = ip,
                    port = port,
                    country = country,
                    anonymity = anonymity,
                    source = self.name,
            )

            self.add_proxy(proxy = proxy) 
Example #18
Source File: a51newren.py    From openslack-crawler with Apache License 2.0 5 votes vote down vote up
def parse_item(self, response):
        sel = scrapy.Selector(response)
        print sel.xpath("//span[@class=' user_name_show']/text()").extract() 
Example #19
Source File: a51newren.py    From openslack-crawler with Apache License 2.0 5 votes vote down vote up
def parse(self, response):
        sel = scrapy.Selector(response)
        item = DmozItem()
        sel = scrapy.Selector(response)
        href = str(response.url)
        hidden = sel.xpath("//input[@name='__RequestVerificationToken']/@value").extract()
        return [FormRequest.from_response(response, \
                                          formdata=self.formdata, \
                                          headers=self.headers, \
                                          meta={
                                              '__RequestVerificationToken': 'BSDY33UtJXv0XqMkIvAJXAdMXC-jqACBsiZb6-mx4uW8Hr89aArTh9DfLtQFDh6NwQsqHXiZMTzheuim3ETI78PhOzQf263wliXL8ArkTrA1'}, \
                                          callback=self.parse_item)] 
Example #20
Source File: fans.py    From tieba-crawler with MIT License 5 votes vote down vote up
def parse_page(self, response):
        """todo: docstring for parse_page.

        :response: todo
        :returns: todo

        """
        logging.debug('fans num: %s' % (len(Selector(response).css('.user'))))
        for sel in Selector(response).css('.user'):
            item = Fan()
            item['name'] = sel.css('.name a::text').extract_first()
            item['baidu_id'] = sel.css('::attr(portrait)').extract_first()
            item['user_name_followed'] = response.meta['row'][1]# 是谁的粉丝

            yield item 
Example #21
Source File: stock_kdata_sina_spider.py    From fooltrader with MIT License 5 votes vote down vote up
def download_day_k_data(self, response):
        path = response.meta['path']
        item = response.meta['item']
        fuquan = response.meta['fuquan']
        trs = response.xpath('//*[@id="FundHoldSharesTable"]/tr[position()>1 and position()<=last()]').extract()

        try:
            if fuquan == 'hfq':
                df = pd.DataFrame(
                    columns=data_contract.KDATA_COLUMN_SINA_FQ)

            else:
                df = pd.DataFrame(
                    columns=data_contract.KDATA_COLUMN_SINA)

            for idx, tr in enumerate(trs):
                tds = Selector(text=tr).xpath('//td//text()').extract()
                tds = [x.strip() for x in tds if x.strip()]
                securityId = item['id']
                timestamp = tds[0]
                open = float(tds[1])
                high = float(tds[2])
                close = float(tds[3])
                low = float(tds[4])
                volume = tds[5]
                turnover = tds[6]
                if fuquan == 'hfq':
                    factor = tds[7]
                    df.loc[idx] = [timestamp, item['code'], low, open, close, high, volume, turnover, securityId,
                                   factor]
                else:
                    df.loc[idx] = [timestamp, item['code'], low, open, close, high, volume, turnover, securityId]
            df.to_csv(path, index=False)
        except Exception as e:
            self.logger.exception('error when getting k data url={} error={}'.format(response.url, e)) 
Example #22
Source File: stock_summary_spider.py    From fooltrader with MIT License 5 votes vote down vote up
def download_sz_summary(self, response):
        search_date = response.meta['search_date']
        trs = response.xpath('//table/tr').extract()

        turnOver = None
        if self.security_item['id'] == 'index_sz_399106':
            for tr in trs[1:]:
                str_list = Selector(text=tr).xpath('//td//text()').extract()
                if '股票总市值' in str_list[0]:
                    tCap = to_float(str_list[1], 0.0)
                elif '股票流通市值' in str_list[0]:
                    mCap = to_float(str_list[1], 0.0)
                elif '平均市盈率' in str_list[0]:
                    pe = to_float(str_list[1], 0.0)
                elif '平均换手率' in str_list[0]:
                    turnoverRate = to_float(str_list[1], 0.0)
        else:
            for tr in trs[1:]:
                str_list = Selector(text=tr).xpath('//td//text()').extract()
                if '上市公司市价总值' in str_list[0]:
                    tCap = to_float(str_list[1], 0.0)
                elif '上市公司流通市值' in str_list[0]:
                    mCap = to_float(str_list[1], 0.0)
                elif '平均市盈率' in str_list[0]:
                    pe = to_float(str_list[1], 0.0)
                elif '总成交金额' in str_list[0]:
                    turnOver = to_float(str_list[1], 0.0)
            if turnOver:
                turnoverRate = 100 * turnOver / tCap
        self.file_lock.acquire()
        # 有些较老的数据不存在,默认设为0.0
        self.current_df.at[search_date, 'pe'] = pe
        self.current_df.at[search_date, 'tCap'] = tCap
        self.current_df.at[search_date, 'mCap'] = mCap
        self.current_df.at[search_date, 'turnoverRate'] = turnoverRate
        self.file_lock.release() 
Example #23
Source File: user.py    From tieba-crawler with MIT License 5 votes vote down vote up
def _parse_user_following_tieba(self, response):
        """TODO: Docstring for _parse_user_following_tieba.

        :response: TODO
        :returns: TODO

        """
        names = []
        for name in Selector(response).css('.u-f-item span:first-child::text').extract():
            names.append(name)

        return names 
Example #24
Source File: user_relation.py    From tieba-crawler with MIT License 5 votes vote down vote up
def next_page(self, response):
        """todo: docstring for next_page.

        :response: todo
        :returns: todo

        """
        href = Selector(response).css('.next::attr(href)').extract_first()
        return 'http://tieba.baidu.com' + href if href else False 
Example #25
Source File: user.py    From tieba-crawler with MIT License 5 votes vote down vote up
def _parse_user_posts_num(self, response):
        """TODO: Docstring for _parse_user_posts_num.

        :response: TODO
        :returns: TODO

        """
        num = Selector(response).css('.userinfo_userdata span:nth-child(4)::text').extract_first()[3:-1]# 发贴:(X)X.X万
        logging.debug('posts num: %s' % (num))
        if num:
            return num if num.find('.') != -1 else float(num) * 10000
        else:
            return 0 
Example #26
Source File: comment.py    From tieba-crawler with MIT License 5 votes vote down vote up
def parse(self, response):
        """TODO: Docstring for parse.

        :response: TODO
        :returns: TODO

        """

        replies_sel = Selector(response).css('.lzl_single_post')
        for sel in replies_sel:
            item = Comment()
            item['body'] = ''.join(sel.css('.lzl_content_main::text').extract()).strip()
            comment_json_str = sel.css('::attr(data-field)').extract_first()
            comment_json = json.loads(comment_json_str)
            item['id'] = comment_json['spid']# 直接取百度的id
            item['author_name'] = comment_json['user_name']
            item['post_time'] = self._fill_time(sel.css('.lzl_time::text').extract_first())
            item['reply_id'] = response.meta['reply_id']
            logging.debug('comment: %r' % (item))
            yield item

        logging.debug('before parsing next page if existed..')
        meta = response.meta
        next_page = self._get_next_page(response)
        if  next_page > meta['cur_page']: #meta.reply_id meta.post_id
            yield Request(self.request_url_tmpl % (meta['post_id'], meta['reply_id'], next_page),
                    callback=self.parse, meta={'post_id': meta['post_id'], 'reply_id': meta['reply_id'], 'cur_page': next_page}) # tid is 主贴的id, pid是回复的id 
Example #27
Source File: member.py    From tieba-crawler with MIT License 5 votes vote down vote up
def empty_page(self, response):
        """TODO: Docstring for empty_page.

        :response: TODO
        :returns: TODO

        """

        return len(Selector(response).css('.user_name').extract()) == 0 
Example #28
Source File: member.py    From tieba-crawler with MIT License 5 votes vote down vote up
def next_page(self, response):
        """TODO: Docstring for next_page.

        :response: TODO
        :returns: TODO

        """
        return 'http://tieba.baidu.com' + Selector(response).css('.next_page::attr(href)').extract_first() 
Example #29
Source File: follow.py    From tieba-crawler with MIT License 5 votes vote down vote up
def parse_page(self, response):
        """todo: docstring for parse_page.

        :response: todo
        :returns: todo

        """
        for sel in Selector(response).css('.user'):
            item = Follow()
            item['name'] = sel.css('.name a::text').extract_first()
            item['baidu_id'] = sel.css('::attr(portrait)').extract_first()
            item['user_name_following'] = response.meta['row'][1]# 是被谁关注的

            yield item 
Example #30
Source File: stock_forecast_spider.py    From fooltrader with MIT License 4 votes vote down vote up
def download_forecast_data(self, response):
        security_item = response.meta['item']
        trs = response.xpath('//*[@id="dataTable"]//tr').extract()

        forecast_jsons = []

        try:
            for tr in trs[1:]:
                tds = Selector(text=tr).xpath('//td//text()').extract()
                tds = [x.strip() for x in tds if x.strip()]

                # 业绩变动字符串转为float
                change_str = tds[7]
                change_start = None

                if '~' in change_str:
                    i = change_str.index('~')
                    change_start = change_str[0:i]
                    change = change_str[i + 1:]
                else:
                    change = change_str

                if change:
                    change = change.strip('%')
                    change = float(change) / 100
                if change_start:
                    change_start = change_start.strip('%')
                    change_start = float(change_start) / 100

                # preEPS可能为空
                preEPS = None
                try:
                    preEPS = float(tds[6])
                except Exception as e:
                    pass

                json_item = {"securityId": security_item['id'],
                             "timestamp": tds[3],
                             "reportPeriod": tds[4],
                             "type": tds[2],
                             "description": tds[5],
                             "preEPS": preEPS,
                             "changeStart": change_start,
                             "change": change,
                             }
                forecast_jsons.append(json_item)

            if forecast_jsons:
                df = pd.DataFrame(forecast_jsons)
                df = df.drop_duplicates()
                df = df[:, EVENT_STOCK_FINANCE_FORECAST_COL]
                df = index_df_with_time(df)
                df.to_csv(get_finance_forecast_event_path(security_item), index=False)


        except Exception as e:
            self.logger.exception('error when getting k data url={} error={}'.format(response.url, e))