Python scrapy.Selector() Examples
The following are 30
code examples of scrapy.Selector().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
scrapy
, or try the search function
.
Example #1
Source File: proxylistplus.py From IPProxyTool with MIT License | 7 votes |
def parse_page(self, response): self.write(response.body) sel = Selector(response) infos = sel.xpath('//tr[@class="cells"]').extract() for i, info in enumerate(infos): self.log(info) val = Selector(text = info) ip = val.xpath('//td[2]/text()').extract_first() port = val.xpath('//td[3]/text()').extract_first() country = val.xpath('//td[5]/text()').extract_first() anonymity = val.xpath('//td[4]/text()').extract_first() proxy = Proxy() proxy.set_value( ip = ip, port = port, country = country, anonymity = anonymity, source = self.name, ) self.add_proxy(proxy = proxy)
Example #2
Source File: hidemy.py From IPProxyTool with MIT License | 6 votes |
def parse_page(self, response): self.write(response.body) sel = Selector(response) infos = sel.xpath('//tbody/tr').extract() for i, info in enumerate(infos): if i == 0: continue val = Selector(text = info) ip = val.xpath('//td[1]/text()').extract_first() port = val.xpath('//td[2]/text()').extract_first() country = val.xpath('//td[3]/div/text()').extract_first() anonymity = val.xpath('//td[6]/text()').extract_first() proxy = Proxy() proxy.set_value( ip = ip, port = port, country = country, anonymity = anonymity, source = self.name, ) self.add_proxy(proxy = proxy)
Example #3
Source File: city.py From openslack-crawler with Apache License 2.0 | 6 votes |
def parse(self, response): item = DmozItem() sel = scrapy.Selector(response) conn = pymssql.connect(host="121.42.136.4", user="sa", password="koala19920716!@#", database="test") cursor = conn.cursor() sites = sel.xpath("//dl[@id='clist']/dd/a/text()").extract() item['title'] = [n.encode('utf-8') for n in sites] yield item # sql = "select ID,CityName from Cities" # cursor.execute(sql) # for (ID,CityName) in cursor.fetchall(): # print ID for name in item['title']: # print name sql = "Insert into Cities(CityName)values('" + name + "')" cursor.execute(sql) conn.commit()
Example #4
Source File: main.py From python-examples with MIT License | 6 votes |
def parse(self, response): print('url:', response.url) body = response.body.replace(b'<<+', b'<<+').replace(b'<+', b'<+') selector = scrapy.Selector(text=body.decode('utf-8')) i = 1 for x in selector.css('.elem::text').extract(): if 'Elements' in x: print('---', i, '---') i += 1 else: print(x) # --- it runs without project and saves in `output.csv` ---
Example #5
Source File: spider.py From SinaWeiboSpider with MIT License | 6 votes |
def parse_user_1(self, response): """ 抓取个人信息2 """ user_item = response.meta["item"] selector = Selector(response) text1 = ";".join(selector.xpath('body/div[@class="c"]/text()').extract()) # 获取标签里的所有text() nickname = re.findall(u'\u6635\u79f0[:|\uff1a](.*?);', text1) # 昵称 intro = re.findall(u'\u7b80\u4ecb[:|\uff1a](.*?);', text1) # 简介 auth = re.findall(u'\u8ba4\u8bc1[:|\uff1a](.*?);', text1) # 认证信息 gender = re.findall(u'\u6027\u522b[:|\uff1a](.*?);', text1) # 性别 place = re.findall(u'\u5730\u533a[:|\uff1a](.*?);', text1) # 地区(包括省份和城市) birthday = re.findall(u'\u751f\u65e5[:|\uff1a](.*?);', text1) # 生日 sexorientation = re.findall(u'\u6027\u53d6\u5411[:|\uff1a](.*?);', text1) # 性取向 marriage = re.findall(u'\u611f\u60c5\u72b6\u51b5[:|\uff1a](.*?);', text1) # 婚姻状况 url = re.findall(u'\u4e92\u8054\u7f51[:|\uff1a](.*?);', text1) # 首页链接 if nickname: user_item["nickname"] = nickname[0] if auth: user_item["auth"] = auth[0] if intro: user_item["intro"] = intro[0] user_item['t'] = time.strftime('%Y-%m-%d', time.localtime(time.time())) yield user_item
Example #6
Source File: spider.py From SinaWeiboSpider with MIT License | 6 votes |
def parse_user_0(self, response): """ 抓取个人信息-第一部分:微博数、关注数、粉丝数 """ user_item = UserItem() selector = Selector(response) text0 = selector.xpath('body/div[@class="u"]/div[@class="tip2"]').extract_first() if text0: num_tweets = re.findall(u'\u5fae\u535a\[(\d+)\]', text0) # 微博数 num_follows = re.findall(u'\u5173\u6ce8\[(\d+)\]', text0) # 关注数 num_fans = re.findall(u'\u7c89\u4e1d\[(\d+)\]', text0) # 粉丝数 if num_tweets: user_item["ctweets"] = int(num_tweets[0]) if num_follows: user_item["cfollows"] = int(num_follows[0]) if num_fans: user_item["cfans"] = int(num_fans[0]) user_item["_id"] = response.meta["user_id"] url_information1 = "http://weibo.cn/%s/info" % response.meta["user_id"] yield Request(url=url_information1, meta={"item": user_item}, callback=self.parse_user_1)
Example #7
Source File: sp500_spider.py From fooltrader with MIT License | 6 votes |
def download_sp500_price(self, response): trs = response.xpath('//*[@id="datatable"]/tr').extract() price_jsons = [] try: for tr in trs[1:]: tds = Selector(text=tr).xpath('//td//text()').extract() tds = [x.strip() for x in tds if x.strip()] price_jsons.append({"timestamp": to_time_str(tds[0]), "close": to_float(tds[1])}) if price_jsons: self.df_close = self.df_close.append(price_jsons, ignore_index=True) self.df_close = index_df_with_time(self.df_close) except Exception as e: self.logger.exception('error when getting sp500 price url={} error={}'.format(response.url, e))
Example #8
Source File: shell.py From learn_python3_spider with MIT License | 6 votes |
def get_help(self): b = [] b.append("Available Scrapy objects:") b.append(" scrapy scrapy module (contains scrapy.Request, scrapy.Selector, etc)") for k, v in sorted(self.vars.items()): if self._is_relevant(v): b.append(" %-10s %s" % (k, v)) b.append("Useful shortcuts:") if self.inthread: b.append(" fetch(url[, redirect=True]) " "Fetch URL and update local objects " "(by default, redirects are followed)") b.append(" fetch(req) " "Fetch a scrapy.Request and update local objects ") b.append(" shelp() Shell help (print this help)") b.append(" view(response) View response in a browser") return "\n".join("[s] %s" % l for l in b)
Example #9
Source File: shell.py From learn_python3_spider with MIT License | 6 votes |
def get_help(self): b = [] b.append("Available Scrapy objects:") b.append(" scrapy scrapy module (contains scrapy.Request, scrapy.Selector, etc)") for k, v in sorted(self.vars.items()): if self._is_relevant(v): b.append(" %-10s %s" % (k, v)) b.append("Useful shortcuts:") if self.inthread: b.append(" fetch(url[, redirect=True]) " "Fetch URL and update local objects " "(by default, redirects are followed)") b.append(" fetch(req) " "Fetch a scrapy.Request and update local objects ") b.append(" shelp() Shell help (print this help)") b.append(" view(response) View response in a browser") return "\n".join("[s] %s" % l for l in b)
Example #10
Source File: comment.py From tieba-crawler with MIT License | 6 votes |
def _get_next_page(self, response): """TODO: Docstring for _parse_next_page. :response: TODO :returns: TODO """ #logging.debug('beginning parsing next page if existed..') meta = response.meta anchor_sels = Selector(response).css('.j_pager a') next_page = 1 #logging.debug('anchor selectors: %r' % (anchor_sels)) for sel in anchor_sels: #logging.debug('pager anchor text: ' % (sel.css('::text').extract_first())) if sel.css('::text').extract_first() == '下一页': next_page = sel.css('::attr(href)').extract_first()[1:] logging.debug('next page num: %s' % (next_page)) return int(next_page)
Example #11
Source File: data5u.py From IPProxyTool with MIT License | 6 votes |
def parse_page(self, response): self.write(response.body) sel = Selector(response) infos = sel.xpath('//ul[@class="l2"]').extract() for i, info in enumerate(infos): val = Selector(text = info) ip = val.xpath('//ul[@class="l2"]/span[1]/li/text()').extract_first() port = val.xpath('//ul[@class="l2"]/span[2]/li/text()').extract_first() anonymity = val.xpath('//ul[@class="l2"]/span[3]/li/text()').extract_first() https = val.xpath('//ul[@class="l2"]/span[4]/li/text()').extract_first() country = val.xpath('//ul[@class="l2"]/span[5]/li/a/text()').extract_first() proxy = Proxy() proxy.set_value( ip = ip, port = port, country = country, anonymity = anonymity, source = self.name, ) self.add_proxy(proxy = proxy)
Example #12
Source File: reply.py From tieba-crawler with MIT License | 6 votes |
def _parse_general_post(self, post, response): """TODO: Docstring for _parse_general_post. :post: TODO :response: TODO :returns: TODO """ item = Reply() #拼接字符串 item['body'] = ''.join(post.css('cc div::text').extract()).strip() item['title'] = Selector(response).css('.core_title_txt::text').extract_first() item['post_time'] = json.loads( post .css('::attr(data-field)') .extract_first() )['content']['date'] return item;
Example #13
Source File: reply.py From tieba-crawler with MIT License | 6 votes |
def parse(self, response): """TODO: Docstring for parse. :returns: TODO """ posts = Selector(response).css('.p_postlist .l_post') for i, post in enumerate(posts): if i == 0: yield self._parse_main_post(post, response) else: item = self._parse_reply(post, response) yield item if item['reply_num'] != 0:# 评论数 self._parse_comments(post)
Example #14
Source File: user.py From tieba-crawler with MIT License | 6 votes |
def _parse_user_id(self, response): """TODO: Docstring for _parse_user_id. :response: TODO :returns: 32 digits user id hex """ uri = Selector(response).css('.concern_num a::attr(href)').extract_first() logging.debug('user id href: %s' % (uri)) if uri: query_dict = parse_qs(urlparse(uri).query) # uri maybe this: /home/concern?id=a3e3474fbda1bfb5bfecc0d6d121?t=1423636759&fr=home return query_dict['id'][0] else: return ''
Example #15
Source File: user.py From tieba-crawler with MIT License | 6 votes |
def _parse_following_and_followed(self, response, item): """TODO: Docstring for _parse_following_and_followed. :response: TODO :item: item.following_num item.followed_num :returns: TODO """ sels = Selector(response).css('.ihome_aside_title') for sel in sels: title = sel.css('::text').extract_first().strip()# 第一个text是'他关注的人'或者其它无用的信息 #logging.debug('title: %s' % (title)) #有的用户没有关注或被关注 if title == '他关注的人' or title == '她关注的人': item['following_num'] = sel.css('a::text').extract_first() else: item['following_num'] = 0 if title == '关注他的人' or title == '关注她的人': item['followed_num'] = sel.css('a::text').extract_first() else: item['followed_num'] = 0 return item
Example #16
Source File: stock_finance_report_event_spider.py From fooltrader with MIT License | 5 votes |
def download_fi_report_event_data(self, response): security_item = response.meta['item'] period_type = response.meta['period_type'] path = get_finance_report_event_path(security_item) df = pd.DataFrame() try: report_timestamps = response.xpath('//*[@id="con02-7"]/table[2]/tr/td[2]//ul/text()').extract() report_timestamps = [date.strip() for date in report_timestamps if date.strip()] report_contents = response.xpath('//*[@id="con02-7"]/table[2]/tr/td[2]//ul//a').extract() for i, tr in enumerate(report_contents): href = Selector(text=tr).xpath('//@href').extract()[0] title = Selector(text=tr).xpath('//text()').extract()[0] report_period = self.report_period_from_title(title, period_type, report_timestamps[i]) df = df.append({ "securityId": security_item['id'], "timestamp": report_timestamps[i], "url": "http://vip.stock.finance.sina.com.cn" + href, "title": title, "reportPeriod": report_period}, ignore_index=True) if not df.empty: df = df.drop_duplicates() df = index_df_with_time(df) df.to_csv(path, index=False) except Exception as e: self.logger.exception('error when getting k data url={}'.format(response.url))
Example #17
Source File: ip181.py From IPProxyTool with MIT License | 5 votes |
def parse_page(self, response): self.write(response.body) sel = Selector(response) infos = sel.xpath('//tbody/tr').extract() for i, info in enumerate(infos): if i == 0: continue val = Selector(text = info) ip = val.xpath('//td[1]/text()').extract_first() port = val.xpath('//td[2]/text()').extract_first() country = val.xpath('//td[6]/text()').extract_first() anonymity = val.xpath('//td[3]/text()').extract_first() https = val.xpath('//td[4]/text()').extract_first() proxy = Proxy() proxy.set_value( ip = ip, port = port, country = country, anonymity = anonymity, source = self.name, ) self.add_proxy(proxy = proxy)
Example #18
Source File: a51newren.py From openslack-crawler with Apache License 2.0 | 5 votes |
def parse_item(self, response): sel = scrapy.Selector(response) print sel.xpath("//span[@class=' user_name_show']/text()").extract()
Example #19
Source File: a51newren.py From openslack-crawler with Apache License 2.0 | 5 votes |
def parse(self, response): sel = scrapy.Selector(response) item = DmozItem() sel = scrapy.Selector(response) href = str(response.url) hidden = sel.xpath("//input[@name='__RequestVerificationToken']/@value").extract() return [FormRequest.from_response(response, \ formdata=self.formdata, \ headers=self.headers, \ meta={ '__RequestVerificationToken': 'BSDY33UtJXv0XqMkIvAJXAdMXC-jqACBsiZb6-mx4uW8Hr89aArTh9DfLtQFDh6NwQsqHXiZMTzheuim3ETI78PhOzQf263wliXL8ArkTrA1'}, \ callback=self.parse_item)]
Example #20
Source File: fans.py From tieba-crawler with MIT License | 5 votes |
def parse_page(self, response): """todo: docstring for parse_page. :response: todo :returns: todo """ logging.debug('fans num: %s' % (len(Selector(response).css('.user')))) for sel in Selector(response).css('.user'): item = Fan() item['name'] = sel.css('.name a::text').extract_first() item['baidu_id'] = sel.css('::attr(portrait)').extract_first() item['user_name_followed'] = response.meta['row'][1]# 是谁的粉丝 yield item
Example #21
Source File: stock_kdata_sina_spider.py From fooltrader with MIT License | 5 votes |
def download_day_k_data(self, response): path = response.meta['path'] item = response.meta['item'] fuquan = response.meta['fuquan'] trs = response.xpath('//*[@id="FundHoldSharesTable"]/tr[position()>1 and position()<=last()]').extract() try: if fuquan == 'hfq': df = pd.DataFrame( columns=data_contract.KDATA_COLUMN_SINA_FQ) else: df = pd.DataFrame( columns=data_contract.KDATA_COLUMN_SINA) for idx, tr in enumerate(trs): tds = Selector(text=tr).xpath('//td//text()').extract() tds = [x.strip() for x in tds if x.strip()] securityId = item['id'] timestamp = tds[0] open = float(tds[1]) high = float(tds[2]) close = float(tds[3]) low = float(tds[4]) volume = tds[5] turnover = tds[6] if fuquan == 'hfq': factor = tds[7] df.loc[idx] = [timestamp, item['code'], low, open, close, high, volume, turnover, securityId, factor] else: df.loc[idx] = [timestamp, item['code'], low, open, close, high, volume, turnover, securityId] df.to_csv(path, index=False) except Exception as e: self.logger.exception('error when getting k data url={} error={}'.format(response.url, e))
Example #22
Source File: stock_summary_spider.py From fooltrader with MIT License | 5 votes |
def download_sz_summary(self, response): search_date = response.meta['search_date'] trs = response.xpath('//table/tr').extract() turnOver = None if self.security_item['id'] == 'index_sz_399106': for tr in trs[1:]: str_list = Selector(text=tr).xpath('//td//text()').extract() if '股票总市值' in str_list[0]: tCap = to_float(str_list[1], 0.0) elif '股票流通市值' in str_list[0]: mCap = to_float(str_list[1], 0.0) elif '平均市盈率' in str_list[0]: pe = to_float(str_list[1], 0.0) elif '平均换手率' in str_list[0]: turnoverRate = to_float(str_list[1], 0.0) else: for tr in trs[1:]: str_list = Selector(text=tr).xpath('//td//text()').extract() if '上市公司市价总值' in str_list[0]: tCap = to_float(str_list[1], 0.0) elif '上市公司流通市值' in str_list[0]: mCap = to_float(str_list[1], 0.0) elif '平均市盈率' in str_list[0]: pe = to_float(str_list[1], 0.0) elif '总成交金额' in str_list[0]: turnOver = to_float(str_list[1], 0.0) if turnOver: turnoverRate = 100 * turnOver / tCap self.file_lock.acquire() # 有些较老的数据不存在,默认设为0.0 self.current_df.at[search_date, 'pe'] = pe self.current_df.at[search_date, 'tCap'] = tCap self.current_df.at[search_date, 'mCap'] = mCap self.current_df.at[search_date, 'turnoverRate'] = turnoverRate self.file_lock.release()
Example #23
Source File: user.py From tieba-crawler with MIT License | 5 votes |
def _parse_user_following_tieba(self, response): """TODO: Docstring for _parse_user_following_tieba. :response: TODO :returns: TODO """ names = [] for name in Selector(response).css('.u-f-item span:first-child::text').extract(): names.append(name) return names
Example #24
Source File: user_relation.py From tieba-crawler with MIT License | 5 votes |
def next_page(self, response): """todo: docstring for next_page. :response: todo :returns: todo """ href = Selector(response).css('.next::attr(href)').extract_first() return 'http://tieba.baidu.com' + href if href else False
Example #25
Source File: user.py From tieba-crawler with MIT License | 5 votes |
def _parse_user_posts_num(self, response): """TODO: Docstring for _parse_user_posts_num. :response: TODO :returns: TODO """ num = Selector(response).css('.userinfo_userdata span:nth-child(4)::text').extract_first()[3:-1]# 发贴:(X)X.X万 logging.debug('posts num: %s' % (num)) if num: return num if num.find('.') != -1 else float(num) * 10000 else: return 0
Example #26
Source File: comment.py From tieba-crawler with MIT License | 5 votes |
def parse(self, response): """TODO: Docstring for parse. :response: TODO :returns: TODO """ replies_sel = Selector(response).css('.lzl_single_post') for sel in replies_sel: item = Comment() item['body'] = ''.join(sel.css('.lzl_content_main::text').extract()).strip() comment_json_str = sel.css('::attr(data-field)').extract_first() comment_json = json.loads(comment_json_str) item['id'] = comment_json['spid']# 直接取百度的id item['author_name'] = comment_json['user_name'] item['post_time'] = self._fill_time(sel.css('.lzl_time::text').extract_first()) item['reply_id'] = response.meta['reply_id'] logging.debug('comment: %r' % (item)) yield item logging.debug('before parsing next page if existed..') meta = response.meta next_page = self._get_next_page(response) if next_page > meta['cur_page']: #meta.reply_id meta.post_id yield Request(self.request_url_tmpl % (meta['post_id'], meta['reply_id'], next_page), callback=self.parse, meta={'post_id': meta['post_id'], 'reply_id': meta['reply_id'], 'cur_page': next_page}) # tid is 主贴的id, pid是回复的id
Example #27
Source File: member.py From tieba-crawler with MIT License | 5 votes |
def empty_page(self, response): """TODO: Docstring for empty_page. :response: TODO :returns: TODO """ return len(Selector(response).css('.user_name').extract()) == 0
Example #28
Source File: member.py From tieba-crawler with MIT License | 5 votes |
def next_page(self, response): """TODO: Docstring for next_page. :response: TODO :returns: TODO """ return 'http://tieba.baidu.com' + Selector(response).css('.next_page::attr(href)').extract_first()
Example #29
Source File: follow.py From tieba-crawler with MIT License | 5 votes |
def parse_page(self, response): """todo: docstring for parse_page. :response: todo :returns: todo """ for sel in Selector(response).css('.user'): item = Follow() item['name'] = sel.css('.name a::text').extract_first() item['baidu_id'] = sel.css('::attr(portrait)').extract_first() item['user_name_following'] = response.meta['row'][1]# 是被谁关注的 yield item
Example #30
Source File: stock_forecast_spider.py From fooltrader with MIT License | 4 votes |
def download_forecast_data(self, response): security_item = response.meta['item'] trs = response.xpath('//*[@id="dataTable"]//tr').extract() forecast_jsons = [] try: for tr in trs[1:]: tds = Selector(text=tr).xpath('//td//text()').extract() tds = [x.strip() for x in tds if x.strip()] # 业绩变动字符串转为float change_str = tds[7] change_start = None if '~' in change_str: i = change_str.index('~') change_start = change_str[0:i] change = change_str[i + 1:] else: change = change_str if change: change = change.strip('%') change = float(change) / 100 if change_start: change_start = change_start.strip('%') change_start = float(change_start) / 100 # preEPS可能为空 preEPS = None try: preEPS = float(tds[6]) except Exception as e: pass json_item = {"securityId": security_item['id'], "timestamp": tds[3], "reportPeriod": tds[4], "type": tds[2], "description": tds[5], "preEPS": preEPS, "changeStart": change_start, "change": change, } forecast_jsons.append(json_item) if forecast_jsons: df = pd.DataFrame(forecast_jsons) df = df.drop_duplicates() df = df[:, EVENT_STOCK_FINANCE_FORECAST_COL] df = index_df_with_time(df) df.to_csv(get_finance_forecast_event_path(security_item), index=False) except Exception as e: self.logger.exception('error when getting k data url={} error={}'.format(response.url, e))