Python lxml.html.xpath() Examples
The following are 30
code examples of lxml.html.xpath().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
lxml.html
, or try the search function
.
Example #1
Source File: belvaParseXML.py From Basic-Expression-Lexicon-Variation-Algorithms-BELVA with GNU General Public License v3.0 | 6 votes |
def parseXMLxpathSearchSingle(xml_source, xpathString): #--------------------------------------------------------------------------------- return_values = [] try: root = etree.XML(xml_source) data_points = root.xpath(xpathString) for data in data_points: return_values.append(data) data.clear() except: pass return return_values #--------------------------------------------------------------------------------- # parse HTML and return value asked
Example #2
Source File: newsevent.py From tushare with BSD 3-Clause "New" or "Revised" License | 6 votes |
def _guba_content(url): try: html = lxml.html.parse(url) res = html.xpath('//div[@class=\"ilt_p\"]/p') if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr).replace(' ', '')#.replace('\n\n', '\n'). html_content = lxml.html.fromstring(sarr) content = html_content.text_content() ptime = html.xpath('//div[@class=\"fl_left iltp_time\"]/span/text()')[0] rcounts = html.xpath('//div[@class=\"fl_right iltp_span\"]/span[2]/text()')[0] reg = re.compile(r'\((.*?)\)') rcounts = reg.findall(rcounts)[0] return [content, ptime, rcounts] except Exception: return ['', '', '0']
Example #3
Source File: newsevent.py From TuShare with BSD 3-Clause "New" or "Revised" License | 6 votes |
def latest_content(url): ''' 获取即时财经新闻内容 Parameter -------- url:新闻链接 Return -------- string:返回新闻的文字内容 ''' try: html = lxml.html.parse(url) res = html.xpath('//div[@id=\"artibody\"]/p') if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr).replace(' ', '')#.replace('\n\n', '\n'). html_content = lxml.html.fromstring(sarr) content = html_content.text_content() return content except Exception as er: print(str(er))
Example #4
Source File: newsevent.py From tushare with BSD 3-Clause "New" or "Revised" License | 6 votes |
def notice_content(url): ''' 获取信息地雷内容 Parameter -------- url:内容链接 Return -------- string:信息内容 ''' try: html = lxml.html.parse(url) res = html.xpath('//div[@id=\"content\"]/pre/text()')[0] return res.strip() except Exception as er: print(str(er))
Example #5
Source File: trading.py From tushare with BSD 3-Clause "New" or "Revised" License | 6 votes |
def _today_ticks(symbol, tdate, pageNo, retry_count, pause): ct._write_console() for _ in range(retry_count): time.sleep(pause) try: html = lxml.html.parse(ct.TODAY_TICKS_URL % (ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['t_ticks'], symbol, tdate, pageNo )) res = html.xpath('//table[@id=\"datatbl\"]/tbody/tr') if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = '<table>%s</table>'%sarr sarr = sarr.replace('--', '0') df = pd.read_html(StringIO(sarr), parse_dates=False)[0] df.columns = ct.TODAY_TICK_COLUMNS df['pchange'] = df['pchange'].map(lambda x : x.replace('%', '')) except Exception as e: print(e) else: return df raise IOError(ct.NETWORK_URL_ERROR_MSG)
Example #6
Source File: newsevent.py From TuShare with BSD 3-Clause "New" or "Revised" License | 6 votes |
def _guba_content(url): try: html = lxml.html.parse(url) res = html.xpath('//div[@class=\"ilt_p\"]/p') if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr).replace(' ', '')#.replace('\n\n', '\n'). html_content = lxml.html.fromstring(sarr) content = html_content.text_content() ptime = html.xpath('//div[@class=\"fl_left iltp_time\"]/span/text()')[0] rcounts = html.xpath('//div[@class=\"fl_right iltp_span\"]/span[2]/text()')[0] reg = re.compile(r'\((.*?)\)') rcounts = reg.findall(rcounts)[0] return [content, ptime, rcounts] except Exception: return ['', '', '0']
Example #7
Source File: trading.py From TuShare with BSD 3-Clause "New" or "Revised" License | 6 votes |
def _today_ticks(symbol, tdate, pageNo, retry_count, pause): ct._write_console() for _ in range(retry_count): time.sleep(pause) try: html = lxml.html.parse(ct.TODAY_TICKS_URL % (ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['t_ticks'], symbol, tdate, pageNo )) res = html.xpath('//table[@id=\"datatbl\"]/tbody/tr') if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = '<table>%s</table>'%sarr sarr = sarr.replace('--', '0') df = pd.read_html(StringIO(sarr), parse_dates=False)[0] df.columns = ct.TODAY_TICK_COLUMNS df['pchange'] = df['pchange'].map(lambda x : x.replace('%', '')) except Exception as e: print(e) else: return df raise IOError(ct.NETWORK_URL_ERROR_MSG)
Example #8
Source File: sinanews.py From rolling-news with MIT License | 6 votes |
def get_news_content(url): """ 获取新闻内容 :param url: str, 新闻链接 :return: str, 新闻内容 """ content = '' try: text = disk_cache_downloader(url) html = lxml.etree.HTML(text) res = html.xpath('//*[@id="artibody" or @id="article"]//p') p_str_list = [lxml.etree.tostring(node).decode('utf-8') for node in res] p_str = ''.join(p_str_list) html_content = lxml.html.fromstring(p_str) content = html_content.text_content() # 清理未知字符和空白字符 content = re.sub(r'\u3000', '', content) content = re.sub(r'[ \xa0?]+', ' ', content) content = re.sub(r'\s*\n\s*', '\n', content) content = re.sub(r'\s*(\s)', r'\1', content) content = content.strip() except Exception as e: print('get_news_content(%s) error:' % url, e) return content
Example #9
Source File: reference.py From tushare with BSD 3-Clause "New" or "Revised" License | 6 votes |
def _profit_divis(pageNo, dataArr, nextPage): ct._write_console() html = lxml.html.parse('%sdata.cfi.cn/%s'%(ct.P_TYPE['http'], nextPage)) res = html.xpath("//table[@class=\"table_data\"]/tr") if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = sarr.replace('--', '0') sarr = '<table>%s</table>'%sarr df = pd.read_html(sarr, skiprows=[0])[0] dataArr = dataArr.append(df, ignore_index=True) nextPage = html.xpath('//div[@id=\"content\"]/div[2]/a[last()]/@href')[0] np = nextPage.split('&')[2].split('=')[1] if pageNo < int(np): return _profit_divis(int(np), dataArr, nextPage) else: return dataArr
Example #10
Source File: client.py From StrategyEase-Python-SDK with MIT License | 6 votes |
def __query_new_stocks(self): DATA_URL = 'http://vip.stock.finance.sina.com.cn/corp/view/vRPD_NewStockIssue.php?page=1&cngem=0&orderBy=NetDate&orderType=desc' html = lxml.html.parse(DATA_URL) res = html.xpath('//table[@id=\"NewStockTable\"]/tr') if six.PY2: sarr = [etree.tostring(node) for node in res] else: sarr = [etree.tostring(node).decode('utf-8') for node in res] sarr = ''.join(sarr) sarr = sarr.replace('<font color="red">*</font>', '') sarr = '<table>%s</table>' % sarr df = pd.read_html(StringIO(sarr), skiprows=[0, 1])[0] df = df.select(lambda x: x in [0, 1, 2, 3, 7], axis=1) df.columns = ['code', 'xcode', 'name', 'ipo_date', 'price'] df['code'] = df['code'].map(lambda x: str(x).zfill(6)) df['xcode'] = df['xcode'].map(lambda x: str(x).zfill(6)) return df
Example #11
Source File: shipaneclient.py From QUANTAXIS with MIT License | 6 votes |
def __query_new_stocks(self): DATA_URL = 'http://vip.stock.finance.sina.com.cn/corp/view/vRPD_NewStockIssue.php?page=1&cngem=0&orderBy=NetDate&orderType=desc' html = lxml.html.parse(DATA_URL) res = html.xpath('//table[@id=\"NewStockTable\"]/tr') if six.PY2: sarr = [etree.tostring(node) for node in res] else: sarr = [etree.tostring(node).decode('utf-8') for node in res] sarr = ''.join(sarr) sarr = sarr.replace('<font color="red">*</font>', '') sarr = '<table>%s</table>' % sarr df = pd.read_html(StringIO(sarr), skiprows=[0, 1])[0] df = df.select(lambda x: x in [0, 1, 2, 3, 7], axis=1) df.columns = ['code', 'xcode', 'name', 'ipo_date', 'price'] df['code'] = df['code'].map(lambda x: str(x).zfill(6)) df['xcode'] = df['xcode'].map(lambda x: str(x).zfill(6)) return df
Example #12
Source File: belvaParseXML.py From Basic-Expression-Lexicon-Variation-Algorithms-BELVA with GNU General Public License v3.0 | 6 votes |
def parseXMLxpathSearch(xml_source, xpathString): #--------------------------------------------------------------------------------- return_values = [] try: root = etree.XML(xml_source) data_points = root.xpath(xpathString) for data in data_points: return_values.append(etree.tostring(data)) data.clear() except: pass return return_values #--------------------------------------------------------------------------------- # parse XML and return value asked (designed for errors via stdout)
Example #13
Source File: belvaParseXML.py From Basic-Expression-Lexicon-Variation-Algorithms-BELVA with GNU General Public License v3.0 | 6 votes |
def parseHTMLxpathSearch(http_source, xpathString): #--------------------------------------------------------------------------------- return_values = [] http_source= str(http_source).replace('\x00','') try: html = lxml.html.fromstring(http_source) for data in html.xpath(xpathString): return_values.append(etree.tostring(data.content)) data.clear() except: pass return return_values #--------------------------------------------------------------------------------- # parse HTML and return value asked
Example #14
Source File: belvaParseXML.py From Basic-Expression-Lexicon-Variation-Algorithms-BELVA with GNU General Public License v3.0 | 6 votes |
def parseXMLxpathSearchAttribute(xml_source, xpathString): #--------------------------------------------------------------------------------- return_values = [] try: root = etree.XML(xml_source) data_points = root.xpath(xpathString) for data in data_points: return_values.append(data) data.clear() except: pass return return_values #--------------------------------------------------------------------------------- # parse HTML and return value asked
Example #15
Source File: aerosol.py From SprayingToolkit with GNU General Public License v3.0 | 5 votes |
def response(self, flow: http.HTTPFlow) -> None: try: if "html" in flow.response.headers["Content-Type"] and len(flow.response.content): if ctx.options.target in flow.request.host: html = lxml.html.fromstring(flow.response.content) the_best_words = set(html.xpath('//text()')) ctx.log.info(print_good(f"Got {len(the_best_words)} words, the best words...")) self.words |= the_best_words except KeyError: pass
Example #16
Source File: fundamental.py From TuShare with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _get_report_data(year, quarter, pageNo, dataArr): ct._write_console() try: request = Request(ct.REPORT_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['fd'], year, quarter, pageNo, ct.PAGE_NUM[1])) print(ct.REPORT_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['fd'], year, quarter, pageNo, ct.PAGE_NUM[1])) text = urlopen(request, timeout=10).read() text = text.decode('GBK') text = text.replace('--', '') html = lxml.html.parse(StringIO(text)) res = html.xpath("//table[@class=\"list_table\"]/tr") if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = '<table>%s</table>'%sarr df = pd.read_html(sarr)[0] df = df.drop(11, axis=1) df.columns = ct.REPORT_COLS dataArr = dataArr.append(df, ignore_index=True) nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') if len(nextPage)>0: pageNo = re.findall(r'\d+', nextPage[0])[0] return _get_report_data(year, quarter, pageNo, dataArr) else: return dataArr except Exception as e: print(e)
Example #17
Source File: fundamental.py From TuShare with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _get_profit_data(year, quarter, pageNo, dataArr): ct._write_console() try: request = Request(ct.PROFIT_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['fd'], year, quarter, pageNo, ct.PAGE_NUM[1])) text = urlopen(request, timeout=10).read() text = text.decode('GBK') text = text.replace('--', '') html = lxml.html.parse(StringIO(text)) res = html.xpath("//table[@class=\"list_table\"]/tr") if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = '<table>%s</table>'%sarr df = pd.read_html(sarr)[0] df.columns=ct.PROFIT_COLS dataArr = dataArr.append(df, ignore_index=True) nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') if len(nextPage)>0: pageNo = re.findall(r'\d+', nextPage[0])[0] return _get_profit_data(year, quarter, pageNo, dataArr) else: return dataArr except: pass
Example #18
Source File: fundamental.py From TuShare with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _get_growth_data(year, quarter, pageNo, dataArr): ct._write_console() try: request = Request(ct.GROWTH_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['fd'], year, quarter, pageNo, ct.PAGE_NUM[1])) text = urlopen(request, timeout=10).read() text = text.decode('GBK') text = text.replace('--', '') html = lxml.html.parse(StringIO(text)) res = html.xpath("//table[@class=\"list_table\"]/tr") if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = '<table>%s</table>'%sarr df = pd.read_html(sarr)[0] df.columns=ct.GROWTH_COLS dataArr = dataArr.append(df, ignore_index=True) nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') if len(nextPage)>0: pageNo = re.findall(r'\d+', nextPage[0])[0] return _get_growth_data(year, quarter, pageNo, dataArr) else: return dataArr except Exception as e: print(e)
Example #19
Source File: fundamental.py From TuShare with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _get_debtpaying_data(year, quarter, pageNo, dataArr): ct._write_console() try: request = Request(ct.DEBTPAYING_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['fd'], year, quarter, pageNo, ct.PAGE_NUM[1])) text = urlopen(request, timeout=10).read() text = text.decode('GBK') html = lxml.html.parse(StringIO(text)) res = html.xpath("//table[@class=\"list_table\"]/tr") if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = '<table>%s</table>'%sarr df = pd.read_html(sarr)[0] df.columns = ct.DEBTPAYING_COLS dataArr = dataArr.append(df, ignore_index=True) nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') if len(nextPage)>0: pageNo = re.findall(r'\d+', nextPage[0])[0] return _get_debtpaying_data(year, quarter, pageNo, dataArr) else: return dataArr except Exception as e: print(e)
Example #20
Source File: fundamental.py From TuShare with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _get_cashflow_data(year, quarter, pageNo, dataArr): ct._write_console() try: request = Request(ct.CASHFLOW_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['fd'], year, quarter, pageNo, ct.PAGE_NUM[1])) text = urlopen(request, timeout=10).read() text = text.decode('GBK') text = text.replace('--', '') html = lxml.html.parse(StringIO(text)) res = html.xpath("//table[@class=\"list_table\"]/tr") if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = '<table>%s</table>'%sarr df = pd.read_html(sarr)[0] df.columns = ct.CASHFLOW_COLS dataArr = dataArr.append(df, ignore_index=True) nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') if len(nextPage)>0: pageNo = re.findall(r'\d+', nextPage[0])[0] return _get_cashflow_data(year, quarter, pageNo, dataArr) else: return dataArr except Exception as e: print(e)
Example #21
Source File: reference.py From TuShare with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _newstocks(data, pageNo, retry_count, pause): for _ in range(retry_count): time.sleep(pause) ct._write_console() try: html = lxml.html.parse(rv.NEW_STOCKS_URL%(ct.P_TYPE['http'],ct.DOMAINS['vsf'], ct.PAGES['newstock'], pageNo)) res = html.xpath('//table[@id=\"NewStockTable\"]/tr') if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = sarr.replace('<font color="red">*</font>', '') sarr = '<table>%s</table>'%sarr df = pd.read_html(StringIO(sarr), skiprows=[0, 1])[0] df = df.drop([df.columns[idx] for idx in [1, 12, 13, 14]], axis=1) df.columns = rv.NEW_STOCKS_COLS df['code'] = df['code'].map(lambda x : str(x).zfill(6)) res = html.xpath('//table[@class=\"table2\"]/tr[1]/td[1]/a/text()') tag = '下一页' if ct.PY3 else unicode('下一页', 'utf-8') hasNext = True if tag in res else False data = data.append(df, ignore_index=True) pageNo += 1 if hasNext: data = _newstocks(data, pageNo, retry_count, pause) except Exception as ex: print(ex) else: return data
Example #22
Source File: newsevent.py From TuShare with BSD 3-Clause "New" or "Revised" License | 5 votes |
def get_notices(code=None, date=None): ''' 个股信息地雷 Parameters -------- code:股票代码 date:信息公布日期 Return -------- DataFrame,属性列表: title:信息标题 type:信息类型 date:公告日期 url:信息内容URL ''' if code is None: return None symbol = 'sh' + code if code[:1] == '6' else 'sz' + code url = nv.NOTICE_INFO_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['ntinfo'], symbol) url = url if date is None else '%s&gg_date=%s'%(url, date) html = lxml.html.parse(url) res = html.xpath('//table[@class=\"body_table\"]/tbody/tr') data = [] for td in res: title = td.xpath('th/a/text()')[0] type = td.xpath('td[1]/text()')[0] date = td.xpath('td[2]/text()')[0] url = '%s%s%s'%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], td.xpath('th/a/@href')[0]) data.append([title, type, date, url]) df = pd.DataFrame(data, columns=nv.NOTICE_INFO_CLS) return df
Example #23
Source File: trading.py From TuShare with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _parse_fq_data(url, index, retry_count, pause): for _ in range(retry_count): time.sleep(pause) try: request = Request(url) text = urlopen(request, timeout=10).read() text = text.decode('GBK') html = lxml.html.parse(StringIO(text)) res = html.xpath('//table[@id=\"FundHoldSharesTable\"]') if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) df = pd.read_html(sarr, skiprows = [0, 1])[0] if len(df) == 0: return pd.DataFrame() if index: df.columns = ct.HIST_FQ_COLS[0:7] else: df.columns = ct.HIST_FQ_COLS if df['date'].dtypes == np.object: df['date'] = df['date'].astype(np.datetime64) df = df.drop_duplicates('date') except Exception as e: print(e) else: return df raise IOError(ct.NETWORK_URL_ERROR_MSG)
Example #24
Source File: billboard.py From TuShare with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _cap_tops(last=5, pageNo=1, retry_count=3, pause=0.001, dataArr=pd.DataFrame()): ct._write_console() for _ in range(retry_count): time.sleep(pause) try: request = Request(rv.LHB_SINA_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], rv.LHB_KINDS[0], ct.PAGES['fd'], last, pageNo)) text = urlopen(request, timeout=10).read() text = text.decode('GBK') html = lxml.html.parse(StringIO(text)) res = html.xpath("//table[@id=\"dataTable\"]/tr") if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = '<table>%s</table>'%sarr df = pd.read_html(sarr)[0] df.columns = rv.LHB_GGTJ_COLS dataArr = dataArr.append(df, ignore_index=True) nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') if len(nextPage)>0: pageNo = re.findall(r'\d+', nextPage[0])[0] return _cap_tops(last, pageNo, retry_count, pause, dataArr) else: return dataArr except Exception as e: print(e)
Example #25
Source File: billboard.py From TuShare with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _inst_tops(last=5, pageNo=1, retry_count=3, pause=0.001, dataArr=pd.DataFrame()): ct._write_console() for _ in range(retry_count): time.sleep(pause) try: request = Request(rv.LHB_SINA_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], rv.LHB_KINDS[2], ct.PAGES['fd'], last, pageNo)) text = urlopen(request, timeout=10).read() text = text.decode('GBK') html = lxml.html.parse(StringIO(text)) res = html.xpath("//table[@id=\"dataTable\"]/tr") if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = '<table>%s</table>'%sarr df = pd.read_html(sarr)[0] df = df.drop([2,3], axis=1) df.columns = rv.LHB_JGZZ_COLS dataArr = dataArr.append(df, ignore_index=True) nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') if len(nextPage)>0: pageNo = re.findall(r'\d+', nextPage[0])[0] return _inst_tops(last, pageNo, retry_count, pause, dataArr) else: return dataArr except Exception as e: print(e)
Example #26
Source File: billboard.py From TuShare with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _inst_detail(pageNo=1, retry_count=3, pause=0.001, dataArr=pd.DataFrame()): ct._write_console() for _ in range(retry_count): time.sleep(pause) try: request = Request(rv.LHB_SINA_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], rv.LHB_KINDS[3], ct.PAGES['fd'], '', pageNo)) text = urlopen(request, timeout=10).read() text = text.decode('GBK') html = lxml.html.parse(StringIO(text)) res = html.xpath("//table[@id=\"dataTable\"]/tr") if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = '<table>%s</table>'%sarr df = pd.read_html(sarr)[0] df.columns = rv.LHB_JGMX_COLS dataArr = dataArr.append(df, ignore_index=True) nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') if len(nextPage)>0: pageNo = re.findall(r'\d+', nextPage[0])[0] return _inst_detail(pageNo, retry_count, pause, dataArr) else: return dataArr except Exception as e: print(e)
Example #27
Source File: func.py From InplusTrader_Linux with MIT License | 5 votes |
def _parse_fq_data(url, index, retry_count, pause): for _ in range(retry_count): time.sleep(pause) try: request = Request(url) text = urlopen(request, timeout=10).read() text = text.decode('GBK') html = lxml.html.parse(StringIO(text)) res = html.xpath('//table[@id=\"FundHoldSharesTable\"]') sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) if sarr == '': return None df = pd.read_html(sarr, skiprows = [0, 1])[0] if len(df) == 0: return pd.DataFrame() if index: df.columns = ct.HIST_FQ_COLS[0:7] else: df.columns = ct.HIST_FQ_COLS if df['date'].dtypes == np.object: df['date'] = df['date'].astype(np.datetime64) df = df.drop_duplicates('date') except ValueError as e: # 时间较早,已经读不到数据 return None except Exception as e: print(e) else: return df raise IOError(ct.NETWORK_URL_ERROR_MSG)
Example #28
Source File: google.py From SprayingToolkit with GNU General Public License v3.0 | 5 votes |
def google(content): names = [] html = lxml.html.fromstring(content) for text in html.xpath('//h3[@class="LC20lb"]//text()'): first, last = linkedin_se_name_parser(text) names.append((first, last, text)) return names
Example #29
Source File: bing.py From SprayingToolkit with GNU General Public License v3.0 | 5 votes |
def bing(content): names = [] html = lxml.html.fromstring(content) for result in html.xpath('//li[@class="b_algo"]/h2/a'): text = ''.join(result.xpath('.//text()')) first, last = linkedin_se_name_parser(text) names.append((first, last, text)) return names
Example #30
Source File: fundamental.py From tushare with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _get_cashflow_data(year, quarter, pageNo, dataArr, retry_count=3, pause=0.001): ct._write_console() for _ in range(retry_count): time.sleep(pause) try: request = Request(ct.CASHFLOW_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['fd'], year, quarter, pageNo, ct.PAGE_NUM[1])) text = urlopen(request, timeout=10).read() text = text.decode('GBK') text = text.replace('--', '') html = lxml.html.parse(StringIO(text)) res = html.xpath("//table[@class=\"list_table\"]/tr") if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = '<table>%s</table>'%sarr df = pd.read_html(sarr)[0] df.columns = ct.CASHFLOW_COLS dataArr = dataArr.append(df, ignore_index=True) nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') if len(nextPage)>0: pageNo = re.findall(r'\d+', nextPage[0])[0] return _get_cashflow_data(year, quarter, pageNo, dataArr) else: return dataArr except Exception as e: pass raise IOError(ct.NETWORK_URL_ERROR_MSG)