Python pandas.read_html() Examples
The following are 30
code examples of pandas.read_html().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pandas
, or try the search function
.
Example #1
Source File: extract_tables.py From axcell with Apache License 2.0 | 6 votes |
def fix_span_tables(soup): classes = OrderedDict([("ltx_tabular", "table"), ("ltx_tr", "tr"), ("ltx_th", "th"), ("ltx_tbody", "tbody"), ("ltx_thead", "thead"), ("ltx_td", "td"), ("ltx_tfoot", "tfoot")]) query = ','.join(["span." + c for c in classes.keys()]) for elem in soup.select(query): for k, v in classes.items(): if k in elem.attrs["class"]: elem.name = v break # pandas.read_html treats th differently # by trying in a few places to get column names # for now <th>s are changed to <td>s, but we still # have classes (ltx_th) to distinguish them
Example #2
Source File: populate.py From phageParser with MIT License | 6 votes |
def addpositionstodict(gendict): print("Downloading position information from web...") for accidwithloc in tqdm(gendict): if 'Start' in gendict[accidwithloc]: continue accid = '_'.join(accidwithloc.split('_')[:-1]) url = ('http://crispr.i2bc.paris-saclay.fr/crispr/crispr_db.php?' 'checked%5B%5D={}'.format(accid)) page = requests.get(url) htmltable = html.fromstring(page.content).xpath( "//table[normalize-space(@class)='primary_table']")[1] strtable = etree.tostring(htmltable) # converts to pandas df and then to numpy array then drop titles arrtable = pandas.read_html(strtable)[0].as_matrix()[2:] for row in arrtable: if row[0] in gendict: gendict[row[0]]['Start'] = row[2] gendict[row[0]]['Stop'] = row[3] else: if row[1] != 'questionable': print("Can't find %s in local files" % row[0]) return gendict
Example #3
Source File: stock_summary.py From akshare with MIT License | 6 votes |
def stock_sse_summary(): """ 上海证券交易所-总貌 http://www.sse.com.cn/market/stockdata/statistic/ :return: 上海证券交易所-总貌 :rtype: pandas.DataFrame """ url = "http://www.sse.com.cn/market/stockdata/statistic/" r = requests.get(url) r.encoding = "utf-8" big_df = pd.DataFrame() temp_list = ["总貌", "主板", "科创板"] for i in range(len(pd.read_html(r.text))): for j in range(0, 2): inner_df = pd.read_html(r.text)[i].iloc[:, j].str.split(" ", expand=True) inner_df["item"] = temp_list[i] big_df = big_df.append(inner_df) big_df.dropna(how="any", inplace=True) big_df.columns = ["item", "number", "type"] big_df = big_df[["type", "item", "number"]] return big_df
Example #4
Source File: stock_info.py From akshare with MIT License | 6 votes |
def stock_info_change_name(stock="688588"): """ 新浪财经-股票曾用名 http://vip.stock.finance.sina.com.cn/corp/go.php/vCI_CorpInfo/stockid/300378.phtml :param stock: 股票代码 :type stock: str :return: 股票曾用名列表 :rtype: list """ url = f"http://vip.stock.finance.sina.com.cn/corp/go.php/vCI_CorpInfo/stockid/{stock}.phtml" r = requests.get(url) temp_df = pd.read_html(r.text)[3].iloc[:, :2] temp_df.dropna(inplace=True) temp_df.columns = ["item", "value"] temp_df["item"] = temp_df["item"].str.split(":", expand=True)[0] try: name_list = temp_df[temp_df["item"] == "证券简称更名历史"].value.tolist()[0].split(" ") return name_list except: return None
Example #5
Source File: time_and_date.py From akshare with MIT License | 6 votes |
def sunrise_city_list() -> list: """ 查询日出与日落数据的城市列表 :return: 所有可以获取的数据的城市列表 :rtype: list """ url = "https://www.timeanddate.com/sun/china" res = requests.get(url) city_list = [] china_city_one_df = pd.read_html(res.text)[0] china_city_two_df = pd.read_html(res.text)[1] city_list.extend([item.lower() for item in china_city_one_df.iloc[:, 0].tolist()]) city_list.extend([item.lower() for item in china_city_one_df.iloc[:, 1].tolist()]) city_list.extend([item.lower() for item in china_city_two_df.iloc[:, 0].tolist()]) city_list.extend([item.lower() for item in china_city_two_df.iloc[:, 1].tolist()]) city_list.extend([item.lower() for item in china_city_two_df.iloc[:, 2].tolist()]) city_list.extend([item.lower() for item in china_city_two_df.iloc[:, 3].tolist()]) city_list.extend([item.lower() for item in china_city_two_df.iloc[:, 4][:-2].tolist()]) return city_list
Example #6
Source File: time_and_date.py From akshare with MIT License | 6 votes |
def sunrise_daily(date: str = "20200428", city: str = "北京") -> pd.DataFrame: """ 每日日出日落数据 https://www.timeanddate.com/sun/china/shaoxing :param date: 需要查询的日期, e.g., “20200428” :type date: str :param city: 需要查询的城市; 注意输入的格式, e.g., "北京", "上海" :type city: str :return: 返回指定日期指定地区的日出日落数据 :rtype: pandas.DataFrame """ if pypinyin.slug(city, separator='') in sunrise_city_list(): year = date[:4] month = date[4:6] url = f"https://www.timeanddate.com/sun/china/{pypinyin.slug(city, separator='')}?month={month}&year={year}" res = requests.get(url) table = pd.read_html(res.text, header=2)[0] month_df = table.iloc[:-1, ] day_df = month_df[month_df.iloc[:, 0].astype(str).str.zfill(2) == date[6:]] day_df.index = pd.to_datetime([date] * len(day_df), format="%Y%m%d") return day_df else: return "请输入正确的城市名称"
Example #7
Source File: shipaneclient.py From QUANTAXIS with MIT License | 6 votes |
def __query_new_stocks(self): DATA_URL = 'http://vip.stock.finance.sina.com.cn/corp/view/vRPD_NewStockIssue.php?page=1&cngem=0&orderBy=NetDate&orderType=desc' html = lxml.html.parse(DATA_URL) res = html.xpath('//table[@id=\"NewStockTable\"]/tr') if six.PY2: sarr = [etree.tostring(node) for node in res] else: sarr = [etree.tostring(node).decode('utf-8') for node in res] sarr = ''.join(sarr) sarr = sarr.replace('<font color="red">*</font>', '') sarr = '<table>%s</table>' % sarr df = pd.read_html(StringIO(sarr), skiprows=[0, 1])[0] df = df.select(lambda x: x in [0, 1, 2, 3, 7], axis=1) df.columns = ['code', 'xcode', 'name', 'ipo_date', 'price'] df['code'] = df['code'].map(lambda x: str(x).zfill(6)) df['xcode'] = df['xcode'].map(lambda x: str(x).zfill(6)) return df
Example #8
Source File: update_spark_params.py From spylon with BSD 3-Clause "New" or "Revised" License | 6 votes |
def _fetch_documentation(version, base_url="https://spark.apache.org/docs"): doc_urls = [ "{base_url}/{version}/configuration.html", "{base_url}/{version}/sql-programming-guide.html", "{base_url}/{version}/monitoring.html", "{base_url}/{version}/spark-standalone.html", "{base_url}/{version}/running-on-mesos.html", "{base_url}/{version}/running-on-yarn.html", ] for url in doc_urls: doc_url = url.format(version=version, base_url=base_url) # print(url) print("Loading spark properties from %s", doc_url) dfs = pd.read_html(doc_url, header=0) desired_cols = ["Property Name", "Default", "Meaning"] for df in dfs: if ("Property Name" in df) and ('Default' in df): for pn, default, desc in df[desired_cols].itertuples(index=False): if type(default) == numpy.bool_: default = bool(default) yield pn, default, desc
Example #9
Source File: arbitrage_tools.py From bitrader with MIT License | 6 votes |
def get_forex_buy_quote(currency_code: str = 'EUR', source: str = 'FNB', order_type: str = 'buy'): """Get latest forex from FNB website """ if source == 'FNB': tables = pd.read_html( 'https://www.fnb.co.za/Controller?nav=rates.forex.list.ForexRatesList', index_col=1, header=0, match=currency_code) df = tables[0] types = { 'buy': 'Bank Selling Rate', 'sell': 'Bank Buying Rate', } exhange_rate = df.loc[currency_code, types[order_type]] return Decimal("%.4f" % float(exhange_rate))
Example #10
Source File: fransRecon.py From fransRecon with MIT License | 6 votes |
def getdatafromViewDNS(searchQuery): searchQuery = searchQuery.replace(" ", "+") url = "https://viewdns.info/reversewhois/?q=" + searchQuery print ("[*] Extracting from: " + url) try: result = pd.read_html(requests.get(url, headers={"User-Agent": "Mozilla/5.0"}).text) response = result[3][0] iter_url = iter(response) return iter_url # next(iter_url) #for url in iter_url: # print(url) except Exception as e: print("[!] Couldn't send query, error: {e} exiting...\n") exit # Will return the org name for any domain name.
Example #11
Source File: client.py From StrategyEase-Python-SDK with MIT License | 6 votes |
def __query_new_stocks(self): DATA_URL = 'http://vip.stock.finance.sina.com.cn/corp/view/vRPD_NewStockIssue.php?page=1&cngem=0&orderBy=NetDate&orderType=desc' html = lxml.html.parse(DATA_URL) res = html.xpath('//table[@id=\"NewStockTable\"]/tr') if six.PY2: sarr = [etree.tostring(node) for node in res] else: sarr = [etree.tostring(node).decode('utf-8') for node in res] sarr = ''.join(sarr) sarr = sarr.replace('<font color="red">*</font>', '') sarr = '<table>%s</table>' % sarr df = pd.read_html(StringIO(sarr), skiprows=[0, 1])[0] df = df.select(lambda x: x in [0, 1, 2, 3, 7], axis=1) df.columns = ['code', 'xcode', 'name', 'ipo_date', 'price'] df['code'] = df['code'].map(lambda x: str(x).zfill(6)) df['xcode'] = df['xcode'].map(lambda x: str(x).zfill(6)) return df
Example #12
Source File: reference.py From tushare with BSD 3-Clause "New" or "Revised" License | 6 votes |
def _profit_divis(pageNo, dataArr, nextPage): ct._write_console() html = lxml.html.parse('%sdata.cfi.cn/%s'%(ct.P_TYPE['http'], nextPage)) res = html.xpath("//table[@class=\"table_data\"]/tr") if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = sarr.replace('--', '0') sarr = '<table>%s</table>'%sarr df = pd.read_html(sarr, skiprows=[0])[0] dataArr = dataArr.append(df, ignore_index=True) nextPage = html.xpath('//div[@id=\"content\"]/div[2]/a[last()]/@href')[0] np = nextPage.split('&')[2].split('=')[1] if pageNo < int(np): return _profit_divis(int(np), dataArr, nextPage) else: return dataArr
Example #13
Source File: reference.py From TuShare with BSD 3-Clause "New" or "Revised" License | 6 votes |
def _sz_hz(date='', retry_count=3, pause=0.001): for _ in range(retry_count): time.sleep(pause) ct._write_console() try: request = Request(rv.MAR_SZ_HZ_URL%(ct.P_TYPE['http'], ct.DOMAINS['szse'], ct.PAGES['szsefc'], date)) lines = urlopen(request, timeout = 10).read() if len(lines) <= 200: return pd.DataFrame() df = pd.read_html(lines, skiprows=[0])[0] df.columns = rv.MAR_SZ_HZ_COLS df['opDate'] = date except Exception as e: print(e) else: return df raise IOError(ct.NETWORK_URL_ERROR_MSG)
Example #14
Source File: trading.py From tushare with BSD 3-Clause "New" or "Revised" License | 6 votes |
def _today_ticks(symbol, tdate, pageNo, retry_count, pause): ct._write_console() for _ in range(retry_count): time.sleep(pause) try: html = lxml.html.parse(ct.TODAY_TICKS_URL % (ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['t_ticks'], symbol, tdate, pageNo )) res = html.xpath('//table[@id=\"datatbl\"]/tbody/tr') if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = '<table>%s</table>'%sarr sarr = sarr.replace('--', '0') df = pd.read_html(StringIO(sarr), parse_dates=False)[0] df.columns = ct.TODAY_TICK_COLUMNS df['pchange'] = df['pchange'].map(lambda x : x.replace('%', '')) except Exception as e: print(e) else: return df raise IOError(ct.NETWORK_URL_ERROR_MSG)
Example #15
Source File: universal.py From xalpha with MIT License | 6 votes |
def get_portfolio_fromttjj(code, start=None, end=None): startobj = dt.datetime.strptime(start, "%Y%m%d") endobj = dt.datetime.strptime(end, "%Y%m%d") if (endobj - startobj).days < 90: return None # note start is always 1.1 4.1 7.1 10.1 in incremental updates if code.startswith("F"): code = code[1:] r = rget("http://fundf10.eastmoney.com/zcpz_{code}.html".format(code=code)) s = BeautifulSoup(r.text, "lxml") table = s.find("table", class_="tzxq") df = pd.read_html(str(table))[0] df["date"] = pd.to_datetime(df["报告期"]) df["stock_ratio"] = df["股票占净比"].replace("---", "0%").apply(lambda s: _float(s[:-1])) df["bond_ratio"] = df["债券占净比"].replace("---", "0%").apply(lambda s: _float(s[:-1])) df["cash_ratio"] = df["现金占净比"].replace("---", "0%").apply(lambda s: _float(s[:-1])) # df["dr_ratio"] = df["存托凭证占净比"].replace("---", "0%").apply(lambda s: xa.cons._float(s[:-1])) df["assets"] = df["净资产(亿元)"] df = df[::-1] return df[["date", "stock_ratio", "bond_ratio", "cash_ratio", "assets"]] # this is the most elegant approach to dispatch get_daily, the definition can be such simple # you actually don't need to bother on start end blah, everything is taken care of by ``cahcedio``
Example #16
Source File: test_triangle.py From chainladder-python with Mozilla Public License 2.0 | 5 votes |
def test_repr(): tri = cl.load_sample('raa') np.testing.assert_array_equal(pd.read_html(tri._repr_html_())[0].set_index('Origin').values, tri.to_frame().values)
Example #17
Source File: billboard.py From TuShare with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _cap_tops(last=5, pageNo=1, retry_count=3, pause=0.001, dataArr=pd.DataFrame()): ct._write_console() for _ in range(retry_count): time.sleep(pause) try: request = Request(rv.LHB_SINA_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], rv.LHB_KINDS[0], ct.PAGES['fd'], last, pageNo)) text = urlopen(request, timeout=10).read() text = text.decode('GBK') html = lxml.html.parse(StringIO(text)) res = html.xpath("//table[@id=\"dataTable\"]/tr") if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = '<table>%s</table>'%sarr df = pd.read_html(sarr)[0] df.columns = rv.LHB_GGTJ_COLS dataArr = dataArr.append(df, ignore_index=True) nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') if len(nextPage)>0: pageNo = re.findall(r'\d+', nextPage[0])[0] return _cap_tops(last, pageNo, retry_count, pause, dataArr) else: return dataArr except Exception as e: print(e)
Example #18
Source File: trading.py From TuShare with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _parse_fq_data(url, index, retry_count, pause): for _ in range(retry_count): time.sleep(pause) try: request = Request(url) text = urlopen(request, timeout=10).read() text = text.decode('GBK') html = lxml.html.parse(StringIO(text)) res = html.xpath('//table[@id=\"FundHoldSharesTable\"]') if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) df = pd.read_html(sarr, skiprows = [0, 1])[0] if len(df) == 0: return pd.DataFrame() if index: df.columns = ct.HIST_FQ_COLS[0:7] else: df.columns = ct.HIST_FQ_COLS if df['date'].dtypes == np.object: df['date'] = df['date'].astype(np.datetime64) df = df.drop_duplicates('date') except Exception as e: print(e) else: return df raise IOError(ct.NETWORK_URL_ERROR_MSG)
Example #19
Source File: fundamental.py From TuShare with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _get_cashflow_data(year, quarter, pageNo, dataArr): ct._write_console() try: request = Request(ct.CASHFLOW_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['fd'], year, quarter, pageNo, ct.PAGE_NUM[1])) text = urlopen(request, timeout=10).read() text = text.decode('GBK') text = text.replace('--', '') html = lxml.html.parse(StringIO(text)) res = html.xpath("//table[@class=\"list_table\"]/tr") if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = '<table>%s</table>'%sarr df = pd.read_html(sarr)[0] df.columns = ct.CASHFLOW_COLS dataArr = dataArr.append(df, ignore_index=True) nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') if len(nextPage)>0: pageNo = re.findall(r'\d+', nextPage[0])[0] return _get_cashflow_data(year, quarter, pageNo, dataArr) else: return dataArr except Exception as e: print(e)
Example #20
Source File: test_scrapetable.py From cjworkbench with GNU Affero General Public License v3.0 | 5 votes |
def test_only_some_colnames(self): # pandas read_table() does odd stuff when there are multiple commas at # the ends of rows. Test that read_html() doesn't do the same thing. fetch_result = fetch( url="http://example.org", tablenum=1, first_row_is_header=True ) assert_frame_equal( fetch_result.dataframe, pd.DataFrame( {"A": ["a", "b"], "Unnamed: 1": [1, 2]} # TODO should be 'Column 2'? ), )
Example #21
Source File: fundamental.py From TuShare with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _get_debtpaying_data(year, quarter, pageNo, dataArr): ct._write_console() try: request = Request(ct.DEBTPAYING_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['fd'], year, quarter, pageNo, ct.PAGE_NUM[1])) text = urlopen(request, timeout=10).read() text = text.decode('GBK') html = lxml.html.parse(StringIO(text)) res = html.xpath("//table[@class=\"list_table\"]/tr") if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = '<table>%s</table>'%sarr df = pd.read_html(sarr)[0] df.columns = ct.DEBTPAYING_COLS dataArr = dataArr.append(df, ignore_index=True) nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') if len(nextPage)>0: pageNo = re.findall(r'\d+', nextPage[0])[0] return _get_debtpaying_data(year, quarter, pageNo, dataArr) else: return dataArr except Exception as e: print(e)
Example #22
Source File: fundamental.py From TuShare with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _get_growth_data(year, quarter, pageNo, dataArr): ct._write_console() try: request = Request(ct.GROWTH_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['fd'], year, quarter, pageNo, ct.PAGE_NUM[1])) text = urlopen(request, timeout=10).read() text = text.decode('GBK') text = text.replace('--', '') html = lxml.html.parse(StringIO(text)) res = html.xpath("//table[@class=\"list_table\"]/tr") if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = '<table>%s</table>'%sarr df = pd.read_html(sarr)[0] df.columns=ct.GROWTH_COLS dataArr = dataArr.append(df, ignore_index=True) nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') if len(nextPage)>0: pageNo = re.findall(r'\d+', nextPage[0])[0] return _get_growth_data(year, quarter, pageNo, dataArr) else: return dataArr except Exception as e: print(e)
Example #23
Source File: fundamental.py From TuShare with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _get_operation_data(year, quarter, pageNo, dataArr): ct._write_console() try: request = Request(ct.OPERATION_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['fd'], year, quarter, pageNo, ct.PAGE_NUM[1])) text = urlopen(request, timeout=10).read() text = text.decode('GBK') text = text.replace('--', '') html = lxml.html.parse(StringIO(text)) res = html.xpath("//table[@class=\"list_table\"]/tr") if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = '<table>%s</table>'%sarr df = pd.read_html(sarr)[0] df.columns=ct.OPERATION_COLS dataArr = dataArr.append(df, ignore_index=True) nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') if len(nextPage)>0: pageNo = re.findall(r'\d+', nextPage[0])[0] return _get_operation_data(year, quarter, pageNo, dataArr) else: return dataArr except Exception as e: print(e)
Example #24
Source File: fundamental.py From TuShare with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _get_profit_data(year, quarter, pageNo, dataArr): ct._write_console() try: request = Request(ct.PROFIT_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['fd'], year, quarter, pageNo, ct.PAGE_NUM[1])) text = urlopen(request, timeout=10).read() text = text.decode('GBK') text = text.replace('--', '') html = lxml.html.parse(StringIO(text)) res = html.xpath("//table[@class=\"list_table\"]/tr") if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = '<table>%s</table>'%sarr df = pd.read_html(sarr)[0] df.columns=ct.PROFIT_COLS dataArr = dataArr.append(df, ignore_index=True) nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') if len(nextPage)>0: pageNo = re.findall(r'\d+', nextPage[0])[0] return _get_profit_data(year, quarter, pageNo, dataArr) else: return dataArr except: pass
Example #25
Source File: reference.py From TuShare with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _newstocks(data, pageNo, retry_count, pause): for _ in range(retry_count): time.sleep(pause) ct._write_console() try: html = lxml.html.parse(rv.NEW_STOCKS_URL%(ct.P_TYPE['http'],ct.DOMAINS['vsf'], ct.PAGES['newstock'], pageNo)) res = html.xpath('//table[@id=\"NewStockTable\"]/tr') if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = sarr.replace('<font color="red">*</font>', '') sarr = '<table>%s</table>'%sarr df = pd.read_html(StringIO(sarr), skiprows=[0, 1])[0] df = df.drop([df.columns[idx] for idx in [1, 12, 13, 14]], axis=1) df.columns = rv.NEW_STOCKS_COLS df['code'] = df['code'].map(lambda x : str(x).zfill(6)) res = html.xpath('//table[@class=\"table2\"]/tr[1]/td[1]/a/text()') tag = '下一页' if ct.PY3 else unicode('下一页', 'utf-8') hasNext = True if tag in res else False data = data.append(df, ignore_index=True) pageNo += 1 if hasNext: data = _newstocks(data, pageNo, retry_count, pause) except Exception as ex: print(ex) else: return data
Example #26
Source File: reference.py From TuShare with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _get_forecast_data(year, quarter, pageNo, dataArr): ct._write_console() try: html = lxml.html.parse(ct.FORECAST_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], ct.PAGES['fd'], year, quarter, pageNo, ct.PAGE_NUM[1])) res = html.xpath("//table[@class=\"list_table\"]/tr") if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = sarr.replace('--', '0') sarr = '<table>%s</table>'%sarr df = pd.read_html(sarr)[0] df = df.drop([4, 5, 8], axis=1) df.columns = ct.FORECAST_COLS dataArr = dataArr.append(df, ignore_index=True) nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') if len(nextPage)>0: pageNo = re.findall(r'\d+',nextPage[0])[0] return _get_forecast_data(year, quarter, pageNo, dataArr) else: return dataArr except Exception as e: print(e)
Example #27
Source File: reference.py From TuShare with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _dist_cotent(year, pageNo, retry_count, pause): for _ in range(retry_count): time.sleep(pause) try: if pageNo > 0: ct._write_console() html = lxml.html.parse(rv.DP_163_URL%(ct.P_TYPE['http'], ct.DOMAINS['163'], ct.PAGES['163dp'], year, pageNo)) res = html.xpath('//div[@class=\"fn_rp_list\"]/table') if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) df = pd.read_html(sarr, skiprows=[0])[0] df = df.drop(df.columns[0], axis=1) df.columns = rv.DP_163_COLS df['divi'] = df['plan'].map(_fun_divi) df['shares'] = df['plan'].map(_fun_into) df = df.drop('plan', axis=1) df['code'] = df['code'].astype(object) df['code'] = df['code'].map(lambda x : str(x).zfill(6)) pages = [] if pageNo == 0: page = html.xpath('//div[@class=\"mod_pages\"]/a') if len(page)>1: asr = page[len(page)-2] pages = asr.xpath('text()') except Exception as e: print(e) else: if pageNo == 0: return df, pages[0] if len(pages)>0 else 0 else: return df raise IOError(ct.NETWORK_URL_ERROR_MSG)
Example #28
Source File: bitcoin_price.py From deep_learning with MIT License | 5 votes |
def get_data(): bit_url = "https://coinmarketcap.com/currencies/bitcoin/historical-data/?start=20130428&end={}" bitcoin_market_info = pd.read_html(bit_url.format(time.strftime("%Y%m%d")))[0] bitcoin_market_info = bitcoin_market_info.assign(Date=pd.to_datetime(bitcoin_market_info["Date"])) bitcoin_market_info.loc[bitcoin_market_info["Volume"] == "-", "Volume"] = 0 bitcoin_market_info["Volume"] = bitcoin_market_info["Volume"].astype("int64") # print(bitcoin_market_info.head()) eth_url = "https://coinmarketcap.com/currencies/ethereum/historical-data/?start=20130428&end={}" eth_market_info = pd.read_html(eth_url.format(time.strftime("%Y%m%d")))[0] eth_market_info = eth_market_info.assign(Date=pd.to_datetime(eth_market_info["Date"])) # print(eth_market_info.head()) return bitcoin_market_info, eth_market_info
Example #29
Source File: billboard.py From tushare with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _inst_detail(pageNo=1, retry_count=3, pause=0.001, dataArr=pd.DataFrame()): ct._write_console() for _ in range(retry_count): time.sleep(pause) try: request = Request(rv.LHB_SINA_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], rv.LHB_KINDS[3], ct.PAGES['fd'], '', pageNo)) text = urlopen(request, timeout=10).read() text = text.decode('GBK') html = lxml.html.parse(StringIO(text)) res = html.xpath("//table[@id=\"dataTable\"]/tr") if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = '<table>%s</table>'%sarr df = pd.read_html(sarr)[0] df.columns = rv.LHB_JGMX_COLS dataArr = dataArr.append(df, ignore_index=True) nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') if len(nextPage)>0: pageNo = re.findall(r'\d+', nextPage[0])[0] return _inst_detail(pageNo, retry_count, pause, dataArr) else: return dataArr except Exception as e: print(e)
Example #30
Source File: billboard.py From TuShare with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _broker_tops(last=5, pageNo=1, retry_count=3, pause=0.001, dataArr=pd.DataFrame()): ct._write_console() for _ in range(retry_count): time.sleep(pause) try: request = Request(rv.LHB_SINA_URL%(ct.P_TYPE['http'], ct.DOMAINS['vsf'], rv.LHB_KINDS[1], ct.PAGES['fd'], last, pageNo)) text = urlopen(request, timeout=10).read() text = text.decode('GBK') html = lxml.html.parse(StringIO(text)) res = html.xpath("//table[@id=\"dataTable\"]/tr") if ct.PY3: sarr = [etree.tostring(node).decode('utf-8') for node in res] else: sarr = [etree.tostring(node) for node in res] sarr = ''.join(sarr) sarr = '<table>%s</table>'%sarr df = pd.read_html(sarr)[0] df.columns = rv.LHB_YYTJ_COLS dataArr = dataArr.append(df, ignore_index=True) nextPage = html.xpath('//div[@class=\"pages\"]/a[last()]/@onclick') if len(nextPage)>0: pageNo = re.findall(r'\d+', nextPage[0])[0] return _broker_tops(last, pageNo, retry_count, pause, dataArr) else: return dataArr except Exception as e: print(e)