Python pandas.read_table() Examples
The following are 30
code examples of pandas.read_table().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pandas
, or try the search function
.
Example #1
Source File: movielens_preprocess.py From striatum with BSD 2-Clause "Simplified" License | 9 votes |
def main(): # read and preprocess the movie data movie = pd.read_table('movies.dat', sep='::', names=['movie_id', 'movie_name', 'tag'], engine='python') movie = movie_preprocessing(movie) # read the ratings data and merge it with movie data rating = pd.read_table("ratings.dat", sep="::", names=["user_id", "movie_id", "rating", "timestamp"], engine='python') data = pd.merge(rating, movie, on="movie_id") # extract feature from our data set streaming_batch, user_feature, actions, reward_list = feature_extraction(data) streaming_batch.to_csv("streaming_batch.csv", sep='\t', index=False) user_feature.to_csv("user_feature.csv", sep='\t') pd.DataFrame(actions, columns=['movie_id']).to_csv("actions.csv", sep='\t', index=False) reward_list.to_csv("reward_list.csv", sep='\t', index=False) action_context = movie[movie['movie_id'].isin(actions)] action_context.to_csv("action_context.csv", sep='\t', index = False)
Example #2
Source File: test_terminal_method_statistical_analysis.py From cellphonedb with MIT License | 6 votes |
def _assert_result(self, filename: str, data: str, iterations: int, project_name: str, result_means_filename: str, debug_seed: int, threshold: float, result_precision: int ) -> None: str_threshold = ''.join(str(threshold).split('.')) means_test_filename = \ 'statistical_analysis__{}_result__' \ 'data-{}_it-{}_seed-{}_threshold-{}_precision-{}.txt'.format(filename, data, iterations, debug_seed, str_threshold, result_precision) original_means = pd.read_table(os.path.realpath('{}/{}'.format(data_test_dir, means_test_filename))) result_means = pd.read_table('{}/{}/{}'.format(output_test_dir, project_name, result_means_filename)) self.assertTrue(dataframe_functions.dataframes_has_same_data(result_means, original_means), msg='failed comparing {} with {}'.format(means_test_filename, result_means_filename)) self.remove_file('{}/{}/{}'.format(output_test_dir, project_name, result_means_filename))
Example #3
Source File: test_parsers.py From Computable with MIT License | 6 votes |
def test_1000_sep(self): data = """A|B|C 1|2,334|5 10|13|10. """ expected = DataFrame({ 'A': [1, 10], 'B': [2334, 13], 'C': [5, 10.] }) df = self.read_csv(StringIO(data), sep='|', thousands=',') tm.assert_frame_equal(df, expected) df = self.read_table(StringIO(data), sep='|', thousands=',') tm.assert_frame_equal(df, expected)
Example #4
Source File: test_parsers.py From Computable with MIT License | 6 votes |
def test_no_header(self): data = """1,2,3,4,5 6,7,8,9,10 11,12,13,14,15 """ df = self.read_table(StringIO(data), sep=',', header=None) df_pref = self.read_table(StringIO(data), sep=',', prefix='X', header=None) names = ['foo', 'bar', 'baz', 'quux', 'panda'] df2 = self.read_table(StringIO(data), sep=',', names=names) expected = [[1, 2, 3, 4, 5.], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]] tm.assert_almost_equal(df.values, expected) tm.assert_almost_equal(df.values, df2.values) self.assert_(np.array_equal(df_pref.columns, ['X0', 'X1', 'X2', 'X3', 'X4'])) self.assert_(np.array_equal(df.columns, lrange(5))) self.assert_(np.array_equal(df2.columns, names))
Example #5
Source File: GetReadDepthDiff.py From smrtsv2 with MIT License | 6 votes |
def get_ref_contig_sizes(altref_file): """ Get a Series of contigs lengths. Includes primary and alt contigs. :param altref_file: BED file of contig information where each record spans the whole contig. Must contain columns "#CHROM" and "END". :return: Series of contig lengths indexed by the contig name. """ # Get reference chromosome sizes ref_len_series = pd.read_table(altref_file, header=0) ref_len_series.index = ref_len_series['#CHROM'] ref_len_series = ref_len_series['END'] return ref_len_series
Example #6
Source File: test_parsers.py From Computable with MIT License | 6 votes |
def test_duplicate_columns(self): for engine in ['python', 'c']: data = """A,A,B,B,B 1,2,3,4,5 6,7,8,9,10 11,12,13,14,15 """ # check default beahviour df = self.read_table(StringIO(data), sep=',',engine=engine) self.assertEqual(list(df.columns), ['A', 'A.1', 'B', 'B.1', 'B.2']) df = self.read_table(StringIO(data), sep=',',engine=engine,mangle_dupe_cols=False) self.assertEqual(list(df.columns), ['A', 'A', 'B', 'B', 'B']) df = self.read_table(StringIO(data), sep=',',engine=engine,mangle_dupe_cols=True) self.assertEqual(list(df.columns), ['A', 'A.1', 'B', 'B.1', 'B.2'])
Example #7
Source File: test_parsers.py From Computable with MIT License | 6 votes |
def test_1000_sep_with_decimal(self): data = """A|B|C 1|2,334.01|5 10|13|10. """ expected = DataFrame({ 'A': [1, 10], 'B': [2334.01, 13], 'C': [5, 10.] }) df = self.read_csv(StringIO(data), sep='|', thousands=',') tm.assert_frame_equal(df, expected) df = self.read_table(StringIO(data), sep='|', thousands=',') tm.assert_frame_equal(df, expected)
Example #8
Source File: test_parsers.py From Computable with MIT License | 6 votes |
def test_iteration_open_handle(self): if PY3: raise nose.SkipTest("won't work in Python 3 {0}".format(sys.version_info)) with tm.ensure_clean() as path: with open(path, 'wb') as f: f.write('AAA\nBBB\nCCC\nDDD\nEEE\nFFF\nGGG') with open(path, 'rb') as f: for line in f: if 'CCC' in line: break try: read_table(f, squeeze=True, header=None, engine='c') except Exception: pass else: raise ValueError('this should not happen') result = read_table(f, squeeze=True, header=None, engine='python') expected = Series(['DDD', 'EEE', 'FFF', 'GGG']) tm.assert_series_equal(result, expected)
Example #9
Source File: fileMerger.py From SUPPA with MIT License | 6 votes |
def merge_files(fl_lst, output, ext): df_lst = [] for fl in fl_lst: df = pd.read_table(fl, sep='\t', index_col=0, header=0) old_header = df.columns.values new_header = [os.path.basename(fl).split(".")[0]+"_"+col_id for col_id in old_header] df.rename(columns=dict(zip(old_header, new_header)), inplace=True) df_lst.append(df) merged_dfs = pd.concat(df_lst, axis=1) header = merged_dfs.columns.values with open("%s.%s" % (output, ext), "w+") as fh: ln = "\t".join(header) fh.write(ln+"\n") with open("%s.%s" % (output, ext), "a") as fh: merged_dfs.to_csv(fh, sep="\t", na_rep="nan", header=False)
Example #10
Source File: lastfm.py From implicit with MIT License | 6 votes |
def _read_dataframe(filename): """ Reads the original dataset TSV as a pandas dataframe """ # delay importing this to avoid another dependency import pandas # read in triples of user/artist/playcount from the input dataset # get a model based off the input params start = time.time() log.debug("reading data from %s", filename) data = pandas.read_table(filename, usecols=[0, 2, 3], names=['user', 'artist', 'plays'], na_filter=False) # map each artist and user to a unique numeric value data['user'] = data['user'].astype("category") data['artist'] = data['artist'].astype("category") # store as a CSR matrix log.debug("read data file in %s", time.time() - start) return data
Example #11
Source File: reddit.py From implicit with MIT License | 6 votes |
def _read_dataframe(filename): """ Reads the original dataset TSV as a pandas dataframe """ # delay importing this to avoid another dependency import pandas # read in triples of user/artist/playcount from the input dataset # get a model based off the input params start = time.time() log.debug("reading data from %s", filename) data = pandas.read_table(filename, usecols=[0, 1, 3], names=['user', 'item', 'rating']) # map each artist and user to a unique numeric value data['user'] = data['user'].astype("category") data['item'] = data['item'].astype("category") # store as a CSR matrix log.debug("read data file in %s", time.time() - start) return data
Example #12
Source File: million_song_dataset.py From implicit with MIT License | 6 votes |
def _read_triplets_dataframe(filename): """ Reads the original dataset TSV as a pandas dataframe """ # delay importing this to avoid another dependency import pandas # read in triples of user/artist/playcount from the input dataset # get a model based off the input params start = time.time() log.debug("reading data from %s", filename) data = pandas.read_table("train_triplets.txt", names=['user', 'track', 'plays']) # map each artist and user to a unique numeric value data['user'] = data['user'].astype("category") data['track'] = data['track'].astype("category") # store as a CSR matrix log.debug("read data file in %s", time.time() - start) return data
Example #13
Source File: test_parsers.py From Computable with MIT License | 5 votes |
def read_table(self, *args, **kwds): kwds = kwds.copy() kwds['engine'] = 'c' kwds['low_memory'] = False return read_table(*args, **kwds)
Example #14
Source File: test_parsers.py From Computable with MIT License | 5 votes |
def read_table(self, *args, **kwds): kwds = kwds.copy() kwds['engine'] = 'c' kwds['low_memory'] = True kwds['buffer_lines'] = 2 return read_table(*args, **kwds)
Example #15
Source File: diff_tools.py From SUPPA with MIT License | 5 votes |
def merge_temp_output_files(output): # Set working directory if os.path.isabs(output): current_path = os.path.dirname(output)+"/" else: current_path = os.getcwd()+"/" dpsi_files = [] for fl in os.listdir(current_path): if ".dpsi.temp." in fl: dpsi_files.append(current_path+fl) dpsi_files.sort(key=lambda x: x[-1]) df_lst = [] for lst in dpsi_files: df = pd.read_table(lst, sep='\t', index_col=0, header=0) df_lst.append(df) merged_dpsi_results = reduce(lambda left, right: pd.merge(left, right, left_index=True, right_index=True, how='outer'), df_lst) header = merged_dpsi_results.columns.values with open("%s.dpsi" % output, "w+") as fh: ln = "\t".join(header) fh.write(ln+"\n") with open("%s.dpsi" % output, "a") as fh: merged_dpsi_results.to_csv(fh, sep="\t", na_rep="nan", header=False) # Delete temp filesdis for fl in os.listdir(current_path): if ".dpsi.temp." in fl: os.remove(current_path+fl) return os.path.abspath("%s.dpsi" % output)
Example #16
Source File: test_parsers.py From Computable with MIT License | 5 votes |
def test_regex_separator(self): data = """ A B C D a 1 2 3 4 b 1 2 3 4 c 1 2 3 4 """ df = self.read_table(StringIO(data), sep='\s+') expected = self.read_csv(StringIO(re.sub('[ ]+', ',', data)), index_col=0) self.assert_(expected.index.name is None) tm.assert_frame_equal(df, expected)
Example #17
Source File: diff_tools.py From SUPPA with MIT License | 5 votes |
def write_psivec_file(psi_lst, output): df_lst = [] for fl in psi_lst: df = pd.read_table(fl, sep='\t', skiprows=[0], index_col=0, header=None) old_header = df.columns.values new_header = [os.path.basename(fl).split(".")[0]+"_"+str(col_id) for col_id in old_header] df.rename(columns=dict(zip(old_header, new_header)), inplace=True) df_lst.append(df) merged_psi_results = reduce(lambda left, right: pd.merge(left, right, left_index=True, right_index=True, how='outer'), df_lst) header = merged_psi_results.columns.values with open("%s.psivec" % output, "w+") as fh: ln = "\t".join(header) fh.write(ln+"\n") with open("%s.psivec" % output, "a") as fh: merged_psi_results.to_csv(fh, sep="\t", na_rep="nan", header=False) return os.path.abspath("%s.psivec" % output)
Example #18
Source File: test_parsers.py From Computable with MIT License | 5 votes |
def test_no_header_prefix(self): data = """1,2,3,4,5 6,7,8,9,10 11,12,13,14,15 """ df_pref = self.read_table(StringIO(data), sep=',', prefix='Field', header=None) expected = [[1, 2, 3, 4, 5.], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]] tm.assert_almost_equal(df_pref.values, expected) self.assert_(np.array_equal(df_pref.columns, ['Field0', 'Field1', 'Field2', 'Field3', 'Field4']))
Example #19
Source File: test_parsers.py From Computable with MIT License | 5 votes |
def test_quoting(self): bad_line_small = """printer\tresult\tvariant_name Klosterdruckerei\tKlosterdruckerei <Salem> (1611-1804)\tMuller, Jacob Klosterdruckerei\tKlosterdruckerei <Salem> (1611-1804)\tMuller, Jakob Klosterdruckerei\tKlosterdruckerei <Kempten> (1609-1805)\t"Furststiftische Hofdruckerei, <Kempten"" Klosterdruckerei\tKlosterdruckerei <Kempten> (1609-1805)\tGaller, Alois Klosterdruckerei\tKlosterdruckerei <Kempten> (1609-1805)\tHochfurstliche Buchhandlung <Kempten>""" self.assertRaises(Exception, self.read_table, StringIO(bad_line_small), sep='\t') good_line_small = bad_line_small + '"' df = self.read_table(StringIO(good_line_small), sep='\t') self.assert_(len(df) == 3)
Example #20
Source File: DyStockDataGateway.py From DevilYuan with MIT License | 5 votes |
def _getDaysFrom163(self, code, startDate, endDate, retry_count=3, pause=0.001): """ 从网易获取个股日线数据,指数和基金(ETF)除外 @code: DevilYuan Code """ symbol = ('0' + code[:6]) if code[-2:] == 'SH' else ('1' + code[:6]) for _ in range(retry_count): sleep(pause) try: url = 'http://quotes.money.163.com/service/chddata.html?code={}&start={}&end={}&fields=TCLOSE;HIGH;LOW;TOPEN;TURNOVER;VOTURNOVER;VATURNOVER' url = url.format(symbol, startDate.replace('-', ''), endDate.replace('-', '')) re = Request(url) lines = urlopen(re, timeout=10).read() lines = lines.decode('GBK') df = pd.read_table(StringIO(lines), sep=',', names=['date', 'code', 'name', 'close', 'high', 'low', 'open', 'turnover', 'volume', 'amount'], skiprows=[0]) except Exception as e: print(e) ex = e else: df = df[['date', 'open', 'high', 'close', 'low', 'volume', 'amount', 'turnover']] # return columns df = df.set_index('date') df = df.sort_index(ascending=False) return df raise ex
Example #21
Source File: test_parsers.py From Computable with MIT License | 5 votes |
def test_squeeze(self): data = """\ a,1 b,2 c,3 """ expected = Series([1, 2, 3], ['a', 'b', 'c']) result = self.read_table(StringIO(data), sep=',', index_col=0, header=None, squeeze=True) tm.assert_isinstance(result, Series) tm.assert_series_equal(result, expected)
Example #22
Source File: test_parsers.py From Computable with MIT License | 5 votes |
def test_1000_sep_with_decimal(self): data = """A|B|C 1|2,334.01|5 10|13|10. """ expected = DataFrame({ 'A': [1, 10], 'B': [2334.01, 13], 'C': [5, 10.] }) tm.assert_equal(expected.A.dtype, 'int64') tm.assert_equal(expected.B.dtype, 'float') tm.assert_equal(expected.C.dtype, 'float') df = self.read_csv(StringIO(data), sep='|', thousands=',', decimal='.') tm.assert_frame_equal(df, expected) df = self.read_table(StringIO(data), sep='|', thousands=',', decimal='.') tm.assert_frame_equal(df, expected) data_with_odd_sep = """A|B|C 1|2.334,01|5 10|13|10, """ df = self.read_csv(StringIO(data_with_odd_sep), sep='|', thousands='.', decimal=',') tm.assert_frame_equal(df, expected) df = self.read_table(StringIO(data_with_odd_sep), sep='|', thousands='.', decimal=',') tm.assert_frame_equal(df, expected)
Example #23
Source File: test_parsers.py From Computable with MIT License | 5 votes |
def read_table(self, *args, **kwargs): raise NotImplementedError
Example #24
Source File: test_paradigm.py From nistats with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_read_events(): """ test that a events for an experimental paradigm are correctly read. """ import tempfile tmpdir = tempfile.mkdtemp() for events in (block_paradigm(), modulated_event_paradigm(), modulated_block_paradigm(), basic_paradigm()): csvfile = write_events(events, tmpdir) read_paradigm = pd.read_table(csvfile) assert (read_paradigm['onset'] == events['onset']).all()
Example #25
Source File: test_parsers.py From vnpy_crypto with MIT License | 5 votes |
def read_table(self, *args, **kwds): kwds = kwds.copy() kwds['engine'] = self.engine return read_table(*args, **kwds)
Example #26
Source File: test_parsers.py From vnpy_crypto with MIT License | 5 votes |
def read_table(self, *args, **kwds): kwds = kwds.copy() kwds['engine'] = self.engine kwds['low_memory'] = True return read_table(*args, **kwds)
Example #27
Source File: test_parsers.py From vnpy_crypto with MIT License | 5 votes |
def read_table(self, *args, **kwargs): raise NotImplementedError
Example #28
Source File: test_multilevel.py From vnpy_crypto with MIT License | 5 votes |
def test_xs_level0(self): from pandas import read_table text = """ A B C D E one two three four a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640 a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744 x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" df = read_table(StringIO(text), sep=r'\s+', engine='python') result = df.xs('a', level=0) expected = df.xs('a') assert len(result) == 2 tm.assert_frame_equal(result, expected)
Example #29
Source File: test_multilevel.py From vnpy_crypto with MIT License | 5 votes |
def test_xs_level_multiple(self): from pandas import read_table text = """ A B C D E one two three four a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640 a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744 x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" df = read_table(StringIO(text), sep=r'\s+', engine='python') result = df.xs(('a', 4), level=['one', 'four']) expected = df.xs('a').xs(4, level='four') tm.assert_frame_equal(result, expected) # this is a copy in 0.14 result = df.xs(('a', 4), level=['one', 'four']) # setting this will give a SettingWithCopyError # as we are trying to write a view def f(x): x[:] = 10 pytest.raises(com.SettingWithCopyError, f, result) # GH2107 dates = lrange(20111201, 20111205) ids = 'abcde' idx = MultiIndex.from_tuples([x for x in cart_product(dates, ids)]) idx.names = ['date', 'secid'] df = DataFrame(np.random.randn(len(idx), 3), idx, ['X', 'Y', 'Z']) rs = df.xs(20111201, level='date') xp = df.loc[20111201, :] tm.assert_frame_equal(rs, xp)
Example #30
Source File: test_formula.py From vnpy_crypto with MIT License | 5 votes |
def test_formula_labels(): # make sure labels pass through patsy as expected # data(Duncan) from car in R dta = StringIO(""""type" "income" "education" "prestige"\n"accountant" "prof" 62 86 82\n"pilot" "prof" 72 76 83\n"architect" "prof" 75 92 90\n"author" "prof" 55 90 76\n"chemist" "prof" 64 86 90\n"minister" "prof" 21 84 87\n"professor" "prof" 64 93 93\n"dentist" "prof" 80 100 90\n"reporter" "wc" 67 87 52\n"engineer" "prof" 72 86 88\n"undertaker" "prof" 42 74 57\n"lawyer" "prof" 76 98 89\n"physician" "prof" 76 97 97\n"welfare.worker" "prof" 41 84 59\n"teacher" "prof" 48 91 73\n"conductor" "wc" 76 34 38\n"contractor" "prof" 53 45 76\n"factory.owner" "prof" 60 56 81\n"store.manager" "prof" 42 44 45\n"banker" "prof" 78 82 92\n"bookkeeper" "wc" 29 72 39\n"mail.carrier" "wc" 48 55 34\n"insurance.agent" "wc" 55 71 41\n"store.clerk" "wc" 29 50 16\n"carpenter" "bc" 21 23 33\n"electrician" "bc" 47 39 53\n"RR.engineer" "bc" 81 28 67\n"machinist" "bc" 36 32 57\n"auto.repairman" "bc" 22 22 26\n"plumber" "bc" 44 25 29\n"gas.stn.attendant" "bc" 15 29 10\n"coal.miner" "bc" 7 7 15\n"streetcar.motorman" "bc" 42 26 19\n"taxi.driver" "bc" 9 19 10\n"truck.driver" "bc" 21 15 13\n"machine.operator" "bc" 21 20 24\n"barber" "bc" 16 26 20\n"bartender" "bc" 16 28 7\n"shoe.shiner" "bc" 9 17 3\n"cook" "bc" 14 22 16\n"soda.clerk" "bc" 12 30 6\n"watchman" "bc" 17 25 11\n"janitor" "bc" 7 20 8\n"policeman" "bc" 34 47 41\n"waiter" "bc" 8 32 10""") from pandas import read_table dta = read_table(dta, sep=" ") model = ols("prestige ~ income + education", dta).fit() assert_equal(model.fittedvalues.index, dta.index)