Python Examples of pandas.read

Source File: movielens_preprocess.py From striatum with BSD 2-Clause "Simplified" License

9 votes

def main():
    # read and preprocess the movie data
    movie = pd.read_table('movies.dat', sep='::', names=['movie_id', 'movie_name', 'tag'], engine='python')
    movie = movie_preprocessing(movie)

    # read the ratings data and merge it with movie data
    rating = pd.read_table("ratings.dat", sep="::",
                           names=["user_id", "movie_id", "rating", "timestamp"], engine='python')
    data = pd.merge(rating, movie, on="movie_id")

    # extract feature from our data set
    streaming_batch, user_feature, actions, reward_list = feature_extraction(data)
    streaming_batch.to_csv("streaming_batch.csv", sep='\t', index=False)
    user_feature.to_csv("user_feature.csv", sep='\t')
    pd.DataFrame(actions, columns=['movie_id']).to_csv("actions.csv", sep='\t', index=False)
    reward_list.to_csv("reward_list.csv", sep='\t', index=False)

    action_context = movie[movie['movie_id'].isin(actions)]
    action_context.to_csv("action_context.csv", sep='\t', index = False)

Source File: test_terminal_method_statistical_analysis.py From cellphonedb with MIT License

6 votes

def _assert_result(self, filename: str,
                       data: str,
                       iterations: int,
                       project_name: str,
                       result_means_filename: str,
                       debug_seed: int,
                       threshold: float,
                       result_precision: int
                       ) -> None:
        str_threshold = ''.join(str(threshold).split('.'))

        means_test_filename = \
            'statistical_analysis__{}_result__' \
            'data-{}_it-{}_seed-{}_threshold-{}_precision-{}.txt'.format(filename,
                                                                         data,
                                                                         iterations,
                                                                         debug_seed,
                                                                         str_threshold,
                                                                         result_precision)
        original_means = pd.read_table(os.path.realpath('{}/{}'.format(data_test_dir, means_test_filename)))
        result_means = pd.read_table('{}/{}/{}'.format(output_test_dir, project_name, result_means_filename))
        self.assertTrue(dataframe_functions.dataframes_has_same_data(result_means, original_means),
                        msg='failed comparing {} with {}'.format(means_test_filename, result_means_filename))
        self.remove_file('{}/{}/{}'.format(output_test_dir, project_name, result_means_filename))

Source File: test_parsers.py From Computable with MIT License

6 votes

def test_1000_sep(self):
        data = """A|B|C
1|2,334|5
10|13|10.
"""
        expected = DataFrame({
            'A': [1, 10],
            'B': [2334, 13],
            'C': [5, 10.]
        })

        df = self.read_csv(StringIO(data), sep='|', thousands=',')
        tm.assert_frame_equal(df, expected)

        df = self.read_table(StringIO(data), sep='|', thousands=',')
        tm.assert_frame_equal(df, expected)

Source File: test_parsers.py From Computable with MIT License

6 votes

def test_no_header(self):
        data = """1,2,3,4,5
6,7,8,9,10
11,12,13,14,15
"""
        df = self.read_table(StringIO(data), sep=',', header=None)
        df_pref = self.read_table(StringIO(data), sep=',', prefix='X',
                                  header=None)

        names = ['foo', 'bar', 'baz', 'quux', 'panda']
        df2 = self.read_table(StringIO(data), sep=',', names=names)
        expected = [[1, 2, 3, 4, 5.],
                    [6, 7, 8, 9, 10],
                    [11, 12, 13, 14, 15]]
        tm.assert_almost_equal(df.values, expected)
        tm.assert_almost_equal(df.values, df2.values)

        self.assert_(np.array_equal(df_pref.columns,
                                    ['X0', 'X1', 'X2', 'X3', 'X4']))
        self.assert_(np.array_equal(df.columns, lrange(5)))

        self.assert_(np.array_equal(df2.columns, names))

Source File: GetReadDepthDiff.py From smrtsv2 with MIT License

6 votes

def get_ref_contig_sizes(altref_file):
    """
    Get a Series of contigs lengths. Includes primary and alt contigs.

    :param altref_file: BED file of contig information where each record spans the whole contig. Must contain
        columns "#CHROM" and "END".

    :return: Series of contig lengths indexed by the contig name.
    """

    # Get reference chromosome sizes
    ref_len_series = pd.read_table(altref_file, header=0)
    ref_len_series.index = ref_len_series['#CHROM']
    ref_len_series = ref_len_series['END']

    return ref_len_series

Source File: test_parsers.py From Computable with MIT License

6 votes

def test_duplicate_columns(self):
        for engine in ['python', 'c']:
            data = """A,A,B,B,B
    1,2,3,4,5
    6,7,8,9,10
    11,12,13,14,15
    """
            # check default beahviour
            df = self.read_table(StringIO(data), sep=',',engine=engine)
            self.assertEqual(list(df.columns), ['A', 'A.1', 'B', 'B.1', 'B.2'])

            df = self.read_table(StringIO(data), sep=',',engine=engine,mangle_dupe_cols=False)
            self.assertEqual(list(df.columns), ['A', 'A', 'B', 'B', 'B'])

            df = self.read_table(StringIO(data), sep=',',engine=engine,mangle_dupe_cols=True)
            self.assertEqual(list(df.columns), ['A', 'A.1', 'B', 'B.1', 'B.2'])

Source File: test_parsers.py From Computable with MIT License

6 votes

def test_1000_sep_with_decimal(self):
        data = """A|B|C
1|2,334.01|5
10|13|10.
"""

        expected = DataFrame({
            'A': [1, 10],
            'B': [2334.01, 13],
            'C': [5, 10.]
        })

        df = self.read_csv(StringIO(data), sep='|', thousands=',')
        tm.assert_frame_equal(df, expected)

        df = self.read_table(StringIO(data), sep='|', thousands=',')
        tm.assert_frame_equal(df, expected)

Source File: test_parsers.py From Computable with MIT License

6 votes

def test_iteration_open_handle(self):
        if PY3:
            raise nose.SkipTest("won't work in Python 3 {0}".format(sys.version_info))

        with tm.ensure_clean() as path:
            with open(path, 'wb') as f:
                f.write('AAA\nBBB\nCCC\nDDD\nEEE\nFFF\nGGG')

            with open(path, 'rb') as f:
                for line in f:
                    if 'CCC' in line:
                        break

                try:
                    read_table(f, squeeze=True, header=None, engine='c')
                except Exception:
                    pass
                else:
                    raise ValueError('this should not happen')

                result = read_table(f, squeeze=True, header=None,
                                    engine='python')

                expected = Series(['DDD', 'EEE', 'FFF', 'GGG'])
                tm.assert_series_equal(result, expected)

Source File: fileMerger.py From SUPPA with MIT License

6 votes

def merge_files(fl_lst, output, ext):


    df_lst = []
    for fl in fl_lst:
        df = pd.read_table(fl, sep='\t', index_col=0, header=0)

        old_header = df.columns.values
        new_header = [os.path.basename(fl).split(".")[0]+"_"+col_id for col_id in old_header]
        df.rename(columns=dict(zip(old_header, new_header)), inplace=True)

        df_lst.append(df)

    merged_dfs = pd.concat(df_lst, axis=1)

    header = merged_dfs.columns.values

    with open("%s.%s" % (output, ext), "w+") as fh:
            ln = "\t".join(header)
            fh.write(ln+"\n")

    with open("%s.%s" % (output, ext), "a") as fh:
            merged_dfs.to_csv(fh, sep="\t", na_rep="nan", header=False)

Source File: lastfm.py From implicit with MIT License

6 votes

def _read_dataframe(filename):
    """ Reads the original dataset TSV as a pandas dataframe """
    # delay importing this to avoid another dependency
    import pandas

    # read in triples of user/artist/playcount from the input dataset
    # get a model based off the input params
    start = time.time()
    log.debug("reading data from %s", filename)
    data = pandas.read_table(filename,
                             usecols=[0, 2, 3],
                             names=['user', 'artist', 'plays'],
                             na_filter=False)

    # map each artist and user to a unique numeric value
    data['user'] = data['user'].astype("category")
    data['artist'] = data['artist'].astype("category")

    # store as a CSR matrix
    log.debug("read data file in %s", time.time() - start)

    return data

Source File: reddit.py From implicit with MIT License

6 votes

def _read_dataframe(filename):
    """ Reads the original dataset TSV as a pandas dataframe """
    # delay importing this to avoid another dependency
    import pandas

    # read in triples of user/artist/playcount from the input dataset
    # get a model based off the input params
    start = time.time()
    log.debug("reading data from %s", filename)
    data = pandas.read_table(filename, usecols=[0, 1, 3], names=['user', 'item', 'rating'])

    # map each artist and user to a unique numeric value
    data['user'] = data['user'].astype("category")
    data['item'] = data['item'].astype("category")

    # store as a CSR matrix
    log.debug("read data file in %s", time.time() - start)
    return data

Source File: million_song_dataset.py From implicit with MIT License

6 votes

def _read_triplets_dataframe(filename):
    """ Reads the original dataset TSV as a pandas dataframe """
    # delay importing this to avoid another dependency
    import pandas

    # read in triples of user/artist/playcount from the input dataset
    # get a model based off the input params
    start = time.time()
    log.debug("reading data from %s", filename)
    data = pandas.read_table("train_triplets.txt", names=['user', 'track', 'plays'])

    # map each artist and user to a unique numeric value
    data['user'] = data['user'].astype("category")
    data['track'] = data['track'].astype("category")

    # store as a CSR matrix
    log.debug("read data file in %s", time.time() - start)

    return data

Source File: test_parsers.py From Computable with MIT License

5 votes

def read_table(self, *args, **kwds):
        kwds = kwds.copy()
        kwds['engine'] = 'c'
        kwds['low_memory'] = False
        return read_table(*args, **kwds)

Source File: test_parsers.py From Computable with MIT License

5 votes

def read_table(self, *args, **kwds):
        kwds = kwds.copy()
        kwds['engine'] = 'c'
        kwds['low_memory'] = True
        kwds['buffer_lines'] = 2
        return read_table(*args, **kwds)

Source File: diff_tools.py From SUPPA with MIT License

5 votes

def merge_temp_output_files(output):

    # Set working directory
    if os.path.isabs(output):
        current_path = os.path.dirname(output)+"/"
    else:
        current_path = os.getcwd()+"/"

    dpsi_files = []
    for fl in os.listdir(current_path):
        if ".dpsi.temp." in fl:
            dpsi_files.append(current_path+fl)

    dpsi_files.sort(key=lambda x: x[-1])

    df_lst = []
    for lst in dpsi_files:
        df = pd.read_table(lst, sep='\t', index_col=0, header=0)
        df_lst.append(df)

        merged_dpsi_results = reduce(lambda left, right: pd.merge(left, right,
                                                        left_index=True, right_index=True,
                                                        how='outer'), df_lst)

        header = merged_dpsi_results.columns.values

        with open("%s.dpsi" % output, "w+") as fh:
            ln = "\t".join(header)
            fh.write(ln+"\n")

        with open("%s.dpsi" % output, "a") as fh:
            merged_dpsi_results.to_csv(fh, sep="\t", na_rep="nan", header=False)

    # Delete temp filesdis
    for fl in os.listdir(current_path):
        if ".dpsi.temp." in fl:
            os.remove(current_path+fl)

    return os.path.abspath("%s.dpsi" % output)

Source File: test_parsers.py From Computable with MIT License

5 votes

def test_regex_separator(self):
        data = """   A   B   C   D
a   1   2   3   4
b   1   2   3   4
c   1   2   3   4
"""
        df = self.read_table(StringIO(data), sep='\s+')
        expected = self.read_csv(StringIO(re.sub('[ ]+', ',', data)),
                                 index_col=0)
        self.assert_(expected.index.name is None)
        tm.assert_frame_equal(df, expected)

Source File: diff_tools.py From SUPPA with MIT License

5 votes

def write_psivec_file(psi_lst, output):

    df_lst = []
    for fl in psi_lst:
        df = pd.read_table(fl, sep='\t', skiprows=[0], index_col=0, header=None)

        old_header = df.columns.values
        new_header = [os.path.basename(fl).split(".")[0]+"_"+str(col_id) for col_id in old_header]
        df.rename(columns=dict(zip(old_header, new_header)), inplace=True)

        df_lst.append(df)

    merged_psi_results = reduce(lambda left, right: pd.merge(left, right,
                                                            left_index=True, right_index=True,
                                                            how='outer'), df_lst)

    header = merged_psi_results.columns.values

    with open("%s.psivec" % output, "w+") as fh:
            ln = "\t".join(header)
            fh.write(ln+"\n")

    with open("%s.psivec" % output, "a") as fh:
            merged_psi_results.to_csv(fh, sep="\t", na_rep="nan", header=False)

    return os.path.abspath("%s.psivec" % output)

Source File: test_parsers.py From Computable with MIT License

5 votes

def test_no_header_prefix(self):
        data = """1,2,3,4,5
6,7,8,9,10
11,12,13,14,15
"""
        df_pref = self.read_table(StringIO(data), sep=',', prefix='Field',
                                  header=None)

        expected = [[1, 2, 3, 4, 5.],
                    [6, 7, 8, 9, 10],
                    [11, 12, 13, 14, 15]]
        tm.assert_almost_equal(df_pref.values, expected)

        self.assert_(np.array_equal(df_pref.columns,
                                    ['Field0', 'Field1', 'Field2', 'Field3', 'Field4']))

Source File: test_parsers.py From Computable with MIT License

5 votes

def test_quoting(self):
        bad_line_small = """printer\tresult\tvariant_name
Klosterdruckerei\tKlosterdruckerei <Salem> (1611-1804)\tMuller, Jacob
Klosterdruckerei\tKlosterdruckerei <Salem> (1611-1804)\tMuller, Jakob
Klosterdruckerei\tKlosterdruckerei <Kempten> (1609-1805)\t"Furststiftische Hofdruckerei,  <Kempten""
Klosterdruckerei\tKlosterdruckerei <Kempten> (1609-1805)\tGaller, Alois
Klosterdruckerei\tKlosterdruckerei <Kempten> (1609-1805)\tHochfurstliche Buchhandlung <Kempten>"""
        self.assertRaises(Exception, self.read_table, StringIO(bad_line_small),
                          sep='\t')

        good_line_small = bad_line_small + '"'
        df = self.read_table(StringIO(good_line_small), sep='\t')
        self.assert_(len(df) == 3)

Source File: DyStockDataGateway.py From DevilYuan with MIT License

5 votes

def _getDaysFrom163(self, code, startDate, endDate, retry_count=3, pause=0.001):
        """
            从网易获取个股日线数据，指数和基金（ETF）除外
            @code: DevilYuan Code

        """
        symbol = ('0' + code[:6]) if code[-2:] == 'SH' else ('1' + code[:6])

        for _ in range(retry_count):
            sleep(pause)
            try:
                url = 'http://quotes.money.163.com/service/chddata.html?code={}&start={}&end={}&fields=TCLOSE;HIGH;LOW;TOPEN;TURNOVER;VOTURNOVER;VATURNOVER'
                url = url.format(symbol, startDate.replace('-', ''), endDate.replace('-', ''))
                re = Request(url)
                lines = urlopen(re, timeout=10).read()
                lines = lines.decode('GBK') 
                df = pd.read_table(StringIO(lines),
                                   sep=',',
                                   names=['date', 'code', 'name', 'close', 'high', 'low', 'open', 'turnover', 'volume', 'amount'],
                                   skiprows=[0])
            except Exception as e:
                print(e)
                ex = e
            else:
                df = df[['date', 'open', 'high', 'close', 'low', 'volume', 'amount', 'turnover']] # return columns
                df = df.set_index('date')
                df = df.sort_index(ascending=False)
                return df
        raise ex

Source File: test_parsers.py From Computable with MIT License

5 votes

def test_squeeze(self):
        data = """\
a,1
b,2
c,3
"""
        expected = Series([1, 2, 3], ['a', 'b', 'c'])
        result = self.read_table(StringIO(data), sep=',', index_col=0,
                                 header=None, squeeze=True)
        tm.assert_isinstance(result, Series)
        tm.assert_series_equal(result, expected)

Source File: test_parsers.py From Computable with MIT License

5 votes

def test_1000_sep_with_decimal(self):
        data = """A|B|C
1|2,334.01|5
10|13|10.
"""
        expected = DataFrame({
            'A': [1, 10],
            'B': [2334.01, 13],
            'C': [5, 10.]
        })

        tm.assert_equal(expected.A.dtype, 'int64')
        tm.assert_equal(expected.B.dtype, 'float')
        tm.assert_equal(expected.C.dtype, 'float')

        df = self.read_csv(StringIO(data), sep='|', thousands=',', decimal='.')
        tm.assert_frame_equal(df, expected)

        df = self.read_table(StringIO(data), sep='|', thousands=',', decimal='.')
        tm.assert_frame_equal(df, expected)

        data_with_odd_sep = """A|B|C
1|2.334,01|5
10|13|10,
"""
        df = self.read_csv(StringIO(data_with_odd_sep), sep='|', thousands='.', decimal=',')
        tm.assert_frame_equal(df, expected)

        df = self.read_table(StringIO(data_with_odd_sep), sep='|', thousands='.', decimal=',')
        tm.assert_frame_equal(df, expected)

Source File: test_parsers.py From Computable with MIT License

5 votes

def read_table(self, *args, **kwargs):
        raise NotImplementedError

Source File: test_paradigm.py From nistats with BSD 3-Clause "New" or "Revised" License

5 votes

def test_read_events():
    """ test that a events for an experimental paradigm are correctly read.
    """
    import tempfile
    tmpdir = tempfile.mkdtemp()
    for events in (block_paradigm(),
                   modulated_event_paradigm(),
                   modulated_block_paradigm(),
                   basic_paradigm()):
        csvfile = write_events(events, tmpdir)
        read_paradigm = pd.read_table(csvfile)
        assert (read_paradigm['onset'] == events['onset']).all()

Source File: test_parsers.py From vnpy_crypto with MIT License

5 votes

def read_table(self, *args, **kwds):
        kwds = kwds.copy()
        kwds['engine'] = self.engine
        return read_table(*args, **kwds)

Source File: test_parsers.py From vnpy_crypto with MIT License

5 votes

def read_table(self, *args, **kwds):
        kwds = kwds.copy()
        kwds['engine'] = self.engine
        kwds['low_memory'] = True
        return read_table(*args, **kwds)

Source File: test_parsers.py From vnpy_crypto with MIT License

5 votes

def read_table(self, *args, **kwargs):
        raise NotImplementedError

Source File: test_multilevel.py From vnpy_crypto with MIT License

5 votes

def test_xs_level0(self):
        from pandas import read_table
        text = """                      A       B       C       D        E
one two three   four
a   b   10.0032 5    -0.5109 -2.3358 -0.4645  0.05076  0.3640
a   q   20      4     0.4473  1.4152  0.2834  1.00661  0.1744
x   q   30      3    -0.6662 -0.5243 -0.3580  0.89145  2.5838"""

        df = read_table(StringIO(text), sep=r'\s+', engine='python')

        result = df.xs('a', level=0)
        expected = df.xs('a')
        assert len(result) == 2
        tm.assert_frame_equal(result, expected)

Source File: test_multilevel.py From vnpy_crypto with MIT License

5 votes

def test_xs_level_multiple(self):
        from pandas import read_table
        text = """                      A       B       C       D        E
one two three   four
a   b   10.0032 5    -0.5109 -2.3358 -0.4645  0.05076  0.3640
a   q   20      4     0.4473  1.4152  0.2834  1.00661  0.1744
x   q   30      3    -0.6662 -0.5243 -0.3580  0.89145  2.5838"""

        df = read_table(StringIO(text), sep=r'\s+', engine='python')

        result = df.xs(('a', 4), level=['one', 'four'])
        expected = df.xs('a').xs(4, level='four')
        tm.assert_frame_equal(result, expected)

        # this is a copy in 0.14
        result = df.xs(('a', 4), level=['one', 'four'])

        # setting this will give a SettingWithCopyError
        # as we are trying to write a view
        def f(x):
            x[:] = 10

        pytest.raises(com.SettingWithCopyError, f, result)

        # GH2107
        dates = lrange(20111201, 20111205)
        ids = 'abcde'
        idx = MultiIndex.from_tuples([x for x in cart_product(dates, ids)])
        idx.names = ['date', 'secid']
        df = DataFrame(np.random.randn(len(idx), 3), idx, ['X', 'Y', 'Z'])

        rs = df.xs(20111201, level='date')
        xp = df.loc[20111201, :]
        tm.assert_frame_equal(rs, xp)

Source File: test_formula.py From vnpy_crypto with MIT License

5 votes

def test_formula_labels():
    # make sure labels pass through patsy as expected
    # data(Duncan) from car in R
    dta = StringIO(""""type" "income" "education" "prestige"\n"accountant" "prof" 62 86 82\n"pilot" "prof" 72 76 83\n"architect" "prof" 75 92 90\n"author" "prof" 55 90 76\n"chemist" "prof" 64 86 90\n"minister" "prof" 21 84 87\n"professor" "prof" 64 93 93\n"dentist" "prof" 80 100 90\n"reporter" "wc" 67 87 52\n"engineer" "prof" 72 86 88\n"undertaker" "prof" 42 74 57\n"lawyer" "prof" 76 98 89\n"physician" "prof" 76 97 97\n"welfare.worker" "prof" 41 84 59\n"teacher" "prof" 48 91 73\n"conductor" "wc" 76 34 38\n"contractor" "prof" 53 45 76\n"factory.owner" "prof" 60 56 81\n"store.manager" "prof" 42 44 45\n"banker" "prof" 78 82 92\n"bookkeeper" "wc" 29 72 39\n"mail.carrier" "wc" 48 55 34\n"insurance.agent" "wc" 55 71 41\n"store.clerk" "wc" 29 50 16\n"carpenter" "bc" 21 23 33\n"electrician" "bc" 47 39 53\n"RR.engineer" "bc" 81 28 67\n"machinist" "bc" 36 32 57\n"auto.repairman" "bc" 22 22 26\n"plumber" "bc" 44 25 29\n"gas.stn.attendant" "bc" 15 29 10\n"coal.miner" "bc" 7 7 15\n"streetcar.motorman" "bc" 42 26 19\n"taxi.driver" "bc" 9 19 10\n"truck.driver" "bc" 21 15 13\n"machine.operator" "bc" 21 20 24\n"barber" "bc" 16 26 20\n"bartender" "bc" 16 28 7\n"shoe.shiner" "bc" 9 17 3\n"cook" "bc" 14 22 16\n"soda.clerk" "bc" 12 30 6\n"watchman" "bc" 17 25 11\n"janitor" "bc" 7 20 8\n"policeman" "bc" 34 47 41\n"waiter" "bc" 8 32 10""")
    from pandas import read_table
    dta = read_table(dta, sep=" ")
    model = ols("prestige ~ income + education", dta).fit()
    assert_equal(model.fittedvalues.index, dta.index)

Python pandas.read_table() Examples