Python Examples of pandas.io.parsers.read

Source File: test_excel.py From Computable with MIT License

7 votes

def test_parse_cols_int(self):
        _skip_if_no_openpyxl()
        _skip_if_no_xlrd()

        suffix = ['xls', 'xlsx', 'xlsm']

        for s in suffix:
            pth = os.path.join(self.dirpath, 'test.%s' % s)
            xls = ExcelFile(pth)
            df = xls.parse('Sheet1', index_col=0, parse_dates=True,
                           parse_cols=3)
            df2 = self.read_csv(self.csv1, index_col=0, parse_dates=True)
            df2 = df2.reindex(columns=['A', 'B', 'C'])
            df3 = xls.parse('Sheet2', skiprows=[1], index_col=0,
                            parse_dates=True, parse_cols=3)
            # TODO add index to xls file)
            tm.assert_frame_equal(df, df2, check_names=False)
            tm.assert_frame_equal(df3, df2, check_names=False)

Source File: test_stata.py From vnpy_crypto with MIT License

6 votes

def test_read_write_reread_dta14(self, file, parsed_114, version):
        file = getattr(self, file)
        parsed = self.read_dta(file)
        parsed.index.name = 'index'

        expected = self.read_csv(self.csv14)
        cols = ['byte_', 'int_', 'long_', 'float_', 'double_']
        for col in cols:
            expected[col] = expected[col]._convert(datetime=True, numeric=True)
        expected['float_'] = expected['float_'].astype(np.float32)
        expected['date_td'] = pd.to_datetime(
            expected['date_td'], errors='coerce')

        tm.assert_frame_equal(parsed_114, parsed)

        with tm.ensure_clean() as path:
            parsed_114.to_stata(path, {'date_td': 'td'}, version=version)
            written_and_read_again = self.read_dta(path)
            tm.assert_frame_equal(
                written_and_read_again.set_index('index'), parsed_114)

Source File: test_stata.py From predictive-maintenance-using-machine-learning with Apache License 2.0

6 votes

def test_read_write_reread_dta14(self, file, parsed_114, version):
        file = getattr(self, file)
        parsed = self.read_dta(file)
        parsed.index.name = 'index'

        expected = self.read_csv(self.csv14)
        cols = ['byte_', 'int_', 'long_', 'float_', 'double_']
        for col in cols:
            expected[col] = expected[col]._convert(datetime=True, numeric=True)
        expected['float_'] = expected['float_'].astype(np.float32)
        expected['date_td'] = pd.to_datetime(
            expected['date_td'], errors='coerce')

        tm.assert_frame_equal(parsed_114, parsed)

        with tm.ensure_clean() as path:
            parsed_114.to_stata(path, {'date_td': 'td'}, version=version)
            written_and_read_again = self.read_dta(path)
            tm.assert_frame_equal(
                written_and_read_again.set_index('index'), parsed_114)

Source File: test_network.py From recruit with Apache License 2.0

6 votes

def test_read_csv_chunked_download(self, s3_resource, caplog):
        # 8 MB, S3FS usees 5MB chunks
        df = DataFrame(np.random.randn(100000, 4), columns=list('abcd'))
        buf = BytesIO()
        str_buf = StringIO()

        df.to_csv(str_buf)

        buf = BytesIO(str_buf.getvalue().encode('utf-8'))

        s3_resource.Bucket("pandas-test").put_object(
            Key="large-file.csv",
            Body=buf)

        with caplog.at_level(logging.DEBUG, logger='s3fs.core'):
            read_csv("s3://pandas-test/large-file.csv", nrows=5)
            # log of fetch_range (start, stop)
            assert ((0, 5505024) in {x.args[-2:] for x in caplog.records})

Source File: test_read_fwf.py From predictive-maintenance-using-machine-learning with Apache License 2.0

6 votes

def test_read_csv_compat():
    csv_data = """\
A,B,C,D,E
2011,58,360.242940,149.910199,11950.7
2011,59,444.953632,166.985655,11788.4
2011,60,364.136849,183.628767,11806.2
2011,61,413.836124,184.375703,11916.8
2011,62,502.953953,173.237159,12468.3
"""
    expected = read_csv(StringIO(csv_data), engine="python")

    fwf_data = """\
A   B     C            D            E
201158    360.242940   149.910199   11950.7
201159    444.953632   166.985655   11788.4
201160    364.136849   183.628767   11806.2
201161    413.836124   184.375703   11916.8
201162    502.953953   173.237159   12468.3
"""
    colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)]
    result = read_fwf(StringIO(fwf_data), colspecs=colspecs)
    tm.assert_frame_equal(result, expected)

Source File: test_excel.py From Computable with MIT License

6 votes

def test_parse_cols_list(self):
        _skip_if_no_openpyxl()
        _skip_if_no_xlrd()

        suffix = ['xls', 'xlsx', 'xlsm']

        for s in suffix:
            pth = os.path.join(self.dirpath, 'test.%s' % s)
            xls = ExcelFile(pth)
            df = xls.parse('Sheet1', index_col=0, parse_dates=True,
                           parse_cols=[0, 2, 3])
            df2 = self.read_csv(self.csv1, index_col=0, parse_dates=True)
            df2 = df2.reindex(columns=['B', 'C'])
            df3 = xls.parse('Sheet2', skiprows=[1], index_col=0,
                            parse_dates=True,
                            parse_cols=[0, 2, 3])
            # TODO add index to xls file)
            tm.assert_frame_equal(df, df2, check_names=False)
            tm.assert_frame_equal(df3, df2, check_names=False)

Source File: test_excel.py From Computable with MIT License

6 votes

def check_excel_table_sheet_by_index(self, filename, csvfile):
        import xlrd

        pth = os.path.join(self.dirpath, filename)
        xls = ExcelFile(pth)
        df = xls.parse(0, index_col=0, parse_dates=True)
        df2 = self.read_csv(csvfile, index_col=0, parse_dates=True)
        df3 = xls.parse(1, skiprows=[1], index_col=0, parse_dates=True)
        tm.assert_frame_equal(df, df2, check_names=False)
        tm.assert_frame_equal(df3, df2, check_names=False)

        df4 = xls.parse(0, index_col=0, parse_dates=True, skipfooter=1)
        df5 = xls.parse(0, index_col=0, parse_dates=True, skip_footer=1)
        tm.assert_frame_equal(df4, df.ix[:-1])
        tm.assert_frame_equal(df4, df5)

        self.assertRaises(xlrd.XLRDError, xls.parse, 'asdf')

Source File: test_network.py From recruit with Apache License 2.0

6 votes

def test_parse_public_s3_bucket_chunked_python(self, tips_df):
        # Read with a chunksize using the Python parser
        chunksize = 5
        for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
            df_reader = read_csv('s3://pandas-test/tips.csv' + ext,
                                 chunksize=chunksize, compression=comp,
                                 engine='python')
            assert df_reader.chunksize == chunksize
            for i_chunk in [0, 1, 2]:
                # Read a couple of chunks and make sure we see them properly.
                df = df_reader.get_chunk()
                assert isinstance(df, DataFrame)
                assert not df.empty
                true_df = tips_df.iloc[
                    chunksize * i_chunk: chunksize * (i_chunk + 1)]
                tm.assert_frame_equal(true_df, df)

Source File: test_network.py From recruit with Apache License 2.0

6 votes

def test_parse_public_s3_bucket_chunked(self, tips_df):
        # Read with a chunksize
        chunksize = 5
        for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
            df_reader = read_csv('s3://pandas-test/tips.csv' + ext,
                                 chunksize=chunksize, compression=comp)
            assert df_reader.chunksize == chunksize
            for i_chunk in [0, 1, 2]:
                # Read a couple of chunks and make sure we see them
                # properly.
                df = df_reader.get_chunk()
                assert isinstance(df, DataFrame)
                assert not df.empty
                true_df = tips_df.iloc[
                    chunksize * i_chunk: chunksize * (i_chunk + 1)]
                tm.assert_frame_equal(true_df, df)

Source File: test_network.py From vnpy_crypto with MIT License

6 votes

def test_parse_public_s3_bucket(self, tips_df):
        pytest.importorskip('s3fs')
        # more of an integration test due to the not-public contents portion
        # can probably mock this though.
        for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
            df = read_csv('s3://pandas-test/tips.csv' +
                          ext, compression=comp)
            assert isinstance(df, DataFrame)
            assert not df.empty
            tm.assert_frame_equal(df, tips_df)

        # Read public file from bucket with not-public contents
        df = read_csv('s3://cant_get_it/tips.csv')
        assert isinstance(df, DataFrame)
        assert not df.empty
        tm.assert_frame_equal(df, tips_df)

Source File: test_excel.py From Computable with MIT License

6 votes

def test_xlsx_table(self):
        _skip_if_no_xlrd()
        _skip_if_no_openpyxl()

        pth = os.path.join(self.dirpath, 'test.xlsx')
        xlsx = ExcelFile(pth)
        df = xlsx.parse('Sheet1', index_col=0, parse_dates=True)
        df2 = self.read_csv(self.csv1, index_col=0, parse_dates=True)
        df3 = xlsx.parse('Sheet2', skiprows=[1], index_col=0, parse_dates=True)

        # TODO add index to xlsx file
        tm.assert_frame_equal(df, df2, check_names=False)
        tm.assert_frame_equal(df3, df2, check_names=False)

        df4 = xlsx.parse('Sheet1', index_col=0, parse_dates=True,
                         skipfooter=1)
        df5 = xlsx.parse('Sheet1', index_col=0, parse_dates=True,
                         skip_footer=1)
        tm.assert_frame_equal(df4, df.ix[:-1])
        tm.assert_frame_equal(df4, df5)

Source File: test_network.py From recruit with Apache License 2.0

6 votes

def test_parse_public_s3_bucket(self, tips_df):
        pytest.importorskip('s3fs')

        # more of an integration test due to the not-public contents portion
        # can probably mock this though.
        for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
            df = read_csv('s3://pandas-test/tips.csv' +
                          ext, compression=comp)
            assert isinstance(df, DataFrame)
            assert not df.empty
            tm.assert_frame_equal(df, tips_df)

        # Read public file from bucket with not-public contents
        df = read_csv('s3://cant_get_it/tips.csv')
        assert isinstance(df, DataFrame)
        assert not df.empty
        tm.assert_frame_equal(df, tips_df)

Source File: test_network.py From predictive-maintenance-using-machine-learning with Apache License 2.0

6 votes

def test_parse_public_s3_bucket(self, tips_df):
        pytest.importorskip('s3fs')

        # more of an integration test due to the not-public contents portion
        # can probably mock this though.
        for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
            df = read_csv('s3://pandas-test/tips.csv' +
                          ext, compression=comp)
            assert isinstance(df, DataFrame)
            assert not df.empty
            tm.assert_frame_equal(df, tips_df)

        # Read public file from bucket with not-public contents
        df = read_csv('s3://cant_get_it/tips.csv')
        assert isinstance(df, DataFrame)
        assert not df.empty
        tm.assert_frame_equal(df, tips_df)

Source File: test_network.py From vnpy_crypto with MIT License

6 votes

def test_parse_public_s3_bucket_chunked(self, tips_df):
        # Read with a chunksize
        chunksize = 5
        for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
            df_reader = read_csv('s3://pandas-test/tips.csv' + ext,
                                 chunksize=chunksize, compression=comp)
            assert df_reader.chunksize == chunksize
            for i_chunk in [0, 1, 2]:
                # Read a couple of chunks and make sure we see them
                # properly.
                df = df_reader.get_chunk()
                assert isinstance(df, DataFrame)
                assert not df.empty
                true_df = tips_df.iloc[
                    chunksize * i_chunk: chunksize * (i_chunk + 1)]
                tm.assert_frame_equal(true_df, df)

Source File: test_network.py From vnpy_crypto with MIT License

6 votes

def test_parse_public_s3_bucket_chunked_python(self, tips_df):
        # Read with a chunksize using the Python parser
        chunksize = 5
        for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
            df_reader = read_csv('s3://pandas-test/tips.csv' + ext,
                                 chunksize=chunksize, compression=comp,
                                 engine='python')
            assert df_reader.chunksize == chunksize
            for i_chunk in [0, 1, 2]:
                # Read a couple of chunks and make sure we see them properly.
                df = df_reader.get_chunk()
                assert isinstance(df, DataFrame)
                assert not df.empty
                true_df = tips_df.iloc[
                    chunksize * i_chunk: chunksize * (i_chunk + 1)]
                tm.assert_frame_equal(true_df, df)

Source File: test_network.py From predictive-maintenance-using-machine-learning with Apache License 2.0

6 votes

def test_parse_public_s3_bucket_chunked(self, tips_df):
        # Read with a chunksize
        chunksize = 5
        for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
            df_reader = read_csv('s3://pandas-test/tips.csv' + ext,
                                 chunksize=chunksize, compression=comp)
            assert df_reader.chunksize == chunksize
            for i_chunk in [0, 1, 2]:
                # Read a couple of chunks and make sure we see them
                # properly.
                df = df_reader.get_chunk()
                assert isinstance(df, DataFrame)
                assert not df.empty
                true_df = tips_df.iloc[
                    chunksize * i_chunk: chunksize * (i_chunk + 1)]
                tm.assert_frame_equal(true_df, df)

Source File: test_read_fwf.py From recruit with Apache License 2.0

6 votes

def test_read_csv_compat():
    csv_data = """\
A,B,C,D,E
2011,58,360.242940,149.910199,11950.7
2011,59,444.953632,166.985655,11788.4
2011,60,364.136849,183.628767,11806.2
2011,61,413.836124,184.375703,11916.8
2011,62,502.953953,173.237159,12468.3
"""
    expected = read_csv(StringIO(csv_data), engine="python")

    fwf_data = """\
A   B     C            D            E
201158    360.242940   149.910199   11950.7
201159    444.953632   166.985655   11788.4
201160    364.136849   183.628767   11806.2
201161    413.836124   184.375703   11916.8
201162    502.953953   173.237159   12468.3
"""
    colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)]
    result = read_fwf(StringIO(fwf_data), colspecs=colspecs)
    tm.assert_frame_equal(result, expected)

Source File: test_network.py From predictive-maintenance-using-machine-learning with Apache License 2.0

6 votes

def test_parse_public_s3_bucket_chunked_python(self, tips_df):
        # Read with a chunksize using the Python parser
        chunksize = 5
        for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:
            df_reader = read_csv('s3://pandas-test/tips.csv' + ext,
                                 chunksize=chunksize, compression=comp,
                                 engine='python')
            assert df_reader.chunksize == chunksize
            for i_chunk in [0, 1, 2]:
                # Read a couple of chunks and make sure we see them properly.
                df = df_reader.get_chunk()
                assert isinstance(df, DataFrame)
                assert not df.empty
                true_df = tips_df.iloc[
                    chunksize * i_chunk: chunksize * (i_chunk + 1)]
                tm.assert_frame_equal(true_df, df)

Source File: test_excel.py From vnpy_crypto with MIT License

6 votes

def get_csv_refdf(self, basename):
        """
        Obtain the reference data from read_csv with the Python engine.

        Parameters
        ----------

        basename : str
            File base name, excluding file extension.

        Returns
        -------

        dfref : DataFrame
        """
        pref = os.path.join(self.dirpath, basename + '.csv')
        dfref = read_csv(pref, index_col=0, parse_dates=True, engine='python')
        return dfref

Source File: test_stata.py From recruit with Apache License 2.0

6 votes

def test_read_write_reread_dta14(self, file, parsed_114, version):
        file = getattr(self, file)
        parsed = self.read_dta(file)
        parsed.index.name = 'index'

        expected = self.read_csv(self.csv14)
        cols = ['byte_', 'int_', 'long_', 'float_', 'double_']
        for col in cols:
            expected[col] = expected[col]._convert(datetime=True, numeric=True)
        expected['float_'] = expected['float_'].astype(np.float32)
        expected['date_td'] = pd.to_datetime(
            expected['date_td'], errors='coerce')

        tm.assert_frame_equal(parsed_114, parsed)

        with tm.ensure_clean() as path:
            parsed_114.to_stata(path, {'date_td': 'td'}, version=version)
            written_and_read_again = self.read_dta(path)
            tm.assert_frame_equal(
                written_and_read_again.set_index('index'), parsed_114)

Source File: test_network.py From vnpy_crypto with MIT License

6 votes

def test_read_csv_chunked_download(self, s3_resource, caplog):
        # 8 MB, S3FS usees 5MB chunks
        df = DataFrame(np.random.randn(100000, 4), columns=list('abcd'))
        buf = BytesIO()
        str_buf = StringIO()

        df.to_csv(str_buf)

        buf = BytesIO(str_buf.getvalue().encode('utf-8'))

        s3_resource.Bucket("pandas-test").put_object(
            Key="large-file.csv",
            Body=buf)

        with caplog.at_level(logging.DEBUG, logger='s3fs.core'):
            read_csv("s3://pandas-test/large-file.csv", nrows=5)
            # log of fetch_range (start, stop)
            assert ((0, 5505024) in set(x.args[-2:] for x in caplog.records))

Source File: test_network.py From predictive-maintenance-using-machine-learning with Apache License 2.0

5 votes

def tips_df(datapath):
    """DataFrame with the tips dataset."""
    return read_csv(datapath('io', 'parser', 'data', 'tips.csv'))

Source File: test_stata.py From predictive-maintenance-using-machine-learning with Apache License 2.0

5 votes

def test_read_write_reread_dta15(self, file):

        expected = self.read_csv(self.csv15)
        expected['byte_'] = expected['byte_'].astype(np.int8)
        expected['int_'] = expected['int_'].astype(np.int16)
        expected['long_'] = expected['long_'].astype(np.int32)
        expected['float_'] = expected['float_'].astype(np.float32)
        expected['double_'] = expected['double_'].astype(np.float64)
        expected['date_td'] = expected['date_td'].apply(
            datetime.strptime, args=('%Y-%m-%d',))

        file = getattr(self, file)
        parsed = self.read_dta(file)

        tm.assert_frame_equal(expected, parsed)

Source File: test_excel.py From Computable with MIT License

5 votes

def read_csv(self, *args, **kwds):
        kwds = kwds.copy()
        kwds['engine'] = 'python'
        return read_csv(*args, **kwds)

Source File: test_stata.py From Computable with MIT License

5 votes

def test_read_dta8(self):
        expected = read_csv(self.csv8, parse_dates=True, sep='\t')
        parsed = self.read_dta(self.dta8)
        tm.assert_frame_equal(parsed, expected)

Source File: test_stata.py From Computable with MIT License

5 votes

def test_read_dta7(self):
        expected = read_csv(self.csv7, parse_dates=True, sep='\t')
        parsed = self.read_dta(self.dta7)
        tm.assert_frame_equal(parsed, expected)

Source File: test_stata.py From Computable with MIT License

5 votes

def test_write_dta6(self):
        skip_if_not_little_endian()

        original = self.read_csv(self.csv3)
        original.index.name = 'index'

        with tm.ensure_clean() as path:
            original.to_stata(path, None, False)
            written_and_read_again = self.read_dta(path)
            tm.assert_frame_equal(written_and_read_again.set_index('index'),
                                  original)

Source File: test_stata.py From Computable with MIT License

5 votes

def test_read_dta3(self):
        parsed = self.read_dta(self.dta3)
        parsed_13 = self.read_dta(self.dta3_13)

        # match stata here
        expected = self.read_csv(self.csv3)
        expected = expected.astype(np.float32)
        expected['year'] = expected['year'].astype(np.int32)
        expected['quarter'] = expected['quarter'].astype(np.int16)

        tm.assert_frame_equal(parsed, expected)
        tm.assert_frame_equal(parsed_13, expected)

Source File: test_stata.py From Computable with MIT License

5 votes

def read_csv(self, file):
        return read_csv(file, parse_dates=True)

Source File: test_network.py From vnpy_crypto with MIT License

5 votes

def test_read_csv_handles_boto_s3_object(self,
                                             s3_resource,
                                             tips_file):
        # see gh-16135

        s3_object = s3_resource.meta.client.get_object(
            Bucket='pandas-test',
            Key='tips.csv')

        result = read_csv(BytesIO(s3_object["Body"].read()), encoding='utf8')
        assert isinstance(result, DataFrame)
        assert not result.empty

        expected = read_csv(tips_file)
        tm.assert_frame_equal(result, expected)

Python pandas.io.parsers.read_csv() Examples