Python Examples of pandas.read

Source File: test_read_fwf.py From vnpy_crypto with MIT License

9 votes

def test_fwf_colspecs_None(self):
        # GH 7079
        data = """\
123456
456789
"""
        colspecs = [(0, 3), (3, None)]
        result = read_fwf(StringIO(data), colspecs=colspecs, header=None)
        expected = DataFrame([[123, 456], [456, 789]])
        tm.assert_frame_equal(result, expected)

        colspecs = [(None, 3), (3, 6)]
        result = read_fwf(StringIO(data), colspecs=colspecs, header=None)
        expected = DataFrame([[123, 456], [456, 789]])
        tm.assert_frame_equal(result, expected)

        colspecs = [(0, None), (3, None)]
        result = read_fwf(StringIO(data), colspecs=colspecs, header=None)
        expected = DataFrame([[123456, 456], [456789, 789]])
        tm.assert_frame_equal(result, expected)

        colspecs = [(None, None), (3, 6)]
        result = read_fwf(StringIO(data), colspecs=colspecs, header=None)
        expected = DataFrame([[123456, 456], [456789, 789]])
        tm.assert_frame_equal(result, expected)

Source File: helios.py From heliopy with GNU General Public License v3.0

7 votes

def load_local_file(self, interval):
        # Read in data
        headings = ['probe', 'year', 'doy', 'hour', 'minute', 'second',
                    'naverage', 'Bx', 'By', 'Bz', '|B|',
                    'sigma_Bx', 'sigma_By', 'sigma_Bz']

        colspecs = [(1, 2), (2, 4), (4, 7), (7, 9), (9, 11), (11, 13),
                    (13, 15), (15, 22), (22, 29), (29, 36), (36, 42), (42, 48),
                    (48, 54), (54, 60)]
        data = pd.read_fwf(self.local_path(interval), names=headings,
                           header=None, colspecs=colspecs)

        # Process data
        data['year'] += 1900
        # Convert date info to datetime
        data['Time'] = pd.to_datetime(data['year'], format='%Y') + \
            pd.to_timedelta(data['doy'] - 1, unit='d') + \
            pd.to_timedelta(data['hour'], unit='h') + \
            pd.to_timedelta(data['minute'], unit='m') + \
            pd.to_timedelta(data['second'], unit='s')
        data = data.drop(['year', 'doy', 'hour', 'minute', 'second'], axis=1)
        data = data.set_index('Time', drop=False)
        return data

Source File: marriage.py From DataExploration with MIT License

6 votes

def ReadFemResp1995():
    """Reads respondent data from NSFG Cycle 5.

    returns: DataFrame
    """
    dat_file = '1995FemRespData.dat.gz'
    names = ['cmintvw', 'timesmar', 'cmmarrhx', 'cmbirth', 'finalwgt']
    colspecs = [(12360-1, 12363),
                (4637-1, 4638),
                (11759-1, 11762),
                (14-1, 16),
                (12350-1, 12359)]
    df = pd.read_fwf(dat_file, 
                         compression='gzip', 
                         colspecs=colspecs, 
                         names=names)

    df.timesmar.replace([98, 99], np.nan, inplace=True)
    df['evrmarry'] = (df.timesmar > 0)

    CleanData(df)
    return df

Source File: test_read_fwf.py From vnpy_crypto with MIT License

6 votes

def test_fwf_regression(self):
        # GH 3594
        # turns out 'T060' is parsable as a datetime slice!

        tzlist = [1, 10, 20, 30, 60, 80, 100]
        ntz = len(tzlist)
        tcolspecs = [16] + [8] * ntz
        tcolnames = ['SST'] + ["T%03d" % z for z in tzlist[1:]]
        data = """  2009164202000   9.5403  9.4105  8.6571  7.8372  6.0612  5.8843  5.5192
  2009164203000   9.5435  9.2010  8.6167  7.8176  6.0804  5.8728  5.4869
  2009164204000   9.5873  9.1326  8.4694  7.5889  6.0422  5.8526  5.4657
  2009164205000   9.5810  9.0896  8.4009  7.4652  6.0322  5.8189  5.4379
  2009164210000   9.6034  9.0897  8.3822  7.4905  6.0908  5.7904  5.4039
"""

        df = read_fwf(StringIO(data),
                      index_col=0,
                      header=None,
                      names=tcolnames,
                      widths=tcolspecs,
                      parse_dates=True,
                      date_parser=lambda s: datetime.strptime(s, '%Y%j%H%M%S'))

        for c in df.columns:
            res = df.loc[:, c]
            assert len(res)

Source File: test_read_fwf.py From vnpy_crypto with MIT License

6 votes

def test_fwf_compression(self):
        try:
            import gzip
            import bz2
        except ImportError:
            pytest.skip("Need gzip and bz2 to run this test")

        data = """1111111111
        2222222222
        3333333333""".strip()
        widths = [5, 5]
        names = ['one', 'two']
        expected = read_fwf(StringIO(data), widths=widths, names=names)
        if compat.PY3:
            data = bytes(data, encoding='utf-8')
        comps = [('gzip', gzip.GzipFile), ('bz2', bz2.BZ2File)]
        for comp_name, compresser in comps:
            with tm.ensure_clean() as path:
                tmp = compresser(path, mode='wb')
                tmp.write(data)
                tmp.close()
                result = read_fwf(path, widths=widths, names=names,
                                  compression=comp_name)
                tm.assert_frame_equal(result, expected)

Source File: test_io.py From modin with Apache License 2.0

6 votes

def test_fwf_file_parse_dates():
    dates = pandas.date_range("2000", freq="h", periods=10)
    fwf_data = "col1 col2        col3 col4"
    for i in range(10, 20):
        fwf_data = fwf_data + "\n{col1}   {col2}  {col3}   {col4}".format(
            col1=str(i),
            col2=str(dates[i - 10].date()),
            col3=str(i),
            col4=str(dates[i - 10].time()),
        )

    setup_fwf_file(overwrite=True, fwf_data=fwf_data)

    pandas_df = pandas.read_fwf(TEST_FWF_FILENAME, parse_dates=[["col2", "col4"]])
    modin_df = pd.read_fwf(TEST_FWF_FILENAME, parse_dates=[["col2", "col4"]])
    df_equals(modin_df, pandas_df)

    pandas_df = pandas.read_fwf(
        TEST_FWF_FILENAME, parse_dates={"time": ["col2", "col4"]}
    )
    modin_df = pd.read_fwf(TEST_FWF_FILENAME, parse_dates={"time": ["col2", "col4"]})
    df_equals(modin_df, pandas_df)

    teardown_fwf_file()

Source File: io.py From modin with Apache License 2.0

6 votes

def read_fwf(
        cls, filepath_or_buffer, colspecs="infer", widths=None, infer_nrows=100, **kwds
    ):
        ErrorMessage.default_to_pandas("`read_fwf`")
        pd_obj = pandas.read_fwf(
            filepath_or_buffer,
            colspecs=colspecs,
            widths=widths,
            infer_nrows=infer_nrows,
            **kwds,
        )
        if isinstance(pd_obj, pandas.DataFrame):
            return cls.from_pandas(pd_obj)
        if isinstance(pd_obj, pandas.io.parsers.TextFileReader):
            # Overwriting the read method should return a Modin DataFrame for calls
            # to __next__ and get_chunk
            pd_read = pd_obj.read
            pd_obj.read = lambda *args, **kwargs: cls.from_pandas(
                pd_read(*args, **kwargs)
            )
        return pd_obj

Source File: test_read_fwf.py From elasticintel with GNU General Public License v3.0

6 votes

def test_fwf_colspecs_None(self):
        # GH 7079
        data = """\
123456
456789
"""
        colspecs = [(0, 3), (3, None)]
        result = read_fwf(StringIO(data), colspecs=colspecs, header=None)
        expected = DataFrame([[123, 456], [456, 789]])
        tm.assert_frame_equal(result, expected)

        colspecs = [(None, 3), (3, 6)]
        result = read_fwf(StringIO(data), colspecs=colspecs, header=None)
        expected = DataFrame([[123, 456], [456, 789]])
        tm.assert_frame_equal(result, expected)

        colspecs = [(0, None), (3, None)]
        result = read_fwf(StringIO(data), colspecs=colspecs, header=None)
        expected = DataFrame([[123456, 456], [456789, 789]])
        tm.assert_frame_equal(result, expected)

        colspecs = [(None, None), (3, 6)]
        result = read_fwf(StringIO(data), colspecs=colspecs, header=None)
        expected = DataFrame([[123456, 456], [456789, 789]])
        tm.assert_frame_equal(result, expected)

Source File: test_io.py From modin with Apache License 2.0

6 votes

def test_fwf_file_usecols(usecols):
    fwf_data = """a       b           c          d
id8141  360.242940  149.910199 11950.7
id1594  444.953632  166.985655 11788.4
id1849  364.136849  183.628767 11806.2
id1230  413.836124  184.375703 11916.8
id1948  502.953953  173.237159 12468.3"""

    setup_fwf_file(overwrite=True, fwf_data=fwf_data)

    pandas_df = pandas.read_fwf(TEST_FWF_FILENAME, usecols=usecols)
    modin_df = pd.read_fwf(TEST_FWF_FILENAME, usecols=usecols)

    df_equals(modin_df, pandas_df)

    teardown_fwf_file()

Source File: test_read_fwf.py From elasticintel with GNU General Public License v3.0

6 votes

def test_fwf_compression(self):
        try:
            import gzip
            import bz2
        except ImportError:
            pytest.skip("Need gzip and bz2 to run this test")

        data = """1111111111
        2222222222
        3333333333""".strip()
        widths = [5, 5]
        names = ['one', 'two']
        expected = read_fwf(StringIO(data), widths=widths, names=names)
        if compat.PY3:
            data = bytes(data, encoding='utf-8')
        comps = [('gzip', gzip.GzipFile), ('bz2', bz2.BZ2File)]
        for comp_name, compresser in comps:
            with tm.ensure_clean() as path:
                tmp = compresser(path, mode='wb')
                tmp.write(data)
                tmp.close()
                result = read_fwf(path, widths=widths, names=names,
                                  compression=comp_name)
                tm.assert_frame_equal(result, expected)

Source File: test_read_fwf.py From elasticintel with GNU General Public License v3.0

5 votes

def test_full_file_with_spaces_and_missing(self):
        # File with spaces and missing values in columns
        test = """
Account               Name    Balance     CreditLimit   AccountCreated
101                           10000.00                       1/17/1998
312     Gerard Butler         90.00       1000.00             8/6/2003
868                                                          5/25/1985
761     Jada Pinkett-Smith    49654.87    100000.00          12/5/2006
317     Bill Murray           789.65
""".strip('\r\n')
        colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70))
        expected = read_fwf(StringIO(test), colspecs=colspecs)
        tm.assert_frame_equal(expected, read_fwf(StringIO(test)))

Source File: test_read_fwf.py From elasticintel with GNU General Public License v3.0

5 votes

def test_1000_fwf(self):
        data = """
 1 2,334.0    5
10   13     10.
"""
        expected = np.array([[1, 2334., 5],
                             [10, 13, 10]])
        df = read_fwf(StringIO(data), colspecs=[(0, 3), (3, 11), (12, 16)],
                      thousands=',')
        tm.assert_almost_equal(df.values, expected)

Source File: test_read_fwf.py From elasticintel with GNU General Public License v3.0

5 votes

def test_messed_up_data(self):
        # Completely messed up file
        test = """
   Account          Name             Balance     Credit Limit   Account Created
       101                           10000.00                       1/17/1998
       312     Gerard Butler         90.00       1000.00

       761     Jada Pinkett-Smith    49654.87    100000.00          12/5/2006
  317          Bill Murray           789.65
""".strip('\r\n')
        colspecs = ((2, 10), (15, 33), (37, 45), (49, 61), (64, 79))
        expected = read_fwf(StringIO(test), colspecs=colspecs)
        tm.assert_frame_equal(expected, read_fwf(StringIO(test)))

Source File: test_read_fwf.py From elasticintel with GNU General Public License v3.0

5 votes

def test_multiple_delimiters(self):
        test = r"""
col1~~~~~col2  col3++++++++++++++++++col4
~~22.....11.0+++foo~~~~~~~~~~Keanu Reeves
  33+++122.33\\\bar.........Gerard Butler
++44~~~~12.01   baz~~Jennifer Love Hewitt
~~55       11+++foo++++Jada Pinkett-Smith
..66++++++.03~~~bar           Bill Murray
""".strip('\r\n')
        colspecs = ((0, 4), (7, 13), (15, 19), (21, 41))
        expected = read_fwf(StringIO(test), colspecs=colspecs,
                            delimiter=' +~.\\')
        tm.assert_frame_equal(expected, read_fwf(StringIO(test),
                                                 delimiter=' +~.\\'))

Source File: pfile.py From Ocean-Data-Map-Project with GNU General Public License v3.0

5 votes

def _read_file(self, skip, colspecs, names):
        self.dataframe = pd.read_fwf(
            self.filename,
            skiprows=skip,
            colspecs=colspecs,
            header=None,
            names=names,
            index_col=False
        )

        if 'pres' in self.dataframe.columns.values:
            self.dataframe['depth'] = seawater.eos80.dpth(
                self.dataframe['pres'], self.meta['latitude'])

Source File: test_io.py From modin with Apache License 2.0

5 votes

def test_fwf_file_skiprows():
    setup_fwf_file(overwrite=True)

    pandas_df = pandas.read_fwf(TEST_FWF_FILENAME, skiprows=2)
    modin_df = pd.read_fwf(TEST_FWF_FILENAME, skiprows=2)
    df_equals(modin_df, pandas_df)

    pandas_df = pandas.read_fwf(TEST_FWF_FILENAME, usecols=[0, 4, 7], skiprows=[2, 5])
    modin_df = pd.read_fwf(TEST_FWF_FILENAME, usecols=[0, 4, 7], skiprows=[2, 5])
    df_equals(modin_df, pandas_df)

Source File: test_io.py From modin with Apache License 2.0

5 votes

def test_fwf_file_index_col():
    fwf_data = """a       b           c          d
id8141  360.242940  149.910199 11950.7
id1594  444.953632  166.985655 11788.4
id1849  364.136849  183.628767 11806.2
id1230  413.836124  184.375703 11916.8
id1948  502.953953  173.237159 12468.3"""

    setup_fwf_file(overwrite=True, fwf_data=fwf_data)

    pandas_df = pandas.read_fwf(TEST_FWF_FILENAME, index_col="c")
    modin_df = pd.read_fwf(TEST_FWF_FILENAME, index_col="c")
    df_equals(modin_df, pandas_df)

    teardown_fwf_file()

Source File: test_io.py From modin with Apache License 2.0

5 votes

def test_fwf_file_skipfooter():
    setup_fwf_file(overwrite=True)

    pandas_df = pandas.read_fwf(TEST_FWF_FILENAME, skipfooter=2)
    modin_df = pd.read_fwf(TEST_FWF_FILENAME, skipfooter=2)

    df_equals(modin_df, pandas_df)

Source File: datasets.py From Doubly-Stochastic-DGP with Apache License 2.0

5 votes

def download_data(self):

        url = '{}{}'.format(uci_base, '00316/UCI%20CBM%20Dataset.zip')

        with urlopen(url) as zipresp:
            with ZipFile(BytesIO(zipresp.read())) as zfile:
                zfile.extractall('/tmp/')

        data = pandas.read_fwf('/tmp/UCI CBM Dataset/data.txt', header=None).values
        data = data[:, :-1]

        with open(self.csv_file_path(self.name), 'w') as f:
            csv.writer(f).writerows(data)

Source File: thinkstats2.py From DataExploration with MIT License

5 votes

def ReadFixedWidth(self, filename, **options):
        """Reads a fixed width ASCII file.

        filename: string filename

        returns: DataFrame
        """
        df = pandas.read_fwf(filename,
                             colspecs=self.colspecs, 
                             names=self.names,
                             **options)
        return df

Source File: test_read_fwf.py From elasticintel with GNU General Public License v3.0

5 votes

def test_full_file_with_spaces(self):
        # File with spaces in columns
        test = """
Account                 Name  Balance     CreditLimit   AccountCreated
101     Keanu Reeves          9315.45     10000.00           1/17/1998
312     Gerard Butler         90.00       1000.00             8/6/2003
868     Jennifer Love Hewitt  0           17000.00           5/25/1985
761     Jada Pinkett-Smith    49654.87    100000.00          12/5/2006
317     Bill Murray           789.65      5000.00             2/5/2007
""".strip('\r\n')
        colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70))
        expected = read_fwf(StringIO(test), colspecs=colspecs)
        tm.assert_frame_equal(expected, read_fwf(StringIO(test)))

Source File: test_read_fwf.py From elasticintel with GNU General Public License v3.0

5 votes

def test_full_file(self):
        # File with all values
        test = """index                             A    B    C
2000-01-03T00:00:00  0.980268513777    3  foo
2000-01-04T00:00:00  1.04791624281    -4  bar
2000-01-05T00:00:00  0.498580885705   73  baz
2000-01-06T00:00:00  1.12020151869     1  foo
2000-01-07T00:00:00  0.487094399463    0  bar
2000-01-10T00:00:00  0.836648671666    2  baz
2000-01-11T00:00:00  0.157160753327   34  foo"""
        colspecs = ((0, 19), (21, 35), (38, 40), (42, 45))
        expected = read_fwf(StringIO(test), colspecs=colspecs)
        tm.assert_frame_equal(expected, read_fwf(StringIO(test)))

Source File: test_read_fwf.py From elasticintel with GNU General Public License v3.0

5 votes

def test_bool_header_arg(self):
        # see gh-6114
        data = """\
MyColumn
   a
   b
   a
   b"""
        for arg in [True, False]:
            with pytest.raises(TypeError):
                read_fwf(StringIO(data), header=arg)

Source File: test_read_fwf.py From elasticintel with GNU General Public License v3.0

5 votes

def test_skiprows_by_index_inference(self):
        test = """
To be skipped
Not  To  Be  Skipped
Once more to be skipped
123  34   8      123
456  78   9      456
""".strip()

        expected = read_csv(StringIO(test), skiprows=[0, 2],
                            delim_whitespace=True)
        tm.assert_frame_equal(expected, read_fwf(
            StringIO(test), skiprows=[0, 2]))

Source File: test_read_fwf.py From elasticintel with GNU General Public License v3.0

5 votes

def test_comment_fwf(self):
        data = """
  1   2.   4  #hello world
  5  NaN  10.0
"""
        expected = np.array([[1, 2., 4],
                             [5, np.nan, 10.]])
        df = read_fwf(StringIO(data), colspecs=[(0, 3), (4, 9), (9, 25)],
                      comment='#')
        tm.assert_almost_equal(df.values, expected)

Source File: test_read_fwf.py From elasticintel with GNU General Public License v3.0

5 votes

def test_fwf_regression(self):
        # GH 3594
        # turns out 'T060' is parsable as a datetime slice!

        tzlist = [1, 10, 20, 30, 60, 80, 100]
        ntz = len(tzlist)
        tcolspecs = [16] + [8] * ntz
        tcolnames = ['SST'] + ["T%03d" % z for z in tzlist[1:]]
        data = """  2009164202000   9.5403  9.4105  8.6571  7.8372  6.0612  5.8843  5.5192
  2009164203000   9.5435  9.2010  8.6167  7.8176  6.0804  5.8728  5.4869
  2009164204000   9.5873  9.1326  8.4694  7.5889  6.0422  5.8526  5.4657
  2009164205000   9.5810  9.0896  8.4009  7.4652  6.0322  5.8189  5.4379
  2009164210000   9.6034  9.0897  8.3822  7.4905  6.0908  5.7904  5.4039
"""

        df = read_fwf(StringIO(data),
                      index_col=0,
                      header=None,
                      names=tcolnames,
                      widths=tcolspecs,
                      parse_dates=True,
                      date_parser=lambda s: datetime.strptime(s, '%Y%j%H%M%S'))

        for c in df.columns:
            res = df.loc[:, c]
            assert len(res)

Source File: test_read_fwf.py From elasticintel with GNU General Public License v3.0

5 votes

def test_fwf_colspecs_is_list_or_tuple_of_two_element_tuples(self):
        data = """index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
baz,12,13,14,15
qux,12,13,14,15
foo2,12,13,14,15
bar2,12,13,14,15
"""

        with tm.assert_raises_regex(TypeError,
                                    'Each column specification '
                                    'must be.+'):
            read_fwf(StringIO(data), [('a', 1)])

Source File: test_read_fwf.py From elasticintel with GNU General Public License v3.0

5 votes

def test_BytesIO_input(self):
        if not compat.PY3:
            pytest.skip(
                "Bytes-related test - only needs to work on Python 3")

        result = read_fwf(BytesIO("שלום\nשלום".encode('utf8')), widths=[
            2, 2], encoding='utf8')
        expected = DataFrame([["של", "ום"]], columns=["של", "ום"])
        tm.assert_frame_equal(result, expected)

Source File: datasets.py From Conditional_Density_Estimation with MIT License

5 votes

def get_df(self):
        if self.needs_download:
            self.download_dataset()
        df = pd.read_fwf(self.data_file_path, header=None).dropna()
        return df

Source File: datasets.py From Conditional_Density_Estimation with MIT License

5 votes

def get_df(self):
        if self.needs_download:
            self.download_dataset()
        df = pd.read_fwf(self.data_file_path, header=None)
        return df

Python pandas.read_fwf() Examples