Python Examples of pandas.read

Source File: test_stata.py From vnpy_crypto with MIT License

6 votes

def test_encoding(self, version):

        # GH 4626, proper encoding handling
        raw = read_stata(self.dta_encoding)
        encoded = read_stata(self.dta_encoding, encoding="latin-1")
        result = encoded.kreis1849[0]

        if compat.PY3:
            expected = raw.kreis1849[0]
            assert result == expected
            assert isinstance(result, compat.string_types)
        else:
            expected = raw.kreis1849.str.decode("latin-1")[0]
            assert result == expected
            assert isinstance(result, unicode)  # noqa

        with tm.ensure_clean() as path:
            encoded.to_stata(path, encoding='latin-1',
                             write_index=False, version=version)
            reread_encoded = read_stata(path, encoding='latin-1')
            tm.assert_frame_equal(encoded, reread_encoded)

Source File: test_stata.py From recruit with Apache License 2.0

6 votes

def test_mixed_string_strl(self):
        # GH 23633
        output = [
            {'mixed': 'string' * 500,
             'number': 0},
            {'mixed': None,
             'number': 1}
        ]
        output = pd.DataFrame(output)
        output.number = output.number.astype('int32')

        with tm.ensure_clean() as path:
            output.to_stata(path, write_index=False, version=117)
            reread = read_stata(path)
            expected = output.fillna('')
            tm.assert_frame_equal(reread, expected)

            # Check strl supports all None (null)
            output.loc[:, 'mixed'] = None
            output.to_stata(path, write_index=False, convert_strl=['mixed'],
                            version=117)
            reread = read_stata(path)
            expected = output.fillna('')
            tm.assert_frame_equal(reread, expected)

Source File: test_stata.py From predictive-maintenance-using-machine-learning with Apache License 2.0

6 votes

def test_encoding(self, version):

        # GH 4626, proper encoding handling
        raw = read_stata(self.dta_encoding)
        with tm.assert_produces_warning(FutureWarning):
            encoded = read_stata(self.dta_encoding, encoding='latin-1')
        result = encoded.kreis1849[0]

        expected = raw.kreis1849[0]
        assert result == expected
        assert isinstance(result, compat.string_types)

        with tm.ensure_clean() as path:
            with tm.assert_produces_warning(FutureWarning):
                encoded.to_stata(path, write_index=False, version=version,
                                 encoding='latin-1')
            reread_encoded = read_stata(path)
            tm.assert_frame_equal(encoded, reread_encoded)

Source File: test_stata.py From recruit with Apache License 2.0

6 votes

def test_date_parsing_ignores_format_details(self, column):
        # GH 17797
        #
        # Test that display formats are ignored when determining if a numeric
        # column is a date value.
        #
        # All date types are stored as numbers and format associated with the
        # column denotes both the type of the date and the display format.
        #
        # STATA supports 9 date types which each have distinct units. We test 7
        # of the 9 types, ignoring %tC and %tb. %tC is a variant of %tc that
        # accounts for leap seconds and %tb relies on STATAs business calendar.
        df = read_stata(self.stata_dates)
        unformatted = df.loc[0, column]
        formatted = df.loc[0, column + "_fmt"]
        assert unformatted == formatted

Source File: test_varmax.py From vnpy_crypto with MIT License

6 votes

def setup_class(cls, true, order, trend, error_cov_type, cov_type='approx',
                 **kwargs):
        cls.true = true
        # 1960:Q1 - 1982:Q4
        with open(current_path + os.sep + 'results' + os.sep + 'manufac.dta', 'rb') as test_data:
            dta = pd.read_stata(test_data)
        dta.index = pd.DatetimeIndex(dta.month, freq='MS')
        dta['dlncaputil'] = dta['lncaputil'].diff()
        dta['dlnhours'] = dta['lnhours'].diff()

        endog = dta.loc['1972-02-01':, ['dlncaputil', 'dlnhours']]

        with warnings.catch_warnings(record=True) as w:
            warnings.simplefilter('always')
            cls.model = varmax.VARMAX(endog, order=order, trend=trend,
                                       error_cov_type=error_cov_type, **kwargs)

        cls.results = cls.model.smooth(true['params'], cov_type=cov_type)

Source File: test_stata.py From predictive-maintenance-using-machine-learning with Apache License 2.0

6 votes

def test_dtype_conversion(self):
        expected = self.read_csv(self.csv15)
        expected['byte_'] = expected['byte_'].astype(np.int8)
        expected['int_'] = expected['int_'].astype(np.int16)
        expected['long_'] = expected['long_'].astype(np.int32)
        expected['float_'] = expected['float_'].astype(np.float32)
        expected['double_'] = expected['double_'].astype(np.float64)
        expected['date_td'] = expected['date_td'].apply(datetime.strptime,
                                                        args=('%Y-%m-%d',))

        no_conversion = read_stata(self.dta15_117,
                                   convert_dates=True)
        tm.assert_frame_equal(expected, no_conversion)

        conversion = read_stata(self.dta15_117,
                                convert_dates=True,
                                preserve_dtypes=False)

        # read_csv types are the same
        expected = self.read_csv(self.csv15)
        expected['date_td'] = expected['date_td'].apply(datetime.strptime,
                                                        args=('%Y-%m-%d',))

        tm.assert_frame_equal(expected, conversion)

Source File: test_stata.py From predictive-maintenance-using-machine-learning with Apache License 2.0

6 votes

def test_out_of_range_float(self):
        original = DataFrame({'ColumnOk': [0.0,
                                           np.finfo(np.float32).eps,
                                           np.finfo(np.float32).max / 10.0],
                              'ColumnTooBig': [0.0,
                                               np.finfo(np.float32).eps,
                                               np.finfo(np.float32).max]})
        original.index.name = 'index'
        for col in original:
            original[col] = original[col].astype(np.float32)

        with tm.ensure_clean() as path:
            original.to_stata(path)
            reread = read_stata(path)
            original['ColumnTooBig'] = original['ColumnTooBig'].astype(
                np.float64)
            tm.assert_frame_equal(original,
                                  reread.set_index('index'))

        original.loc[2, 'ColumnTooBig'] = np.inf
        msg = ("Column ColumnTooBig has a maximum value of infinity which"
               " is outside the range supported by Stata")
        with pytest.raises(ValueError, match=msg):
            with tm.ensure_clean() as path:
                original.to_stata(path)

Source File: test_stata.py From recruit with Apache License 2.0

6 votes

def test_out_of_range_float(self):
        original = DataFrame({'ColumnOk': [0.0,
                                           np.finfo(np.float32).eps,
                                           np.finfo(np.float32).max / 10.0],
                              'ColumnTooBig': [0.0,
                                               np.finfo(np.float32).eps,
                                               np.finfo(np.float32).max]})
        original.index.name = 'index'
        for col in original:
            original[col] = original[col].astype(np.float32)

        with tm.ensure_clean() as path:
            original.to_stata(path)
            reread = read_stata(path)
            original['ColumnTooBig'] = original['ColumnTooBig'].astype(
                np.float64)
            tm.assert_frame_equal(original,
                                  reread.set_index('index'))

        original.loc[2, 'ColumnTooBig'] = np.inf
        msg = ("Column ColumnTooBig has a maximum value of infinity which"
               " is outside the range supported by Stata")
        with pytest.raises(ValueError, match=msg):
            with tm.ensure_clean() as path:
                original.to_stata(path)

Source File: test_stata.py From vnpy_crypto with MIT License

6 votes

def test_dtype_conversion(self):
        expected = self.read_csv(self.csv15)
        expected['byte_'] = expected['byte_'].astype(np.int8)
        expected['int_'] = expected['int_'].astype(np.int16)
        expected['long_'] = expected['long_'].astype(np.int32)
        expected['float_'] = expected['float_'].astype(np.float32)
        expected['double_'] = expected['double_'].astype(np.float64)
        expected['date_td'] = expected['date_td'].apply(datetime.strptime,
                                                        args=('%Y-%m-%d',))

        no_conversion = read_stata(self.dta15_117,
                                   convert_dates=True)
        tm.assert_frame_equal(expected, no_conversion)

        conversion = read_stata(self.dta15_117,
                                convert_dates=True,
                                preserve_dtypes=False)

        # read_csv types are the same
        expected = self.read_csv(self.csv15)
        expected['date_td'] = expected['date_td'].apply(datetime.strptime,
                                                        args=('%Y-%m-%d',))

        tm.assert_frame_equal(expected, conversion)

Source File: stata.py From vnpy_crypto with MIT License

6 votes

def read_stata(filepath_or_buffer, convert_dates=True,
               convert_categoricals=True, encoding=None, index_col=None,
               convert_missing=False, preserve_dtypes=True, columns=None,
               order_categoricals=True, chunksize=None, iterator=False):

    reader = StataReader(filepath_or_buffer,
                         convert_dates=convert_dates,
                         convert_categoricals=convert_categoricals,
                         index_col=index_col, convert_missing=convert_missing,
                         preserve_dtypes=preserve_dtypes,
                         columns=columns,
                         order_categoricals=order_categoricals,
                         chunksize=chunksize, encoding=encoding)

    if iterator or chunksize:
        data = reader
    else:
        try:
            data = reader.read()
        finally:
            reader.close()
    return data

Source File: test_stata.py From predictive-maintenance-using-machine-learning with Apache License 2.0

6 votes

def test_date_parsing_ignores_format_details(self, column):
        # GH 17797
        #
        # Test that display formats are ignored when determining if a numeric
        # column is a date value.
        #
        # All date types are stored as numbers and format associated with the
        # column denotes both the type of the date and the display format.
        #
        # STATA supports 9 date types which each have distinct units. We test 7
        # of the 9 types, ignoring %tC and %tb. %tC is a variant of %tc that
        # accounts for leap seconds and %tb relies on STATAs business calendar.
        df = read_stata(self.stata_dates)
        unformatted = df.loc[0, column]
        formatted = df.loc[0, column + "_fmt"]
        assert unformatted == formatted

Source File: test_stata.py From vnpy_crypto with MIT License

6 votes

def test_out_of_range_float(self):
        original = DataFrame({'ColumnOk': [0.0,
                                           np.finfo(np.float32).eps,
                                           np.finfo(np.float32).max / 10.0],
                              'ColumnTooBig': [0.0,
                                               np.finfo(np.float32).eps,
                                               np.finfo(np.float32).max]})
        original.index.name = 'index'
        for col in original:
            original[col] = original[col].astype(np.float32)

        with tm.ensure_clean() as path:
            original.to_stata(path)
            reread = read_stata(path)
            original['ColumnTooBig'] = original['ColumnTooBig'].astype(
                np.float64)
            tm.assert_frame_equal(original,
                                  reread.set_index('index'))

        original.loc[2, 'ColumnTooBig'] = np.inf
        with pytest.raises(ValueError) as cm:
            with tm.ensure_clean() as path:
                original.to_stata(path)
            assert 'ColumnTooBig' in cm.exception
            assert 'infinity' in cm.exception

Source File: test_stata.py From predictive-maintenance-using-machine-learning with Apache License 2.0

6 votes

def test_mixed_string_strl(self):
        # GH 23633
        output = [
            {'mixed': 'string' * 500,
             'number': 0},
            {'mixed': None,
             'number': 1}
        ]
        output = pd.DataFrame(output)
        output.number = output.number.astype('int32')

        with tm.ensure_clean() as path:
            output.to_stata(path, write_index=False, version=117)
            reread = read_stata(path)
            expected = output.fillna('')
            tm.assert_frame_equal(reread, expected)

            # Check strl supports all None (null)
            output.loc[:, 'mixed'] = None
            output.to_stata(path, write_index=False, convert_strl=['mixed'],
                            version=117)
            reread = read_stata(path)
            expected = output.fillna('')
            tm.assert_frame_equal(reread, expected)

Source File: test_liml.py From econtools with BSD 3-Clause "New" or "Revised" License

6 votes

def setup_class(cls):
        """Stata reg output from `sysuse auto; reg price mpg`"""
        cls.init(cls)
        test_path = path.split(path.relpath(__file__))[0]
        auto_path = path.join(test_path, 'data', 'auto.dta')
        autodata = pd.read_stata(auto_path)
        y = 'price'
        x_end = ['mpg', 'length']
        z = ['trunk', 'weight', 'headroom']
        x_exog = []
        nosingles = True
        cls.result = ivreg(autodata, y, x_end, z, x_exog,
                           addcons=True,
                           iv_method='liml',
                           nosingles=nosingles)
        cls.expected = liml_std

Source File: test_liml.py From econtools with BSD 3-Clause "New" or "Revised" License

6 votes

def setup_class(cls):
        """Stata reg output from `sysuse auto; reg price mpg`"""
        cls.init(cls)
        cls.precision['se'] = 0
        cls.precision['CI_low'] = 0
        cls.precision['CI_high'] = -1
        test_path = path.split(path.relpath(__file__))[0]
        auto_path = path.join(test_path, 'data', 'auto.dta')
        autodata = pd.read_stata(auto_path)
        y = 'price'
        x_end = ['mpg', 'length']
        z = ['trunk', 'weight', 'headroom']
        x_exog = []
        nosingles = True
        cls.result = ivreg(autodata, y, x_end, z, x_exog,
                           addcons=True,
                           iv_method='liml',
                           vce_type='robust',
                           nosingles=nosingles)
        cls.expected = liml_robust

Source File: test_stata.py From recruit with Apache License 2.0

6 votes

def test_dtype_conversion(self):
        expected = self.read_csv(self.csv15)
        expected['byte_'] = expected['byte_'].astype(np.int8)
        expected['int_'] = expected['int_'].astype(np.int16)
        expected['long_'] = expected['long_'].astype(np.int32)
        expected['float_'] = expected['float_'].astype(np.float32)
        expected['double_'] = expected['double_'].astype(np.float64)
        expected['date_td'] = expected['date_td'].apply(datetime.strptime,
                                                        args=('%Y-%m-%d',))

        no_conversion = read_stata(self.dta15_117,
                                   convert_dates=True)
        tm.assert_frame_equal(expected, no_conversion)

        conversion = read_stata(self.dta15_117,
                                convert_dates=True,
                                preserve_dtypes=False)

        # read_csv types are the same
        expected = self.read_csv(self.csv15)
        expected['date_td'] = expected['date_td'].apply(datetime.strptime,
                                                        args=('%Y-%m-%d',))

        tm.assert_frame_equal(expected, conversion)

Source File: test_liml.py From econtools with BSD 3-Clause "New" or "Revised" License

6 votes

def setup_class(cls):
        """Stata reg output from `sysuse auto; reg price mpg`"""
        cls.init(cls)
        cls.precision['se'] = 0
        cls.precision['CI_low'] = 0
        cls.precision['CI_high'] = 0
        test_path = path.split(path.relpath(__file__))[0]
        auto_path = path.join(test_path, 'data', 'auto.dta')
        autodata = pd.read_stata(auto_path)
        y = 'price'
        x_end = ['mpg', 'length']
        z = ['trunk', 'weight', 'headroom']
        x_exog = []
        nosingles = True
        cls.result = ivreg(autodata, y, x_end, z, x_exog,
                           addcons=True,
                           iv_method='liml',
                           cluster='gear_ratio',
                           nosingles=nosingles)
        cls.expected = liml_cluster

Source File: test_liml.py From econtools with BSD 3-Clause "New" or "Revised" License

6 votes

def setup_class(cls):
        """Stata reg output from `sysuse auto; reg price mpg`"""
        cls.init(cls)
        test_path = path.split(path.relpath(__file__))[0]
        auto_path = path.join(test_path, 'data', 'auto.dta')
        autodata = pd.read_stata(auto_path)
        y = 'price'
        x_end = ['mpg', 'length']
        z = ['weight', 'trunk']
        x_exog = []
        nosingles = True
        cls.result = ivreg(autodata, y, x_end, z, x_exog, addcons=True,
                           iv_method='liml',
                           cluster='gear_ratio',
                           nosingles=nosingles)
        cls.expected = tsls_cluster

Source File: test_stata.py From recruit with Apache License 2.0

6 votes

def test_encoding(self, version):

        # GH 4626, proper encoding handling
        raw = read_stata(self.dta_encoding)
        with tm.assert_produces_warning(FutureWarning):
            encoded = read_stata(self.dta_encoding, encoding='latin-1')
        result = encoded.kreis1849[0]

        expected = raw.kreis1849[0]
        assert result == expected
        assert isinstance(result, compat.string_types)

        with tm.ensure_clean() as path:
            with tm.assert_produces_warning(FutureWarning):
                encoded.to_stata(path, write_index=False, version=version,
                                 encoding='latin-1')
            reread_encoded = read_stata(path)
            tm.assert_frame_equal(encoded, reread_encoded)

Source File: test_stata.py From vnpy_crypto with MIT License

6 votes

def test_date_parsing_ignores_format_details(self, column):
        # GH 17797
        #
        # Test that display formats are ignored when determining if a numeric
        # column is a date value.
        #
        # All date types are stored as numbers and format associated with the
        # column denotes both the type of the date and the display format.
        #
        # STATA supports 9 date types which each have distinct units. We test 7
        # of the 9 types, ignoring %tC and %tb. %tC is a variant of %tc that
        # accounts for leap seconds and %tb relies on STATAs business calendar.
        df = read_stata(self.stata_dates)
        unformatted = df.loc[0, column]
        formatted = df.loc[0, column + "_fmt"]
        assert unformatted == formatted

Source File: test_stata.py From predictive-maintenance-using-machine-learning with Apache License 2.0

5 votes

def test_pickle_path_localpath(self):
        df = tm.makeDataFrame()
        df.index.name = 'index'
        reader = lambda x: read_stata(x).set_index('index')
        result = tm.round_trip_localpath(df.to_stata, reader)
        tm.assert_frame_equal(df, result)

Source File: test_stata.py From predictive-maintenance-using-machine-learning with Apache License 2.0

5 votes

def test_drop_column(self):
        expected = self.read_csv(self.csv15)
        expected['byte_'] = expected['byte_'].astype(np.int8)
        expected['int_'] = expected['int_'].astype(np.int16)
        expected['long_'] = expected['long_'].astype(np.int32)
        expected['float_'] = expected['float_'].astype(np.float32)
        expected['double_'] = expected['double_'].astype(np.float64)
        expected['date_td'] = expected['date_td'].apply(datetime.strptime,
                                                        args=('%Y-%m-%d',))

        columns = ['byte_', 'int_', 'long_']
        expected = expected[columns]
        dropped = read_stata(self.dta15_117, convert_dates=True,
                             columns=columns)

        tm.assert_frame_equal(expected, dropped)

        # See PR 10757
        columns = ['int_', 'long_', 'byte_']
        expected = expected[columns]
        reordered = read_stata(self.dta15_117, convert_dates=True,
                               columns=columns)
        tm.assert_frame_equal(expected, reordered)

        msg = "columns contains duplicate entries"
        with pytest.raises(ValueError, match=msg):
            columns = ['byte_', 'byte_']
            read_stata(self.dta15_117, convert_dates=True, columns=columns)

        msg = ("The following columns were not found in the Stata data set:"
               " not_found")
        with pytest.raises(ValueError, match=msg):
            columns = ['byte_', 'int_', 'long_', 'not_found']
            read_stata(self.dta15_117, convert_dates=True, columns=columns)

Source File: test_stata.py From predictive-maintenance-using-machine-learning with Apache License 2.0

5 votes

def test_read_chunks_115(self, file, chunksize,
                             convert_categoricals, convert_dates):
        fname = getattr(self, file)

        # Read the whole file
        with warnings.catch_warnings(record=True) as w:
            warnings.simplefilter("always")
            parsed = read_stata(
                fname,
                convert_categoricals=convert_categoricals,
                convert_dates=convert_dates)

        # Compare to what we get when reading by chunk
        itr = read_stata(
            fname, iterator=True,
            convert_dates=convert_dates,
            convert_categoricals=convert_categoricals)
        pos = 0
        for j in range(5):
            with warnings.catch_warnings(record=True) as w:  # noqa
                warnings.simplefilter("always")
                try:
                    chunk = itr.read(chunksize)
                except StopIteration:
                    break
            from_frame = parsed.iloc[pos:pos + chunksize, :]
            tm.assert_frame_equal(
                from_frame, chunk, check_dtype=False,
                check_datetimelike_compat=True,
                check_categorical=False)

            pos += chunksize
        itr.close()

Source File: test_stata.py From predictive-maintenance-using-machine-learning with Apache License 2.0

5 votes

def test_105(self):
        # Data obtained from:
        # http://go.worldbank.org/ZXY29PVJ21
        dpath = os.path.join(self.dirpath, 'S4_EDUC1.dta')
        df = pd.read_stata(dpath)
        df0 = [[1, 1, 3, -2], [2, 1, 2, -2], [4, 1, 1, -2]]
        df0 = pd.DataFrame(df0)
        df0.columns = ["clustnum", "pri_schl", "psch_num", "psch_dis"]
        df0['clustnum'] = df0["clustnum"].astype(np.int16)
        df0['pri_schl'] = df0["pri_schl"].astype(np.int8)
        df0['psch_num'] = df0["psch_num"].astype(np.int8)
        df0['psch_dis'] = df0["psch_dis"].astype(np.float32)
        tm.assert_frame_equal(df.head(3), df0)

Source File: test_stata.py From predictive-maintenance-using-machine-learning with Apache License 2.0

5 votes

def test_read_empty_dta(self, version):
        empty_ds = DataFrame(columns=['unit'])
        # GH 7369, make sure can read a 0-obs dta file
        with tm.ensure_clean() as path:
            empty_ds.to_stata(path, write_index=False, version=version)
            empty_ds2 = read_stata(path)
            tm.assert_frame_equal(empty_ds, empty_ds2)

Source File: test_stata.py From predictive-maintenance-using-machine-learning with Apache License 2.0

5 votes

def read_dta(self, file):
        # Legacy default reader configuration
        return read_stata(file, convert_dates=True)

Source File: test_stata.py From predictive-maintenance-using-machine-learning with Apache License 2.0

5 votes

def parsed_114(dirpath):
    dta14_114 = os.path.join(dirpath, 'stata5_114.dta')
    parsed_114 = read_stata(dta14_114, convert_dates=True)
    parsed_114.index.name = 'index'
    return parsed_114

Source File: run_pandas.py From recipy with Apache License 2.0

5 votes

def read_stata(self):
        """
        Use pandas.read_stata to load dataframe.dta.
        """
        file_name = os.path.join(self.data_dir, "dataframe.dta")
        pd.read_stata(file_name)

Source File: test_stata.py From vnpy_crypto with MIT License

5 votes

def test_gzip_writing(self):
        # writing version 117 requires seek and cannot be used with gzip
        df = tm.makeDataFrame()
        df.index.name = 'index'
        with tm.ensure_clean() as path:
            with gzip.GzipFile(path, 'wb') as gz:
                df.to_stata(gz, version=114)
            with gzip.GzipFile(path, 'rb') as gz:
                reread = pd.read_stata(gz, index_col='index')
        tm.assert_frame_equal(df, reread)

Source File: test_stata.py From vnpy_crypto with MIT License

5 votes

def test_nonfile_writing(self, version):
        # GH 21041
        bio = io.BytesIO()
        df = tm.makeDataFrame()
        df.index.name = 'index'
        with tm.ensure_clean() as path:
            df.to_stata(bio, version=version)
            bio.seek(0)
            with open(path, 'wb') as dta:
                dta.write(bio.read())
            reread = pd.read_stata(path, index_col='index')
        tm.assert_frame_equal(df, reread)

Python pandas.read_stata() Examples