Python pandas.read_stata() Examples

The following are 30 code examples of pandas.read_stata(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pandas , or try the search function .
Example #1
Source File: test_stata.py    From vnpy_crypto with MIT License 6 votes vote down vote up
def test_encoding(self, version):

        # GH 4626, proper encoding handling
        raw = read_stata(self.dta_encoding)
        encoded = read_stata(self.dta_encoding, encoding="latin-1")
        result = encoded.kreis1849[0]

        if compat.PY3:
            expected = raw.kreis1849[0]
            assert result == expected
            assert isinstance(result, compat.string_types)
        else:
            expected = raw.kreis1849.str.decode("latin-1")[0]
            assert result == expected
            assert isinstance(result, unicode)  # noqa

        with tm.ensure_clean() as path:
            encoded.to_stata(path, encoding='latin-1',
                             write_index=False, version=version)
            reread_encoded = read_stata(path, encoding='latin-1')
            tm.assert_frame_equal(encoded, reread_encoded) 
Example #2
Source File: test_stata.py    From recruit with Apache License 2.0 6 votes vote down vote up
def test_mixed_string_strl(self):
        # GH 23633
        output = [
            {'mixed': 'string' * 500,
             'number': 0},
            {'mixed': None,
             'number': 1}
        ]
        output = pd.DataFrame(output)
        output.number = output.number.astype('int32')

        with tm.ensure_clean() as path:
            output.to_stata(path, write_index=False, version=117)
            reread = read_stata(path)
            expected = output.fillna('')
            tm.assert_frame_equal(reread, expected)

            # Check strl supports all None (null)
            output.loc[:, 'mixed'] = None
            output.to_stata(path, write_index=False, convert_strl=['mixed'],
                            version=117)
            reread = read_stata(path)
            expected = output.fillna('')
            tm.assert_frame_equal(reread, expected) 
Example #3
Source File: test_stata.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 6 votes vote down vote up
def test_encoding(self, version):

        # GH 4626, proper encoding handling
        raw = read_stata(self.dta_encoding)
        with tm.assert_produces_warning(FutureWarning):
            encoded = read_stata(self.dta_encoding, encoding='latin-1')
        result = encoded.kreis1849[0]

        expected = raw.kreis1849[0]
        assert result == expected
        assert isinstance(result, compat.string_types)

        with tm.ensure_clean() as path:
            with tm.assert_produces_warning(FutureWarning):
                encoded.to_stata(path, write_index=False, version=version,
                                 encoding='latin-1')
            reread_encoded = read_stata(path)
            tm.assert_frame_equal(encoded, reread_encoded) 
Example #4
Source File: test_stata.py    From recruit with Apache License 2.0 6 votes vote down vote up
def test_date_parsing_ignores_format_details(self, column):
        # GH 17797
        #
        # Test that display formats are ignored when determining if a numeric
        # column is a date value.
        #
        # All date types are stored as numbers and format associated with the
        # column denotes both the type of the date and the display format.
        #
        # STATA supports 9 date types which each have distinct units. We test 7
        # of the 9 types, ignoring %tC and %tb. %tC is a variant of %tc that
        # accounts for leap seconds and %tb relies on STATAs business calendar.
        df = read_stata(self.stata_dates)
        unformatted = df.loc[0, column]
        formatted = df.loc[0, column + "_fmt"]
        assert unformatted == formatted 
Example #5
Source File: test_varmax.py    From vnpy_crypto with MIT License 6 votes vote down vote up
def setup_class(cls, true, order, trend, error_cov_type, cov_type='approx',
                 **kwargs):
        cls.true = true
        # 1960:Q1 - 1982:Q4
        with open(current_path + os.sep + 'results' + os.sep + 'manufac.dta', 'rb') as test_data:
            dta = pd.read_stata(test_data)
        dta.index = pd.DatetimeIndex(dta.month, freq='MS')
        dta['dlncaputil'] = dta['lncaputil'].diff()
        dta['dlnhours'] = dta['lnhours'].diff()

        endog = dta.loc['1972-02-01':, ['dlncaputil', 'dlnhours']]

        with warnings.catch_warnings(record=True) as w:
            warnings.simplefilter('always')
            cls.model = varmax.VARMAX(endog, order=order, trend=trend,
                                       error_cov_type=error_cov_type, **kwargs)

        cls.results = cls.model.smooth(true['params'], cov_type=cov_type) 
Example #6
Source File: test_stata.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 6 votes vote down vote up
def test_dtype_conversion(self):
        expected = self.read_csv(self.csv15)
        expected['byte_'] = expected['byte_'].astype(np.int8)
        expected['int_'] = expected['int_'].astype(np.int16)
        expected['long_'] = expected['long_'].astype(np.int32)
        expected['float_'] = expected['float_'].astype(np.float32)
        expected['double_'] = expected['double_'].astype(np.float64)
        expected['date_td'] = expected['date_td'].apply(datetime.strptime,
                                                        args=('%Y-%m-%d',))

        no_conversion = read_stata(self.dta15_117,
                                   convert_dates=True)
        tm.assert_frame_equal(expected, no_conversion)

        conversion = read_stata(self.dta15_117,
                                convert_dates=True,
                                preserve_dtypes=False)

        # read_csv types are the same
        expected = self.read_csv(self.csv15)
        expected['date_td'] = expected['date_td'].apply(datetime.strptime,
                                                        args=('%Y-%m-%d',))

        tm.assert_frame_equal(expected, conversion) 
Example #7
Source File: test_stata.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 6 votes vote down vote up
def test_out_of_range_float(self):
        original = DataFrame({'ColumnOk': [0.0,
                                           np.finfo(np.float32).eps,
                                           np.finfo(np.float32).max / 10.0],
                              'ColumnTooBig': [0.0,
                                               np.finfo(np.float32).eps,
                                               np.finfo(np.float32).max]})
        original.index.name = 'index'
        for col in original:
            original[col] = original[col].astype(np.float32)

        with tm.ensure_clean() as path:
            original.to_stata(path)
            reread = read_stata(path)
            original['ColumnTooBig'] = original['ColumnTooBig'].astype(
                np.float64)
            tm.assert_frame_equal(original,
                                  reread.set_index('index'))

        original.loc[2, 'ColumnTooBig'] = np.inf
        msg = ("Column ColumnTooBig has a maximum value of infinity which"
               " is outside the range supported by Stata")
        with pytest.raises(ValueError, match=msg):
            with tm.ensure_clean() as path:
                original.to_stata(path) 
Example #8
Source File: test_stata.py    From recruit with Apache License 2.0 6 votes vote down vote up
def test_out_of_range_float(self):
        original = DataFrame({'ColumnOk': [0.0,
                                           np.finfo(np.float32).eps,
                                           np.finfo(np.float32).max / 10.0],
                              'ColumnTooBig': [0.0,
                                               np.finfo(np.float32).eps,
                                               np.finfo(np.float32).max]})
        original.index.name = 'index'
        for col in original:
            original[col] = original[col].astype(np.float32)

        with tm.ensure_clean() as path:
            original.to_stata(path)
            reread = read_stata(path)
            original['ColumnTooBig'] = original['ColumnTooBig'].astype(
                np.float64)
            tm.assert_frame_equal(original,
                                  reread.set_index('index'))

        original.loc[2, 'ColumnTooBig'] = np.inf
        msg = ("Column ColumnTooBig has a maximum value of infinity which"
               " is outside the range supported by Stata")
        with pytest.raises(ValueError, match=msg):
            with tm.ensure_clean() as path:
                original.to_stata(path) 
Example #9
Source File: test_stata.py    From vnpy_crypto with MIT License 6 votes vote down vote up
def test_dtype_conversion(self):
        expected = self.read_csv(self.csv15)
        expected['byte_'] = expected['byte_'].astype(np.int8)
        expected['int_'] = expected['int_'].astype(np.int16)
        expected['long_'] = expected['long_'].astype(np.int32)
        expected['float_'] = expected['float_'].astype(np.float32)
        expected['double_'] = expected['double_'].astype(np.float64)
        expected['date_td'] = expected['date_td'].apply(datetime.strptime,
                                                        args=('%Y-%m-%d',))

        no_conversion = read_stata(self.dta15_117,
                                   convert_dates=True)
        tm.assert_frame_equal(expected, no_conversion)

        conversion = read_stata(self.dta15_117,
                                convert_dates=True,
                                preserve_dtypes=False)

        # read_csv types are the same
        expected = self.read_csv(self.csv15)
        expected['date_td'] = expected['date_td'].apply(datetime.strptime,
                                                        args=('%Y-%m-%d',))

        tm.assert_frame_equal(expected, conversion) 
Example #10
Source File: stata.py    From vnpy_crypto with MIT License 6 votes vote down vote up
def read_stata(filepath_or_buffer, convert_dates=True,
               convert_categoricals=True, encoding=None, index_col=None,
               convert_missing=False, preserve_dtypes=True, columns=None,
               order_categoricals=True, chunksize=None, iterator=False):

    reader = StataReader(filepath_or_buffer,
                         convert_dates=convert_dates,
                         convert_categoricals=convert_categoricals,
                         index_col=index_col, convert_missing=convert_missing,
                         preserve_dtypes=preserve_dtypes,
                         columns=columns,
                         order_categoricals=order_categoricals,
                         chunksize=chunksize, encoding=encoding)

    if iterator or chunksize:
        data = reader
    else:
        try:
            data = reader.read()
        finally:
            reader.close()
    return data 
Example #11
Source File: test_stata.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 6 votes vote down vote up
def test_date_parsing_ignores_format_details(self, column):
        # GH 17797
        #
        # Test that display formats are ignored when determining if a numeric
        # column is a date value.
        #
        # All date types are stored as numbers and format associated with the
        # column denotes both the type of the date and the display format.
        #
        # STATA supports 9 date types which each have distinct units. We test 7
        # of the 9 types, ignoring %tC and %tb. %tC is a variant of %tc that
        # accounts for leap seconds and %tb relies on STATAs business calendar.
        df = read_stata(self.stata_dates)
        unformatted = df.loc[0, column]
        formatted = df.loc[0, column + "_fmt"]
        assert unformatted == formatted 
Example #12
Source File: test_stata.py    From vnpy_crypto with MIT License 6 votes vote down vote up
def test_out_of_range_float(self):
        original = DataFrame({'ColumnOk': [0.0,
                                           np.finfo(np.float32).eps,
                                           np.finfo(np.float32).max / 10.0],
                              'ColumnTooBig': [0.0,
                                               np.finfo(np.float32).eps,
                                               np.finfo(np.float32).max]})
        original.index.name = 'index'
        for col in original:
            original[col] = original[col].astype(np.float32)

        with tm.ensure_clean() as path:
            original.to_stata(path)
            reread = read_stata(path)
            original['ColumnTooBig'] = original['ColumnTooBig'].astype(
                np.float64)
            tm.assert_frame_equal(original,
                                  reread.set_index('index'))

        original.loc[2, 'ColumnTooBig'] = np.inf
        with pytest.raises(ValueError) as cm:
            with tm.ensure_clean() as path:
                original.to_stata(path)
            assert 'ColumnTooBig' in cm.exception
            assert 'infinity' in cm.exception 
Example #13
Source File: test_stata.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 6 votes vote down vote up
def test_mixed_string_strl(self):
        # GH 23633
        output = [
            {'mixed': 'string' * 500,
             'number': 0},
            {'mixed': None,
             'number': 1}
        ]
        output = pd.DataFrame(output)
        output.number = output.number.astype('int32')

        with tm.ensure_clean() as path:
            output.to_stata(path, write_index=False, version=117)
            reread = read_stata(path)
            expected = output.fillna('')
            tm.assert_frame_equal(reread, expected)

            # Check strl supports all None (null)
            output.loc[:, 'mixed'] = None
            output.to_stata(path, write_index=False, convert_strl=['mixed'],
                            version=117)
            reread = read_stata(path)
            expected = output.fillna('')
            tm.assert_frame_equal(reread, expected) 
Example #14
Source File: test_liml.py    From econtools with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def setup_class(cls):
        """Stata reg output from `sysuse auto; reg price mpg`"""
        cls.init(cls)
        test_path = path.split(path.relpath(__file__))[0]
        auto_path = path.join(test_path, 'data', 'auto.dta')
        autodata = pd.read_stata(auto_path)
        y = 'price'
        x_end = ['mpg', 'length']
        z = ['trunk', 'weight', 'headroom']
        x_exog = []
        nosingles = True
        cls.result = ivreg(autodata, y, x_end, z, x_exog,
                           addcons=True,
                           iv_method='liml',
                           nosingles=nosingles)
        cls.expected = liml_std 
Example #15
Source File: test_liml.py    From econtools with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def setup_class(cls):
        """Stata reg output from `sysuse auto; reg price mpg`"""
        cls.init(cls)
        cls.precision['se'] = 0
        cls.precision['CI_low'] = 0
        cls.precision['CI_high'] = -1
        test_path = path.split(path.relpath(__file__))[0]
        auto_path = path.join(test_path, 'data', 'auto.dta')
        autodata = pd.read_stata(auto_path)
        y = 'price'
        x_end = ['mpg', 'length']
        z = ['trunk', 'weight', 'headroom']
        x_exog = []
        nosingles = True
        cls.result = ivreg(autodata, y, x_end, z, x_exog,
                           addcons=True,
                           iv_method='liml',
                           vce_type='robust',
                           nosingles=nosingles)
        cls.expected = liml_robust 
Example #16
Source File: test_stata.py    From recruit with Apache License 2.0 6 votes vote down vote up
def test_dtype_conversion(self):
        expected = self.read_csv(self.csv15)
        expected['byte_'] = expected['byte_'].astype(np.int8)
        expected['int_'] = expected['int_'].astype(np.int16)
        expected['long_'] = expected['long_'].astype(np.int32)
        expected['float_'] = expected['float_'].astype(np.float32)
        expected['double_'] = expected['double_'].astype(np.float64)
        expected['date_td'] = expected['date_td'].apply(datetime.strptime,
                                                        args=('%Y-%m-%d',))

        no_conversion = read_stata(self.dta15_117,
                                   convert_dates=True)
        tm.assert_frame_equal(expected, no_conversion)

        conversion = read_stata(self.dta15_117,
                                convert_dates=True,
                                preserve_dtypes=False)

        # read_csv types are the same
        expected = self.read_csv(self.csv15)
        expected['date_td'] = expected['date_td'].apply(datetime.strptime,
                                                        args=('%Y-%m-%d',))

        tm.assert_frame_equal(expected, conversion) 
Example #17
Source File: test_liml.py    From econtools with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def setup_class(cls):
        """Stata reg output from `sysuse auto; reg price mpg`"""
        cls.init(cls)
        cls.precision['se'] = 0
        cls.precision['CI_low'] = 0
        cls.precision['CI_high'] = 0
        test_path = path.split(path.relpath(__file__))[0]
        auto_path = path.join(test_path, 'data', 'auto.dta')
        autodata = pd.read_stata(auto_path)
        y = 'price'
        x_end = ['mpg', 'length']
        z = ['trunk', 'weight', 'headroom']
        x_exog = []
        nosingles = True
        cls.result = ivreg(autodata, y, x_end, z, x_exog,
                           addcons=True,
                           iv_method='liml',
                           cluster='gear_ratio',
                           nosingles=nosingles)
        cls.expected = liml_cluster 
Example #18
Source File: test_liml.py    From econtools with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def setup_class(cls):
        """Stata reg output from `sysuse auto; reg price mpg`"""
        cls.init(cls)
        test_path = path.split(path.relpath(__file__))[0]
        auto_path = path.join(test_path, 'data', 'auto.dta')
        autodata = pd.read_stata(auto_path)
        y = 'price'
        x_end = ['mpg', 'length']
        z = ['weight', 'trunk']
        x_exog = []
        nosingles = True
        cls.result = ivreg(autodata, y, x_end, z, x_exog, addcons=True,
                           iv_method='liml',
                           cluster='gear_ratio',
                           nosingles=nosingles)
        cls.expected = tsls_cluster 
Example #19
Source File: test_stata.py    From recruit with Apache License 2.0 6 votes vote down vote up
def test_encoding(self, version):

        # GH 4626, proper encoding handling
        raw = read_stata(self.dta_encoding)
        with tm.assert_produces_warning(FutureWarning):
            encoded = read_stata(self.dta_encoding, encoding='latin-1')
        result = encoded.kreis1849[0]

        expected = raw.kreis1849[0]
        assert result == expected
        assert isinstance(result, compat.string_types)

        with tm.ensure_clean() as path:
            with tm.assert_produces_warning(FutureWarning):
                encoded.to_stata(path, write_index=False, version=version,
                                 encoding='latin-1')
            reread_encoded = read_stata(path)
            tm.assert_frame_equal(encoded, reread_encoded) 
Example #20
Source File: test_stata.py    From vnpy_crypto with MIT License 6 votes vote down vote up
def test_date_parsing_ignores_format_details(self, column):
        # GH 17797
        #
        # Test that display formats are ignored when determining if a numeric
        # column is a date value.
        #
        # All date types are stored as numbers and format associated with the
        # column denotes both the type of the date and the display format.
        #
        # STATA supports 9 date types which each have distinct units. We test 7
        # of the 9 types, ignoring %tC and %tb. %tC is a variant of %tc that
        # accounts for leap seconds and %tb relies on STATAs business calendar.
        df = read_stata(self.stata_dates)
        unformatted = df.loc[0, column]
        formatted = df.loc[0, column + "_fmt"]
        assert unformatted == formatted 
Example #21
Source File: test_stata.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 5 votes vote down vote up
def test_pickle_path_localpath(self):
        df = tm.makeDataFrame()
        df.index.name = 'index'
        reader = lambda x: read_stata(x).set_index('index')
        result = tm.round_trip_localpath(df.to_stata, reader)
        tm.assert_frame_equal(df, result) 
Example #22
Source File: test_stata.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 5 votes vote down vote up
def test_drop_column(self):
        expected = self.read_csv(self.csv15)
        expected['byte_'] = expected['byte_'].astype(np.int8)
        expected['int_'] = expected['int_'].astype(np.int16)
        expected['long_'] = expected['long_'].astype(np.int32)
        expected['float_'] = expected['float_'].astype(np.float32)
        expected['double_'] = expected['double_'].astype(np.float64)
        expected['date_td'] = expected['date_td'].apply(datetime.strptime,
                                                        args=('%Y-%m-%d',))

        columns = ['byte_', 'int_', 'long_']
        expected = expected[columns]
        dropped = read_stata(self.dta15_117, convert_dates=True,
                             columns=columns)

        tm.assert_frame_equal(expected, dropped)

        # See PR 10757
        columns = ['int_', 'long_', 'byte_']
        expected = expected[columns]
        reordered = read_stata(self.dta15_117, convert_dates=True,
                               columns=columns)
        tm.assert_frame_equal(expected, reordered)

        msg = "columns contains duplicate entries"
        with pytest.raises(ValueError, match=msg):
            columns = ['byte_', 'byte_']
            read_stata(self.dta15_117, convert_dates=True, columns=columns)

        msg = ("The following columns were not found in the Stata data set:"
               " not_found")
        with pytest.raises(ValueError, match=msg):
            columns = ['byte_', 'int_', 'long_', 'not_found']
            read_stata(self.dta15_117, convert_dates=True, columns=columns) 
Example #23
Source File: test_stata.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 5 votes vote down vote up
def test_read_chunks_115(self, file, chunksize,
                             convert_categoricals, convert_dates):
        fname = getattr(self, file)

        # Read the whole file
        with warnings.catch_warnings(record=True) as w:
            warnings.simplefilter("always")
            parsed = read_stata(
                fname,
                convert_categoricals=convert_categoricals,
                convert_dates=convert_dates)

        # Compare to what we get when reading by chunk
        itr = read_stata(
            fname, iterator=True,
            convert_dates=convert_dates,
            convert_categoricals=convert_categoricals)
        pos = 0
        for j in range(5):
            with warnings.catch_warnings(record=True) as w:  # noqa
                warnings.simplefilter("always")
                try:
                    chunk = itr.read(chunksize)
                except StopIteration:
                    break
            from_frame = parsed.iloc[pos:pos + chunksize, :]
            tm.assert_frame_equal(
                from_frame, chunk, check_dtype=False,
                check_datetimelike_compat=True,
                check_categorical=False)

            pos += chunksize
        itr.close() 
Example #24
Source File: test_stata.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 5 votes vote down vote up
def test_105(self):
        # Data obtained from:
        # http://go.worldbank.org/ZXY29PVJ21
        dpath = os.path.join(self.dirpath, 'S4_EDUC1.dta')
        df = pd.read_stata(dpath)
        df0 = [[1, 1, 3, -2], [2, 1, 2, -2], [4, 1, 1, -2]]
        df0 = pd.DataFrame(df0)
        df0.columns = ["clustnum", "pri_schl", "psch_num", "psch_dis"]
        df0['clustnum'] = df0["clustnum"].astype(np.int16)
        df0['pri_schl'] = df0["pri_schl"].astype(np.int8)
        df0['psch_num'] = df0["psch_num"].astype(np.int8)
        df0['psch_dis'] = df0["psch_dis"].astype(np.float32)
        tm.assert_frame_equal(df.head(3), df0) 
Example #25
Source File: test_stata.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 5 votes vote down vote up
def test_read_empty_dta(self, version):
        empty_ds = DataFrame(columns=['unit'])
        # GH 7369, make sure can read a 0-obs dta file
        with tm.ensure_clean() as path:
            empty_ds.to_stata(path, write_index=False, version=version)
            empty_ds2 = read_stata(path)
            tm.assert_frame_equal(empty_ds, empty_ds2) 
Example #26
Source File: test_stata.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 5 votes vote down vote up
def read_dta(self, file):
        # Legacy default reader configuration
        return read_stata(file, convert_dates=True) 
Example #27
Source File: test_stata.py    From predictive-maintenance-using-machine-learning with Apache License 2.0 5 votes vote down vote up
def parsed_114(dirpath):
    dta14_114 = os.path.join(dirpath, 'stata5_114.dta')
    parsed_114 = read_stata(dta14_114, convert_dates=True)
    parsed_114.index.name = 'index'
    return parsed_114 
Example #28
Source File: run_pandas.py    From recipy with Apache License 2.0 5 votes vote down vote up
def read_stata(self):
        """
        Use pandas.read_stata to load dataframe.dta.
        """
        file_name = os.path.join(self.data_dir, "dataframe.dta")
        pd.read_stata(file_name) 
Example #29
Source File: test_stata.py    From vnpy_crypto with MIT License 5 votes vote down vote up
def test_gzip_writing(self):
        # writing version 117 requires seek and cannot be used with gzip
        df = tm.makeDataFrame()
        df.index.name = 'index'
        with tm.ensure_clean() as path:
            with gzip.GzipFile(path, 'wb') as gz:
                df.to_stata(gz, version=114)
            with gzip.GzipFile(path, 'rb') as gz:
                reread = pd.read_stata(gz, index_col='index')
        tm.assert_frame_equal(df, reread) 
Example #30
Source File: test_stata.py    From vnpy_crypto with MIT License 5 votes vote down vote up
def test_nonfile_writing(self, version):
        # GH 21041
        bio = io.BytesIO()
        df = tm.makeDataFrame()
        df.index.name = 'index'
        with tm.ensure_clean() as path:
            df.to_stata(bio, version=version)
            bio.seek(0)
            with open(path, 'wb') as dta:
                dta.write(bio.read())
            reread = pd.read_stata(path, index_col='index')
        tm.assert_frame_equal(df, reread)