Python pandas.read_stata() Examples
The following are 30
code examples of pandas.read_stata().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pandas
, or try the search function
.
Example #1
Source File: test_stata.py From vnpy_crypto with MIT License | 6 votes |
def test_encoding(self, version): # GH 4626, proper encoding handling raw = read_stata(self.dta_encoding) encoded = read_stata(self.dta_encoding, encoding="latin-1") result = encoded.kreis1849[0] if compat.PY3: expected = raw.kreis1849[0] assert result == expected assert isinstance(result, compat.string_types) else: expected = raw.kreis1849.str.decode("latin-1")[0] assert result == expected assert isinstance(result, unicode) # noqa with tm.ensure_clean() as path: encoded.to_stata(path, encoding='latin-1', write_index=False, version=version) reread_encoded = read_stata(path, encoding='latin-1') tm.assert_frame_equal(encoded, reread_encoded)
Example #2
Source File: test_stata.py From recruit with Apache License 2.0 | 6 votes |
def test_mixed_string_strl(self): # GH 23633 output = [ {'mixed': 'string' * 500, 'number': 0}, {'mixed': None, 'number': 1} ] output = pd.DataFrame(output) output.number = output.number.astype('int32') with tm.ensure_clean() as path: output.to_stata(path, write_index=False, version=117) reread = read_stata(path) expected = output.fillna('') tm.assert_frame_equal(reread, expected) # Check strl supports all None (null) output.loc[:, 'mixed'] = None output.to_stata(path, write_index=False, convert_strl=['mixed'], version=117) reread = read_stata(path) expected = output.fillna('') tm.assert_frame_equal(reread, expected)
Example #3
Source File: test_stata.py From predictive-maintenance-using-machine-learning with Apache License 2.0 | 6 votes |
def test_encoding(self, version): # GH 4626, proper encoding handling raw = read_stata(self.dta_encoding) with tm.assert_produces_warning(FutureWarning): encoded = read_stata(self.dta_encoding, encoding='latin-1') result = encoded.kreis1849[0] expected = raw.kreis1849[0] assert result == expected assert isinstance(result, compat.string_types) with tm.ensure_clean() as path: with tm.assert_produces_warning(FutureWarning): encoded.to_stata(path, write_index=False, version=version, encoding='latin-1') reread_encoded = read_stata(path) tm.assert_frame_equal(encoded, reread_encoded)
Example #4
Source File: test_stata.py From recruit with Apache License 2.0 | 6 votes |
def test_date_parsing_ignores_format_details(self, column): # GH 17797 # # Test that display formats are ignored when determining if a numeric # column is a date value. # # All date types are stored as numbers and format associated with the # column denotes both the type of the date and the display format. # # STATA supports 9 date types which each have distinct units. We test 7 # of the 9 types, ignoring %tC and %tb. %tC is a variant of %tc that # accounts for leap seconds and %tb relies on STATAs business calendar. df = read_stata(self.stata_dates) unformatted = df.loc[0, column] formatted = df.loc[0, column + "_fmt"] assert unformatted == formatted
Example #5
Source File: test_varmax.py From vnpy_crypto with MIT License | 6 votes |
def setup_class(cls, true, order, trend, error_cov_type, cov_type='approx', **kwargs): cls.true = true # 1960:Q1 - 1982:Q4 with open(current_path + os.sep + 'results' + os.sep + 'manufac.dta', 'rb') as test_data: dta = pd.read_stata(test_data) dta.index = pd.DatetimeIndex(dta.month, freq='MS') dta['dlncaputil'] = dta['lncaputil'].diff() dta['dlnhours'] = dta['lnhours'].diff() endog = dta.loc['1972-02-01':, ['dlncaputil', 'dlnhours']] with warnings.catch_warnings(record=True) as w: warnings.simplefilter('always') cls.model = varmax.VARMAX(endog, order=order, trend=trend, error_cov_type=error_cov_type, **kwargs) cls.results = cls.model.smooth(true['params'], cov_type=cov_type)
Example #6
Source File: test_stata.py From predictive-maintenance-using-machine-learning with Apache License 2.0 | 6 votes |
def test_dtype_conversion(self): expected = self.read_csv(self.csv15) expected['byte_'] = expected['byte_'].astype(np.int8) expected['int_'] = expected['int_'].astype(np.int16) expected['long_'] = expected['long_'].astype(np.int32) expected['float_'] = expected['float_'].astype(np.float32) expected['double_'] = expected['double_'].astype(np.float64) expected['date_td'] = expected['date_td'].apply(datetime.strptime, args=('%Y-%m-%d',)) no_conversion = read_stata(self.dta15_117, convert_dates=True) tm.assert_frame_equal(expected, no_conversion) conversion = read_stata(self.dta15_117, convert_dates=True, preserve_dtypes=False) # read_csv types are the same expected = self.read_csv(self.csv15) expected['date_td'] = expected['date_td'].apply(datetime.strptime, args=('%Y-%m-%d',)) tm.assert_frame_equal(expected, conversion)
Example #7
Source File: test_stata.py From predictive-maintenance-using-machine-learning with Apache License 2.0 | 6 votes |
def test_out_of_range_float(self): original = DataFrame({'ColumnOk': [0.0, np.finfo(np.float32).eps, np.finfo(np.float32).max / 10.0], 'ColumnTooBig': [0.0, np.finfo(np.float32).eps, np.finfo(np.float32).max]}) original.index.name = 'index' for col in original: original[col] = original[col].astype(np.float32) with tm.ensure_clean() as path: original.to_stata(path) reread = read_stata(path) original['ColumnTooBig'] = original['ColumnTooBig'].astype( np.float64) tm.assert_frame_equal(original, reread.set_index('index')) original.loc[2, 'ColumnTooBig'] = np.inf msg = ("Column ColumnTooBig has a maximum value of infinity which" " is outside the range supported by Stata") with pytest.raises(ValueError, match=msg): with tm.ensure_clean() as path: original.to_stata(path)
Example #8
Source File: test_stata.py From recruit with Apache License 2.0 | 6 votes |
def test_out_of_range_float(self): original = DataFrame({'ColumnOk': [0.0, np.finfo(np.float32).eps, np.finfo(np.float32).max / 10.0], 'ColumnTooBig': [0.0, np.finfo(np.float32).eps, np.finfo(np.float32).max]}) original.index.name = 'index' for col in original: original[col] = original[col].astype(np.float32) with tm.ensure_clean() as path: original.to_stata(path) reread = read_stata(path) original['ColumnTooBig'] = original['ColumnTooBig'].astype( np.float64) tm.assert_frame_equal(original, reread.set_index('index')) original.loc[2, 'ColumnTooBig'] = np.inf msg = ("Column ColumnTooBig has a maximum value of infinity which" " is outside the range supported by Stata") with pytest.raises(ValueError, match=msg): with tm.ensure_clean() as path: original.to_stata(path)
Example #9
Source File: test_stata.py From vnpy_crypto with MIT License | 6 votes |
def test_dtype_conversion(self): expected = self.read_csv(self.csv15) expected['byte_'] = expected['byte_'].astype(np.int8) expected['int_'] = expected['int_'].astype(np.int16) expected['long_'] = expected['long_'].astype(np.int32) expected['float_'] = expected['float_'].astype(np.float32) expected['double_'] = expected['double_'].astype(np.float64) expected['date_td'] = expected['date_td'].apply(datetime.strptime, args=('%Y-%m-%d',)) no_conversion = read_stata(self.dta15_117, convert_dates=True) tm.assert_frame_equal(expected, no_conversion) conversion = read_stata(self.dta15_117, convert_dates=True, preserve_dtypes=False) # read_csv types are the same expected = self.read_csv(self.csv15) expected['date_td'] = expected['date_td'].apply(datetime.strptime, args=('%Y-%m-%d',)) tm.assert_frame_equal(expected, conversion)
Example #10
Source File: stata.py From vnpy_crypto with MIT License | 6 votes |
def read_stata(filepath_or_buffer, convert_dates=True, convert_categoricals=True, encoding=None, index_col=None, convert_missing=False, preserve_dtypes=True, columns=None, order_categoricals=True, chunksize=None, iterator=False): reader = StataReader(filepath_or_buffer, convert_dates=convert_dates, convert_categoricals=convert_categoricals, index_col=index_col, convert_missing=convert_missing, preserve_dtypes=preserve_dtypes, columns=columns, order_categoricals=order_categoricals, chunksize=chunksize, encoding=encoding) if iterator or chunksize: data = reader else: try: data = reader.read() finally: reader.close() return data
Example #11
Source File: test_stata.py From predictive-maintenance-using-machine-learning with Apache License 2.0 | 6 votes |
def test_date_parsing_ignores_format_details(self, column): # GH 17797 # # Test that display formats are ignored when determining if a numeric # column is a date value. # # All date types are stored as numbers and format associated with the # column denotes both the type of the date and the display format. # # STATA supports 9 date types which each have distinct units. We test 7 # of the 9 types, ignoring %tC and %tb. %tC is a variant of %tc that # accounts for leap seconds and %tb relies on STATAs business calendar. df = read_stata(self.stata_dates) unformatted = df.loc[0, column] formatted = df.loc[0, column + "_fmt"] assert unformatted == formatted
Example #12
Source File: test_stata.py From vnpy_crypto with MIT License | 6 votes |
def test_out_of_range_float(self): original = DataFrame({'ColumnOk': [0.0, np.finfo(np.float32).eps, np.finfo(np.float32).max / 10.0], 'ColumnTooBig': [0.0, np.finfo(np.float32).eps, np.finfo(np.float32).max]}) original.index.name = 'index' for col in original: original[col] = original[col].astype(np.float32) with tm.ensure_clean() as path: original.to_stata(path) reread = read_stata(path) original['ColumnTooBig'] = original['ColumnTooBig'].astype( np.float64) tm.assert_frame_equal(original, reread.set_index('index')) original.loc[2, 'ColumnTooBig'] = np.inf with pytest.raises(ValueError) as cm: with tm.ensure_clean() as path: original.to_stata(path) assert 'ColumnTooBig' in cm.exception assert 'infinity' in cm.exception
Example #13
Source File: test_stata.py From predictive-maintenance-using-machine-learning with Apache License 2.0 | 6 votes |
def test_mixed_string_strl(self): # GH 23633 output = [ {'mixed': 'string' * 500, 'number': 0}, {'mixed': None, 'number': 1} ] output = pd.DataFrame(output) output.number = output.number.astype('int32') with tm.ensure_clean() as path: output.to_stata(path, write_index=False, version=117) reread = read_stata(path) expected = output.fillna('') tm.assert_frame_equal(reread, expected) # Check strl supports all None (null) output.loc[:, 'mixed'] = None output.to_stata(path, write_index=False, convert_strl=['mixed'], version=117) reread = read_stata(path) expected = output.fillna('') tm.assert_frame_equal(reread, expected)
Example #14
Source File: test_liml.py From econtools with BSD 3-Clause "New" or "Revised" License | 6 votes |
def setup_class(cls): """Stata reg output from `sysuse auto; reg price mpg`""" cls.init(cls) test_path = path.split(path.relpath(__file__))[0] auto_path = path.join(test_path, 'data', 'auto.dta') autodata = pd.read_stata(auto_path) y = 'price' x_end = ['mpg', 'length'] z = ['trunk', 'weight', 'headroom'] x_exog = [] nosingles = True cls.result = ivreg(autodata, y, x_end, z, x_exog, addcons=True, iv_method='liml', nosingles=nosingles) cls.expected = liml_std
Example #15
Source File: test_liml.py From econtools with BSD 3-Clause "New" or "Revised" License | 6 votes |
def setup_class(cls): """Stata reg output from `sysuse auto; reg price mpg`""" cls.init(cls) cls.precision['se'] = 0 cls.precision['CI_low'] = 0 cls.precision['CI_high'] = -1 test_path = path.split(path.relpath(__file__))[0] auto_path = path.join(test_path, 'data', 'auto.dta') autodata = pd.read_stata(auto_path) y = 'price' x_end = ['mpg', 'length'] z = ['trunk', 'weight', 'headroom'] x_exog = [] nosingles = True cls.result = ivreg(autodata, y, x_end, z, x_exog, addcons=True, iv_method='liml', vce_type='robust', nosingles=nosingles) cls.expected = liml_robust
Example #16
Source File: test_stata.py From recruit with Apache License 2.0 | 6 votes |
def test_dtype_conversion(self): expected = self.read_csv(self.csv15) expected['byte_'] = expected['byte_'].astype(np.int8) expected['int_'] = expected['int_'].astype(np.int16) expected['long_'] = expected['long_'].astype(np.int32) expected['float_'] = expected['float_'].astype(np.float32) expected['double_'] = expected['double_'].astype(np.float64) expected['date_td'] = expected['date_td'].apply(datetime.strptime, args=('%Y-%m-%d',)) no_conversion = read_stata(self.dta15_117, convert_dates=True) tm.assert_frame_equal(expected, no_conversion) conversion = read_stata(self.dta15_117, convert_dates=True, preserve_dtypes=False) # read_csv types are the same expected = self.read_csv(self.csv15) expected['date_td'] = expected['date_td'].apply(datetime.strptime, args=('%Y-%m-%d',)) tm.assert_frame_equal(expected, conversion)
Example #17
Source File: test_liml.py From econtools with BSD 3-Clause "New" or "Revised" License | 6 votes |
def setup_class(cls): """Stata reg output from `sysuse auto; reg price mpg`""" cls.init(cls) cls.precision['se'] = 0 cls.precision['CI_low'] = 0 cls.precision['CI_high'] = 0 test_path = path.split(path.relpath(__file__))[0] auto_path = path.join(test_path, 'data', 'auto.dta') autodata = pd.read_stata(auto_path) y = 'price' x_end = ['mpg', 'length'] z = ['trunk', 'weight', 'headroom'] x_exog = [] nosingles = True cls.result = ivreg(autodata, y, x_end, z, x_exog, addcons=True, iv_method='liml', cluster='gear_ratio', nosingles=nosingles) cls.expected = liml_cluster
Example #18
Source File: test_liml.py From econtools with BSD 3-Clause "New" or "Revised" License | 6 votes |
def setup_class(cls): """Stata reg output from `sysuse auto; reg price mpg`""" cls.init(cls) test_path = path.split(path.relpath(__file__))[0] auto_path = path.join(test_path, 'data', 'auto.dta') autodata = pd.read_stata(auto_path) y = 'price' x_end = ['mpg', 'length'] z = ['weight', 'trunk'] x_exog = [] nosingles = True cls.result = ivreg(autodata, y, x_end, z, x_exog, addcons=True, iv_method='liml', cluster='gear_ratio', nosingles=nosingles) cls.expected = tsls_cluster
Example #19
Source File: test_stata.py From recruit with Apache License 2.0 | 6 votes |
def test_encoding(self, version): # GH 4626, proper encoding handling raw = read_stata(self.dta_encoding) with tm.assert_produces_warning(FutureWarning): encoded = read_stata(self.dta_encoding, encoding='latin-1') result = encoded.kreis1849[0] expected = raw.kreis1849[0] assert result == expected assert isinstance(result, compat.string_types) with tm.ensure_clean() as path: with tm.assert_produces_warning(FutureWarning): encoded.to_stata(path, write_index=False, version=version, encoding='latin-1') reread_encoded = read_stata(path) tm.assert_frame_equal(encoded, reread_encoded)
Example #20
Source File: test_stata.py From vnpy_crypto with MIT License | 6 votes |
def test_date_parsing_ignores_format_details(self, column): # GH 17797 # # Test that display formats are ignored when determining if a numeric # column is a date value. # # All date types are stored as numbers and format associated with the # column denotes both the type of the date and the display format. # # STATA supports 9 date types which each have distinct units. We test 7 # of the 9 types, ignoring %tC and %tb. %tC is a variant of %tc that # accounts for leap seconds and %tb relies on STATAs business calendar. df = read_stata(self.stata_dates) unformatted = df.loc[0, column] formatted = df.loc[0, column + "_fmt"] assert unformatted == formatted
Example #21
Source File: test_stata.py From predictive-maintenance-using-machine-learning with Apache License 2.0 | 5 votes |
def test_pickle_path_localpath(self): df = tm.makeDataFrame() df.index.name = 'index' reader = lambda x: read_stata(x).set_index('index') result = tm.round_trip_localpath(df.to_stata, reader) tm.assert_frame_equal(df, result)
Example #22
Source File: test_stata.py From predictive-maintenance-using-machine-learning with Apache License 2.0 | 5 votes |
def test_drop_column(self): expected = self.read_csv(self.csv15) expected['byte_'] = expected['byte_'].astype(np.int8) expected['int_'] = expected['int_'].astype(np.int16) expected['long_'] = expected['long_'].astype(np.int32) expected['float_'] = expected['float_'].astype(np.float32) expected['double_'] = expected['double_'].astype(np.float64) expected['date_td'] = expected['date_td'].apply(datetime.strptime, args=('%Y-%m-%d',)) columns = ['byte_', 'int_', 'long_'] expected = expected[columns] dropped = read_stata(self.dta15_117, convert_dates=True, columns=columns) tm.assert_frame_equal(expected, dropped) # See PR 10757 columns = ['int_', 'long_', 'byte_'] expected = expected[columns] reordered = read_stata(self.dta15_117, convert_dates=True, columns=columns) tm.assert_frame_equal(expected, reordered) msg = "columns contains duplicate entries" with pytest.raises(ValueError, match=msg): columns = ['byte_', 'byte_'] read_stata(self.dta15_117, convert_dates=True, columns=columns) msg = ("The following columns were not found in the Stata data set:" " not_found") with pytest.raises(ValueError, match=msg): columns = ['byte_', 'int_', 'long_', 'not_found'] read_stata(self.dta15_117, convert_dates=True, columns=columns)
Example #23
Source File: test_stata.py From predictive-maintenance-using-machine-learning with Apache License 2.0 | 5 votes |
def test_read_chunks_115(self, file, chunksize, convert_categoricals, convert_dates): fname = getattr(self, file) # Read the whole file with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") parsed = read_stata( fname, convert_categoricals=convert_categoricals, convert_dates=convert_dates) # Compare to what we get when reading by chunk itr = read_stata( fname, iterator=True, convert_dates=convert_dates, convert_categoricals=convert_categoricals) pos = 0 for j in range(5): with warnings.catch_warnings(record=True) as w: # noqa warnings.simplefilter("always") try: chunk = itr.read(chunksize) except StopIteration: break from_frame = parsed.iloc[pos:pos + chunksize, :] tm.assert_frame_equal( from_frame, chunk, check_dtype=False, check_datetimelike_compat=True, check_categorical=False) pos += chunksize itr.close()
Example #24
Source File: test_stata.py From predictive-maintenance-using-machine-learning with Apache License 2.0 | 5 votes |
def test_105(self): # Data obtained from: # http://go.worldbank.org/ZXY29PVJ21 dpath = os.path.join(self.dirpath, 'S4_EDUC1.dta') df = pd.read_stata(dpath) df0 = [[1, 1, 3, -2], [2, 1, 2, -2], [4, 1, 1, -2]] df0 = pd.DataFrame(df0) df0.columns = ["clustnum", "pri_schl", "psch_num", "psch_dis"] df0['clustnum'] = df0["clustnum"].astype(np.int16) df0['pri_schl'] = df0["pri_schl"].astype(np.int8) df0['psch_num'] = df0["psch_num"].astype(np.int8) df0['psch_dis'] = df0["psch_dis"].astype(np.float32) tm.assert_frame_equal(df.head(3), df0)
Example #25
Source File: test_stata.py From predictive-maintenance-using-machine-learning with Apache License 2.0 | 5 votes |
def test_read_empty_dta(self, version): empty_ds = DataFrame(columns=['unit']) # GH 7369, make sure can read a 0-obs dta file with tm.ensure_clean() as path: empty_ds.to_stata(path, write_index=False, version=version) empty_ds2 = read_stata(path) tm.assert_frame_equal(empty_ds, empty_ds2)
Example #26
Source File: test_stata.py From predictive-maintenance-using-machine-learning with Apache License 2.0 | 5 votes |
def read_dta(self, file): # Legacy default reader configuration return read_stata(file, convert_dates=True)
Example #27
Source File: test_stata.py From predictive-maintenance-using-machine-learning with Apache License 2.0 | 5 votes |
def parsed_114(dirpath): dta14_114 = os.path.join(dirpath, 'stata5_114.dta') parsed_114 = read_stata(dta14_114, convert_dates=True) parsed_114.index.name = 'index' return parsed_114
Example #28
Source File: run_pandas.py From recipy with Apache License 2.0 | 5 votes |
def read_stata(self): """ Use pandas.read_stata to load dataframe.dta. """ file_name = os.path.join(self.data_dir, "dataframe.dta") pd.read_stata(file_name)
Example #29
Source File: test_stata.py From vnpy_crypto with MIT License | 5 votes |
def test_gzip_writing(self): # writing version 117 requires seek and cannot be used with gzip df = tm.makeDataFrame() df.index.name = 'index' with tm.ensure_clean() as path: with gzip.GzipFile(path, 'wb') as gz: df.to_stata(gz, version=114) with gzip.GzipFile(path, 'rb') as gz: reread = pd.read_stata(gz, index_col='index') tm.assert_frame_equal(df, reread)
Example #30
Source File: test_stata.py From vnpy_crypto with MIT License | 5 votes |
def test_nonfile_writing(self, version): # GH 21041 bio = io.BytesIO() df = tm.makeDataFrame() df.index.name = 'index' with tm.ensure_clean() as path: df.to_stata(bio, version=version) bio.seek(0) with open(path, 'wb') as dta: dta.write(bio.read()) reread = pd.read_stata(path, index_col='index') tm.assert_frame_equal(df, reread)