Python pandas.StringDtype() Examples
The following are 8
code examples of pandas.StringDtype().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pandas
, or try the search function
.
Example #1
Source File: filtering_fe_autotype.py From dash-docs with MIT License | 6 votes |
def table_type(df_column): # Note - this only works with Pandas >= 1.0.0 if sys.version_info < (3, 0): # Pandas 1.0.0 does not support Python 2 return 'any' if isinstance(df_column.dtype, pd.DatetimeTZDtype): return 'datetime', elif (isinstance(df_column.dtype, pd.StringDtype) or isinstance(df_column.dtype, pd.BooleanDtype) or isinstance(df_column.dtype, pd.CategoricalDtype) or isinstance(df_column.dtype, pd.PeriodDtype)): return 'text' elif (isinstance(df_column.dtype, pd.SparseDtype) or isinstance(df_column.dtype, pd.IntervalDtype) or isinstance(df_column.dtype, pd.Int8Dtype) or isinstance(df_column.dtype, pd.Int16Dtype) or isinstance(df_column.dtype, pd.Int32Dtype) or isinstance(df_column.dtype, pd.Int64Dtype)): return 'numeric' else: return 'any'
Example #2
Source File: schema.py From mlflow with Apache License 2.0 | 6 votes |
def _pandas_string_type(): try: return pd.StringDtype() except AttributeError: return np.object
Example #3
Source File: epacems.py From pudl with MIT License | 5 votes |
def _load_plant_utc_offset(datapkg_dir): """Load the UTC offset each EIA plant. CEMS times don't change for DST, so we get get the UTC offset by using the offset for the plants' timezones in January. Args: datapkg_dir (path-like) : Path to the directory of the datapackage which is currently being assembled. Returns: pandas.DataFrame: With columns plant_id_eia and utc_offset """ import pytz jan1 = datetime.datetime(2011, 1, 1) # year doesn't matter timezones = ( pd.read_csv( pathlib.Path(datapkg_dir, 'data/plants_entity_eia.csv'), usecols=["plant_id_eia", "timezone"], dtype={"plant_id_eia": "Int64", "timezone": pd.StringDtype()}) .replace(to_replace="None", value=pd.NA) .dropna() ) timezones["utc_offset"] = ( timezones["timezone"] .apply(lambda tz: pytz.timezone(tz).localize(jan1).utcoffset()) ) del timezones["timezone"] return timezones
Example #4
Source File: parser.py From whatstk with GNU General Public License v3.0 | 5 votes |
def _add_schema(df): """Add default chat schema to df. Args: df (pandas.DataFrame): Chat dataframe. Returns: pandas.DataFrame: Chat dataframe with correct dtypes. """ df = df.astype({ COLNAMES_DF.USERNAME: pd.StringDtype(), COLNAMES_DF.MESSAGE: pd.StringDtype() }) return df
Example #5
Source File: test_dtypes.py From pandera with MIT License | 4 votes |
def test_pandas_extension_types(): """Test pandas extension data type happy path.""" # pylint: disable=no-member test_params = [ ( pd.CategoricalDtype(), pd.Series(["a", "a", "b", "b", "c", "c"], dtype="category"), None ), ( pd.DatetimeTZDtype(tz='UTC'), pd.Series( pd.date_range(start="20200101", end="20200301"), dtype="datetime64[ns, utc]" ), None ), (pd.Int64Dtype(), pd.Series(range(10), dtype="Int64"), None), (pd.StringDtype(), pd.Series(["foo", "bar", "baz"], dtype="string"), None), ( pd.PeriodDtype(freq='D'), pd.Series(pd.period_range('1/1/2019', '1/1/2020', freq='D')), None ), ( pd.SparseDtype("float"), pd.Series(range(100)).where( lambda s: s < 5, other=np.nan).astype("Sparse[float]"), {"nullable": True}, ), ( pd.BooleanDtype(), pd.Series([1, 0, 0, 1, 1], dtype="boolean"), None ), ( pd.IntervalDtype(subtype="int64"), pd.Series(pd.IntervalIndex.from_breaks([0, 1, 2, 3, 4])), None, ) ] for dtype, data, series_kwargs in test_params: series_kwargs = {} if series_kwargs is None else series_kwargs series_schema = SeriesSchema(pandas_dtype=dtype, **series_kwargs) assert isinstance(series_schema.validate(data), pd.Series)
Example #6
Source File: ferc714.py From pudl with MIT License | 4 votes |
def electricity_planning_areas(pudl_settings): """Electric Planning Area geometries from HIFLD.""" gdb_path = pathlib.Path( pudl_settings["data_dir"], "local/hifld/electric_planning_areas.gdb" ) gdf = ( geopandas.read_file(gdb_path) .assign( SOURCEDATE=lambda x: pd.to_datetime(x.SOURCEDATE), VAL_DATE=lambda x: pd.to_datetime(x.VAL_DATE), ID=lambda x: pd.to_numeric(x.ID), NAICS_CODE=lambda x: pd.to_numeric(x.NAICS_CODE), YEAR=lambda x: pd.to_numeric(x.YEAR), ) # Hack to work around geopanda issue fixed as of v0.8.0 # https://github.com/geopandas/geopandas/issues/1366 .assign( ID=lambda x: x.ID.astype(pd.Int64Dtype()), NAME=lambda x: x.NAME.astype(pd.StringDtype()), COUNTRY=lambda x: x.COUNTRY.astype(pd.StringDtype()), NAICS_CODE=lambda x: x.NAICS_CODE.astype(pd.Int64Dtype()), NAICS_DESC=lambda x: x.NAICS_DESC.astype(pd.StringDtype()), SOURCE=lambda x: x.SOURCE.astype(pd.StringDtype()), VAL_METHOD=lambda x: x.VAL_METHOD.astype(pd.StringDtype()), WEBSITE=lambda x: x.WEBSITE.astype(pd.StringDtype()), ABBRV=lambda x: x.ABBRV.astype(pd.StringDtype()), YEAR=lambda x: x.YEAR.astype(pd.Int64Dtype()), PEAK_LOAD=lambda x: x.PEAK_LOAD.astype(float), PEAK_RANGE=lambda x: x.PEAK_RANGE.astype(float), SHAPE_Length=lambda x: x.SHAPE_Length.astype(float), SHAPE_Area=lambda x: x.SHAPE_Area.astype(float), ) ) # Need to set these IDs b/c HIFLD geometry uses EIA Balancing Authority IDs # (maybe?) FERC 714 is using EIA Utility IDs. This isn't totally resolved # and we need to figure out which set of IDs is getting used where. gdf.loc[gdf.ID == 2775, "ID"] = 229 # CAISO gdf.loc[gdf.ID == 59504, "ID"] = 17690 # Southwest Power Pool gdf.loc[gdf.ID == 14379, "ID"] = 14354 # PacifiCorp East + West gdf.loc[gdf.ID == 13670, "ID"] = 39347 # Northeast TX Electric Co-op return gdf
Example #7
Source File: eia860.py From pudl with MIT License | 4 votes |
def ownership(eia860_dfs, eia860_transformed_dfs): """ Pulls and transforms the ownership table. Args: eia860_dfs (dict): Each entry in this dictionary of DataFrame objects corresponds to a page from the EIA860 form, as reported in the Excel spreadsheets they distribute eia860_transformed_dfs (dict): A dictionary of DataFrame objects in which pages from EIA860 form (keys) correspond to normalized DataFrames of values from that page (values) Returns: dict: eia860_transformed_dfs, a dictionary of DataFrame objects in which pages from EIA860 form (keys) correspond to normalized DataFrames of values from that page (values) """ o_df = ( eia860_dfs['ownership'].copy() .pipe(pudl.helpers.fix_eia_na) .pipe(pudl.helpers.convert_to_date) ) # The fix we're making here is only known to be valid for 2011 -- if we # get older data... then we need to to revisit the cleaning function and # make sure it also applies to those earlier years. if min(o_df.report_date.dt.year) < min(pc.working_years["eia860"]): raise ValueError( f"EIA 860 transform step is only known to work for " f"year {min(pc.working_years['eia860'])} and later, but found data " f"from year {min(o_df.report_date.dt.year)}." ) # Prior to 2012, ownership was reported as a percentage, rather than # as a proportion, so we need to divide those values by 100. o_df.loc[o_df.report_date.dt.year < 2012, 'fraction_owned'] = \ o_df.loc[o_df.report_date.dt.year < 2012, 'fraction_owned'] / 100 o_df = ( o_df.astype({ "owner_utility_id_eia": pd.Int64Dtype(), "utility_id_eia": pd.Int64Dtype(), "plant_id_eia": pd.Int64Dtype(), "owner_state": pd.StringDtype() }) ) eia860_transformed_dfs['ownership_eia860'] = o_df return eia860_transformed_dfs
Example #8
Source File: epacems_to_parquet.py From pudl with MIT License | 4 votes |
def create_in_dtypes(): """ Create a dictionary of input data types. This specifies the dtypes of the input columns, which is necessary for some cases where, e.g., a column is always NaN. Returns: dict: mapping columns names to :mod:`pandas` data types. """ # These measurement codes are used by all four of our measurement variables common_codes = ( "LME", "Measured", "Measured and Substitute", "Other", "Substitute", "Undetermined", "Unknown Code", "", ) co2_so2_cats = pd.CategoricalDtype(categories=common_codes, ordered=False) nox_cats = pd.CategoricalDtype( categories=common_codes + ("Calculated",), ordered=False ) state_cats = pd.CategoricalDtype( categories=pc.cems_states.keys(), ordered=False) in_dtypes = { "state": state_cats, "plant_id_eia": "int32", "unitid": pd.StringDtype(), # "operating_datetime_utc": "datetime", "operating_time_hours": "float32", "gross_load_mw": "float32", "steam_load_1000_lbs": "float32", "so2_mass_lbs": "float32", "so2_mass_measurement_code": co2_so2_cats, "nox_rate_lbs_mmbtu": "float32", "nox_rate_measurement_code": nox_cats, "nox_mass_lbs": "float32", "nox_mass_measurement_code": nox_cats, "co2_mass_tons": "float32", "co2_mass_measurement_code": co2_so2_cats, "heat_content_mmbtu": "float32", "facility_id": pd.Int32Dtype(), "unit_id_epa": pd.Int32Dtype(), } return in_dtypes