Python pyarrow.timestamp() Examples
The following are 30
code examples of pyarrow.timestamp().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyarrow
, or try the search function
.
Example #1
Source File: test_renderprep.py From cjworkbench with GNU Affero General Public License v3.0 | 6 votes |
def test_list_prompting_error_concatenate_different_type_to_text(self): context = self._render_context( input_table=arrow_table( {"A": [1], "B": pa.array([datetime.now()], pa.timestamp("ns"))} ) ) schema = ParamDType.List( inner_dtype=ParamDType.Column(column_types=frozenset({"text"})) ) with self.assertRaises(PromptingError) as cm: clean_value(schema, ["A", "B"], context) self.assertEqual( cm.exception.errors, [PromptingError.WrongColumnType(["A", "B"], None, frozenset({"text"}))], )
Example #2
Source File: test_renderprep.py From cjworkbench with GNU Affero General Public License v3.0 | 6 votes |
def test_list_prompting_error_concatenate_different_type(self): context = self._render_context( input_table=arrow_table( {"A": ["1"], "B": pa.array([datetime.now()], pa.timestamp("ns"))} ) ) schema = ParamDType.List( inner_dtype=ParamDType.Column(column_types=frozenset({"number"})) ) with self.assertRaises(PromptingError) as cm: clean_value(schema, ["A", "B"], context) self.assertEqual( cm.exception.errors, [ PromptingError.WrongColumnType(["A"], "text", frozenset({"number"})), PromptingError.WrongColumnType( ["B"], "datetime", frozenset({"number"}) ), ], )
Example #3
Source File: test_renderprep.py From cjworkbench with GNU Affero General Public License v3.0 | 6 votes |
def test_dict_prompting_error_concatenate_different_types(self): context = self._render_context( input_table=arrow_table( {"A": ["1"], "B": pa.array([datetime.now()], pa.timestamp("ns"))} ) ) schema = ParamDType.Dict( { "x": ParamDType.Column(column_types=frozenset({"number"})), "y": ParamDType.Column(column_types=frozenset({"number"})), } ) with self.assertRaises(PromptingError) as cm: clean_value(schema, {"x": "A", "y": "B"}, context) self.assertEqual( cm.exception.errors, [ PromptingError.WrongColumnType(["A"], "text", frozenset({"number"})), PromptingError.WrongColumnType( ["B"], "datetime", frozenset({"number"}) ), ], )
Example #4
Source File: test_index.py From kartothek with MIT License | 6 votes |
def test_serialization_normalization(key): """ Check that index normalizes values consistently after serializing. This is helpful to ensure correct behavior for cases such as when key=`datetime.datetime(2018, 1, 1, 12, 30)`, as this would be parsed to `pa.timestamp("us")` during index creation, but stored as `pa.timestamp("ns")`. """ index = ExplicitSecondaryIndex( column="col", index_dct={key: ["part_2", "part_4", "part_1"]} ) index2 = pickle.loads(pickle.dumps(index)) assert index.normalize_value(index.dtype, key) == index2.normalize_value( index2.dtype, key )
Example #5
Source File: test_io.py From cjworkbench with GNU Affero General Public License v3.0 | 6 votes |
def test_metadata_comes_from_db_columns(self): columns = [ Column("A", ColumnType.Number(format="{:,.2f}")), Column("B", ColumnType.Datetime()), Column("C", ColumnType.Text()), ] result = RenderResult( arrow_table( { "A": [1], "B": pa.array([datetime.datetime.now()], pa.timestamp("ns")), "C": ["x"], }, columns=columns, ) ) cache_render_result(self.workflow, self.wf_module, self.delta.id, result) # Delete from disk entirely, to prove we did not read. minio.remove(BUCKET, crr_parquet_key(self.wf_module.cached_render_result)) # Load _new_ CachedRenderResult -- from DB columns, not memory fresh_wf_module = WfModule.objects.get(id=self.wf_module.id) cached_result = fresh_wf_module.cached_render_result self.assertEqual(cached_result.table_metadata, TableMetadata(1, columns))
Example #6
Source File: types.py From LearningApacheSpark with MIT License | 6 votes |
def _check_series_localize_timestamps(s, timezone): """ Convert timezone aware timestamps to timezone-naive in the specified timezone or local timezone. If the input series is not a timestamp series, then the same series is returned. If the input series is a timestamp series, then a converted series is returned. :param s: pandas.Series :param timezone: the timezone to convert. if None then use local timezone :return pandas.Series that have been converted to tz-naive """ from pyspark.sql.utils import require_minimum_pandas_version require_minimum_pandas_version() from pandas.api.types import is_datetime64tz_dtype tz = timezone or _get_local_timezone() # TODO: handle nested timestamps, such as ArrayType(TimestampType())? if is_datetime64tz_dtype(s.dtype): return s.dt.tz_convert(tz).dt.tz_localize(None) else: return s
Example #7
Source File: index.py From kartothek with MIT License | 6 votes |
def _parquet_bytes_to_dict(column: str, index_buffer: bytes): reader = pa.BufferReader(index_buffer) # This can be done much more efficient but would take a lot more # time to implement so this will be only done on request. table = pq.read_table(reader) if ARROW_LARGER_EQ_0150: column_type = table.schema.field(column).type else: column_type = table.schema.field_by_name(column).type # `datetime.datetime` objects have a precision of up to microseconds only, so arrow # parses the type to `pa.timestamp("us")`. Since the # values are normalized to `numpy.datetime64[ns]` anyways, we do not care about this # and load the column type as `pa.timestamp("ns")` if column_type == pa.timestamp("us"): column_type = pa.timestamp("ns") df = _fix_pyarrow_07992_table(table).to_pandas() # Could eventually be phased out index_dct = dict( zip(df[column].values, (list(x) for x in df[_PARTITION_COLUMN_NAME].values)) ) return index_dct, column_type
Example #8
Source File: test_renderprep.py From cjworkbench with GNU Affero General Public License v3.0 | 6 votes |
def test_clean_multichartseries_non_number_is_prompting_error(self): context = self._render_context( input_table=arrow_table( {"A": ["a"], "B": pa.array([datetime.now()], pa.timestamp("ns"))} ) ) value = [ {"column": "A", "color": "#aaaaaa"}, {"column": "B", "color": "#cccccc"}, ] with self.assertRaises(PromptingError) as cm: clean_value(ParamDType.Multichartseries(), value, context) self.assertEqual( cm.exception.errors, [ PromptingError.WrongColumnType(["A"], "text", frozenset({"number"})), PromptingError.WrongColumnType( ["B"], "datetime", frozenset({"number"}) ), ], )
Example #9
Source File: test_wfmodule.py From cjworkbench with GNU Affero General Public License v3.0 | 6 votes |
def test_wf_module_render_null_datetime(self): # Ran into problems 2019-09-06, when switching to Arrow cache_render_result( self.workflow, self.wf_module2, self.wf_module2.last_relevant_delta_id, RenderResult( arrow_table( { "A": pa.array( [dt(2019, 1, 2, 3, 4, 5, 6007, None), None], pa.timestamp("ns"), ) } ) ), ) response = self.client.get("/api/wfmodules/%d/render" % self.wf_module2.id) self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertEqual( json.loads(response.content)["rows"], [{"A": "2019-01-02T03:04:05.006007Z"}, {"A": None}], )
Example #10
Source File: test_types.py From cjworkbench with GNU Affero General Public License v3.0 | 6 votes |
def test_arrow_datetime_column(self): dataframe, columns = arrow_table_to_dataframe( arrow_table( { "A": pyarrow.array( [dt.fromisoformat("2019-09-17T21:21:00.123456"), None], type=pyarrow.timestamp(unit="ns", tz=None), ) }, [atypes.Column("A", atypes.ColumnType.Datetime())], ) ) assert_frame_equal( dataframe, pd.DataFrame( {"A": ["2019-09-17T21:21:00.123456Z", None]}, dtype="datetime64[ns]" ), ) self.assertEqual(columns, [Column("A", ColumnType.DATETIME())])
Example #11
Source File: test_types.py From cjworkbench with GNU Affero General Public License v3.0 | 6 votes |
def test_dataframe_datetime_column(self): assert_arrow_table_equals( dataframe_to_arrow_table( pd.DataFrame( {"A": ["2019-09-17T21:21:00.123456Z", None]}, dtype="datetime64[ns]" ), [Column("A", ColumnType.DATETIME())], self.path, ), arrow_table( { "A": pyarrow.array( [dt.fromisoformat("2019-09-17T21:21:00.123456"), None], type=pyarrow.timestamp(unit="ns", tz=None), ) }, [atypes.Column("A", atypes.ColumnType.Datetime())], ), )
Example #12
Source File: test_module.py From cjworkbench with GNU Affero General Public License v3.0 | 6 votes |
def test_render_with_input_columns(self): def render(table, params, *, input_columns): self.assertEqual( input_columns, { "A": ptypes.RenderColumn("A", "text", None), "B": ptypes.RenderColumn("B", "number", "{:,.3f}"), "C": ptypes.RenderColumn("C", "datetime", None), }, ) with arrow_table_context( {"A": ["x"], "B": [1], "C": pa.array([datetime.now()], pa.timestamp("ns"))}, columns=[ Column("A", ColumnType.Text()), Column("B", ColumnType.Number("{:,.3f}")), Column("C", ColumnType.Datetime()), ], dir=self.basedir, ) as arrow_table: self._test_render(render, arrow_table=arrow_table)
Example #13
Source File: test_validate.py From cjworkbench with GNU Affero General Public License v3.0 | 5 votes |
def test_column_datetime_must_be_ns_resolution(self): # [2019-09-17] Pandas only supports datetime64[ns] # https://github.com/pandas-dev/pandas/issues/7307#issuecomment-224180563 with self.assertRaises(DatetimeUnitNotAllowed): validate_table_metadata( pyarrow.table( { "A": pyarrow.array( [5298375234], type=pyarrow.timestamp("us", tz=None) ) } ), TableMetadata(1, [Datetime("A")]), )
Example #14
Source File: types.py From cjworkbench with GNU Affero General Public License v3.0 | 5 votes |
def _dtype_to_arrow_type(dtype: np.dtype) -> pyarrow.DataType: if dtype == np.int8: return pyarrow.int8() elif dtype == np.int16: return pyarrow.int16() elif dtype == np.int32: return pyarrow.int32() elif dtype == np.int64: return pyarrow.int64() elif dtype == np.uint8: return pyarrow.uint8() elif dtype == np.uint16: return pyarrow.uint16() elif dtype == np.uint32: return pyarrow.uint32() elif dtype == np.uint64: return pyarrow.uint64() elif dtype == np.float16: return pyarrow.float16() elif dtype == np.float32: return pyarrow.float32() elif dtype == np.float64: return pyarrow.float64() elif dtype.kind == "M": # [2019-09-17] Pandas only allows "ns" unit -- as in, datetime64[ns] # https://github.com/pandas-dev/pandas/issues/7307#issuecomment-224180563 assert dtype.str.endswith("[ns]") return pyarrow.timestamp(unit="ns", tz=None) elif dtype == np.object_: return pyarrow.string() else: raise RuntimeError("Unhandled dtype %r" % dtype)
Example #15
Source File: test_client.py From json2parquet with MIT License | 5 votes |
def test_ingest_with_datetime_formatted(): """ Test ingesting datetime data with a given schema and custom date format """ schema = pa.schema([ pa.field("foo", pa.int64()), pa.field("bar", pa.int64()), pa.field("baz", pa.timestamp("ns")) ]) data = [{"foo": 1, "bar": 2, "baz": "2018/01/01 01:02:03"}, {"foo": 10, "bar": 20, "baz": "2018/01/02 01:02:03"}] converted_data = client.ingest_data(data, schema, date_format="%Y/%m/%d %H:%M:%S") timestamp_values = [pd.to_datetime("2018-01-01 01:02:03"), pd.to_datetime("2018-01-02 01:02:03")] assert converted_data.to_pydict() == {'foo': [1, 10], 'bar': [2, 20], 'baz': timestamp_values}
Example #16
Source File: index.py From kartothek with MIT License | 5 votes |
def store(self, store: KeyValueStore, dataset_uuid: str) -> str: """ Store the index as a parquet file If compatible, the new keyname will be the name stored under the attribute `index_storage_key`. If this attribute is None, a new key will be generated of the format `{dataset_uuid}/indices/{column}/{timestamp}.by-dataset-index.parquet` where the timestamp is in nanosecond accuracy and is created upon Index object initialization Parameters ---------- store: dataset_uuid: """ storage_key = None if ( self.index_storage_key is not None and dataset_uuid and dataset_uuid in self.index_storage_key ): storage_key = self.index_storage_key if storage_key is None: storage_key = "{dataset_uuid}/indices/{column}/{timestamp}{suffix}".format( dataset_uuid=dataset_uuid, suffix=naming.EXTERNAL_INDEX_SUFFIX, column=quote(self.column), timestamp=quote(self.creation_time.isoformat()), ) table = _index_dct_to_table(self.index_dct, self.column, self.dtype) buf = pa.BufferOutputStream() pq.write_table(table, buf) store.put(storage_key, buf.getvalue().to_pybytes()) return storage_key
Example #17
Source File: test_index.py From kartothek with MIT License | 5 votes |
def test_index_ts_inference(store): index = ExplicitSecondaryIndex( column="col", index_dct={ pd.Timestamp("2017-01-01"): ["part_1", "part_2"], pd.Timestamp("2017-01-02"): ["part_3"], }, ) assert index.dtype == pa.timestamp("ns")
Example #18
Source File: array_util_test.py From tfx-bsl with Apache License 2.0 | 5 votes |
def testUnsupported(self): with self.assertRaisesRegex(RuntimeError, "Unimplemented"): array_util.GetByteSize(pa.array([], type=pa.timestamp("s")))
Example #19
Source File: test_client.py From json2parquet with MIT License | 5 votes |
def test_ingest_with_datetime(): """ Test ingesting datetime data with a given schema """ schema = pa.schema([ pa.field("foo", pa.int64()), pa.field("bar", pa.int64()), pa.field("baz", pa.timestamp("ns")) ]) data = [{"foo": 1, "bar": 2, "baz": "2018-01-01 01:02:03"}, {"foo": 10, "bar": 20, "baz": "2018-01-02 01:02:03"}] converted_data = client.ingest_data(data, schema) timestamp_values = [pd.to_datetime("2018-01-01 01:02:03"), pd.to_datetime("2018-01-02 01:02:03")] assert converted_data.to_pydict() == {'foo': [1, 10], 'bar': [2, 20], 'baz': timestamp_values}
Example #20
Source File: csv2parquet.py From csv2parquet with Apache License 2.0 | 5 votes |
def get_pyarrow_types(): return { 'bool': PA_BOOL, 'float32': PA_FLOAT32, 'float64': PA_FLOAT64, 'int8': PA_INT8, 'int16': PA_INT16, 'int32': PA_INT32, 'int64': PA_INT64, 'string': PA_STRING, 'timestamp': PA_TIMESTAMP, 'base64': PA_BINARY } # pylint: disable=too-many-branches,too-many-statements
Example #21
Source File: types.py From LearningApacheSpark with MIT License | 5 votes |
def _check_series_convert_timestamps_localize(s, from_timezone, to_timezone): """ Convert timestamp to timezone-naive in the specified timezone or local timezone :param s: a pandas.Series :param from_timezone: the timezone to convert from. if None then use local timezone :param to_timezone: the timezone to convert to. if None then use local timezone :return pandas.Series where if it is a timestamp, has been converted to tz-naive """ from pyspark.sql.utils import require_minimum_pandas_version require_minimum_pandas_version() import pandas as pd from pandas.api.types import is_datetime64tz_dtype, is_datetime64_dtype from_tz = from_timezone or _get_local_timezone() to_tz = to_timezone or _get_local_timezone() # TODO: handle nested timestamps, such as ArrayType(TimestampType())? if is_datetime64tz_dtype(s.dtype): return s.dt.tz_convert(to_tz).dt.tz_localize(None) elif is_datetime64_dtype(s.dtype) and from_tz != to_tz: # `s.dt.tz_localize('tzlocal()')` doesn't work properly when including NaT. return s.apply( lambda ts: ts.tz_localize(from_tz, ambiguous=False).tz_convert(to_tz).tz_localize(None) if ts is not pd.NaT else pd.NaT) else: return s
Example #22
Source File: parquet.py From spectrify with MIT License | 5 votes |
def _pa_timestamp_ns(): """Wrapper function around Arrow's timestamp type function, which is the only type function that requires an argument... """ return pa.timestamp('ns')
Example #23
Source File: test_parquet.py From spectrify with MIT License | 5 votes |
def setUp(self): self.sa_meta = sa.MetaData() self.data = [ [17.124, 1.12, 3.14, 13.37], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [True, None, False, True], ['string 1', 'string 2', None, 'string 3'], [datetime(2007, 7, 13, 1, 23, 34, 123456), None, datetime(2006, 1, 13, 12, 34, 56, 432539), datetime(2010, 8, 13, 5, 46, 57, 437699), ], ["Test Text", "Some#More#Test# Text", "!@#$%%^&*&", None], ] self.table = sa.Table( 'unit_test_table', self.sa_meta, sa.Column('real_col', sa.REAL), sa.Column('bigint_col', sa.BIGINT), sa.Column('int_col', sa.INTEGER), sa.Column('smallint_col', sa.SMALLINT), sa.Column('bool_col', sa.BOOLEAN), sa.Column('str_col', sa.VARCHAR), sa.Column('timestamp_col', sa.TIMESTAMP), sa.Column('plaintext_col', sa.TEXT), ) self.expected_datatypes = [ pa.float32(), pa.int64(), pa.int32(), pa.int16(), pa.bool_(), pa.string(), pa.timestamp('ns'), pa.string(), ]
Example #24
Source File: test_parquet.py From spectrify with MIT License | 5 votes |
def test_write(self): # Write out test file with UncloseableBytesIO() as write_buffer: with Writer(write_buffer, self.table) as writer: writer.write_row_group(self.data) file_bytes = write_buffer.getvalue() # Read in test file read_buffer = BytesIO(file_bytes) with pa.PythonFile(read_buffer, mode='r') as infile: # Verify data parq_table = pq.read_table(infile) written_data = list(parq_table.to_pydict().values()) tuples_by_data_type = zip(self.data, written_data) for i in tuples_by_data_type: tuples_by_order = zip(i[0], i[1]) for j in tuples_by_order: self.assertAlmostEquals(j[0], j[1], places=5) # Verify parquet file schema for i, field in enumerate(parq_table.schema): self.assertEqual(field.type.id, self.expected_datatypes[i].id) # Ensure timestamp column was written with int96; right now # there is no way to see except to check that the unit on # the timestamp type is 'ns' ts_col = parq_table.schema.field_by_name('timestamp_col') self.assertEqual(ts_col.type.unit, 'ns')
Example #25
Source File: test_unischema.py From petastorm with Apache License 2.0 | 5 votes |
def test_arrow_schema_convertion(): fields = [ pa.field('string', pa.string()), pa.field('int8', pa.int8()), pa.field('int16', pa.int16()), pa.field('int32', pa.int32()), pa.field('int64', pa.int64()), pa.field('float', pa.float32()), pa.field('double', pa.float64()), pa.field('bool', pa.bool_(), False), pa.field('fixed_size_binary', pa.binary(10)), pa.field('variable_size_binary', pa.binary()), pa.field('decimal', pa.decimal128(3, 4)), pa.field('timestamp_s', pa.timestamp('s')), pa.field('timestamp_ns', pa.timestamp('ns')), pa.field('date_32', pa.date32()), pa.field('date_64', pa.date64()) ] arrow_schema = pa.schema(fields) mock_dataset = _mock_parquet_dataset([], arrow_schema) unischema = Unischema.from_arrow_schema(mock_dataset) for name in arrow_schema.names: assert getattr(unischema, name).name == name assert getattr(unischema, name).codec is None if name == 'bool': assert not getattr(unischema, name).nullable else: assert getattr(unischema, name).nullable # Test schema preserve fields order field_name_list = [f.name for f in fields] assert list(unischema.fields.keys()) == field_name_list
Example #26
Source File: types.py From LearningApacheSpark with MIT License | 5 votes |
def to_arrow_type(dt): """ Convert Spark data type to pyarrow type """ from distutils.version import LooseVersion import pyarrow as pa if type(dt) == BooleanType: arrow_type = pa.bool_() elif type(dt) == ByteType: arrow_type = pa.int8() elif type(dt) == ShortType: arrow_type = pa.int16() elif type(dt) == IntegerType: arrow_type = pa.int32() elif type(dt) == LongType: arrow_type = pa.int64() elif type(dt) == FloatType: arrow_type = pa.float32() elif type(dt) == DoubleType: arrow_type = pa.float64() elif type(dt) == DecimalType: arrow_type = pa.decimal128(dt.precision, dt.scale) elif type(dt) == StringType: arrow_type = pa.string() elif type(dt) == BinaryType: # TODO: remove version check once minimum pyarrow version is 0.10.0 if LooseVersion(pa.__version__) < LooseVersion("0.10.0"): raise TypeError("Unsupported type in conversion to Arrow: " + str(dt) + "\nPlease install pyarrow >= 0.10.0 for BinaryType support.") arrow_type = pa.binary() elif type(dt) == DateType: arrow_type = pa.date32() elif type(dt) == TimestampType: # Timestamps should be in UTC, JVM Arrow timestamps require a timezone to be read arrow_type = pa.timestamp('us', tz='UTC') elif type(dt) == ArrayType: if type(dt.elementType) == TimestampType: raise TypeError("Unsupported type in conversion to Arrow: " + str(dt)) arrow_type = pa.list_(to_arrow_type(dt.elementType)) else: raise TypeError("Unsupported type in conversion to Arrow: " + str(dt)) return arrow_type
Example #27
Source File: test_io.py From cjworkbench with GNU Affero General Public License v3.0 | 5 votes |
def test_read_cached_render_result_slice_as_text_datetime(self): result = RenderResult( arrow_table( {"A": pa.array([2134213412341232967, None], pa.timestamp("ns"))}, columns=[Column("A", ColumnType.Datetime())], ) ) cache_render_result(self.workflow, self.wf_module, self.delta.id, result) crr = self.wf_module.cached_render_result self.assertEqual( read_cached_render_result_slice_as_text(crr, "csv", range(2), range(3)), "A\n2037-08-18T13:03:32.341232967Z\n", )
Example #28
Source File: types.py From LearningApacheSpark with MIT License | 5 votes |
def _check_series_convert_timestamps_local_tz(s, timezone): """ Convert timestamp to timezone-naive in the specified timezone or local timezone :param s: a pandas.Series :param timezone: the timezone to convert to. if None then use local timezone :return pandas.Series where if it is a timestamp, has been converted to tz-naive """ return _check_series_convert_timestamps_localize(s, None, timezone)
Example #29
Source File: test__pandas_helpers.py From python-bigquery with Apache License 2.0 | 5 votes |
def is_timestamp(type_): # See: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#timestamp-type return all_( pyarrow.types.is_timestamp, lambda type_: type_.unit == "us", lambda type_: type_.tz == "UTC", )(type_)
Example #30
Source File: test__pandas_helpers.py From python-bigquery with Apache License 2.0 | 5 votes |
def test_is_datetime(): assert is_datetime(pyarrow.timestamp("us", tz=None)) assert not is_datetime(pyarrow.timestamp("ms", tz=None)) assert not is_datetime(pyarrow.timestamp("us", tz="UTC")) assert not is_datetime(pyarrow.timestamp("ns", tz="UTC")) assert not is_datetime(pyarrow.string())