Python pyarrow.float64() Examples
The following are 26
code examples of pyarrow.float64().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyarrow
, or try the search function
.
![](https://www.programcreek.com/common/static/images/search.png)
Example #1
Source File: test_index.py From kartothek with MIT License | 7 votes |
def test_eval_operators_type_safety(): # gh66 ind = IndexBase(column="col", index_dct={1234: ["part"]}, dtype=pa.int64()) with pytest.raises( TypeError, match=r"Unexpected type for predicate: Column 'col' has pandas type 'int64', " r"but predicate value '1234' has pandas type 'object' \(Python type '<class 'str'>'\).", ): ind.eval_operator("==", "1234") with pytest.raises( TypeError, match=r"Unexpected type for predicate: Column 'col' has pandas type 'int64', " r"but predicate value 1234.0 has pandas type 'float64' \(Python type '<class 'float'>'\).", ): ind.eval_operator("==", 1234.0) assert ind.eval_operator("==", 1234) == {"part"}
Example #2
Source File: types.py From cjworkbench with GNU Affero General Public License v3.0 | 6 votes |
def format(self, value: Union[int, float]) -> str: if self._need_int: value = int(value) else: # Format float64 _integers_ as int. For instance, '3.0' should be # formatted as though it were the int, '3'. # # Python would normally format '3.0' as '3.0' by default; that's # not acceptable to us because we can't write a JavaScript # formatter that would do the same thing. (Javascript doesn't # distinguish between float and int.) int_value = int(value) if int_value == value: value = int_value return self._prefix + format(value, self._format_spec) + self._suffix
Example #3
Source File: test__pandas_helpers.py From python-bigquery with Apache License 2.0 | 6 votes |
def test_list_columns_and_indexes_without_named_index(module_under_test): df_data = collections.OrderedDict( [ ("a_series", [1, 2, 3, 4]), ("b_series", [0.1, 0.2, 0.3, 0.4]), ("c_series", ["a", "b", "c", "d"]), ] ) dataframe = pandas.DataFrame(df_data) columns_and_indexes = module_under_test.list_columns_and_indexes(dataframe) expected = [ ("a_series", pandas.api.types.pandas_dtype("int64")), ("b_series", pandas.api.types.pandas_dtype("float64")), ("c_series", pandas.api.types.pandas_dtype("object")), ] assert columns_and_indexes == expected
Example #4
Source File: test__pandas_helpers.py From python-bigquery with Apache License 2.0 | 6 votes |
def test_list_columns_and_indexes_with_named_index_same_as_column_name( module_under_test, ): df_data = collections.OrderedDict( [ ("a_series", [1, 2, 3, 4]), ("b_series", [0.1, 0.2, 0.3, 0.4]), ("c_series", ["a", "b", "c", "d"]), ] ) dataframe = pandas.DataFrame( df_data, # Use same name as an integer column but a different datatype so that # we can verify that the column is listed but the index isn't. index=pandas.Index([0.1, 0.2, 0.3, 0.4], name="a_series"), ) columns_and_indexes = module_under_test.list_columns_and_indexes(dataframe) expected = [ ("a_series", pandas.api.types.pandas_dtype("int64")), ("b_series", pandas.api.types.pandas_dtype("float64")), ("c_series", pandas.api.types.pandas_dtype("object")), ] assert columns_and_indexes == expected
Example #5
Source File: test__pandas_helpers.py From python-bigquery with Apache License 2.0 | 6 votes |
def test_list_columns_and_indexes_with_named_index(module_under_test): df_data = collections.OrderedDict( [ ("a_series", [1, 2, 3, 4]), ("b_series", [0.1, 0.2, 0.3, 0.4]), ("c_series", ["a", "b", "c", "d"]), ] ) dataframe = pandas.DataFrame( df_data, index=pandas.Index([4, 5, 6, 7], name="a_index") ) columns_and_indexes = module_under_test.list_columns_and_indexes(dataframe) expected = [ ("a_index", pandas.api.types.pandas_dtype("int64")), ("a_series", pandas.api.types.pandas_dtype("int64")), ("b_series", pandas.api.types.pandas_dtype("float64")), ("c_series", pandas.api.types.pandas_dtype("object")), ] assert columns_and_indexes == expected
Example #6
Source File: test_algorithms.py From fletcher with MIT License | 6 votes |
def test_reduce_op_no_identity(data, skipna, op, pandas_op): arrow = pa.array(data, type=pa.float64(), from_pandas=True) pandas = pd.Series(data, dtype=float) should_raise = arrow.null_count == len(arrow) and (skipna or len(arrow) == 0) if should_raise: with pytest.raises(ValueError): assert_allclose_na(op(arrow, skipna), pandas_op(pandas, skipna=skipna)) else: assert_allclose_na(op(arrow, skipna), pandas_op(pandas, skipna=skipna)) # Split in the middle and check whether this still works if len(data) > 2: arrow = pa.chunked_array( [ pa.array(data[: len(data) // 2], type=pa.float64(), from_pandas=True), pa.array(data[len(data) // 2 :], type=pa.float64(), from_pandas=True), ] ) if should_raise: with pytest.raises(ValueError): assert_allclose_na(op(arrow, skipna), pandas_op(pandas, skipna=skipna)) else: assert_allclose_na(op(arrow, skipna), pandas_op(pandas, skipna=skipna))
Example #7
Source File: test_unit_arrow_chunk_iterator.py From snowflake-connector-python with Apache License 2.0 | 5 votes |
def test_iterate_over_float_chunk(): random.seed(datetime.datetime.now()) column_meta = [ {"logicalType": "REAL"}, {"logicalType": "FLOAT"} ] def float_generator(): return random.uniform(-100.0, 100.0) iterate_over_test_chunk([pyarrow.float64(), pyarrow.float64()], column_meta, float_generator)
Example #8
Source File: array_util_test.py From tfx-bsl with Apache License 2.0 | 5 votes |
def _get_numeric_byte_size_test_cases(): result = [] for array_type, sizeof in [ (pa.int8(), 1), (pa.uint8(), 1), (pa.int16(), 2), (pa.uint16(), 2), (pa.int32(), 4), (pa.uint32(), 4), (pa.int64(), 8), (pa.uint64(), 8), (pa.float32(), 4), (pa.float64(), 8), ]: result.append( dict( testcase_name=str(array_type), array=pa.array(range(9), type=array_type), slice_offset=2, slice_length=3, expected_size=(_all_false_null_bitmap_size(2) + sizeof * 9), expected_sliced_size=(_all_false_null_bitmap_size(1) + sizeof * 3))) return result
Example #9
Source File: array_util_test.py From tfx-bsl with Apache License 2.0 | 5 votes |
def test_element_lengths_list_array(self, list_type_factory): list_lengths = array_util.GetElementLengths( pa.array([], type=list_type_factory(pa.int64()))) self.assertTrue(list_lengths.equals(pa.array([], type=pa.int64()))) list_lengths = array_util.GetElementLengths( pa.array([[1., 2.], [], [3.]], list_type_factory(pa.float32()))) self.assertTrue(list_lengths.equals(pa.array([2, 0, 1], type=pa.int64()))) list_lengths = array_util.GetElementLengths( pa.array([[1., 2.], None, [3.]], list_type_factory(pa.float64()))) self.assertTrue(list_lengths.equals(pa.array([2, 0, 1], type=pa.int64())))
Example #10
Source File: array_util_test.py From tfx-bsl with Apache License 2.0 | 5 votes |
def test_list_lengths(self, list_type_factory): list_lengths = array_util.ListLengthsFromListArray( pa.array([], type=list_type_factory(pa.int64()))) self.assertTrue(list_lengths.equals(pa.array([], type=pa.int64()))) list_lengths = array_util.ListLengthsFromListArray( pa.array([[1., 2.], [], [3.]], type=list_type_factory(pa.float32()))) self.assertTrue(list_lengths.equals(pa.array([2, 0, 1], type=pa.int64()))) list_lengths = array_util.ListLengthsFromListArray( pa.array([[1., 2.], None, [3.]], type=list_type_factory(pa.float64()))) self.assertTrue(list_lengths.equals(pa.array([2, 0, 1], type=pa.int64())))
Example #11
Source File: test_common_metadata.py From kartothek with MIT License | 5 votes |
def test_validate_empty_dataframe_corrupt_raises( df_all_types, df_all_types_schema, df_all_types_empty_schema, corrupt_column, corrupt_value, corrupt_dtype, ): # In case there is something wrong with the schema, raise! # First, an integer column carries a float or an object. df_corrupt = df_all_types.copy() # for value, dtype in [(-1.1, np.float64), ('a', np.object)]: df_corrupt[corrupt_column] = pd.Series([corrupt_value], dtype=corrupt_dtype) df_corrupt_meta = make_meta(df_corrupt, origin="1") # Raise when comparing the proper to the corrupt schema for schemas in permutations([df_all_types_schema, df_corrupt_meta]): with pytest.raises(ValueError): validate_compatible(schemas) # Also raise if there is a schema originating from an empty DF to make # sure the emptiness doesn't cancel the validation for schemas in permutations( [df_all_types_schema, df_corrupt_meta, df_all_types_empty_schema] ): with pytest.raises(ValueError): validate_compatible(schemas)
Example #12
Source File: types.py From cjworkbench with GNU Affero General Public License v3.0 | 5 votes |
def _dtype_to_arrow_type(dtype: np.dtype) -> pyarrow.DataType: if dtype == np.int8: return pyarrow.int8() elif dtype == np.int16: return pyarrow.int16() elif dtype == np.int32: return pyarrow.int32() elif dtype == np.int64: return pyarrow.int64() elif dtype == np.uint8: return pyarrow.uint8() elif dtype == np.uint16: return pyarrow.uint16() elif dtype == np.uint32: return pyarrow.uint32() elif dtype == np.uint64: return pyarrow.uint64() elif dtype == np.float16: return pyarrow.float16() elif dtype == np.float32: return pyarrow.float32() elif dtype == np.float64: return pyarrow.float64() elif dtype.kind == "M": # [2019-09-17] Pandas only allows "ns" unit -- as in, datetime64[ns] # https://github.com/pandas-dev/pandas/issues/7307#issuecomment-224180563 assert dtype.str.endswith("[ns]") return pyarrow.timestamp(unit="ns", tz=None) elif dtype == np.object_: return pyarrow.string() else: raise RuntimeError("Unhandled dtype %r" % dtype)
Example #13
Source File: parquet_pio.py From sdc with BSD 2-Clause "Simplified" License | 5 votes |
def _get_numba_typ_from_pa_typ(pa_typ): import pyarrow as pa _typ_map = { # boolean pa.bool_(): types.bool_, # signed int types pa.int8(): types.int8, pa.int16(): types.int16, pa.int32(): types.int32, pa.int64(): types.int64, # unsigned int types pa.uint8(): types.uint8, pa.uint16(): types.uint16, pa.uint32(): types.uint32, pa.uint64(): types.uint64, # float types (TODO: float16?) pa.float32(): types.float32, pa.float64(): types.float64, # String pa.string(): string_type, # date pa.date32(): types.NPDatetime('ns'), pa.date64(): types.NPDatetime('ns'), # time (TODO: time32, time64, ...) pa.timestamp('ns'): types.NPDatetime('ns'), pa.timestamp('us'): types.NPDatetime('ns'), pa.timestamp('ms'): types.NPDatetime('ns'), pa.timestamp('s'): types.NPDatetime('ns'), } if pa_typ not in _typ_map: raise ValueError("Arrow data type {} not supported yet".format(pa_typ)) return _typ_map[pa_typ]
Example #14
Source File: test_algorithms.py From fletcher with MIT License | 5 votes |
def test_reduce_op(data, skipna, op, pandas_op): arrow = pa.array(data, type=pa.float64(), from_pandas=True) pandas = pd.Series(data, dtype=float) assert_allclose_na(op(arrow, skipna), pandas_op(pandas, skipna=skipna)) # Split in the middle and check whether this still works if len(data) > 2: arrow = pa.chunked_array( [ pa.array(data[: len(data) // 2], type=pa.float64(), from_pandas=True), pa.array(data[len(data) // 2 :], type=pa.float64(), from_pandas=True), ] ) assert_allclose_na(op(arrow, skipna), pandas_op(pandas, skipna=skipna))
Example #15
Source File: test_table.py From python-bigquery with Apache License 2.0 | 5 votes |
def test_to_dataframe_column_dtypes(self): from google.cloud.bigquery.schema import SchemaField schema = [ SchemaField("start_timestamp", "TIMESTAMP"), SchemaField("seconds", "INT64"), SchemaField("miles", "FLOAT64"), SchemaField("km", "FLOAT64"), SchemaField("payment_type", "STRING"), SchemaField("complete", "BOOL"), SchemaField("date", "DATE"), ] row_data = [ ["1.4338368E9", "420", "1.1", "1.77", u"Cash", "true", "1999-12-01"], ["1.3878117E9", "2580", "17.7", "28.5", u"Cash", "false", "1953-06-14"], ["1.3855653E9", "2280", "4.4", "7.1", u"Credit", "true", "1981-11-04"], ] rows = [{"f": [{"v": field} for field in row]} for row in row_data] path = "/foo" api_request = mock.Mock(return_value={"rows": rows}) row_iterator = self._make_one(_mock_client(), api_request, path, schema) df = row_iterator.to_dataframe( dtypes={"km": "float16"}, create_bqstorage_client=False, ) self.assertIsInstance(df, pandas.DataFrame) self.assertEqual(len(df), 3) # verify the number of rows exp_columns = [field.name for field in schema] self.assertEqual(list(df), exp_columns) # verify the column names self.assertEqual(df.start_timestamp.dtype.name, "datetime64[ns, UTC]") self.assertEqual(df.seconds.dtype.name, "int64") self.assertEqual(df.miles.dtype.name, "float64") self.assertEqual(df.km.dtype.name, "float16") self.assertEqual(df.payment_type.dtype.name, "object") self.assertEqual(df.complete.dtype.name, "bool") self.assertEqual(df.date.dtype.name, "object")
Example #16
Source File: test__pandas_helpers.py From python-bigquery with Apache License 2.0 | 5 votes |
def test_bq_to_arrow_data_type_w_array_struct(module_under_test, bq_type): fields = ( schema.SchemaField("field01", "STRING"), schema.SchemaField("field02", "BYTES"), schema.SchemaField("field03", "INTEGER"), schema.SchemaField("field04", "INT64"), schema.SchemaField("field05", "FLOAT"), schema.SchemaField("field06", "FLOAT64"), schema.SchemaField("field07", "NUMERIC"), schema.SchemaField("field08", "BOOLEAN"), schema.SchemaField("field09", "BOOL"), schema.SchemaField("field10", "TIMESTAMP"), schema.SchemaField("field11", "DATE"), schema.SchemaField("field12", "TIME"), schema.SchemaField("field13", "DATETIME"), schema.SchemaField("field14", "GEOGRAPHY"), ) field = schema.SchemaField("ignored_name", bq_type, mode="REPEATED", fields=fields) actual = module_under_test.bq_to_arrow_data_type(field) expected_value_type = pyarrow.struct( ( pyarrow.field("field01", pyarrow.string()), pyarrow.field("field02", pyarrow.binary()), pyarrow.field("field03", pyarrow.int64()), pyarrow.field("field04", pyarrow.int64()), pyarrow.field("field05", pyarrow.float64()), pyarrow.field("field06", pyarrow.float64()), pyarrow.field("field07", module_under_test.pyarrow_numeric()), pyarrow.field("field08", pyarrow.bool_()), pyarrow.field("field09", pyarrow.bool_()), pyarrow.field("field10", module_under_test.pyarrow_timestamp()), pyarrow.field("field11", pyarrow.date32()), pyarrow.field("field12", module_under_test.pyarrow_time()), pyarrow.field("field13", module_under_test.pyarrow_datetime()), pyarrow.field("field14", pyarrow.string()), ) ) assert pyarrow.types.is_list(actual) assert pyarrow.types.is_struct(actual.value_type) assert actual.value_type.num_children == len(fields) assert actual.value_type.equals(expected_value_type)
Example #17
Source File: test__pandas_helpers.py From python-bigquery with Apache License 2.0 | 5 votes |
def test_bq_to_arrow_data_type_w_struct(module_under_test, bq_type): fields = ( schema.SchemaField("field01", "STRING"), schema.SchemaField("field02", "BYTES"), schema.SchemaField("field03", "INTEGER"), schema.SchemaField("field04", "INT64"), schema.SchemaField("field05", "FLOAT"), schema.SchemaField("field06", "FLOAT64"), schema.SchemaField("field07", "NUMERIC"), schema.SchemaField("field08", "BOOLEAN"), schema.SchemaField("field09", "BOOL"), schema.SchemaField("field10", "TIMESTAMP"), schema.SchemaField("field11", "DATE"), schema.SchemaField("field12", "TIME"), schema.SchemaField("field13", "DATETIME"), schema.SchemaField("field14", "GEOGRAPHY"), ) field = schema.SchemaField("ignored_name", bq_type, mode="NULLABLE", fields=fields) actual = module_under_test.bq_to_arrow_data_type(field) expected = pyarrow.struct( ( pyarrow.field("field01", pyarrow.string()), pyarrow.field("field02", pyarrow.binary()), pyarrow.field("field03", pyarrow.int64()), pyarrow.field("field04", pyarrow.int64()), pyarrow.field("field05", pyarrow.float64()), pyarrow.field("field06", pyarrow.float64()), pyarrow.field("field07", module_under_test.pyarrow_numeric()), pyarrow.field("field08", pyarrow.bool_()), pyarrow.field("field09", pyarrow.bool_()), pyarrow.field("field10", module_under_test.pyarrow_timestamp()), pyarrow.field("field11", pyarrow.date32()), pyarrow.field("field12", module_under_test.pyarrow_time()), pyarrow.field("field13", module_under_test.pyarrow_datetime()), pyarrow.field("field14", pyarrow.string()), ) ) assert pyarrow.types.is_struct(actual) assert actual.num_children == len(fields) assert actual.equals(expected)
Example #18
Source File: types.py From LearningApacheSpark with MIT License | 5 votes |
def to_arrow_type(dt): """ Convert Spark data type to pyarrow type """ from distutils.version import LooseVersion import pyarrow as pa if type(dt) == BooleanType: arrow_type = pa.bool_() elif type(dt) == ByteType: arrow_type = pa.int8() elif type(dt) == ShortType: arrow_type = pa.int16() elif type(dt) == IntegerType: arrow_type = pa.int32() elif type(dt) == LongType: arrow_type = pa.int64() elif type(dt) == FloatType: arrow_type = pa.float32() elif type(dt) == DoubleType: arrow_type = pa.float64() elif type(dt) == DecimalType: arrow_type = pa.decimal128(dt.precision, dt.scale) elif type(dt) == StringType: arrow_type = pa.string() elif type(dt) == BinaryType: # TODO: remove version check once minimum pyarrow version is 0.10.0 if LooseVersion(pa.__version__) < LooseVersion("0.10.0"): raise TypeError("Unsupported type in conversion to Arrow: " + str(dt) + "\nPlease install pyarrow >= 0.10.0 for BinaryType support.") arrow_type = pa.binary() elif type(dt) == DateType: arrow_type = pa.date32() elif type(dt) == TimestampType: # Timestamps should be in UTC, JVM Arrow timestamps require a timezone to be read arrow_type = pa.timestamp('us', tz='UTC') elif type(dt) == ArrayType: if type(dt.elementType) == TimestampType: raise TypeError("Unsupported type in conversion to Arrow: " + str(dt)) arrow_type = pa.list_(to_arrow_type(dt.elementType)) else: raise TypeError("Unsupported type in conversion to Arrow: " + str(dt)) return arrow_type
Example #19
Source File: test_unischema.py From petastorm with Apache License 2.0 | 5 votes |
def test_arrow_schema_convertion(): fields = [ pa.field('string', pa.string()), pa.field('int8', pa.int8()), pa.field('int16', pa.int16()), pa.field('int32', pa.int32()), pa.field('int64', pa.int64()), pa.field('float', pa.float32()), pa.field('double', pa.float64()), pa.field('bool', pa.bool_(), False), pa.field('fixed_size_binary', pa.binary(10)), pa.field('variable_size_binary', pa.binary()), pa.field('decimal', pa.decimal128(3, 4)), pa.field('timestamp_s', pa.timestamp('s')), pa.field('timestamp_ns', pa.timestamp('ns')), pa.field('date_32', pa.date32()), pa.field('date_64', pa.date64()) ] arrow_schema = pa.schema(fields) mock_dataset = _mock_parquet_dataset([], arrow_schema) unischema = Unischema.from_arrow_schema(mock_dataset) for name in arrow_schema.names: assert getattr(unischema, name).name == name assert getattr(unischema, name).codec is None if name == 'bool': assert not getattr(unischema, name).nullable else: assert getattr(unischema, name).nullable # Test schema preserve fields order field_name_list = [f.name for f in fields] assert list(unischema.fields.keys()) == field_name_list
Example #20
Source File: test_unischema.py From petastorm with Apache License 2.0 | 5 votes |
def test_dict_to_spark_row_order(): TestSchema = Unischema('TestSchema', [ UnischemaField('float_col', np.float64, ()), UnischemaField('int_col', np.int64, ()), ]) row_dict = { TestSchema.int_col.name: 3, TestSchema.float_col.name: 2.0, } spark_row = dict_to_spark_row(TestSchema, row_dict) schema_field_names = list(TestSchema.fields) assert spark_row[0] == row_dict[schema_field_names[0]] assert spark_row[1] == row_dict[schema_field_names[1]]
Example #21
Source File: csv2parquet.py From csv2parquet with Apache License 2.0 | 5 votes |
def get_pyarrow_types(): return { 'bool': PA_BOOL, 'float32': PA_FLOAT32, 'float64': PA_FLOAT64, 'int8': PA_INT8, 'int16': PA_INT16, 'int32': PA_INT32, 'int64': PA_INT64, 'string': PA_STRING, 'timestamp': PA_TIMESTAMP, 'base64': PA_BINARY } # pylint: disable=too-many-branches,too-many-statements
Example #22
Source File: test_unit_arrow_chunk_iterator.py From snowflake-connector-python with Apache License 2.0 | 5 votes |
def test_iterate_over_float_chunk(): random.seed(datetime.datetime.now()) column_meta = [ {"logicalType": "REAL"}, {"logicalType": "FLOAT"} ] def float_generator(): return random.uniform(-100.0, 100.0) iterate_over_test_chunk([pyarrow.float64(), pyarrow.float64()], column_meta, float_generator)
Example #23
Source File: test_table.py From python-bigquery with Apache License 2.0 | 4 votes |
def test_to_dataframe_w_bqstorage_empty_streams(self): from google.cloud.bigquery import schema from google.cloud.bigquery import table as mut from google.cloud.bigquery_storage_v1 import reader arrow_fields = [ pyarrow.field("colA", pyarrow.int64()), # Not alphabetical to test column order. pyarrow.field("colC", pyarrow.float64()), pyarrow.field("colB", pyarrow.utf8()), ] arrow_schema = pyarrow.schema(arrow_fields) bqstorage_client = mock.create_autospec(bigquery_storage_v1.BigQueryReadClient) session = bigquery_storage_v1.types.ReadSession( streams=[{"name": "/projects/proj/dataset/dset/tables/tbl/streams/1234"}], arrow_schema={"serialized_schema": arrow_schema.serialize().to_pybytes()}, ) bqstorage_client.create_read_session.return_value = session mock_rowstream = mock.create_autospec(reader.ReadRowsStream) bqstorage_client.read_rows.return_value = mock_rowstream mock_rows = mock.create_autospec(reader.ReadRowsIterable) mock_rowstream.rows.return_value = mock_rows mock_pages = mock.PropertyMock(return_value=()) type(mock_rows).pages = mock_pages # Schema is required when there are no record batches in the stream. schema = [ schema.SchemaField("colA", "INTEGER"), schema.SchemaField("colC", "FLOAT"), schema.SchemaField("colB", "STRING"), ] row_iterator = mut.RowIterator( _mock_client(), None, # api_request: ignored None, # path: ignored schema, table=mut.TableReference.from_string("proj.dset.tbl"), selected_fields=schema, ) got = row_iterator.to_dataframe(bqstorage_client) column_names = ["colA", "colC", "colB"] self.assertEqual(list(got), column_names) self.assertTrue(got.empty)
Example #24
Source File: common_metadata.py From kartothek with MIT License | 4 votes |
def normalize_type(t_pa, t_pd, t_np, metadata): """ This will normalize types as followed: - all signed integers (``int8``, ``int16``, ``int32``, ``int64``) will be converted to ``int64`` - all unsigned integers (``uint8``, ``uint16``, ``uint32``, ``uint64``) will be converted to ``uint64`` - all floats (``float32``, ``float64``) will be converted to ``float64`` - all list value types will be normalized (e.g. ``list[int16]`` to ``list[int64]``, ``list[list[uint8]]`` to ``list[list[uint64]]``) - all dict value types will be normalized (e.g. ``dictionary<values=float32, indices=int16, ordered=0>`` to ``float64``) Parameters ---------- t_pa: pyarrow.Type pyarrow type object, e.g. ``pa.list_(pa.int8())``. t_pd: string pandas type identifier, e.g. ``"list[int8]"``. t_np: string numpy type identifier, e.g. ``"object"``. metadata: Union[None, Dict[String, Any]] metadata associated with the type, e.g. information about categorials. Returns ------- type_tuple: Tuple[pyarrow.Type, string, string, Union[None, Dict[String, Any]]] tuple of ``t_pa``, ``t_pd``, ``t_np``, ``metadata`` for normalized type """ if pa.types.is_signed_integer(t_pa): return pa.int64(), "int64", "int64", None elif pa.types.is_unsigned_integer(t_pa): return pa.uint64(), "uint64", "uint64", None elif pa.types.is_floating(t_pa): return pa.float64(), "float64", "float64", None elif pa.types.is_list(t_pa): t_pa2, t_pd2, t_np2, metadata2 = normalize_type( t_pa.value_type, t_pd[len("list[") : -1], None, None ) return pa.list_(t_pa2), "list[{}]".format(t_pd2), "object", None elif pa.types.is_dictionary(t_pa): # downcast to dictionary content, `t_pd` is useless in that case if ARROW_LARGER_EQ_0141: return normalize_type(t_pa.value_type, t_np, t_np, None) else: return normalize_type(t_pa.dictionary.type, t_np, t_np, None) else: return t_pa, t_pd, t_np, metadata
Example #25
Source File: test_common_metadata.py From kartothek with MIT License | 4 votes |
def test_store_schema_metadata(store, df_all_types): store_schema_metadata( schema=make_meta(df_all_types, origin="df_all_types"), dataset_uuid="some_uuid", store=store, table="some_table", ) key = "some_uuid/some_table/_common_metadata" assert key in store.keys() pq_file = pq.ParquetFile(store.open(key)) actual_schema = pq_file.schema.to_arrow_schema() fields = [ pa.field("array_float32", pa.list_(pa.float64())), pa.field("array_float64", pa.list_(pa.float64())), pa.field("array_int16", pa.list_(pa.int64())), pa.field("array_int32", pa.list_(pa.int64())), pa.field("array_int64", pa.list_(pa.int64())), pa.field("array_int8", pa.list_(pa.int64())), pa.field("array_uint16", pa.list_(pa.uint64())), pa.field("array_uint32", pa.list_(pa.uint64())), pa.field("array_uint64", pa.list_(pa.uint64())), pa.field("array_uint8", pa.list_(pa.uint64())), pa.field("array_unicode", pa.list_(pa.string())), pa.field("bool", pa.bool_()), pa.field("byte", pa.binary()), pa.field("date", pa.date32()), pa.field("datetime64", pa.timestamp("us")), pa.field("float32", pa.float64()), pa.field("float64", pa.float64()), pa.field("int16", pa.int64()), pa.field("int32", pa.int64()), pa.field("int64", pa.int64()), pa.field("int8", pa.int64()), pa.field("null", pa.null()), pa.field("uint16", pa.uint64()), pa.field("uint32", pa.uint64()), pa.field("uint64", pa.uint64()), pa.field("uint8", pa.uint64()), pa.field("unicode", pa.string()), ] expected_schema = pa.schema(fields) assert actual_schema.remove_metadata() == expected_schema
Example #26
Source File: client.py From json2parquet with MIT License | 4 votes |
def _convert_data_with_schema(data, schema, date_format=None, field_aliases=None): column_data = {} array_data = [] schema_names = [] for row in data: for column in schema.names: _col = column_data.get(column, []) _col.append(row.get(column)) column_data[column] = _col for column in schema: _col = column_data.get(column.name) if isinstance(column.type, pa.lib.TimestampType): _converted_col = [] for t in _col: try: _converted_col.append(pd.to_datetime(t, format=date_format)) except pd._libs.tslib.OutOfBoundsDatetime: _converted_col.append(pd.Timestamp.max) array_data.append(pa.Array.from_pandas(pd.to_datetime(_converted_col), type=pa.timestamp('ns'))) elif column.type.id == pa.date32().id: _converted_col = map(_date_converter, _col) array_data.append(pa.array(_converted_col, type=pa.date32())) # Float types are ambiguous for conversions, need to specify the exact type elif column.type.id == pa.float64().id: array_data.append(pa.array(_col, type=pa.float64())) elif column.type.id == pa.float32().id: # Python doesn't have a native float32 type # and PyArrow cannot cast float64 -> float32 _col = pd.to_numeric(_col, downcast='float') array_data.append(pa.Array.from_pandas(_col, type=pa.float32())) elif column.type.id == pa.int32().id: # PyArrow 0.8.0 can cast int64 -> int32 _col64 = pa.array(_col, type=pa.int64()) array_data.append(_col64.cast(pa.int32())) elif column.type.id == pa.bool_().id: _col = map(_boolean_converter, _col) array_data.append(pa.array(_col, type=column.type)) else: array_data.append(pa.array(_col, type=column.type)) if isinstance(field_aliases, dict): schema_names.append(field_aliases.get(column.name, column.name)) else: schema_names.append(column.name) return pa.RecordBatch.from_arrays(array_data, schema_names)