Python Examples of pyarrow.uint64

Source File: parquet_pio.py From sdc with BSD 2-Clause "Simplified" License

5 votes

def _get_numba_typ_from_pa_typ(pa_typ):
    import pyarrow as pa
    _typ_map = {
        # boolean
        pa.bool_(): types.bool_,
        # signed int types
        pa.int8(): types.int8,
        pa.int16(): types.int16,
        pa.int32(): types.int32,
        pa.int64(): types.int64,
        # unsigned int types
        pa.uint8(): types.uint8,
        pa.uint16(): types.uint16,
        pa.uint32(): types.uint32,
        pa.uint64(): types.uint64,
        # float types (TODO: float16?)
        pa.float32(): types.float32,
        pa.float64(): types.float64,
        # String
        pa.string(): string_type,
        # date
        pa.date32(): types.NPDatetime('ns'),
        pa.date64(): types.NPDatetime('ns'),
        # time (TODO: time32, time64, ...)
        pa.timestamp('ns'): types.NPDatetime('ns'),
        pa.timestamp('us'): types.NPDatetime('ns'),
        pa.timestamp('ms'): types.NPDatetime('ns'),
        pa.timestamp('s'): types.NPDatetime('ns'),
    }
    if pa_typ not in _typ_map:
        raise ValueError("Arrow data type {} not supported yet".format(pa_typ))
    return _typ_map[pa_typ]

Source File: types.py From cjworkbench with GNU Affero General Public License v3.0

5 votes

def _dtype_to_arrow_type(dtype: np.dtype) -> pyarrow.DataType:
    if dtype == np.int8:
        return pyarrow.int8()
    elif dtype == np.int16:
        return pyarrow.int16()
    elif dtype == np.int32:
        return pyarrow.int32()
    elif dtype == np.int64:
        return pyarrow.int64()
    elif dtype == np.uint8:
        return pyarrow.uint8()
    elif dtype == np.uint16:
        return pyarrow.uint16()
    elif dtype == np.uint32:
        return pyarrow.uint32()
    elif dtype == np.uint64:
        return pyarrow.uint64()
    elif dtype == np.float16:
        return pyarrow.float16()
    elif dtype == np.float32:
        return pyarrow.float32()
    elif dtype == np.float64:
        return pyarrow.float64()
    elif dtype.kind == "M":
        # [2019-09-17] Pandas only allows "ns" unit -- as in, datetime64[ns]
        # https://github.com/pandas-dev/pandas/issues/7307#issuecomment-224180563
        assert dtype.str.endswith("[ns]")
        return pyarrow.timestamp(unit="ns", tz=None)
    elif dtype == np.object_:
        return pyarrow.string()
    else:
        raise RuntimeError("Unhandled dtype %r" % dtype)

Source File: test_common_metadata.py From kartothek with MIT License

5 votes

def test_empty_dataframe_from_schema(df_all_types):
    schema = make_meta(df_all_types, origin="1")
    actual_df = empty_dataframe_from_schema(schema)

    expected_df = df_all_types.loc[[]]
    expected_df["date"] = pd.Series([], dtype="datetime64[ns]")
    for c in expected_df.columns:
        if c.startswith("float"):
            expected_df[c] = pd.Series([], dtype=float)
        if c.startswith("int"):
            expected_df[c] = pd.Series([], dtype=int)
        if c.startswith("uint"):
            expected_df[c] = pd.Series([], dtype=np.uint64)

    pdt.assert_frame_equal(actual_df, expected_df)

Source File: test_common_metadata.py From kartothek with MIT License

5 votes

def test_empty_dataframe_from_schema_columns(df_all_types):
    schema = make_meta(df_all_types, origin="1")
    actual_df = empty_dataframe_from_schema(schema, ["uint64", "int64"])

    expected_df = df_all_types.loc[[], ["uint64", "int64"]]
    pdt.assert_frame_equal(actual_df, expected_df)

Source File: array_util_test.py From tfx-bsl with Apache License 2.0

5 votes

def _get_numeric_byte_size_test_cases():
  result = []
  for array_type, sizeof in [
      (pa.int8(), 1),
      (pa.uint8(), 1),
      (pa.int16(), 2),
      (pa.uint16(), 2),
      (pa.int32(), 4),
      (pa.uint32(), 4),
      (pa.int64(), 8),
      (pa.uint64(), 8),
      (pa.float32(), 4),
      (pa.float64(), 8),
  ]:
    result.append(
        dict(
            testcase_name=str(array_type),
            array=pa.array(range(9), type=array_type),
            slice_offset=2,
            slice_length=3,
            expected_size=(_all_false_null_bitmap_size(2) + sizeof * 9),
            expected_sliced_size=(_all_false_null_bitmap_size(1) + sizeof * 3)))
  return result

Source File: common_metadata.py From kartothek with MIT License

4 votes

def normalize_type(t_pa, t_pd, t_np, metadata):
    """
    This will normalize types as followed:

    - all signed integers (``int8``, ``int16``, ``int32``, ``int64``) will be converted to ``int64``
    - all unsigned integers (``uint8``, ``uint16``, ``uint32``, ``uint64``) will be converted to ``uint64``
    - all floats (``float32``, ``float64``) will be converted to ``float64``
    - all list value types will be normalized (e.g. ``list[int16]`` to ``list[int64]``, ``list[list[uint8]]`` to
      ``list[list[uint64]]``)
    - all dict value types will be normalized (e.g. ``dictionary<values=float32, indices=int16, ordered=0>`` to
      ``float64``)

    Parameters
    ----------
    t_pa: pyarrow.Type
        pyarrow type object, e.g. ``pa.list_(pa.int8())``.
    t_pd: string
        pandas type identifier, e.g. ``"list[int8]"``.
    t_np: string
        numpy type identifier, e.g. ``"object"``.
    metadata: Union[None, Dict[String, Any]]
        metadata associated with the type, e.g. information about categorials.

    Returns
    -------
    type_tuple: Tuple[pyarrow.Type, string, string, Union[None, Dict[String, Any]]]
        tuple of ``t_pa``, ``t_pd``, ``t_np``, ``metadata`` for normalized type
    """
    if pa.types.is_signed_integer(t_pa):
        return pa.int64(), "int64", "int64", None
    elif pa.types.is_unsigned_integer(t_pa):
        return pa.uint64(), "uint64", "uint64", None
    elif pa.types.is_floating(t_pa):
        return pa.float64(), "float64", "float64", None
    elif pa.types.is_list(t_pa):
        t_pa2, t_pd2, t_np2, metadata2 = normalize_type(
            t_pa.value_type, t_pd[len("list[") : -1], None, None
        )
        return pa.list_(t_pa2), "list[{}]".format(t_pd2), "object", None
    elif pa.types.is_dictionary(t_pa):
        # downcast to dictionary content, `t_pd` is useless in that case
        if ARROW_LARGER_EQ_0141:
            return normalize_type(t_pa.value_type, t_np, t_np, None)
        else:
            return normalize_type(t_pa.dictionary.type, t_np, t_np, None)
    else:
        return t_pa, t_pd, t_np, metadata

Source File: test_common_metadata.py From kartothek with MIT License

4 votes

def test_store_schema_metadata(store, df_all_types):
    store_schema_metadata(
        schema=make_meta(df_all_types, origin="df_all_types"),
        dataset_uuid="some_uuid",
        store=store,
        table="some_table",
    )

    key = "some_uuid/some_table/_common_metadata"
    assert key in store.keys()
    pq_file = pq.ParquetFile(store.open(key))
    actual_schema = pq_file.schema.to_arrow_schema()
    fields = [
        pa.field("array_float32", pa.list_(pa.float64())),
        pa.field("array_float64", pa.list_(pa.float64())),
        pa.field("array_int16", pa.list_(pa.int64())),
        pa.field("array_int32", pa.list_(pa.int64())),
        pa.field("array_int64", pa.list_(pa.int64())),
        pa.field("array_int8", pa.list_(pa.int64())),
        pa.field("array_uint16", pa.list_(pa.uint64())),
        pa.field("array_uint32", pa.list_(pa.uint64())),
        pa.field("array_uint64", pa.list_(pa.uint64())),
        pa.field("array_uint8", pa.list_(pa.uint64())),
        pa.field("array_unicode", pa.list_(pa.string())),
        pa.field("bool", pa.bool_()),
        pa.field("byte", pa.binary()),
        pa.field("date", pa.date32()),
        pa.field("datetime64", pa.timestamp("us")),
        pa.field("float32", pa.float64()),
        pa.field("float64", pa.float64()),
        pa.field("int16", pa.int64()),
        pa.field("int32", pa.int64()),
        pa.field("int64", pa.int64()),
        pa.field("int8", pa.int64()),
        pa.field("null", pa.null()),
        pa.field("uint16", pa.uint64()),
        pa.field("uint32", pa.uint64()),
        pa.field("uint64", pa.uint64()),
        pa.field("uint8", pa.uint64()),
        pa.field("unicode", pa.string()),
    ]
    expected_schema = pa.schema(fields)

    assert actual_schema.remove_metadata() == expected_schema

Source File: test_index.py From kartothek with MIT License

4 votes

def test_eq_explicit():
    def assert_eq(a, b):
        assert a == b
        assert b == a
        assert not (a != b)
        assert not (b != a)

    def assert_ne(a, b):
        assert a != b
        assert b != a
        assert not (a == b)
        assert not (b == a)

    original_index = ExplicitSecondaryIndex(
        column="col",
        index_dct={1: ["part_1"]},
        dtype=pa.int64(),
        index_storage_key="dataset_uuid/some_index.parquet",
    )

    idx1 = original_index.copy()
    assert_eq(idx1, original_index)

    idx2 = original_index.copy()
    idx2.column = "col2"
    assert_ne(idx2, original_index)

    idx3 = original_index.copy()
    idx3.dtype = pa.uint64()
    assert_ne(idx3, original_index)

    idx4 = original_index.copy()
    idx4.index_dct = {1: ["part_1"], 2: ["part_2"]}
    assert_ne(idx4, original_index)

    idx5 = original_index.copy()
    idx5.index_dct = {1: ["part_1", "part_2"]}
    assert_ne(idx5, original_index)

    idx6 = original_index.copy()
    idx6.index_dct = {1: ["part_2"]}
    assert_ne(idx6, original_index)

    idx7 = original_index.copy()
    idx7.index_dct = {2: ["part_1"]}
    assert_ne(idx7, original_index)

    idx8 = original_index.copy()
    idx8.dtype = None
    assert_ne(idx8, original_index)

    idx9a = original_index.copy()
    idx9b = original_index.copy()
    idx9a.dtype = None
    idx9b.dtype = None
    assert_eq(idx9a, idx9b)

Python pyarrow.uint64() Examples