Python pyarrow.float64() Examples

The following are 26 code examples of pyarrow.float64(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pyarrow , or try the search function .
Example #1
Source File: test_index.py    From kartothek with MIT License 7 votes vote down vote up
def test_eval_operators_type_safety():
    # gh66
    ind = IndexBase(column="col", index_dct={1234: ["part"]}, dtype=pa.int64())
    with pytest.raises(
        TypeError,
        match=r"Unexpected type for predicate: Column 'col' has pandas type 'int64', "
        r"but predicate value '1234' has pandas type 'object' \(Python type '<class 'str'>'\).",
    ):
        ind.eval_operator("==", "1234")
    with pytest.raises(
        TypeError,
        match=r"Unexpected type for predicate: Column 'col' has pandas type 'int64', "
        r"but predicate value 1234.0 has pandas type 'float64' \(Python type '<class 'float'>'\).",
    ):
        ind.eval_operator("==", 1234.0)

    assert ind.eval_operator("==", 1234) == {"part"} 
Example #2
Source File: types.py    From cjworkbench with GNU Affero General Public License v3.0 6 votes vote down vote up
def format(self, value: Union[int, float]) -> str:
        if self._need_int:
            value = int(value)
        else:
            # Format float64 _integers_ as int. For instance, '3.0' should be
            # formatted as though it were the int, '3'.
            #
            # Python would normally format '3.0' as '3.0' by default; that's
            # not acceptable to us because we can't write a JavaScript
            # formatter that would do the same thing. (Javascript doesn't
            # distinguish between float and int.)
            int_value = int(value)
            if int_value == value:
                value = int_value

        return self._prefix + format(value, self._format_spec) + self._suffix 
Example #3
Source File: test__pandas_helpers.py    From python-bigquery with Apache License 2.0 6 votes vote down vote up
def test_list_columns_and_indexes_without_named_index(module_under_test):
    df_data = collections.OrderedDict(
        [
            ("a_series", [1, 2, 3, 4]),
            ("b_series", [0.1, 0.2, 0.3, 0.4]),
            ("c_series", ["a", "b", "c", "d"]),
        ]
    )
    dataframe = pandas.DataFrame(df_data)

    columns_and_indexes = module_under_test.list_columns_and_indexes(dataframe)
    expected = [
        ("a_series", pandas.api.types.pandas_dtype("int64")),
        ("b_series", pandas.api.types.pandas_dtype("float64")),
        ("c_series", pandas.api.types.pandas_dtype("object")),
    ]
    assert columns_and_indexes == expected 
Example #4
Source File: test__pandas_helpers.py    From python-bigquery with Apache License 2.0 6 votes vote down vote up
def test_list_columns_and_indexes_with_named_index_same_as_column_name(
    module_under_test,
):
    df_data = collections.OrderedDict(
        [
            ("a_series", [1, 2, 3, 4]),
            ("b_series", [0.1, 0.2, 0.3, 0.4]),
            ("c_series", ["a", "b", "c", "d"]),
        ]
    )
    dataframe = pandas.DataFrame(
        df_data,
        # Use same name as an integer column but a different datatype so that
        # we can verify that the column is listed but the index isn't.
        index=pandas.Index([0.1, 0.2, 0.3, 0.4], name="a_series"),
    )

    columns_and_indexes = module_under_test.list_columns_and_indexes(dataframe)
    expected = [
        ("a_series", pandas.api.types.pandas_dtype("int64")),
        ("b_series", pandas.api.types.pandas_dtype("float64")),
        ("c_series", pandas.api.types.pandas_dtype("object")),
    ]
    assert columns_and_indexes == expected 
Example #5
Source File: test__pandas_helpers.py    From python-bigquery with Apache License 2.0 6 votes vote down vote up
def test_list_columns_and_indexes_with_named_index(module_under_test):
    df_data = collections.OrderedDict(
        [
            ("a_series", [1, 2, 3, 4]),
            ("b_series", [0.1, 0.2, 0.3, 0.4]),
            ("c_series", ["a", "b", "c", "d"]),
        ]
    )
    dataframe = pandas.DataFrame(
        df_data, index=pandas.Index([4, 5, 6, 7], name="a_index")
    )

    columns_and_indexes = module_under_test.list_columns_and_indexes(dataframe)
    expected = [
        ("a_index", pandas.api.types.pandas_dtype("int64")),
        ("a_series", pandas.api.types.pandas_dtype("int64")),
        ("b_series", pandas.api.types.pandas_dtype("float64")),
        ("c_series", pandas.api.types.pandas_dtype("object")),
    ]
    assert columns_and_indexes == expected 
Example #6
Source File: test_algorithms.py    From fletcher with MIT License 6 votes vote down vote up
def test_reduce_op_no_identity(data, skipna, op, pandas_op):
    arrow = pa.array(data, type=pa.float64(), from_pandas=True)
    pandas = pd.Series(data, dtype=float)
    should_raise = arrow.null_count == len(arrow) and (skipna or len(arrow) == 0)

    if should_raise:
        with pytest.raises(ValueError):
            assert_allclose_na(op(arrow, skipna), pandas_op(pandas, skipna=skipna))
    else:
        assert_allclose_na(op(arrow, skipna), pandas_op(pandas, skipna=skipna))

    # Split in the middle and check whether this still works
    if len(data) > 2:
        arrow = pa.chunked_array(
            [
                pa.array(data[: len(data) // 2], type=pa.float64(), from_pandas=True),
                pa.array(data[len(data) // 2 :], type=pa.float64(), from_pandas=True),
            ]
        )
        if should_raise:
            with pytest.raises(ValueError):
                assert_allclose_na(op(arrow, skipna), pandas_op(pandas, skipna=skipna))
        else:
            assert_allclose_na(op(arrow, skipna), pandas_op(pandas, skipna=skipna)) 
Example #7
Source File: test_unit_arrow_chunk_iterator.py    From snowflake-connector-python with Apache License 2.0 5 votes vote down vote up
def test_iterate_over_float_chunk():
    random.seed(datetime.datetime.now())
    column_meta = [
            {"logicalType": "REAL"},
            {"logicalType": "FLOAT"}
    ]

    def float_generator():
        return random.uniform(-100.0, 100.0)

    iterate_over_test_chunk([pyarrow.float64(), pyarrow.float64()],
                            column_meta, float_generator) 
Example #8
Source File: array_util_test.py    From tfx-bsl with Apache License 2.0 5 votes vote down vote up
def _get_numeric_byte_size_test_cases():
  result = []
  for array_type, sizeof in [
      (pa.int8(), 1),
      (pa.uint8(), 1),
      (pa.int16(), 2),
      (pa.uint16(), 2),
      (pa.int32(), 4),
      (pa.uint32(), 4),
      (pa.int64(), 8),
      (pa.uint64(), 8),
      (pa.float32(), 4),
      (pa.float64(), 8),
  ]:
    result.append(
        dict(
            testcase_name=str(array_type),
            array=pa.array(range(9), type=array_type),
            slice_offset=2,
            slice_length=3,
            expected_size=(_all_false_null_bitmap_size(2) + sizeof * 9),
            expected_sliced_size=(_all_false_null_bitmap_size(1) + sizeof * 3)))
  return result 
Example #9
Source File: array_util_test.py    From tfx-bsl with Apache License 2.0 5 votes vote down vote up
def test_element_lengths_list_array(self, list_type_factory):
    list_lengths = array_util.GetElementLengths(
        pa.array([], type=list_type_factory(pa.int64())))
    self.assertTrue(list_lengths.equals(pa.array([], type=pa.int64())))
    list_lengths = array_util.GetElementLengths(
        pa.array([[1., 2.], [], [3.]], list_type_factory(pa.float32())))
    self.assertTrue(list_lengths.equals(pa.array([2, 0, 1], type=pa.int64())))
    list_lengths = array_util.GetElementLengths(
        pa.array([[1., 2.], None, [3.]], list_type_factory(pa.float64())))
    self.assertTrue(list_lengths.equals(pa.array([2, 0, 1], type=pa.int64()))) 
Example #10
Source File: array_util_test.py    From tfx-bsl with Apache License 2.0 5 votes vote down vote up
def test_list_lengths(self, list_type_factory):
    list_lengths = array_util.ListLengthsFromListArray(
        pa.array([], type=list_type_factory(pa.int64())))
    self.assertTrue(list_lengths.equals(pa.array([], type=pa.int64())))
    list_lengths = array_util.ListLengthsFromListArray(
        pa.array([[1., 2.], [], [3.]], type=list_type_factory(pa.float32())))
    self.assertTrue(list_lengths.equals(pa.array([2, 0, 1], type=pa.int64())))
    list_lengths = array_util.ListLengthsFromListArray(
        pa.array([[1., 2.], None, [3.]], type=list_type_factory(pa.float64())))
    self.assertTrue(list_lengths.equals(pa.array([2, 0, 1], type=pa.int64()))) 
Example #11
Source File: test_common_metadata.py    From kartothek with MIT License 5 votes vote down vote up
def test_validate_empty_dataframe_corrupt_raises(
    df_all_types,
    df_all_types_schema,
    df_all_types_empty_schema,
    corrupt_column,
    corrupt_value,
    corrupt_dtype,
):
    # In case there is something wrong with the schema, raise!

    # First, an integer column carries a float or an object.
    df_corrupt = df_all_types.copy()
    # for value, dtype in [(-1.1, np.float64), ('a', np.object)]:
    df_corrupt[corrupt_column] = pd.Series([corrupt_value], dtype=corrupt_dtype)
    df_corrupt_meta = make_meta(df_corrupt, origin="1")
    # Raise when comparing the proper to the corrupt schema
    for schemas in permutations([df_all_types_schema, df_corrupt_meta]):
        with pytest.raises(ValueError):
            validate_compatible(schemas)
    # Also raise if there is a schema originating from an empty DF to make
    # sure the emptiness doesn't cancel the validation
    for schemas in permutations(
        [df_all_types_schema, df_corrupt_meta, df_all_types_empty_schema]
    ):
        with pytest.raises(ValueError):
            validate_compatible(schemas) 
Example #12
Source File: types.py    From cjworkbench with GNU Affero General Public License v3.0 5 votes vote down vote up
def _dtype_to_arrow_type(dtype: np.dtype) -> pyarrow.DataType:
    if dtype == np.int8:
        return pyarrow.int8()
    elif dtype == np.int16:
        return pyarrow.int16()
    elif dtype == np.int32:
        return pyarrow.int32()
    elif dtype == np.int64:
        return pyarrow.int64()
    elif dtype == np.uint8:
        return pyarrow.uint8()
    elif dtype == np.uint16:
        return pyarrow.uint16()
    elif dtype == np.uint32:
        return pyarrow.uint32()
    elif dtype == np.uint64:
        return pyarrow.uint64()
    elif dtype == np.float16:
        return pyarrow.float16()
    elif dtype == np.float32:
        return pyarrow.float32()
    elif dtype == np.float64:
        return pyarrow.float64()
    elif dtype.kind == "M":
        # [2019-09-17] Pandas only allows "ns" unit -- as in, datetime64[ns]
        # https://github.com/pandas-dev/pandas/issues/7307#issuecomment-224180563
        assert dtype.str.endswith("[ns]")
        return pyarrow.timestamp(unit="ns", tz=None)
    elif dtype == np.object_:
        return pyarrow.string()
    else:
        raise RuntimeError("Unhandled dtype %r" % dtype) 
Example #13
Source File: parquet_pio.py    From sdc with BSD 2-Clause "Simplified" License 5 votes vote down vote up
def _get_numba_typ_from_pa_typ(pa_typ):
    import pyarrow as pa
    _typ_map = {
        # boolean
        pa.bool_(): types.bool_,
        # signed int types
        pa.int8(): types.int8,
        pa.int16(): types.int16,
        pa.int32(): types.int32,
        pa.int64(): types.int64,
        # unsigned int types
        pa.uint8(): types.uint8,
        pa.uint16(): types.uint16,
        pa.uint32(): types.uint32,
        pa.uint64(): types.uint64,
        # float types (TODO: float16?)
        pa.float32(): types.float32,
        pa.float64(): types.float64,
        # String
        pa.string(): string_type,
        # date
        pa.date32(): types.NPDatetime('ns'),
        pa.date64(): types.NPDatetime('ns'),
        # time (TODO: time32, time64, ...)
        pa.timestamp('ns'): types.NPDatetime('ns'),
        pa.timestamp('us'): types.NPDatetime('ns'),
        pa.timestamp('ms'): types.NPDatetime('ns'),
        pa.timestamp('s'): types.NPDatetime('ns'),
    }
    if pa_typ not in _typ_map:
        raise ValueError("Arrow data type {} not supported yet".format(pa_typ))
    return _typ_map[pa_typ] 
Example #14
Source File: test_algorithms.py    From fletcher with MIT License 5 votes vote down vote up
def test_reduce_op(data, skipna, op, pandas_op):
    arrow = pa.array(data, type=pa.float64(), from_pandas=True)
    pandas = pd.Series(data, dtype=float)

    assert_allclose_na(op(arrow, skipna), pandas_op(pandas, skipna=skipna))

    # Split in the middle and check whether this still works
    if len(data) > 2:
        arrow = pa.chunked_array(
            [
                pa.array(data[: len(data) // 2], type=pa.float64(), from_pandas=True),
                pa.array(data[len(data) // 2 :], type=pa.float64(), from_pandas=True),
            ]
        )
        assert_allclose_na(op(arrow, skipna), pandas_op(pandas, skipna=skipna)) 
Example #15
Source File: test_table.py    From python-bigquery with Apache License 2.0 5 votes vote down vote up
def test_to_dataframe_column_dtypes(self):
        from google.cloud.bigquery.schema import SchemaField

        schema = [
            SchemaField("start_timestamp", "TIMESTAMP"),
            SchemaField("seconds", "INT64"),
            SchemaField("miles", "FLOAT64"),
            SchemaField("km", "FLOAT64"),
            SchemaField("payment_type", "STRING"),
            SchemaField("complete", "BOOL"),
            SchemaField("date", "DATE"),
        ]
        row_data = [
            ["1.4338368E9", "420", "1.1", "1.77", u"Cash", "true", "1999-12-01"],
            ["1.3878117E9", "2580", "17.7", "28.5", u"Cash", "false", "1953-06-14"],
            ["1.3855653E9", "2280", "4.4", "7.1", u"Credit", "true", "1981-11-04"],
        ]
        rows = [{"f": [{"v": field} for field in row]} for row in row_data]
        path = "/foo"
        api_request = mock.Mock(return_value={"rows": rows})
        row_iterator = self._make_one(_mock_client(), api_request, path, schema)

        df = row_iterator.to_dataframe(
            dtypes={"km": "float16"}, create_bqstorage_client=False,
        )

        self.assertIsInstance(df, pandas.DataFrame)
        self.assertEqual(len(df), 3)  # verify the number of rows
        exp_columns = [field.name for field in schema]
        self.assertEqual(list(df), exp_columns)  # verify the column names

        self.assertEqual(df.start_timestamp.dtype.name, "datetime64[ns, UTC]")
        self.assertEqual(df.seconds.dtype.name, "int64")
        self.assertEqual(df.miles.dtype.name, "float64")
        self.assertEqual(df.km.dtype.name, "float16")
        self.assertEqual(df.payment_type.dtype.name, "object")
        self.assertEqual(df.complete.dtype.name, "bool")
        self.assertEqual(df.date.dtype.name, "object") 
Example #16
Source File: test__pandas_helpers.py    From python-bigquery with Apache License 2.0 5 votes vote down vote up
def test_bq_to_arrow_data_type_w_array_struct(module_under_test, bq_type):
    fields = (
        schema.SchemaField("field01", "STRING"),
        schema.SchemaField("field02", "BYTES"),
        schema.SchemaField("field03", "INTEGER"),
        schema.SchemaField("field04", "INT64"),
        schema.SchemaField("field05", "FLOAT"),
        schema.SchemaField("field06", "FLOAT64"),
        schema.SchemaField("field07", "NUMERIC"),
        schema.SchemaField("field08", "BOOLEAN"),
        schema.SchemaField("field09", "BOOL"),
        schema.SchemaField("field10", "TIMESTAMP"),
        schema.SchemaField("field11", "DATE"),
        schema.SchemaField("field12", "TIME"),
        schema.SchemaField("field13", "DATETIME"),
        schema.SchemaField("field14", "GEOGRAPHY"),
    )
    field = schema.SchemaField("ignored_name", bq_type, mode="REPEATED", fields=fields)
    actual = module_under_test.bq_to_arrow_data_type(field)
    expected_value_type = pyarrow.struct(
        (
            pyarrow.field("field01", pyarrow.string()),
            pyarrow.field("field02", pyarrow.binary()),
            pyarrow.field("field03", pyarrow.int64()),
            pyarrow.field("field04", pyarrow.int64()),
            pyarrow.field("field05", pyarrow.float64()),
            pyarrow.field("field06", pyarrow.float64()),
            pyarrow.field("field07", module_under_test.pyarrow_numeric()),
            pyarrow.field("field08", pyarrow.bool_()),
            pyarrow.field("field09", pyarrow.bool_()),
            pyarrow.field("field10", module_under_test.pyarrow_timestamp()),
            pyarrow.field("field11", pyarrow.date32()),
            pyarrow.field("field12", module_under_test.pyarrow_time()),
            pyarrow.field("field13", module_under_test.pyarrow_datetime()),
            pyarrow.field("field14", pyarrow.string()),
        )
    )
    assert pyarrow.types.is_list(actual)
    assert pyarrow.types.is_struct(actual.value_type)
    assert actual.value_type.num_children == len(fields)
    assert actual.value_type.equals(expected_value_type) 
Example #17
Source File: test__pandas_helpers.py    From python-bigquery with Apache License 2.0 5 votes vote down vote up
def test_bq_to_arrow_data_type_w_struct(module_under_test, bq_type):
    fields = (
        schema.SchemaField("field01", "STRING"),
        schema.SchemaField("field02", "BYTES"),
        schema.SchemaField("field03", "INTEGER"),
        schema.SchemaField("field04", "INT64"),
        schema.SchemaField("field05", "FLOAT"),
        schema.SchemaField("field06", "FLOAT64"),
        schema.SchemaField("field07", "NUMERIC"),
        schema.SchemaField("field08", "BOOLEAN"),
        schema.SchemaField("field09", "BOOL"),
        schema.SchemaField("field10", "TIMESTAMP"),
        schema.SchemaField("field11", "DATE"),
        schema.SchemaField("field12", "TIME"),
        schema.SchemaField("field13", "DATETIME"),
        schema.SchemaField("field14", "GEOGRAPHY"),
    )
    field = schema.SchemaField("ignored_name", bq_type, mode="NULLABLE", fields=fields)
    actual = module_under_test.bq_to_arrow_data_type(field)
    expected = pyarrow.struct(
        (
            pyarrow.field("field01", pyarrow.string()),
            pyarrow.field("field02", pyarrow.binary()),
            pyarrow.field("field03", pyarrow.int64()),
            pyarrow.field("field04", pyarrow.int64()),
            pyarrow.field("field05", pyarrow.float64()),
            pyarrow.field("field06", pyarrow.float64()),
            pyarrow.field("field07", module_under_test.pyarrow_numeric()),
            pyarrow.field("field08", pyarrow.bool_()),
            pyarrow.field("field09", pyarrow.bool_()),
            pyarrow.field("field10", module_under_test.pyarrow_timestamp()),
            pyarrow.field("field11", pyarrow.date32()),
            pyarrow.field("field12", module_under_test.pyarrow_time()),
            pyarrow.field("field13", module_under_test.pyarrow_datetime()),
            pyarrow.field("field14", pyarrow.string()),
        )
    )
    assert pyarrow.types.is_struct(actual)
    assert actual.num_children == len(fields)
    assert actual.equals(expected) 
Example #18
Source File: types.py    From LearningApacheSpark with MIT License 5 votes vote down vote up
def to_arrow_type(dt):
    """ Convert Spark data type to pyarrow type
    """
    from distutils.version import LooseVersion
    import pyarrow as pa
    if type(dt) == BooleanType:
        arrow_type = pa.bool_()
    elif type(dt) == ByteType:
        arrow_type = pa.int8()
    elif type(dt) == ShortType:
        arrow_type = pa.int16()
    elif type(dt) == IntegerType:
        arrow_type = pa.int32()
    elif type(dt) == LongType:
        arrow_type = pa.int64()
    elif type(dt) == FloatType:
        arrow_type = pa.float32()
    elif type(dt) == DoubleType:
        arrow_type = pa.float64()
    elif type(dt) == DecimalType:
        arrow_type = pa.decimal128(dt.precision, dt.scale)
    elif type(dt) == StringType:
        arrow_type = pa.string()
    elif type(dt) == BinaryType:
        # TODO: remove version check once minimum pyarrow version is 0.10.0
        if LooseVersion(pa.__version__) < LooseVersion("0.10.0"):
            raise TypeError("Unsupported type in conversion to Arrow: " + str(dt) +
                            "\nPlease install pyarrow >= 0.10.0 for BinaryType support.")
        arrow_type = pa.binary()
    elif type(dt) == DateType:
        arrow_type = pa.date32()
    elif type(dt) == TimestampType:
        # Timestamps should be in UTC, JVM Arrow timestamps require a timezone to be read
        arrow_type = pa.timestamp('us', tz='UTC')
    elif type(dt) == ArrayType:
        if type(dt.elementType) == TimestampType:
            raise TypeError("Unsupported type in conversion to Arrow: " + str(dt))
        arrow_type = pa.list_(to_arrow_type(dt.elementType))
    else:
        raise TypeError("Unsupported type in conversion to Arrow: " + str(dt))
    return arrow_type 
Example #19
Source File: test_unischema.py    From petastorm with Apache License 2.0 5 votes vote down vote up
def test_arrow_schema_convertion():
    fields = [
        pa.field('string', pa.string()),
        pa.field('int8', pa.int8()),
        pa.field('int16', pa.int16()),
        pa.field('int32', pa.int32()),
        pa.field('int64', pa.int64()),
        pa.field('float', pa.float32()),
        pa.field('double', pa.float64()),
        pa.field('bool', pa.bool_(), False),
        pa.field('fixed_size_binary', pa.binary(10)),
        pa.field('variable_size_binary', pa.binary()),
        pa.field('decimal', pa.decimal128(3, 4)),
        pa.field('timestamp_s', pa.timestamp('s')),
        pa.field('timestamp_ns', pa.timestamp('ns')),
        pa.field('date_32', pa.date32()),
        pa.field('date_64', pa.date64())
    ]
    arrow_schema = pa.schema(fields)

    mock_dataset = _mock_parquet_dataset([], arrow_schema)

    unischema = Unischema.from_arrow_schema(mock_dataset)
    for name in arrow_schema.names:
        assert getattr(unischema, name).name == name
        assert getattr(unischema, name).codec is None

        if name == 'bool':
            assert not getattr(unischema, name).nullable
        else:
            assert getattr(unischema, name).nullable

    # Test schema preserve fields order
    field_name_list = [f.name for f in fields]
    assert list(unischema.fields.keys()) == field_name_list 
Example #20
Source File: test_unischema.py    From petastorm with Apache License 2.0 5 votes vote down vote up
def test_dict_to_spark_row_order():
    TestSchema = Unischema('TestSchema', [
        UnischemaField('float_col', np.float64, ()),
        UnischemaField('int_col', np.int64, ()),
    ])
    row_dict = {
        TestSchema.int_col.name: 3,
        TestSchema.float_col.name: 2.0,
    }
    spark_row = dict_to_spark_row(TestSchema, row_dict)
    schema_field_names = list(TestSchema.fields)
    assert spark_row[0] == row_dict[schema_field_names[0]]
    assert spark_row[1] == row_dict[schema_field_names[1]] 
Example #21
Source File: csv2parquet.py    From csv2parquet with Apache License 2.0 5 votes vote down vote up
def get_pyarrow_types():
    return {
        'bool': PA_BOOL,
        'float32': PA_FLOAT32,
        'float64': PA_FLOAT64,
        'int8': PA_INT8,
        'int16': PA_INT16,
        'int32': PA_INT32,
        'int64': PA_INT64,
        'string': PA_STRING,
        'timestamp': PA_TIMESTAMP,
        'base64': PA_BINARY
    }

# pylint: disable=too-many-branches,too-many-statements 
Example #22
Source File: test_unit_arrow_chunk_iterator.py    From snowflake-connector-python with Apache License 2.0 5 votes vote down vote up
def test_iterate_over_float_chunk():
    random.seed(datetime.datetime.now())
    column_meta = [
            {"logicalType": "REAL"},
            {"logicalType": "FLOAT"}
    ]

    def float_generator():
        return random.uniform(-100.0, 100.0)

    iterate_over_test_chunk([pyarrow.float64(), pyarrow.float64()],
                            column_meta, float_generator) 
Example #23
Source File: test_table.py    From python-bigquery with Apache License 2.0 4 votes vote down vote up
def test_to_dataframe_w_bqstorage_empty_streams(self):
        from google.cloud.bigquery import schema
        from google.cloud.bigquery import table as mut
        from google.cloud.bigquery_storage_v1 import reader

        arrow_fields = [
            pyarrow.field("colA", pyarrow.int64()),
            # Not alphabetical to test column order.
            pyarrow.field("colC", pyarrow.float64()),
            pyarrow.field("colB", pyarrow.utf8()),
        ]
        arrow_schema = pyarrow.schema(arrow_fields)

        bqstorage_client = mock.create_autospec(bigquery_storage_v1.BigQueryReadClient)
        session = bigquery_storage_v1.types.ReadSession(
            streams=[{"name": "/projects/proj/dataset/dset/tables/tbl/streams/1234"}],
            arrow_schema={"serialized_schema": arrow_schema.serialize().to_pybytes()},
        )
        bqstorage_client.create_read_session.return_value = session

        mock_rowstream = mock.create_autospec(reader.ReadRowsStream)
        bqstorage_client.read_rows.return_value = mock_rowstream

        mock_rows = mock.create_autospec(reader.ReadRowsIterable)
        mock_rowstream.rows.return_value = mock_rows
        mock_pages = mock.PropertyMock(return_value=())
        type(mock_rows).pages = mock_pages

        # Schema is required when there are no record batches in the stream.
        schema = [
            schema.SchemaField("colA", "INTEGER"),
            schema.SchemaField("colC", "FLOAT"),
            schema.SchemaField("colB", "STRING"),
        ]

        row_iterator = mut.RowIterator(
            _mock_client(),
            None,  # api_request: ignored
            None,  # path: ignored
            schema,
            table=mut.TableReference.from_string("proj.dset.tbl"),
            selected_fields=schema,
        )

        got = row_iterator.to_dataframe(bqstorage_client)

        column_names = ["colA", "colC", "colB"]
        self.assertEqual(list(got), column_names)
        self.assertTrue(got.empty) 
Example #24
Source File: common_metadata.py    From kartothek with MIT License 4 votes vote down vote up
def normalize_type(t_pa, t_pd, t_np, metadata):
    """
    This will normalize types as followed:

    - all signed integers (``int8``, ``int16``, ``int32``, ``int64``) will be converted to ``int64``
    - all unsigned integers (``uint8``, ``uint16``, ``uint32``, ``uint64``) will be converted to ``uint64``
    - all floats (``float32``, ``float64``) will be converted to ``float64``
    - all list value types will be normalized (e.g. ``list[int16]`` to ``list[int64]``, ``list[list[uint8]]`` to
      ``list[list[uint64]]``)
    - all dict value types will be normalized (e.g. ``dictionary<values=float32, indices=int16, ordered=0>`` to
      ``float64``)

    Parameters
    ----------
    t_pa: pyarrow.Type
        pyarrow type object, e.g. ``pa.list_(pa.int8())``.
    t_pd: string
        pandas type identifier, e.g. ``"list[int8]"``.
    t_np: string
        numpy type identifier, e.g. ``"object"``.
    metadata: Union[None, Dict[String, Any]]
        metadata associated with the type, e.g. information about categorials.

    Returns
    -------
    type_tuple: Tuple[pyarrow.Type, string, string, Union[None, Dict[String, Any]]]
        tuple of ``t_pa``, ``t_pd``, ``t_np``, ``metadata`` for normalized type
    """
    if pa.types.is_signed_integer(t_pa):
        return pa.int64(), "int64", "int64", None
    elif pa.types.is_unsigned_integer(t_pa):
        return pa.uint64(), "uint64", "uint64", None
    elif pa.types.is_floating(t_pa):
        return pa.float64(), "float64", "float64", None
    elif pa.types.is_list(t_pa):
        t_pa2, t_pd2, t_np2, metadata2 = normalize_type(
            t_pa.value_type, t_pd[len("list[") : -1], None, None
        )
        return pa.list_(t_pa2), "list[{}]".format(t_pd2), "object", None
    elif pa.types.is_dictionary(t_pa):
        # downcast to dictionary content, `t_pd` is useless in that case
        if ARROW_LARGER_EQ_0141:
            return normalize_type(t_pa.value_type, t_np, t_np, None)
        else:
            return normalize_type(t_pa.dictionary.type, t_np, t_np, None)
    else:
        return t_pa, t_pd, t_np, metadata 
Example #25
Source File: test_common_metadata.py    From kartothek with MIT License 4 votes vote down vote up
def test_store_schema_metadata(store, df_all_types):
    store_schema_metadata(
        schema=make_meta(df_all_types, origin="df_all_types"),
        dataset_uuid="some_uuid",
        store=store,
        table="some_table",
    )

    key = "some_uuid/some_table/_common_metadata"
    assert key in store.keys()
    pq_file = pq.ParquetFile(store.open(key))
    actual_schema = pq_file.schema.to_arrow_schema()
    fields = [
        pa.field("array_float32", pa.list_(pa.float64())),
        pa.field("array_float64", pa.list_(pa.float64())),
        pa.field("array_int16", pa.list_(pa.int64())),
        pa.field("array_int32", pa.list_(pa.int64())),
        pa.field("array_int64", pa.list_(pa.int64())),
        pa.field("array_int8", pa.list_(pa.int64())),
        pa.field("array_uint16", pa.list_(pa.uint64())),
        pa.field("array_uint32", pa.list_(pa.uint64())),
        pa.field("array_uint64", pa.list_(pa.uint64())),
        pa.field("array_uint8", pa.list_(pa.uint64())),
        pa.field("array_unicode", pa.list_(pa.string())),
        pa.field("bool", pa.bool_()),
        pa.field("byte", pa.binary()),
        pa.field("date", pa.date32()),
        pa.field("datetime64", pa.timestamp("us")),
        pa.field("float32", pa.float64()),
        pa.field("float64", pa.float64()),
        pa.field("int16", pa.int64()),
        pa.field("int32", pa.int64()),
        pa.field("int64", pa.int64()),
        pa.field("int8", pa.int64()),
        pa.field("null", pa.null()),
        pa.field("uint16", pa.uint64()),
        pa.field("uint32", pa.uint64()),
        pa.field("uint64", pa.uint64()),
        pa.field("uint8", pa.uint64()),
        pa.field("unicode", pa.string()),
    ]
    expected_schema = pa.schema(fields)

    assert actual_schema.remove_metadata() == expected_schema 
Example #26
Source File: client.py    From json2parquet with MIT License 4 votes vote down vote up
def _convert_data_with_schema(data, schema, date_format=None, field_aliases=None):
    column_data = {}
    array_data = []
    schema_names = []
    for row in data:
        for column in schema.names:
            _col = column_data.get(column, [])
            _col.append(row.get(column))
            column_data[column] = _col
    for column in schema:
        _col = column_data.get(column.name)
        if isinstance(column.type, pa.lib.TimestampType):
            _converted_col = []
            for t in _col:
                try:
                    _converted_col.append(pd.to_datetime(t, format=date_format))
                except pd._libs.tslib.OutOfBoundsDatetime:
                    _converted_col.append(pd.Timestamp.max)
            array_data.append(pa.Array.from_pandas(pd.to_datetime(_converted_col), type=pa.timestamp('ns')))
        elif column.type.id == pa.date32().id:
            _converted_col = map(_date_converter, _col)
            array_data.append(pa.array(_converted_col, type=pa.date32()))
        # Float types are ambiguous for conversions, need to specify the exact type
        elif column.type.id == pa.float64().id:
            array_data.append(pa.array(_col, type=pa.float64()))
        elif column.type.id == pa.float32().id:
            # Python doesn't have a native float32 type
            # and PyArrow cannot cast float64 -> float32
            _col = pd.to_numeric(_col, downcast='float')
            array_data.append(pa.Array.from_pandas(_col, type=pa.float32()))
        elif column.type.id == pa.int32().id:
            # PyArrow 0.8.0 can cast int64 -> int32
            _col64 = pa.array(_col, type=pa.int64())
            array_data.append(_col64.cast(pa.int32()))
        elif column.type.id == pa.bool_().id:
            _col = map(_boolean_converter, _col)
            array_data.append(pa.array(_col, type=column.type))
        else:
            array_data.append(pa.array(_col, type=column.type))
        if isinstance(field_aliases, dict):
            schema_names.append(field_aliases.get(column.name, column.name))
        else:
            schema_names.append(column.name)
    return pa.RecordBatch.from_arrays(array_data, schema_names)