Python Examples of pyarrow.float32

Source File: array_util_test.py From tfx-bsl with Apache License 2.0

6 votes

def test_get_flattened_array_parent_indices(self, list_type_factory,
                                              parent_indices_type):
    indices = array_util.GetFlattenedArrayParentIndices(
        pa.array([], type=list_type_factory(pa.int32())))
    self.assertTrue(indices.equals(pa.array([], type=parent_indices_type)))

    indices = array_util.GetFlattenedArrayParentIndices(
        pa.array([[1.], [2.], [], [3., 4.]],
                 type=list_type_factory(pa.float32())))
    self.assertTrue(
        indices.equals(pa.array([0, 1, 3, 3], type=parent_indices_type)))

    indices = array_util.GetFlattenedArrayParentIndices(
        pa.array([[1.], [2.], [], [3., 4.]],
                 type=list_type_factory(pa.float32())).slice(1))
    self.assertTrue(
        indices.equals(pa.array([0, 2, 2], type=parent_indices_type)))

    indices = array_util.GetFlattenedArrayParentIndices(
        pa.array([list(range(1024))],
                 type=list_type_factory(pa.int64())))
    self.assertTrue(
        indices.equals(pa.array([0] * 1024, type=parent_indices_type)))

Source File: test_pandas_integration.py From fletcher with MIT License

6 votes

def array_chunked_nulls(request):
    case_dict = {
        "all": pa.chunked_array([pa.array([None] * 4) for _ in range(10)]),
        "all_float": pa.chunked_array(
            [pa.array([None] * 4, type=pa.float32()) for _ in range(10)]
        ),
        "some_in_all_chunks": pa.chunked_array(
            [pa.array(["a", "b", None] * 4), pa.array(["a", None, "b"] * 4)]
        ),
        "only_in_some_chunk": pa.chunked_array(
            [
                pa.array(["a", "x"]),
                pa.array(["a", "b", None] * 4),
                pa.array(["a", "b"] * 4),
            ]
        ),
        "none": pa.chunked_array([pa.array(["a", "b"] * 4) for _ in range(10)]),
    }
    return case_dict[request.param]


# ----------------------------------------------------------------------------
# Block Methods
# ----------------------------------------------------------------------------

Source File: tf_sequence_example_record_test.py From tfx-bsl with Apache License 2.0

6 votes

def _GetExpectedColumnValues(tfxio):
  if tfxio._can_produce_large_types:
    list_factory = pa.large_list
    bytes_type = pa.large_binary()
  else:
    list_factory = pa.list_
    bytes_type = pa.binary()

  return {
      path.ColumnPath(["int_feature"]):
          pa.array([[1], [2], [3]], type=list_factory(pa.int64())),
      path.ColumnPath(["float_feature"]):
          pa.array([[1, 2, 3, 4], [2, 3, 4, 5], None],
                   type=list_factory(pa.float32())),
      path.ColumnPath([_SEQUENCE_COLUMN_NAME, "int_feature"]):
          pa.array([[[1, 2], [3]], None, [[4]]],
                   list_factory(list_factory(pa.int64()))),
      path.ColumnPath([_SEQUENCE_COLUMN_NAME, "string_feature"]):
          pa.array([None, [[b"foo", b"bar"], []], [[b"baz"]]],
                   list_factory(list_factory(bytes_type)))
  }

Source File: test_integration.py From pymapd with Apache License 2.0

6 votes

def test_load_empty_table_arrow(self, con):

        con.execute("drop table if exists baz;")
        con.execute("create table baz (a int, b float, c text);")

        data = [(1, 1.1, 'a'), (2, 2.2, '2'), (3, 3.3, '3')]

        df = pd.DataFrame(data, columns=list('abc')).astype(
            {'a': 'int32', 'b': 'float32'}
        )

        table = pa.Table.from_pandas(df, preserve_index=False)
        con.load_table("baz", table, method='arrow')
        result = sorted(con.execute("select * from baz"))
        self.check_empty_insert(result, data)
        con.execute("drop table if exists baz;")

Source File: test_integration.py From pymapd with Apache License 2.0

5 votes

def test_select_ipc_gpu(self, con, query, parameters):

        from cudf.core.dataframe import DataFrame

        c = con.cursor()
        c.execute('drop table if exists stocks;')
        create = (
            'create table stocks (date_ text, trans text, symbol text, '
            'qty int, price float, vol float);'
        )
        c.execute(create)
        i1 = "INSERT INTO stocks VALUES ('2006-01-05','BUY','RHAT',100,35.14,1.1);"  # noqa
        i2 = "INSERT INTO stocks VALUES ('2006-01-05','BUY','GOOG',100,12.14,1.2);"  # noqa

        c.execute(i1)
        c.execute(i2)

        result = con.select_ipc_gpu("select qty, price from stocks")
        assert isinstance(result, DataFrame)

        dtypes = dict(qty=np.int32, price=np.float32)
        expected = pd.DataFrame(
            [[100, 35.14], [100, 12.14]], columns=['qty', 'price']
        ).astype(dtypes)

        result = result.to_pandas()[['qty', 'price']]  # column order
        pd.testing.assert_frame_equal(result, expected)
        c.execute('drop table if exists stocks;')

Source File: array_util_test.py From tfx-bsl with Apache License 2.0

5 votes

def test_element_lengths_list_array(self, list_type_factory):
    list_lengths = array_util.GetElementLengths(
        pa.array([], type=list_type_factory(pa.int64())))
    self.assertTrue(list_lengths.equals(pa.array([], type=pa.int64())))
    list_lengths = array_util.GetElementLengths(
        pa.array([[1., 2.], [], [3.]], list_type_factory(pa.float32())))
    self.assertTrue(list_lengths.equals(pa.array([2, 0, 1], type=pa.int64())))
    list_lengths = array_util.GetElementLengths(
        pa.array([[1., 2.], None, [3.]], list_type_factory(pa.float64())))
    self.assertTrue(list_lengths.equals(pa.array([2, 0, 1], type=pa.int64())))

Source File: array_util_test.py From tfx-bsl with Apache License 2.0

5 votes

def test_list_lengths(self, list_type_factory):
    list_lengths = array_util.ListLengthsFromListArray(
        pa.array([], type=list_type_factory(pa.int64())))
    self.assertTrue(list_lengths.equals(pa.array([], type=pa.int64())))
    list_lengths = array_util.ListLengthsFromListArray(
        pa.array([[1., 2.], [], [3.]], type=list_type_factory(pa.float32())))
    self.assertTrue(list_lengths.equals(pa.array([2, 0, 1], type=pa.int64())))
    list_lengths = array_util.ListLengthsFromListArray(
        pa.array([[1., 2.], None, [3.]], type=list_type_factory(pa.float64())))
    self.assertTrue(list_lengths.equals(pa.array([2, 0, 1], type=pa.int64())))

Source File: types.py From cjworkbench with GNU Affero General Public License v3.0

5 votes

def _dtype_to_arrow_type(dtype: np.dtype) -> pyarrow.DataType:
    if dtype == np.int8:
        return pyarrow.int8()
    elif dtype == np.int16:
        return pyarrow.int16()
    elif dtype == np.int32:
        return pyarrow.int32()
    elif dtype == np.int64:
        return pyarrow.int64()
    elif dtype == np.uint8:
        return pyarrow.uint8()
    elif dtype == np.uint16:
        return pyarrow.uint16()
    elif dtype == np.uint32:
        return pyarrow.uint32()
    elif dtype == np.uint64:
        return pyarrow.uint64()
    elif dtype == np.float16:
        return pyarrow.float16()
    elif dtype == np.float32:
        return pyarrow.float32()
    elif dtype == np.float64:
        return pyarrow.float64()
    elif dtype.kind == "M":
        # [2019-09-17] Pandas only allows "ns" unit -- as in, datetime64[ns]
        # https://github.com/pandas-dev/pandas/issues/7307#issuecomment-224180563
        assert dtype.str.endswith("[ns]")
        return pyarrow.timestamp(unit="ns", tz=None)
    elif dtype == np.object_:
        return pyarrow.string()
    else:
        raise RuntimeError("Unhandled dtype %r" % dtype)

Source File: parquet_pio.py From sdc with BSD 2-Clause "Simplified" License

5 votes

def _get_numba_typ_from_pa_typ(pa_typ):
    import pyarrow as pa
    _typ_map = {
        # boolean
        pa.bool_(): types.bool_,
        # signed int types
        pa.int8(): types.int8,
        pa.int16(): types.int16,
        pa.int32(): types.int32,
        pa.int64(): types.int64,
        # unsigned int types
        pa.uint8(): types.uint8,
        pa.uint16(): types.uint16,
        pa.uint32(): types.uint32,
        pa.uint64(): types.uint64,
        # float types (TODO: float16?)
        pa.float32(): types.float32,
        pa.float64(): types.float64,
        # String
        pa.string(): string_type,
        # date
        pa.date32(): types.NPDatetime('ns'),
        pa.date64(): types.NPDatetime('ns'),
        # time (TODO: time32, time64, ...)
        pa.timestamp('ns'): types.NPDatetime('ns'),
        pa.timestamp('us'): types.NPDatetime('ns'),
        pa.timestamp('ms'): types.NPDatetime('ns'),
        pa.timestamp('s'): types.NPDatetime('ns'),
    }
    if pa_typ not in _typ_map:
        raise ValueError("Arrow data type {} not supported yet".format(pa_typ))
    return _typ_map[pa_typ]

Source File: test_integration.py From pymapd with Apache License 2.0

5 votes

def test_load_table_creates(self, con):

        data = pd.DataFrame(
            {
                "boolean_": [True, False],
                "smallint_cast": np.array([0, 1], dtype=np.int8),
                "smallint_": np.array([0, 1], dtype=np.int16),
                "int_": np.array([0, 1], dtype=np.int32),
                "bigint_": np.array([0, 1], dtype=np.int64),
                "float_": np.array([0, 1], dtype=np.float32),
                "double_": np.array([0, 1], dtype=np.float64),
                "varchar_": ["a", "b"],
                "text_": ['a', 'b'],
                "time_": [datetime.time(0, 11, 59), datetime.time(13)],
                "timestamp_": [pd.Timestamp("2016"), pd.Timestamp("2017")],
                "date_": [
                    datetime.date(2016, 1, 1),
                    datetime.date(2017, 1, 1),
                ],
            },
            columns=[
                'boolean_',
                'smallint_',
                'int_',
                'bigint_',
                'float_',
                'double_',
                'varchar_',
                'text_',
                'time_',
                'timestamp_',
                'date_',
            ],
        )

        con.execute("drop table if exists test_load_table_creates;")
        con.load_table("test_load_table_creates", data, create=True)
        con.execute("drop table if exists test_load_table_creates;")

Source File: test_integration.py From pymapd with Apache License 2.0

5 votes

def test_load_infer(self, con):

        con.execute("drop table if exists baz;")
        con.execute("create table baz (a int, b float, c text);")

        data = pd.DataFrame(
            {
                'a': np.array([0, 1], dtype=np.int32),
                'b': np.array([1.1, 2.2], dtype=np.float32),
                'c': ['a', 'b'],
            }
        )
        con.load_table("baz", data)
        con.execute("drop table if exists baz;")

Source File: test_integration.py From pymapd with Apache License 2.0

5 votes

def test_select_ipc_parametrized(self, con, query, parameters):

        c = con.cursor()
        c.execute('drop table if exists stocks;')
        create = (
            'create table stocks (date_ text, trans text, symbol text, '
            'qty int, price float, vol float);'
        )
        c.execute(create)
        i1 = "INSERT INTO stocks VALUES ('2006-01-05','BUY','RHAT',100,35.14,1.1);"  # noqa
        i2 = "INSERT INTO stocks VALUES ('2006-01-05','BUY','GOOG',100,12.14,1.2);"  # noqa

        c.execute(i1)
        c.execute(i2)

        result = con.select_ipc(query, parameters=parameters)
        expected = pd.DataFrame(
            {
                "qty": np.array([100, 100], dtype=np.int32),
                "price": np.array(
                    [35.13999938964844, 12.140000343322754], dtype=np.float32
                ),
            }
        )[['qty', 'price']]
        tm.assert_frame_equal(result, expected)
        c.execute('drop table if exists stocks;')

Source File: array_util_test.py From tfx-bsl with Apache License 2.0

5 votes

def _get_numeric_byte_size_test_cases():
  result = []
  for array_type, sizeof in [
      (pa.int8(), 1),
      (pa.uint8(), 1),
      (pa.int16(), 2),
      (pa.uint16(), 2),
      (pa.int32(), 4),
      (pa.uint32(), 4),
      (pa.int64(), 8),
      (pa.uint64(), 8),
      (pa.float32(), 4),
      (pa.float64(), 8),
  ]:
    result.append(
        dict(
            testcase_name=str(array_type),
            array=pa.array(range(9), type=array_type),
            slice_offset=2,
            slice_length=3,
            expected_size=(_all_false_null_bitmap_size(2) + sizeof * 9),
            expected_sliced_size=(_all_false_null_bitmap_size(1) + sizeof * 3)))
  return result

Source File: types.py From LearningApacheSpark with MIT License

5 votes

def to_arrow_type(dt):
    """ Convert Spark data type to pyarrow type
    """
    from distutils.version import LooseVersion
    import pyarrow as pa
    if type(dt) == BooleanType:
        arrow_type = pa.bool_()
    elif type(dt) == ByteType:
        arrow_type = pa.int8()
    elif type(dt) == ShortType:
        arrow_type = pa.int16()
    elif type(dt) == IntegerType:
        arrow_type = pa.int32()
    elif type(dt) == LongType:
        arrow_type = pa.int64()
    elif type(dt) == FloatType:
        arrow_type = pa.float32()
    elif type(dt) == DoubleType:
        arrow_type = pa.float64()
    elif type(dt) == DecimalType:
        arrow_type = pa.decimal128(dt.precision, dt.scale)
    elif type(dt) == StringType:
        arrow_type = pa.string()
    elif type(dt) == BinaryType:
        # TODO: remove version check once minimum pyarrow version is 0.10.0
        if LooseVersion(pa.__version__) < LooseVersion("0.10.0"):
            raise TypeError("Unsupported type in conversion to Arrow: " + str(dt) +
                            "\nPlease install pyarrow >= 0.10.0 for BinaryType support.")
        arrow_type = pa.binary()
    elif type(dt) == DateType:
        arrow_type = pa.date32()
    elif type(dt) == TimestampType:
        # Timestamps should be in UTC, JVM Arrow timestamps require a timezone to be read
        arrow_type = pa.timestamp('us', tz='UTC')
    elif type(dt) == ArrayType:
        if type(dt.elementType) == TimestampType:
            raise TypeError("Unsupported type in conversion to Arrow: " + str(dt))
        arrow_type = pa.list_(to_arrow_type(dt.elementType))
    else:
        raise TypeError("Unsupported type in conversion to Arrow: " + str(dt))
    return arrow_type

Source File: test_unischema.py From petastorm with Apache License 2.0

5 votes

def test_arrow_schema_convertion():
    fields = [
        pa.field('string', pa.string()),
        pa.field('int8', pa.int8()),
        pa.field('int16', pa.int16()),
        pa.field('int32', pa.int32()),
        pa.field('int64', pa.int64()),
        pa.field('float', pa.float32()),
        pa.field('double', pa.float64()),
        pa.field('bool', pa.bool_(), False),
        pa.field('fixed_size_binary', pa.binary(10)),
        pa.field('variable_size_binary', pa.binary()),
        pa.field('decimal', pa.decimal128(3, 4)),
        pa.field('timestamp_s', pa.timestamp('s')),
        pa.field('timestamp_ns', pa.timestamp('ns')),
        pa.field('date_32', pa.date32()),
        pa.field('date_64', pa.date64())
    ]
    arrow_schema = pa.schema(fields)

    mock_dataset = _mock_parquet_dataset([], arrow_schema)

    unischema = Unischema.from_arrow_schema(mock_dataset)
    for name in arrow_schema.names:
        assert getattr(unischema, name).name == name
        assert getattr(unischema, name).codec is None

        if name == 'bool':
            assert not getattr(unischema, name).nullable
        else:
            assert getattr(unischema, name).nullable

    # Test schema preserve fields order
    field_name_list = [f.name for f in fields]
    assert list(unischema.fields.keys()) == field_name_list

Source File: test_unischema.py From petastorm with Apache License 2.0

5 votes

def test_make_named_tuple():
    TestSchema = Unischema('TestSchema', [
        UnischemaField('string_scalar', np.string_, (), ScalarCodec(StringType()), True),
        UnischemaField('int32_scalar', np.int32, (), ScalarCodec(ShortType()), False),
        UnischemaField('uint8_scalar', np.uint8, (), ScalarCodec(ShortType()), False),
        UnischemaField('int32_matrix', np.float32, (10, 20, 3), NdarrayCodec(), True),
        UnischemaField('decimal_scalar', Decimal, (10, 20, 3), ScalarCodec(DecimalType(10, 9)), False),
    ])

    TestSchema.make_namedtuple(string_scalar='abc', int32_scalar=10, uint8_scalar=20,
                               int32_matrix=np.int32((10, 20, 3)), decimal_scalar=Decimal(123) / Decimal(10))

    TestSchema.make_namedtuple(string_scalar=None, int32_scalar=10, uint8_scalar=20,
                               int32_matrix=None, decimal_scalar=Decimal(123) / Decimal(10))

Source File: test_unischema.py From petastorm with Apache License 2.0

5 votes

def test_dict_to_spark_row_field_validation_ndarrays():
    """Test various validations done on data types when converting a dictionary to a spark row"""
    TestSchema = Unischema('TestSchema', [
        UnischemaField('tensor3d', np.float32, (10, 20, 30), NdarrayCodec(), False),
    ])

    assert isinstance(dict_to_spark_row(TestSchema, {'tensor3d': np.zeros((10, 20, 30), dtype=np.float32)}), Row)

    # Null value into not nullable field
    with pytest.raises(ValueError):
        isinstance(dict_to_spark_row(TestSchema, {'string_field': None}), Row)

    # Wrong dimensions
    with pytest.raises(ValueError):
        isinstance(dict_to_spark_row(TestSchema, {'string_field': np.zeros((1, 2, 3), dtype=np.float32)}), Row)

Source File: test_parquet.py From spectrify with MIT License

5 votes

def setUp(self):
        self.sa_meta = sa.MetaData()
        self.data = [
            [17.124, 1.12, 3.14, 13.37],
            [1, 2, 3, 4],
            [1, 2, 3, 4],
            [1, 2, 3, 4],
            [True, None, False, True],
            ['string 1', 'string 2', None, 'string 3'],
            [datetime(2007, 7, 13, 1, 23, 34, 123456),
             None,
             datetime(2006, 1, 13, 12, 34, 56, 432539),
             datetime(2010, 8, 13, 5, 46, 57, 437699), ],
            ["Test Text", "Some#More#Test#  Text", "!@#$%%^&*&", None],
        ]
        self.table = sa.Table(
            'unit_test_table',
            self.sa_meta,
            sa.Column('real_col', sa.REAL),
            sa.Column('bigint_col', sa.BIGINT),
            sa.Column('int_col', sa.INTEGER),
            sa.Column('smallint_col', sa.SMALLINT),
            sa.Column('bool_col', sa.BOOLEAN),
            sa.Column('str_col', sa.VARCHAR),
            sa.Column('timestamp_col', sa.TIMESTAMP),
            sa.Column('plaintext_col', sa.TEXT),
        )

        self.expected_datatypes = [
            pa.float32(),
            pa.int64(),
            pa.int32(),
            pa.int16(),
            pa.bool_(),
            pa.string(),
            pa.timestamp('ns'),
            pa.string(),
        ]

Source File: csv2parquet.py From csv2parquet with Apache License 2.0

5 votes

def get_pyarrow_types():
    return {
        'bool': PA_BOOL,
        'float32': PA_FLOAT32,
        'float64': PA_FLOAT64,
        'int8': PA_INT8,
        'int16': PA_INT16,
        'int32': PA_INT32,
        'int64': PA_INT64,
        'string': PA_STRING,
        'timestamp': PA_TIMESTAMP,
        'base64': PA_BINARY
    }

# pylint: disable=too-many-branches,too-many-statements

Source File: client.py From json2parquet with MIT License

4 votes

def _convert_data_with_schema(data, schema, date_format=None, field_aliases=None):
    column_data = {}
    array_data = []
    schema_names = []
    for row in data:
        for column in schema.names:
            _col = column_data.get(column, [])
            _col.append(row.get(column))
            column_data[column] = _col
    for column in schema:
        _col = column_data.get(column.name)
        if isinstance(column.type, pa.lib.TimestampType):
            _converted_col = []
            for t in _col:
                try:
                    _converted_col.append(pd.to_datetime(t, format=date_format))
                except pd._libs.tslib.OutOfBoundsDatetime:
                    _converted_col.append(pd.Timestamp.max)
            array_data.append(pa.Array.from_pandas(pd.to_datetime(_converted_col), type=pa.timestamp('ns')))
        elif column.type.id == pa.date32().id:
            _converted_col = map(_date_converter, _col)
            array_data.append(pa.array(_converted_col, type=pa.date32()))
        # Float types are ambiguous for conversions, need to specify the exact type
        elif column.type.id == pa.float64().id:
            array_data.append(pa.array(_col, type=pa.float64()))
        elif column.type.id == pa.float32().id:
            # Python doesn't have a native float32 type
            # and PyArrow cannot cast float64 -> float32
            _col = pd.to_numeric(_col, downcast='float')
            array_data.append(pa.Array.from_pandas(_col, type=pa.float32()))
        elif column.type.id == pa.int32().id:
            # PyArrow 0.8.0 can cast int64 -> int32
            _col64 = pa.array(_col, type=pa.int64())
            array_data.append(_col64.cast(pa.int32()))
        elif column.type.id == pa.bool_().id:
            _col = map(_boolean_converter, _col)
            array_data.append(pa.array(_col, type=column.type))
        else:
            array_data.append(pa.array(_col, type=column.type))
        if isinstance(field_aliases, dict):
            schema_names.append(field_aliases.get(column.name, column.name))
        else:
            schema_names.append(column.name)
    return pa.RecordBatch.from_arrays(array_data, schema_names)

Source File: epacems_to_parquet.py From pudl with MIT License

4 votes

def create_cems_schema():
    """Make an explicit Arrow schema for the EPA CEMS data.

    Make changes in the types of the generated parquet files by editing this
    function.

    Note that parquet's internal representation doesn't use unsigned numbers or
    16-bit ints, so just keep things simple here and always use int32 and
    float32.

    Returns:
        pyarrow.schema: An Arrow schema for the EPA CEMS data.

    """
    int_nullable = partial(pa.field, type=pa.int32(), nullable=True)
    int_not_null = partial(pa.field, type=pa.int32(), nullable=False)
    str_not_null = partial(pa.field, type=pa.string(), nullable=False)
    # Timestamp resolution is hourly, but millisecond is the largest allowed.
    timestamp = partial(pa.field, type=pa.timestamp(
        "ms", tz="utc"), nullable=False)
    float_nullable = partial(pa.field, type=pa.float32(), nullable=True)
    float_not_null = partial(pa.field, type=pa.float32(), nullable=False)
    # (float32 can accurately hold integers up to 16,777,216 so no need for
    # float64)
    dict_nullable = partial(
        pa.field,
        type=pa.dictionary(pa.int8(), pa.string(), ordered=False),
        nullable=True
    )
    return pa.schema([
        int_not_null("year"),
        dict_nullable("state"),
        int_not_null("plant_id_eia"),
        str_not_null("unitid"),
        timestamp("operating_datetime_utc"),
        float_nullable("operating_time_hours"),
        float_not_null("gross_load_mw"),
        float_nullable("steam_load_1000_lbs"),
        float_nullable("so2_mass_lbs"),
        dict_nullable("so2_mass_measurement_code"),
        float_nullable("nox_rate_lbs_mmbtu"),
        dict_nullable("nox_rate_measurement_code"),
        float_nullable("nox_mass_lbs"),
        dict_nullable("nox_mass_measurement_code"),
        float_nullable("co2_mass_tons"),
        dict_nullable("co2_mass_measurement_code"),
        float_not_null("heat_content_mmbtu"),
        int_nullable("facility_id"),
        int_nullable("unit_id_epa"),
    ])

Source File: epacems_to_parquet.py From pudl with MIT License

4 votes

def create_in_dtypes():
    """
    Create a dictionary of input data types.

    This specifies the dtypes of the input columns, which is necessary for some
    cases where, e.g., a column is always NaN.

    Returns:
        dict: mapping columns names to :mod:`pandas` data types.

    """
    # These measurement codes are used by all four of our measurement variables
    common_codes = (
        "LME",
        "Measured",
        "Measured and Substitute",
        "Other",
        "Substitute",
        "Undetermined",
        "Unknown Code",
        "",
    )
    co2_so2_cats = pd.CategoricalDtype(categories=common_codes, ordered=False)
    nox_cats = pd.CategoricalDtype(
        categories=common_codes + ("Calculated",), ordered=False
    )
    state_cats = pd.CategoricalDtype(
        categories=pc.cems_states.keys(), ordered=False)
    in_dtypes = {
        "state": state_cats,
        "plant_id_eia": "int32",
        "unitid": pd.StringDtype(),
        # "operating_datetime_utc": "datetime",
        "operating_time_hours": "float32",
        "gross_load_mw": "float32",
        "steam_load_1000_lbs": "float32",
        "so2_mass_lbs": "float32",
        "so2_mass_measurement_code": co2_so2_cats,
        "nox_rate_lbs_mmbtu": "float32",
        "nox_rate_measurement_code": nox_cats,
        "nox_mass_lbs": "float32",
        "nox_mass_measurement_code": nox_cats,
        "co2_mass_tons": "float32",
        "co2_mass_measurement_code": co2_so2_cats,
        "heat_content_mmbtu": "float32",
        "facility_id": pd.Int32Dtype(),
        "unit_id_epa": pd.Int32Dtype(),
    }
    return in_dtypes

Source File: batch_util_test.py From data-validation with Apache License 2.0

4 votes

def test_batch_examples(self):
    examples = [
        {
            'a': np.array([1.0, 2.0], dtype=np.float32),
            'b': np.array(['a', 'b', 'c', 'e'])
        },
        {
            'a': np.array([3.0, 4.0, 5.0], dtype=np.float32),
        },
        {
            'b': np.array(['d', 'e', 'f']),
            'd': np.array([10, 20, 30], dtype=np.int64),
        },
        {
            'b': np.array(['a', 'b', 'c'])
        },
        {
            'c': np.array(['d', 'e', 'f'])
        }
    ]
    expected_record_batches = [
        pa.RecordBatch.from_arrays([
            pa.array([[1.0, 2.0], [3.0, 4.0, 5.0]], type=pa.list_(
                pa.float32())),
            pa.array([['a', 'b', 'c', 'e'], None])
        ], ['a', 'b']),
        pa.RecordBatch.from_arrays([
            pa.array([['d', 'e', 'f'], ['a', 'b', 'c']]),
            pa.array([[10, 20, 30], None], type=pa.list_(pa.int64()))
        ], ['b', 'd']),
        pa.RecordBatch.from_arrays([pa.array([['d', 'e', 'f']])], ['c']),
    ]

    with beam.Pipeline() as p:
      result = (
          p
          | beam.Create(examples, reshuffle=False)
          | batch_util.BatchExamplesToArrowRecordBatches(desired_batch_size=2))
      util.assert_that(
          result,
          test_util.make_arrow_record_batches_equal_fn(self,
                                                       expected_record_batches))

Python pyarrow.float32() Examples