Python pyarrow.float32() Examples
The following are 23
code examples of pyarrow.float32().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyarrow
, or try the search function
.
Example #1
Source File: array_util_test.py From tfx-bsl with Apache License 2.0 | 6 votes |
def test_get_flattened_array_parent_indices(self, list_type_factory, parent_indices_type): indices = array_util.GetFlattenedArrayParentIndices( pa.array([], type=list_type_factory(pa.int32()))) self.assertTrue(indices.equals(pa.array([], type=parent_indices_type))) indices = array_util.GetFlattenedArrayParentIndices( pa.array([[1.], [2.], [], [3., 4.]], type=list_type_factory(pa.float32()))) self.assertTrue( indices.equals(pa.array([0, 1, 3, 3], type=parent_indices_type))) indices = array_util.GetFlattenedArrayParentIndices( pa.array([[1.], [2.], [], [3., 4.]], type=list_type_factory(pa.float32())).slice(1)) self.assertTrue( indices.equals(pa.array([0, 2, 2], type=parent_indices_type))) indices = array_util.GetFlattenedArrayParentIndices( pa.array([list(range(1024))], type=list_type_factory(pa.int64()))) self.assertTrue( indices.equals(pa.array([0] * 1024, type=parent_indices_type)))
Example #2
Source File: test_pandas_integration.py From fletcher with MIT License | 6 votes |
def array_chunked_nulls(request): case_dict = { "all": pa.chunked_array([pa.array([None] * 4) for _ in range(10)]), "all_float": pa.chunked_array( [pa.array([None] * 4, type=pa.float32()) for _ in range(10)] ), "some_in_all_chunks": pa.chunked_array( [pa.array(["a", "b", None] * 4), pa.array(["a", None, "b"] * 4)] ), "only_in_some_chunk": pa.chunked_array( [ pa.array(["a", "x"]), pa.array(["a", "b", None] * 4), pa.array(["a", "b"] * 4), ] ), "none": pa.chunked_array([pa.array(["a", "b"] * 4) for _ in range(10)]), } return case_dict[request.param] # ---------------------------------------------------------------------------- # Block Methods # ----------------------------------------------------------------------------
Example #3
Source File: tf_sequence_example_record_test.py From tfx-bsl with Apache License 2.0 | 6 votes |
def _GetExpectedColumnValues(tfxio): if tfxio._can_produce_large_types: list_factory = pa.large_list bytes_type = pa.large_binary() else: list_factory = pa.list_ bytes_type = pa.binary() return { path.ColumnPath(["int_feature"]): pa.array([[1], [2], [3]], type=list_factory(pa.int64())), path.ColumnPath(["float_feature"]): pa.array([[1, 2, 3, 4], [2, 3, 4, 5], None], type=list_factory(pa.float32())), path.ColumnPath([_SEQUENCE_COLUMN_NAME, "int_feature"]): pa.array([[[1, 2], [3]], None, [[4]]], list_factory(list_factory(pa.int64()))), path.ColumnPath([_SEQUENCE_COLUMN_NAME, "string_feature"]): pa.array([None, [[b"foo", b"bar"], []], [[b"baz"]]], list_factory(list_factory(bytes_type))) }
Example #4
Source File: test_integration.py From pymapd with Apache License 2.0 | 6 votes |
def test_load_empty_table_arrow(self, con): con.execute("drop table if exists baz;") con.execute("create table baz (a int, b float, c text);") data = [(1, 1.1, 'a'), (2, 2.2, '2'), (3, 3.3, '3')] df = pd.DataFrame(data, columns=list('abc')).astype( {'a': 'int32', 'b': 'float32'} ) table = pa.Table.from_pandas(df, preserve_index=False) con.load_table("baz", table, method='arrow') result = sorted(con.execute("select * from baz")) self.check_empty_insert(result, data) con.execute("drop table if exists baz;")
Example #5
Source File: test_integration.py From pymapd with Apache License 2.0 | 5 votes |
def test_select_ipc_gpu(self, con, query, parameters): from cudf.core.dataframe import DataFrame c = con.cursor() c.execute('drop table if exists stocks;') create = ( 'create table stocks (date_ text, trans text, symbol text, ' 'qty int, price float, vol float);' ) c.execute(create) i1 = "INSERT INTO stocks VALUES ('2006-01-05','BUY','RHAT',100,35.14,1.1);" # noqa i2 = "INSERT INTO stocks VALUES ('2006-01-05','BUY','GOOG',100,12.14,1.2);" # noqa c.execute(i1) c.execute(i2) result = con.select_ipc_gpu("select qty, price from stocks") assert isinstance(result, DataFrame) dtypes = dict(qty=np.int32, price=np.float32) expected = pd.DataFrame( [[100, 35.14], [100, 12.14]], columns=['qty', 'price'] ).astype(dtypes) result = result.to_pandas()[['qty', 'price']] # column order pd.testing.assert_frame_equal(result, expected) c.execute('drop table if exists stocks;')
Example #6
Source File: array_util_test.py From tfx-bsl with Apache License 2.0 | 5 votes |
def test_element_lengths_list_array(self, list_type_factory): list_lengths = array_util.GetElementLengths( pa.array([], type=list_type_factory(pa.int64()))) self.assertTrue(list_lengths.equals(pa.array([], type=pa.int64()))) list_lengths = array_util.GetElementLengths( pa.array([[1., 2.], [], [3.]], list_type_factory(pa.float32()))) self.assertTrue(list_lengths.equals(pa.array([2, 0, 1], type=pa.int64()))) list_lengths = array_util.GetElementLengths( pa.array([[1., 2.], None, [3.]], list_type_factory(pa.float64()))) self.assertTrue(list_lengths.equals(pa.array([2, 0, 1], type=pa.int64())))
Example #7
Source File: array_util_test.py From tfx-bsl with Apache License 2.0 | 5 votes |
def test_list_lengths(self, list_type_factory): list_lengths = array_util.ListLengthsFromListArray( pa.array([], type=list_type_factory(pa.int64()))) self.assertTrue(list_lengths.equals(pa.array([], type=pa.int64()))) list_lengths = array_util.ListLengthsFromListArray( pa.array([[1., 2.], [], [3.]], type=list_type_factory(pa.float32()))) self.assertTrue(list_lengths.equals(pa.array([2, 0, 1], type=pa.int64()))) list_lengths = array_util.ListLengthsFromListArray( pa.array([[1., 2.], None, [3.]], type=list_type_factory(pa.float64()))) self.assertTrue(list_lengths.equals(pa.array([2, 0, 1], type=pa.int64())))
Example #8
Source File: types.py From cjworkbench with GNU Affero General Public License v3.0 | 5 votes |
def _dtype_to_arrow_type(dtype: np.dtype) -> pyarrow.DataType: if dtype == np.int8: return pyarrow.int8() elif dtype == np.int16: return pyarrow.int16() elif dtype == np.int32: return pyarrow.int32() elif dtype == np.int64: return pyarrow.int64() elif dtype == np.uint8: return pyarrow.uint8() elif dtype == np.uint16: return pyarrow.uint16() elif dtype == np.uint32: return pyarrow.uint32() elif dtype == np.uint64: return pyarrow.uint64() elif dtype == np.float16: return pyarrow.float16() elif dtype == np.float32: return pyarrow.float32() elif dtype == np.float64: return pyarrow.float64() elif dtype.kind == "M": # [2019-09-17] Pandas only allows "ns" unit -- as in, datetime64[ns] # https://github.com/pandas-dev/pandas/issues/7307#issuecomment-224180563 assert dtype.str.endswith("[ns]") return pyarrow.timestamp(unit="ns", tz=None) elif dtype == np.object_: return pyarrow.string() else: raise RuntimeError("Unhandled dtype %r" % dtype)
Example #9
Source File: parquet_pio.py From sdc with BSD 2-Clause "Simplified" License | 5 votes |
def _get_numba_typ_from_pa_typ(pa_typ): import pyarrow as pa _typ_map = { # boolean pa.bool_(): types.bool_, # signed int types pa.int8(): types.int8, pa.int16(): types.int16, pa.int32(): types.int32, pa.int64(): types.int64, # unsigned int types pa.uint8(): types.uint8, pa.uint16(): types.uint16, pa.uint32(): types.uint32, pa.uint64(): types.uint64, # float types (TODO: float16?) pa.float32(): types.float32, pa.float64(): types.float64, # String pa.string(): string_type, # date pa.date32(): types.NPDatetime('ns'), pa.date64(): types.NPDatetime('ns'), # time (TODO: time32, time64, ...) pa.timestamp('ns'): types.NPDatetime('ns'), pa.timestamp('us'): types.NPDatetime('ns'), pa.timestamp('ms'): types.NPDatetime('ns'), pa.timestamp('s'): types.NPDatetime('ns'), } if pa_typ not in _typ_map: raise ValueError("Arrow data type {} not supported yet".format(pa_typ)) return _typ_map[pa_typ]
Example #10
Source File: test_integration.py From pymapd with Apache License 2.0 | 5 votes |
def test_load_table_creates(self, con): data = pd.DataFrame( { "boolean_": [True, False], "smallint_cast": np.array([0, 1], dtype=np.int8), "smallint_": np.array([0, 1], dtype=np.int16), "int_": np.array([0, 1], dtype=np.int32), "bigint_": np.array([0, 1], dtype=np.int64), "float_": np.array([0, 1], dtype=np.float32), "double_": np.array([0, 1], dtype=np.float64), "varchar_": ["a", "b"], "text_": ['a', 'b'], "time_": [datetime.time(0, 11, 59), datetime.time(13)], "timestamp_": [pd.Timestamp("2016"), pd.Timestamp("2017")], "date_": [ datetime.date(2016, 1, 1), datetime.date(2017, 1, 1), ], }, columns=[ 'boolean_', 'smallint_', 'int_', 'bigint_', 'float_', 'double_', 'varchar_', 'text_', 'time_', 'timestamp_', 'date_', ], ) con.execute("drop table if exists test_load_table_creates;") con.load_table("test_load_table_creates", data, create=True) con.execute("drop table if exists test_load_table_creates;")
Example #11
Source File: test_integration.py From pymapd with Apache License 2.0 | 5 votes |
def test_load_infer(self, con): con.execute("drop table if exists baz;") con.execute("create table baz (a int, b float, c text);") data = pd.DataFrame( { 'a': np.array([0, 1], dtype=np.int32), 'b': np.array([1.1, 2.2], dtype=np.float32), 'c': ['a', 'b'], } ) con.load_table("baz", data) con.execute("drop table if exists baz;")
Example #12
Source File: test_integration.py From pymapd with Apache License 2.0 | 5 votes |
def test_select_ipc_parametrized(self, con, query, parameters): c = con.cursor() c.execute('drop table if exists stocks;') create = ( 'create table stocks (date_ text, trans text, symbol text, ' 'qty int, price float, vol float);' ) c.execute(create) i1 = "INSERT INTO stocks VALUES ('2006-01-05','BUY','RHAT',100,35.14,1.1);" # noqa i2 = "INSERT INTO stocks VALUES ('2006-01-05','BUY','GOOG',100,12.14,1.2);" # noqa c.execute(i1) c.execute(i2) result = con.select_ipc(query, parameters=parameters) expected = pd.DataFrame( { "qty": np.array([100, 100], dtype=np.int32), "price": np.array( [35.13999938964844, 12.140000343322754], dtype=np.float32 ), } )[['qty', 'price']] tm.assert_frame_equal(result, expected) c.execute('drop table if exists stocks;')
Example #13
Source File: array_util_test.py From tfx-bsl with Apache License 2.0 | 5 votes |
def _get_numeric_byte_size_test_cases(): result = [] for array_type, sizeof in [ (pa.int8(), 1), (pa.uint8(), 1), (pa.int16(), 2), (pa.uint16(), 2), (pa.int32(), 4), (pa.uint32(), 4), (pa.int64(), 8), (pa.uint64(), 8), (pa.float32(), 4), (pa.float64(), 8), ]: result.append( dict( testcase_name=str(array_type), array=pa.array(range(9), type=array_type), slice_offset=2, slice_length=3, expected_size=(_all_false_null_bitmap_size(2) + sizeof * 9), expected_sliced_size=(_all_false_null_bitmap_size(1) + sizeof * 3))) return result
Example #14
Source File: types.py From LearningApacheSpark with MIT License | 5 votes |
def to_arrow_type(dt): """ Convert Spark data type to pyarrow type """ from distutils.version import LooseVersion import pyarrow as pa if type(dt) == BooleanType: arrow_type = pa.bool_() elif type(dt) == ByteType: arrow_type = pa.int8() elif type(dt) == ShortType: arrow_type = pa.int16() elif type(dt) == IntegerType: arrow_type = pa.int32() elif type(dt) == LongType: arrow_type = pa.int64() elif type(dt) == FloatType: arrow_type = pa.float32() elif type(dt) == DoubleType: arrow_type = pa.float64() elif type(dt) == DecimalType: arrow_type = pa.decimal128(dt.precision, dt.scale) elif type(dt) == StringType: arrow_type = pa.string() elif type(dt) == BinaryType: # TODO: remove version check once minimum pyarrow version is 0.10.0 if LooseVersion(pa.__version__) < LooseVersion("0.10.0"): raise TypeError("Unsupported type in conversion to Arrow: " + str(dt) + "\nPlease install pyarrow >= 0.10.0 for BinaryType support.") arrow_type = pa.binary() elif type(dt) == DateType: arrow_type = pa.date32() elif type(dt) == TimestampType: # Timestamps should be in UTC, JVM Arrow timestamps require a timezone to be read arrow_type = pa.timestamp('us', tz='UTC') elif type(dt) == ArrayType: if type(dt.elementType) == TimestampType: raise TypeError("Unsupported type in conversion to Arrow: " + str(dt)) arrow_type = pa.list_(to_arrow_type(dt.elementType)) else: raise TypeError("Unsupported type in conversion to Arrow: " + str(dt)) return arrow_type
Example #15
Source File: test_unischema.py From petastorm with Apache License 2.0 | 5 votes |
def test_arrow_schema_convertion(): fields = [ pa.field('string', pa.string()), pa.field('int8', pa.int8()), pa.field('int16', pa.int16()), pa.field('int32', pa.int32()), pa.field('int64', pa.int64()), pa.field('float', pa.float32()), pa.field('double', pa.float64()), pa.field('bool', pa.bool_(), False), pa.field('fixed_size_binary', pa.binary(10)), pa.field('variable_size_binary', pa.binary()), pa.field('decimal', pa.decimal128(3, 4)), pa.field('timestamp_s', pa.timestamp('s')), pa.field('timestamp_ns', pa.timestamp('ns')), pa.field('date_32', pa.date32()), pa.field('date_64', pa.date64()) ] arrow_schema = pa.schema(fields) mock_dataset = _mock_parquet_dataset([], arrow_schema) unischema = Unischema.from_arrow_schema(mock_dataset) for name in arrow_schema.names: assert getattr(unischema, name).name == name assert getattr(unischema, name).codec is None if name == 'bool': assert not getattr(unischema, name).nullable else: assert getattr(unischema, name).nullable # Test schema preserve fields order field_name_list = [f.name for f in fields] assert list(unischema.fields.keys()) == field_name_list
Example #16
Source File: test_unischema.py From petastorm with Apache License 2.0 | 5 votes |
def test_make_named_tuple(): TestSchema = Unischema('TestSchema', [ UnischemaField('string_scalar', np.string_, (), ScalarCodec(StringType()), True), UnischemaField('int32_scalar', np.int32, (), ScalarCodec(ShortType()), False), UnischemaField('uint8_scalar', np.uint8, (), ScalarCodec(ShortType()), False), UnischemaField('int32_matrix', np.float32, (10, 20, 3), NdarrayCodec(), True), UnischemaField('decimal_scalar', Decimal, (10, 20, 3), ScalarCodec(DecimalType(10, 9)), False), ]) TestSchema.make_namedtuple(string_scalar='abc', int32_scalar=10, uint8_scalar=20, int32_matrix=np.int32((10, 20, 3)), decimal_scalar=Decimal(123) / Decimal(10)) TestSchema.make_namedtuple(string_scalar=None, int32_scalar=10, uint8_scalar=20, int32_matrix=None, decimal_scalar=Decimal(123) / Decimal(10))
Example #17
Source File: test_unischema.py From petastorm with Apache License 2.0 | 5 votes |
def test_dict_to_spark_row_field_validation_ndarrays(): """Test various validations done on data types when converting a dictionary to a spark row""" TestSchema = Unischema('TestSchema', [ UnischemaField('tensor3d', np.float32, (10, 20, 30), NdarrayCodec(), False), ]) assert isinstance(dict_to_spark_row(TestSchema, {'tensor3d': np.zeros((10, 20, 30), dtype=np.float32)}), Row) # Null value into not nullable field with pytest.raises(ValueError): isinstance(dict_to_spark_row(TestSchema, {'string_field': None}), Row) # Wrong dimensions with pytest.raises(ValueError): isinstance(dict_to_spark_row(TestSchema, {'string_field': np.zeros((1, 2, 3), dtype=np.float32)}), Row)
Example #18
Source File: test_parquet.py From spectrify with MIT License | 5 votes |
def setUp(self): self.sa_meta = sa.MetaData() self.data = [ [17.124, 1.12, 3.14, 13.37], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [True, None, False, True], ['string 1', 'string 2', None, 'string 3'], [datetime(2007, 7, 13, 1, 23, 34, 123456), None, datetime(2006, 1, 13, 12, 34, 56, 432539), datetime(2010, 8, 13, 5, 46, 57, 437699), ], ["Test Text", "Some#More#Test# Text", "!@#$%%^&*&", None], ] self.table = sa.Table( 'unit_test_table', self.sa_meta, sa.Column('real_col', sa.REAL), sa.Column('bigint_col', sa.BIGINT), sa.Column('int_col', sa.INTEGER), sa.Column('smallint_col', sa.SMALLINT), sa.Column('bool_col', sa.BOOLEAN), sa.Column('str_col', sa.VARCHAR), sa.Column('timestamp_col', sa.TIMESTAMP), sa.Column('plaintext_col', sa.TEXT), ) self.expected_datatypes = [ pa.float32(), pa.int64(), pa.int32(), pa.int16(), pa.bool_(), pa.string(), pa.timestamp('ns'), pa.string(), ]
Example #19
Source File: csv2parquet.py From csv2parquet with Apache License 2.0 | 5 votes |
def get_pyarrow_types(): return { 'bool': PA_BOOL, 'float32': PA_FLOAT32, 'float64': PA_FLOAT64, 'int8': PA_INT8, 'int16': PA_INT16, 'int32': PA_INT32, 'int64': PA_INT64, 'string': PA_STRING, 'timestamp': PA_TIMESTAMP, 'base64': PA_BINARY } # pylint: disable=too-many-branches,too-many-statements
Example #20
Source File: client.py From json2parquet with MIT License | 4 votes |
def _convert_data_with_schema(data, schema, date_format=None, field_aliases=None): column_data = {} array_data = [] schema_names = [] for row in data: for column in schema.names: _col = column_data.get(column, []) _col.append(row.get(column)) column_data[column] = _col for column in schema: _col = column_data.get(column.name) if isinstance(column.type, pa.lib.TimestampType): _converted_col = [] for t in _col: try: _converted_col.append(pd.to_datetime(t, format=date_format)) except pd._libs.tslib.OutOfBoundsDatetime: _converted_col.append(pd.Timestamp.max) array_data.append(pa.Array.from_pandas(pd.to_datetime(_converted_col), type=pa.timestamp('ns'))) elif column.type.id == pa.date32().id: _converted_col = map(_date_converter, _col) array_data.append(pa.array(_converted_col, type=pa.date32())) # Float types are ambiguous for conversions, need to specify the exact type elif column.type.id == pa.float64().id: array_data.append(pa.array(_col, type=pa.float64())) elif column.type.id == pa.float32().id: # Python doesn't have a native float32 type # and PyArrow cannot cast float64 -> float32 _col = pd.to_numeric(_col, downcast='float') array_data.append(pa.Array.from_pandas(_col, type=pa.float32())) elif column.type.id == pa.int32().id: # PyArrow 0.8.0 can cast int64 -> int32 _col64 = pa.array(_col, type=pa.int64()) array_data.append(_col64.cast(pa.int32())) elif column.type.id == pa.bool_().id: _col = map(_boolean_converter, _col) array_data.append(pa.array(_col, type=column.type)) else: array_data.append(pa.array(_col, type=column.type)) if isinstance(field_aliases, dict): schema_names.append(field_aliases.get(column.name, column.name)) else: schema_names.append(column.name) return pa.RecordBatch.from_arrays(array_data, schema_names)
Example #21
Source File: epacems_to_parquet.py From pudl with MIT License | 4 votes |
def create_cems_schema(): """Make an explicit Arrow schema for the EPA CEMS data. Make changes in the types of the generated parquet files by editing this function. Note that parquet's internal representation doesn't use unsigned numbers or 16-bit ints, so just keep things simple here and always use int32 and float32. Returns: pyarrow.schema: An Arrow schema for the EPA CEMS data. """ int_nullable = partial(pa.field, type=pa.int32(), nullable=True) int_not_null = partial(pa.field, type=pa.int32(), nullable=False) str_not_null = partial(pa.field, type=pa.string(), nullable=False) # Timestamp resolution is hourly, but millisecond is the largest allowed. timestamp = partial(pa.field, type=pa.timestamp( "ms", tz="utc"), nullable=False) float_nullable = partial(pa.field, type=pa.float32(), nullable=True) float_not_null = partial(pa.field, type=pa.float32(), nullable=False) # (float32 can accurately hold integers up to 16,777,216 so no need for # float64) dict_nullable = partial( pa.field, type=pa.dictionary(pa.int8(), pa.string(), ordered=False), nullable=True ) return pa.schema([ int_not_null("year"), dict_nullable("state"), int_not_null("plant_id_eia"), str_not_null("unitid"), timestamp("operating_datetime_utc"), float_nullable("operating_time_hours"), float_not_null("gross_load_mw"), float_nullable("steam_load_1000_lbs"), float_nullable("so2_mass_lbs"), dict_nullable("so2_mass_measurement_code"), float_nullable("nox_rate_lbs_mmbtu"), dict_nullable("nox_rate_measurement_code"), float_nullable("nox_mass_lbs"), dict_nullable("nox_mass_measurement_code"), float_nullable("co2_mass_tons"), dict_nullable("co2_mass_measurement_code"), float_not_null("heat_content_mmbtu"), int_nullable("facility_id"), int_nullable("unit_id_epa"), ])
Example #22
Source File: epacems_to_parquet.py From pudl with MIT License | 4 votes |
def create_in_dtypes(): """ Create a dictionary of input data types. This specifies the dtypes of the input columns, which is necessary for some cases where, e.g., a column is always NaN. Returns: dict: mapping columns names to :mod:`pandas` data types. """ # These measurement codes are used by all four of our measurement variables common_codes = ( "LME", "Measured", "Measured and Substitute", "Other", "Substitute", "Undetermined", "Unknown Code", "", ) co2_so2_cats = pd.CategoricalDtype(categories=common_codes, ordered=False) nox_cats = pd.CategoricalDtype( categories=common_codes + ("Calculated",), ordered=False ) state_cats = pd.CategoricalDtype( categories=pc.cems_states.keys(), ordered=False) in_dtypes = { "state": state_cats, "plant_id_eia": "int32", "unitid": pd.StringDtype(), # "operating_datetime_utc": "datetime", "operating_time_hours": "float32", "gross_load_mw": "float32", "steam_load_1000_lbs": "float32", "so2_mass_lbs": "float32", "so2_mass_measurement_code": co2_so2_cats, "nox_rate_lbs_mmbtu": "float32", "nox_rate_measurement_code": nox_cats, "nox_mass_lbs": "float32", "nox_mass_measurement_code": nox_cats, "co2_mass_tons": "float32", "co2_mass_measurement_code": co2_so2_cats, "heat_content_mmbtu": "float32", "facility_id": pd.Int32Dtype(), "unit_id_epa": pd.Int32Dtype(), } return in_dtypes
Example #23
Source File: batch_util_test.py From data-validation with Apache License 2.0 | 4 votes |
def test_batch_examples(self): examples = [ { 'a': np.array([1.0, 2.0], dtype=np.float32), 'b': np.array(['a', 'b', 'c', 'e']) }, { 'a': np.array([3.0, 4.0, 5.0], dtype=np.float32), }, { 'b': np.array(['d', 'e', 'f']), 'd': np.array([10, 20, 30], dtype=np.int64), }, { 'b': np.array(['a', 'b', 'c']) }, { 'c': np.array(['d', 'e', 'f']) } ] expected_record_batches = [ pa.RecordBatch.from_arrays([ pa.array([[1.0, 2.0], [3.0, 4.0, 5.0]], type=pa.list_( pa.float32())), pa.array([['a', 'b', 'c', 'e'], None]) ], ['a', 'b']), pa.RecordBatch.from_arrays([ pa.array([['d', 'e', 'f'], ['a', 'b', 'c']]), pa.array([[10, 20, 30], None], type=pa.list_(pa.int64())) ], ['b', 'd']), pa.RecordBatch.from_arrays([pa.array([['d', 'e', 'f']])], ['c']), ] with beam.Pipeline() as p: result = ( p | beam.Create(examples, reshuffle=False) | batch_util.BatchExamplesToArrowRecordBatches(desired_batch_size=2)) util.assert_that( result, test_util.make_arrow_record_batches_equal_fn(self, expected_record_batches))