Python pyarrow.binary() Examples
The following are 23
code examples of pyarrow.binary().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyarrow
, or try the search function
.
Example #1
Source File: test_unit_arrow_chunk_iterator.py From snowflake-connector-python with Apache License 2.0 | 6 votes |
def test_iterate_over_binary_chunk(): random.seed(datetime.datetime.now()) column_meta = { "byteLength": "100", "logicalType": "BINARY", "precision": "0", "scale": "0", "charLength": "0" } def byte_array_generator(): return bytearray(os.urandom(1000)) iterate_over_test_chunk([pyarrow.binary(), pyarrow.binary()], [column_meta, column_meta], byte_array_generator)
Example #2
Source File: test_unit_arrow_chunk_iterator.py From snowflake-connector-python with Apache License 2.0 | 6 votes |
def test_iterate_over_binary_chunk(): random.seed(datetime.datetime.now()) column_meta = { "byteLength": "100", "logicalType": "BINARY", "precision": "0", "scale": "0", "charLength": "0" } def byte_array_generator(): return bytearray(os.urandom(1000)) iterate_over_test_chunk([pyarrow.binary(), pyarrow.binary()], [column_meta, column_meta], byte_array_generator)
Example #3
Source File: tensor_adapter_test.py From tfx-bsl with Apache License 2.0 | 6 votes |
def testRaggedTensorStructTypeNonLeaf(self): tensor_representation = text_format.Parse( """ ragged_tensor { feature_path { step: "ragged_feature" } } """, schema_pb2.TensorRepresentation()) record_batch = pa.RecordBatch.from_arrays([ pa.StructArray.from_arrays([ pa.array([[1, 2, 3]], pa.list_(pa.int64())), pa.array([["a", "b", "c"]], pa.list_(pa.binary())) ], ["inner_feature", "x2"]) ], ["ragged_feature"]) with self.assertRaisesRegex(ValueError, ".*Unable to handle tensor output.*"): tensor_adapter.TensorAdapter( tensor_adapter.TensorAdapterConfig(record_batch.schema, {"output": tensor_representation}))
Example #4
Source File: tensor_adapter_test.py From tfx-bsl with Apache License 2.0 | 6 votes |
def testRaggedTensorStructTypeInvalidSteps(self): tensor_representation = text_format.Parse( """ ragged_tensor { feature_path { step: "ragged_feature" step: "wrong_step" } } """, schema_pb2.TensorRepresentation()) record_batch = pa.RecordBatch.from_arrays([ pa.StructArray.from_arrays([ pa.array([[1, 2, 3]], pa.list_(pa.int64())), pa.array([["a", "b", "c"]], pa.list_(pa.binary())) ], ["inner_feature", "x2"]) ], ["ragged_feature"]) with self.assertRaisesRegex(ValueError, ".*Unable to handle tensor output.*"): tensor_adapter.TensorAdapter( tensor_adapter.TensorAdapterConfig(record_batch.schema, {"output": tensor_representation}))
Example #5
Source File: tf_sequence_example_record_test.py From tfx-bsl with Apache License 2.0 | 6 votes |
def _GetExpectedColumnValues(tfxio): if tfxio._can_produce_large_types: list_factory = pa.large_list bytes_type = pa.large_binary() else: list_factory = pa.list_ bytes_type = pa.binary() return { path.ColumnPath(["int_feature"]): pa.array([[1], [2], [3]], type=list_factory(pa.int64())), path.ColumnPath(["float_feature"]): pa.array([[1, 2, 3, 4], [2, 3, 4, 5], None], type=list_factory(pa.float32())), path.ColumnPath([_SEQUENCE_COLUMN_NAME, "int_feature"]): pa.array([[[1, 2], [3]], None, [[4]]], list_factory(list_factory(pa.int64()))), path.ColumnPath([_SEQUENCE_COLUMN_NAME, "string_feature"]): pa.array([None, [[b"foo", b"bar"], []], [[b"baz"]]], list_factory(list_factory(bytes_type))) }
Example #6
Source File: sequence_example_coder_test.py From tfx-bsl with Apache License 2.0 | 5 votes |
def _test_decode(self, schema_text_proto, sequence_examples_text_proto, create_expected, use_large_types): serialized_sequence_examples = [ text_format.Parse(pbtxt, tf.train.SequenceExample()).SerializeToString() for pbtxt in sequence_examples_text_proto ] serialized_schema = None if schema_text_proto is not None: serialized_schema = text_format.Parse( schema_text_proto, schema_pb2.Schema()).SerializeToString() if serialized_schema: coder = sequence_example_coder.SequenceExamplesToRecordBatchDecoder( _TEST_SEQUENCE_COLUMN_NAME, serialized_schema, use_large_types=use_large_types) else: coder = sequence_example_coder.SequenceExamplesToRecordBatchDecoder( _TEST_SEQUENCE_COLUMN_NAME, use_large_types=use_large_types) result = coder.DecodeBatch(serialized_sequence_examples) self.assertIsInstance(result, pa.RecordBatch) if use_large_types: expected = create_expected(pa.large_list, pa.large_binary()) else: expected = create_expected(pa.list_, pa.binary()) self.assertTrue( result.equals(expected), "actual: {}\n expected:{}".format(result, expected)) if serialized_schema is not None: self.assertTrue(coder.ArrowSchema().equals(result.schema))
Example #7
Source File: record_based_tfxio.py From tfx-bsl with Apache License 2.0 | 5 votes |
def CreateRawRecordColumn( raw_records: List[bytes], produce_large_types: bool) -> pa.Array: """Returns an Array that satisfies the requirement of a raw record column.""" list_array_factory = ( pa.LargeListArray.from_arrays if produce_large_types else pa.ListArray.from_arrays) binary_type = pa.large_binary() if produce_large_types else pa.binary() return list_array_factory( np.arange(0, len(raw_records) + 1, dtype=np.int64), pa.array(raw_records, type=binary_type))
Example #8
Source File: record_based_tfxio.py From tfx-bsl with Apache License 2.0 | 5 votes |
def ArrowSchema(self) -> pa.Schema: schema = self._ArrowSchemaNoRawRecordColumn() if self._raw_record_column_name is not None: column_type = (pa.large_list(pa.large_binary()) if self._can_produce_large_types else pa.list_(pa.binary())) if schema.get_field_index(self._raw_record_column_name) != -1: raise ValueError( "Raw record column name {} collided with a column in the schema." .format(self._raw_record_column_name)) schema = schema.append( pa.field(self._raw_record_column_name, column_type)) return schema
Example #9
Source File: tensor_adapter.py From tfx-bsl with Apache License 2.0 | 5 votes |
def _GetConvertToBinaryFn( array_type: pa.DataType) -> Optional[Callable[[pa.Array], pa.Array]]: """Returns a function that converts a StringArray to BinaryArray.""" if pa.types.is_string(array_type): return lambda array: array.view(pa.binary()) if pa.types.is_large_string(array_type): return lambda array: array.view(pa.large_binary()) return None
Example #10
Source File: tensor_to_arrow.py From tfx-bsl with Apache License 2.0 | 5 votes |
def _tf_dtype_to_arrow_type(dtype: tf.DType): """Maps a tf Dtype to an Arrow type.""" if dtype == tf.string: return pa.binary() elif dtype == tf.bool: raise TypeError("Unable to handle bool tensors -- consider casting it to a " "tf.uint8") return pa.from_numpy_dtype(dtype.as_numpy_dtype)
Example #11
Source File: tf_sequence_example_record_test.py From tfx-bsl with Apache License 2.0 | 5 votes |
def _ValidateRecordBatch( self, tfxio, record_batch, raw_record_column_name=None): self.assertIsInstance(record_batch, pa.RecordBatch) self.assertEqual(record_batch.num_rows, 3) expected_column_values = _GetExpectedColumnValues(tfxio) for i, field in enumerate(record_batch.schema): if field.name == raw_record_column_name: continue if field.name == _SEQUENCE_COLUMN_NAME: self.assertTrue(pa.types.is_struct(field.type)) for seq_column, seq_field in zip( record_batch.column(i).flatten(), list(field.type)): expected_array = expected_column_values[path.ColumnPath( [_SEQUENCE_COLUMN_NAME, seq_field.name])] self.assertTrue( seq_column.equals(expected_array), "Sequence column {} did not match ({} vs {})".format( seq_field.name, seq_column, expected_array)) continue self.assertTrue( record_batch.column(i).equals(expected_column_values[path.ColumnPath( [field.name])]), "Column {} did not match ({} vs {}).".format( field.name, record_batch.column(i), expected_column_values[path.ColumnPath([field.name])])) if raw_record_column_name is not None: if tfxio._can_produce_large_types: raw_record_column_type = pa.large_list(pa.large_binary()) else: raw_record_column_type = pa.list_(pa.binary()) self.assertEqual(record_batch.schema.names[-1], raw_record_column_name) self.assertTrue( record_batch.columns[-1].type.equals(raw_record_column_type)) self.assertEqual(record_batch.columns[-1].flatten().to_pylist(), _SERIALIZED_EXAMPLES)
Example #12
Source File: table_util_test.py From tfx-bsl with Apache License 2.0 | 5 votes |
def test_success(self, row_indices, expected_output): record_batch = pa.RecordBatch.from_arrays([ pa.array([[1, 2, 3], None, [4], [], [5, 6], [7], [8, 9], [10], []], type=pa.list_(pa.int32())), pa.array( [["a"], ["b", "c"], None, [], None, ["d", "e"], ["f"], None, ["g"]], type=pa.list_(pa.binary())), ], ["f1", "f2"]) for row_indices_type in (pa.int32(), pa.int64()): sliced = table_util.RecordBatchTake( record_batch, pa.array(row_indices, type=row_indices_type)) self.assertTrue( sliced.equals(expected_output), "Expected {}, got {}".format(expected_output, sliced))
Example #13
Source File: array_util_test.py From tfx-bsl with Apache License 2.0 | 5 votes |
def _get_binary_like_byte_size_test_cases(): result = [] for array_type, sizeof_offsets in [ (pa.binary(), 4), (pa.string(), 4), (pa.large_binary(), 8), (pa.large_string(), 8), ]: result.append( dict( testcase_name=str(array_type), array=pa.array([ "a", "bb", "ccc", "dddd", "eeeee", "ffffff", "ggggggg", "hhhhhhhh", "iiiiiiiii" ], type=array_type), slice_offset=1, slice_length=3, # contents: 45 # offsets: 10 * sizeof_offsets # null bitmap: 2 expected_size=(45 + sizeof_offsets * 10 + _all_false_null_bitmap_size(2)), # contents: 9 # offsets: 4 * sizeof_offsets # null bitmap: 1 expected_sliced_size=(9 + sizeof_offsets * 4 + _all_false_null_bitmap_size(1)))) return result
Example #14
Source File: test__pandas_helpers.py From python-bigquery with Apache License 2.0 | 5 votes |
def test_bq_to_arrow_data_type_w_array_struct(module_under_test, bq_type): fields = ( schema.SchemaField("field01", "STRING"), schema.SchemaField("field02", "BYTES"), schema.SchemaField("field03", "INTEGER"), schema.SchemaField("field04", "INT64"), schema.SchemaField("field05", "FLOAT"), schema.SchemaField("field06", "FLOAT64"), schema.SchemaField("field07", "NUMERIC"), schema.SchemaField("field08", "BOOLEAN"), schema.SchemaField("field09", "BOOL"), schema.SchemaField("field10", "TIMESTAMP"), schema.SchemaField("field11", "DATE"), schema.SchemaField("field12", "TIME"), schema.SchemaField("field13", "DATETIME"), schema.SchemaField("field14", "GEOGRAPHY"), ) field = schema.SchemaField("ignored_name", bq_type, mode="REPEATED", fields=fields) actual = module_under_test.bq_to_arrow_data_type(field) expected_value_type = pyarrow.struct( ( pyarrow.field("field01", pyarrow.string()), pyarrow.field("field02", pyarrow.binary()), pyarrow.field("field03", pyarrow.int64()), pyarrow.field("field04", pyarrow.int64()), pyarrow.field("field05", pyarrow.float64()), pyarrow.field("field06", pyarrow.float64()), pyarrow.field("field07", module_under_test.pyarrow_numeric()), pyarrow.field("field08", pyarrow.bool_()), pyarrow.field("field09", pyarrow.bool_()), pyarrow.field("field10", module_under_test.pyarrow_timestamp()), pyarrow.field("field11", pyarrow.date32()), pyarrow.field("field12", module_under_test.pyarrow_time()), pyarrow.field("field13", module_under_test.pyarrow_datetime()), pyarrow.field("field14", pyarrow.string()), ) ) assert pyarrow.types.is_list(actual) assert pyarrow.types.is_struct(actual.value_type) assert actual.value_type.num_children == len(fields) assert actual.value_type.equals(expected_value_type)
Example #15
Source File: test__pandas_helpers.py From python-bigquery with Apache License 2.0 | 5 votes |
def test_bq_to_arrow_data_type_w_struct(module_under_test, bq_type): fields = ( schema.SchemaField("field01", "STRING"), schema.SchemaField("field02", "BYTES"), schema.SchemaField("field03", "INTEGER"), schema.SchemaField("field04", "INT64"), schema.SchemaField("field05", "FLOAT"), schema.SchemaField("field06", "FLOAT64"), schema.SchemaField("field07", "NUMERIC"), schema.SchemaField("field08", "BOOLEAN"), schema.SchemaField("field09", "BOOL"), schema.SchemaField("field10", "TIMESTAMP"), schema.SchemaField("field11", "DATE"), schema.SchemaField("field12", "TIME"), schema.SchemaField("field13", "DATETIME"), schema.SchemaField("field14", "GEOGRAPHY"), ) field = schema.SchemaField("ignored_name", bq_type, mode="NULLABLE", fields=fields) actual = module_under_test.bq_to_arrow_data_type(field) expected = pyarrow.struct( ( pyarrow.field("field01", pyarrow.string()), pyarrow.field("field02", pyarrow.binary()), pyarrow.field("field03", pyarrow.int64()), pyarrow.field("field04", pyarrow.int64()), pyarrow.field("field05", pyarrow.float64()), pyarrow.field("field06", pyarrow.float64()), pyarrow.field("field07", module_under_test.pyarrow_numeric()), pyarrow.field("field08", pyarrow.bool_()), pyarrow.field("field09", pyarrow.bool_()), pyarrow.field("field10", module_under_test.pyarrow_timestamp()), pyarrow.field("field11", pyarrow.date32()), pyarrow.field("field12", module_under_test.pyarrow_time()), pyarrow.field("field13", module_under_test.pyarrow_datetime()), pyarrow.field("field14", pyarrow.string()), ) ) assert pyarrow.types.is_struct(actual) assert actual.num_children == len(fields) assert actual.equals(expected)
Example #16
Source File: types.py From LearningApacheSpark with MIT License | 5 votes |
def to_arrow_type(dt): """ Convert Spark data type to pyarrow type """ from distutils.version import LooseVersion import pyarrow as pa if type(dt) == BooleanType: arrow_type = pa.bool_() elif type(dt) == ByteType: arrow_type = pa.int8() elif type(dt) == ShortType: arrow_type = pa.int16() elif type(dt) == IntegerType: arrow_type = pa.int32() elif type(dt) == LongType: arrow_type = pa.int64() elif type(dt) == FloatType: arrow_type = pa.float32() elif type(dt) == DoubleType: arrow_type = pa.float64() elif type(dt) == DecimalType: arrow_type = pa.decimal128(dt.precision, dt.scale) elif type(dt) == StringType: arrow_type = pa.string() elif type(dt) == BinaryType: # TODO: remove version check once minimum pyarrow version is 0.10.0 if LooseVersion(pa.__version__) < LooseVersion("0.10.0"): raise TypeError("Unsupported type in conversion to Arrow: " + str(dt) + "\nPlease install pyarrow >= 0.10.0 for BinaryType support.") arrow_type = pa.binary() elif type(dt) == DateType: arrow_type = pa.date32() elif type(dt) == TimestampType: # Timestamps should be in UTC, JVM Arrow timestamps require a timezone to be read arrow_type = pa.timestamp('us', tz='UTC') elif type(dt) == ArrayType: if type(dt.elementType) == TimestampType: raise TypeError("Unsupported type in conversion to Arrow: " + str(dt)) arrow_type = pa.list_(to_arrow_type(dt.elementType)) else: raise TypeError("Unsupported type in conversion to Arrow: " + str(dt)) return arrow_type
Example #17
Source File: test_unischema.py From petastorm with Apache License 2.0 | 5 votes |
def test_arrow_schema_convertion(): fields = [ pa.field('string', pa.string()), pa.field('int8', pa.int8()), pa.field('int16', pa.int16()), pa.field('int32', pa.int32()), pa.field('int64', pa.int64()), pa.field('float', pa.float32()), pa.field('double', pa.float64()), pa.field('bool', pa.bool_(), False), pa.field('fixed_size_binary', pa.binary(10)), pa.field('variable_size_binary', pa.binary()), pa.field('decimal', pa.decimal128(3, 4)), pa.field('timestamp_s', pa.timestamp('s')), pa.field('timestamp_ns', pa.timestamp('ns')), pa.field('date_32', pa.date32()), pa.field('date_64', pa.date64()) ] arrow_schema = pa.schema(fields) mock_dataset = _mock_parquet_dataset([], arrow_schema) unischema = Unischema.from_arrow_schema(mock_dataset) for name in arrow_schema.names: assert getattr(unischema, name).name == name assert getattr(unischema, name).codec is None if name == 'bool': assert not getattr(unischema, name).nullable else: assert getattr(unischema, name).nullable # Test schema preserve fields order field_name_list = [f.name for f in fields] assert list(unischema.fields.keys()) == field_name_list
Example #18
Source File: test_db.py From aws-data-wrangler with Apache License 2.0 | 5 votes |
def test_redshift_category(bucket, databases_parameters): path = f"s3://{bucket}/test_redshift_category/" df = get_df_category().drop(["binary"], axis=1, inplace=False) engine = wr.catalog.get_engine(connection="aws-data-wrangler-redshift") wr.db.copy_to_redshift( df=df, path=path, con=engine, schema="public", table="test_redshift_category", mode="overwrite", iam_role=databases_parameters["redshift"]["role"], ) df2 = wr.db.unload_redshift( sql="SELECT * FROM public.test_redshift_category", con=engine, iam_role=databases_parameters["redshift"]["role"], path=path, keep_files=False, categories=df.columns, ) ensure_data_types_category(df2) dfs = wr.db.unload_redshift( sql="SELECT * FROM public.test_redshift_category", con=engine, iam_role=databases_parameters["redshift"]["role"], path=path, keep_files=False, categories=df.columns, chunked=True, ) for df2 in dfs: ensure_data_types_category(df2) wr.s3.delete_objects(path=path)
Example #19
Source File: arrow_util_test.py From data-validation with Apache License 2.0 | 5 votes |
def testEnumerateArraysStringWeight(self): # The arrow type of a string changes between py2 and py3 so we accept either with self.assertRaisesRegex( ValueError, r'Weight column "w" must be of numeric type. Found (string|binary).*'): for _ in arrow_util.enumerate_arrays( pa.RecordBatch.from_arrays( [pa.array([[1], [2, 3]]), pa.array([["a"], ["b"]])], ["v", "w"]), weight_column="w", enumerate_leaves_only=True): pass
Example #20
Source File: arrow_util_test.py From data-validation with Apache License 2.0 | 5 votes |
def testIsBinaryLike(self): for t in (pa.binary(), pa.large_binary(), pa.string(), pa.large_string()): self.assertTrue(arrow_util.is_binary_like(t)) for t in (pa.list_(pa.binary()), pa.large_list(pa.string())): self.assertFalse(arrow_util.is_binary_like(t))
Example #21
Source File: arrow_util_test.py From data-validation with Apache License 2.0 | 5 votes |
def testIsListLike(self): for t in (pa.list_(pa.int64()), pa.large_list(pa.int64())): self.assertTrue(arrow_util.is_list_like(t)) for t in (pa.binary(), pa.int64(), pa.large_string()): self.assertFalse(arrow_util.is_list_like(t))
Example #22
Source File: test_common_metadata.py From kartothek with MIT License | 4 votes |
def test_store_schema_metadata(store, df_all_types): store_schema_metadata( schema=make_meta(df_all_types, origin="df_all_types"), dataset_uuid="some_uuid", store=store, table="some_table", ) key = "some_uuid/some_table/_common_metadata" assert key in store.keys() pq_file = pq.ParquetFile(store.open(key)) actual_schema = pq_file.schema.to_arrow_schema() fields = [ pa.field("array_float32", pa.list_(pa.float64())), pa.field("array_float64", pa.list_(pa.float64())), pa.field("array_int16", pa.list_(pa.int64())), pa.field("array_int32", pa.list_(pa.int64())), pa.field("array_int64", pa.list_(pa.int64())), pa.field("array_int8", pa.list_(pa.int64())), pa.field("array_uint16", pa.list_(pa.uint64())), pa.field("array_uint32", pa.list_(pa.uint64())), pa.field("array_uint64", pa.list_(pa.uint64())), pa.field("array_uint8", pa.list_(pa.uint64())), pa.field("array_unicode", pa.list_(pa.string())), pa.field("bool", pa.bool_()), pa.field("byte", pa.binary()), pa.field("date", pa.date32()), pa.field("datetime64", pa.timestamp("us")), pa.field("float32", pa.float64()), pa.field("float64", pa.float64()), pa.field("int16", pa.int64()), pa.field("int32", pa.int64()), pa.field("int64", pa.int64()), pa.field("int8", pa.int64()), pa.field("null", pa.null()), pa.field("uint16", pa.uint64()), pa.field("uint32", pa.uint64()), pa.field("uint64", pa.uint64()), pa.field("uint8", pa.uint64()), pa.field("unicode", pa.string()), ] expected_schema = pa.schema(fields) assert actual_schema.remove_metadata() == expected_schema
Example #23
Source File: test_db.py From aws-data-wrangler with Apache License 2.0 | 4 votes |
def test_redshift_copy_unload(bucket, databases_parameters): path = f"s3://{bucket}/test_redshift_copy/" df = get_df().drop(["iint8", "binary"], axis=1, inplace=False) engine = wr.catalog.get_engine(connection="aws-data-wrangler-redshift") wr.db.copy_to_redshift( df=df, path=path, con=engine, schema="public", table="__test_redshift_copy", mode="overwrite", iam_role=databases_parameters["redshift"]["role"], ) df2 = wr.db.unload_redshift( sql="SELECT * FROM public.__test_redshift_copy", con=engine, iam_role=databases_parameters["redshift"]["role"], path=path, keep_files=False, ) assert len(df2.index) == 3 ensure_data_types(df=df2, has_list=False) wr.db.copy_to_redshift( df=df, path=path, con=engine, schema="public", table="__test_redshift_copy", mode="append", iam_role=databases_parameters["redshift"]["role"], ) df2 = wr.db.unload_redshift( sql="SELECT * FROM public.__test_redshift_copy", con=engine, iam_role=databases_parameters["redshift"]["role"], path=path, keep_files=False, ) assert len(df2.index) == 6 ensure_data_types(df=df2, has_list=False) dfs = wr.db.unload_redshift( sql="SELECT * FROM public.__test_redshift_copy", con=engine, iam_role=databases_parameters["redshift"]["role"], path=path, keep_files=False, chunked=True, ) for chunk in dfs: ensure_data_types(df=chunk, has_list=False)