Python Examples of pyarrow.binary

Source File: test_unit_arrow_chunk_iterator.py From snowflake-connector-python with Apache License 2.0

6 votes

def test_iterate_over_binary_chunk():
    random.seed(datetime.datetime.now())
    column_meta = {
        "byteLength": "100",
        "logicalType": "BINARY",
        "precision": "0",
        "scale": "0",
        "charLength": "0"
    }

    def byte_array_generator():
        return bytearray(os.urandom(1000))

    iterate_over_test_chunk([pyarrow.binary(), pyarrow.binary()],
                            [column_meta, column_meta],
                            byte_array_generator)

Source File: test_unit_arrow_chunk_iterator.py From snowflake-connector-python with Apache License 2.0

6 votes

def test_iterate_over_binary_chunk():
    random.seed(datetime.datetime.now())
    column_meta = {
        "byteLength": "100",
        "logicalType": "BINARY",
        "precision": "0",
        "scale": "0",
        "charLength": "0"
    }

    def byte_array_generator():
        return bytearray(os.urandom(1000))

    iterate_over_test_chunk([pyarrow.binary(), pyarrow.binary()],
                            [column_meta, column_meta],
                            byte_array_generator)

Source File: tensor_adapter_test.py From tfx-bsl with Apache License 2.0

6 votes

def testRaggedTensorStructTypeNonLeaf(self):
    tensor_representation = text_format.Parse(
        """
        ragged_tensor {
          feature_path {
            step: "ragged_feature"
          }
        }
        """, schema_pb2.TensorRepresentation())
    record_batch = pa.RecordBatch.from_arrays([
        pa.StructArray.from_arrays([
            pa.array([[1, 2, 3]], pa.list_(pa.int64())),
            pa.array([["a", "b", "c"]], pa.list_(pa.binary()))
        ], ["inner_feature", "x2"])
    ], ["ragged_feature"])
    with self.assertRaisesRegex(ValueError,
                                ".*Unable to handle tensor output.*"):
      tensor_adapter.TensorAdapter(
          tensor_adapter.TensorAdapterConfig(record_batch.schema,
                                             {"output": tensor_representation}))

Source File: tensor_adapter_test.py From tfx-bsl with Apache License 2.0

6 votes

def testRaggedTensorStructTypeInvalidSteps(self):
    tensor_representation = text_format.Parse(
        """
        ragged_tensor {
          feature_path {
            step: "ragged_feature"
            step: "wrong_step"
          }
        }
        """, schema_pb2.TensorRepresentation())
    record_batch = pa.RecordBatch.from_arrays([
        pa.StructArray.from_arrays([
            pa.array([[1, 2, 3]], pa.list_(pa.int64())),
            pa.array([["a", "b", "c"]], pa.list_(pa.binary()))
        ], ["inner_feature", "x2"])
    ], ["ragged_feature"])
    with self.assertRaisesRegex(ValueError,
                                ".*Unable to handle tensor output.*"):
      tensor_adapter.TensorAdapter(
          tensor_adapter.TensorAdapterConfig(record_batch.schema,
                                             {"output": tensor_representation}))

Source File: tf_sequence_example_record_test.py From tfx-bsl with Apache License 2.0

6 votes

def _GetExpectedColumnValues(tfxio):
  if tfxio._can_produce_large_types:
    list_factory = pa.large_list
    bytes_type = pa.large_binary()
  else:
    list_factory = pa.list_
    bytes_type = pa.binary()

  return {
      path.ColumnPath(["int_feature"]):
          pa.array([[1], [2], [3]], type=list_factory(pa.int64())),
      path.ColumnPath(["float_feature"]):
          pa.array([[1, 2, 3, 4], [2, 3, 4, 5], None],
                   type=list_factory(pa.float32())),
      path.ColumnPath([_SEQUENCE_COLUMN_NAME, "int_feature"]):
          pa.array([[[1, 2], [3]], None, [[4]]],
                   list_factory(list_factory(pa.int64()))),
      path.ColumnPath([_SEQUENCE_COLUMN_NAME, "string_feature"]):
          pa.array([None, [[b"foo", b"bar"], []], [[b"baz"]]],
                   list_factory(list_factory(bytes_type)))
  }

Source File: sequence_example_coder_test.py From tfx-bsl with Apache License 2.0

5 votes

def _test_decode(self, schema_text_proto, sequence_examples_text_proto,
                   create_expected, use_large_types):
    serialized_sequence_examples = [
        text_format.Parse(pbtxt,
                          tf.train.SequenceExample()).SerializeToString()
        for pbtxt in sequence_examples_text_proto
    ]
    serialized_schema = None
    if schema_text_proto is not None:
      serialized_schema = text_format.Parse(
          schema_text_proto, schema_pb2.Schema()).SerializeToString()

    if serialized_schema:
      coder = sequence_example_coder.SequenceExamplesToRecordBatchDecoder(
          _TEST_SEQUENCE_COLUMN_NAME,
          serialized_schema,
          use_large_types=use_large_types)
    else:
      coder = sequence_example_coder.SequenceExamplesToRecordBatchDecoder(
          _TEST_SEQUENCE_COLUMN_NAME, use_large_types=use_large_types)

    result = coder.DecodeBatch(serialized_sequence_examples)
    self.assertIsInstance(result, pa.RecordBatch)
    if use_large_types:
      expected = create_expected(pa.large_list, pa.large_binary())
    else:
      expected = create_expected(pa.list_, pa.binary())
    self.assertTrue(
        result.equals(expected),
        "actual: {}\n expected:{}".format(result, expected))

    if serialized_schema is not None:
      self.assertTrue(coder.ArrowSchema().equals(result.schema))

Source File: record_based_tfxio.py From tfx-bsl with Apache License 2.0

5 votes

def CreateRawRecordColumn(
    raw_records: List[bytes], produce_large_types: bool) -> pa.Array:
  """Returns an Array that satisfies the requirement of a raw record column."""
  list_array_factory = (
      pa.LargeListArray.from_arrays
      if produce_large_types else pa.ListArray.from_arrays)
  binary_type = pa.large_binary() if produce_large_types else pa.binary()
  return list_array_factory(
      np.arange(0, len(raw_records) + 1, dtype=np.int64),
      pa.array(raw_records, type=binary_type))

Source File: record_based_tfxio.py From tfx-bsl with Apache License 2.0

5 votes

def ArrowSchema(self) -> pa.Schema:
    schema = self._ArrowSchemaNoRawRecordColumn()
    if self._raw_record_column_name is not None:
      column_type = (pa.large_list(pa.large_binary()) if
                     self._can_produce_large_types else pa.list_(pa.binary()))
      if schema.get_field_index(self._raw_record_column_name) != -1:
        raise ValueError(
            "Raw record column name {} collided with a column in the schema."
            .format(self._raw_record_column_name))
      schema = schema.append(
          pa.field(self._raw_record_column_name, column_type))
    return schema

Source File: tensor_adapter.py From tfx-bsl with Apache License 2.0

5 votes

def _GetConvertToBinaryFn(
    array_type: pa.DataType) -> Optional[Callable[[pa.Array], pa.Array]]:
  """Returns a function that converts a StringArray to BinaryArray."""

  if pa.types.is_string(array_type):
    return lambda array: array.view(pa.binary())
  if pa.types.is_large_string(array_type):
    return lambda array: array.view(pa.large_binary())
  return None

Source File: tensor_to_arrow.py From tfx-bsl with Apache License 2.0

5 votes

def _tf_dtype_to_arrow_type(dtype: tf.DType):
  """Maps a tf Dtype to an Arrow type."""
  if dtype == tf.string:
    return pa.binary()
  elif dtype == tf.bool:
    raise TypeError("Unable to handle bool tensors -- consider casting it to a "
                    "tf.uint8")
  return pa.from_numpy_dtype(dtype.as_numpy_dtype)

Source File: tf_sequence_example_record_test.py From tfx-bsl with Apache License 2.0

5 votes

def _ValidateRecordBatch(
      self, tfxio, record_batch, raw_record_column_name=None):
    self.assertIsInstance(record_batch, pa.RecordBatch)
    self.assertEqual(record_batch.num_rows, 3)
    expected_column_values = _GetExpectedColumnValues(tfxio)
    for i, field in enumerate(record_batch.schema):
      if field.name == raw_record_column_name:
        continue
      if field.name == _SEQUENCE_COLUMN_NAME:
        self.assertTrue(pa.types.is_struct(field.type))
        for seq_column, seq_field in zip(
            record_batch.column(i).flatten(), list(field.type)):
          expected_array = expected_column_values[path.ColumnPath(
              [_SEQUENCE_COLUMN_NAME, seq_field.name])]
          self.assertTrue(
              seq_column.equals(expected_array),
              "Sequence column {} did not match ({} vs {})".format(
                  seq_field.name, seq_column, expected_array))
        continue
      self.assertTrue(
          record_batch.column(i).equals(expected_column_values[path.ColumnPath(
              [field.name])]), "Column {} did not match ({} vs {}).".format(
                  field.name, record_batch.column(i),
                  expected_column_values[path.ColumnPath([field.name])]))

    if raw_record_column_name is not None:
      if tfxio._can_produce_large_types:
        raw_record_column_type = pa.large_list(pa.large_binary())
      else:
        raw_record_column_type = pa.list_(pa.binary())
      self.assertEqual(record_batch.schema.names[-1], raw_record_column_name)
      self.assertTrue(
          record_batch.columns[-1].type.equals(raw_record_column_type))
      self.assertEqual(record_batch.columns[-1].flatten().to_pylist(),
                       _SERIALIZED_EXAMPLES)

Source File: table_util_test.py From tfx-bsl with Apache License 2.0

5 votes

def test_success(self, row_indices, expected_output):
    record_batch = pa.RecordBatch.from_arrays([
        pa.array([[1, 2, 3], None, [4], [], [5, 6], [7], [8, 9], [10], []],
                 type=pa.list_(pa.int32())),
        pa.array(
            [["a"], ["b", "c"], None, [], None, ["d", "e"], ["f"], None, ["g"]],
            type=pa.list_(pa.binary())),
    ], ["f1", "f2"])

    for row_indices_type in (pa.int32(), pa.int64()):
      sliced = table_util.RecordBatchTake(
          record_batch, pa.array(row_indices, type=row_indices_type))
      self.assertTrue(
          sliced.equals(expected_output),
          "Expected {}, got {}".format(expected_output, sliced))

Source File: array_util_test.py From tfx-bsl with Apache License 2.0

5 votes

def _get_binary_like_byte_size_test_cases():
  result = []
  for array_type, sizeof_offsets in [
      (pa.binary(), 4),
      (pa.string(), 4),
      (pa.large_binary(), 8),
      (pa.large_string(), 8),
  ]:
    result.append(
        dict(
            testcase_name=str(array_type),
            array=pa.array([
                "a", "bb", "ccc", "dddd", "eeeee", "ffffff", "ggggggg",
                "hhhhhhhh", "iiiiiiiii"
            ],
                           type=array_type),
            slice_offset=1,
            slice_length=3,
            # contents: 45
            # offsets: 10 * sizeof_offsets
            # null bitmap: 2
            expected_size=(45 + sizeof_offsets * 10 +
                           _all_false_null_bitmap_size(2)),
            # contents: 9
            # offsets: 4 * sizeof_offsets
            # null bitmap: 1
            expected_sliced_size=(9 + sizeof_offsets * 4 +
                                  _all_false_null_bitmap_size(1))))
  return result

Source File: test__pandas_helpers.py From python-bigquery with Apache License 2.0

5 votes

def test_bq_to_arrow_data_type_w_array_struct(module_under_test, bq_type):
    fields = (
        schema.SchemaField("field01", "STRING"),
        schema.SchemaField("field02", "BYTES"),
        schema.SchemaField("field03", "INTEGER"),
        schema.SchemaField("field04", "INT64"),
        schema.SchemaField("field05", "FLOAT"),
        schema.SchemaField("field06", "FLOAT64"),
        schema.SchemaField("field07", "NUMERIC"),
        schema.SchemaField("field08", "BOOLEAN"),
        schema.SchemaField("field09", "BOOL"),
        schema.SchemaField("field10", "TIMESTAMP"),
        schema.SchemaField("field11", "DATE"),
        schema.SchemaField("field12", "TIME"),
        schema.SchemaField("field13", "DATETIME"),
        schema.SchemaField("field14", "GEOGRAPHY"),
    )
    field = schema.SchemaField("ignored_name", bq_type, mode="REPEATED", fields=fields)
    actual = module_under_test.bq_to_arrow_data_type(field)
    expected_value_type = pyarrow.struct(
        (
            pyarrow.field("field01", pyarrow.string()),
            pyarrow.field("field02", pyarrow.binary()),
            pyarrow.field("field03", pyarrow.int64()),
            pyarrow.field("field04", pyarrow.int64()),
            pyarrow.field("field05", pyarrow.float64()),
            pyarrow.field("field06", pyarrow.float64()),
            pyarrow.field("field07", module_under_test.pyarrow_numeric()),
            pyarrow.field("field08", pyarrow.bool_()),
            pyarrow.field("field09", pyarrow.bool_()),
            pyarrow.field("field10", module_under_test.pyarrow_timestamp()),
            pyarrow.field("field11", pyarrow.date32()),
            pyarrow.field("field12", module_under_test.pyarrow_time()),
            pyarrow.field("field13", module_under_test.pyarrow_datetime()),
            pyarrow.field("field14", pyarrow.string()),
        )
    )
    assert pyarrow.types.is_list(actual)
    assert pyarrow.types.is_struct(actual.value_type)
    assert actual.value_type.num_children == len(fields)
    assert actual.value_type.equals(expected_value_type)

Source File: test__pandas_helpers.py From python-bigquery with Apache License 2.0

5 votes

def test_bq_to_arrow_data_type_w_struct(module_under_test, bq_type):
    fields = (
        schema.SchemaField("field01", "STRING"),
        schema.SchemaField("field02", "BYTES"),
        schema.SchemaField("field03", "INTEGER"),
        schema.SchemaField("field04", "INT64"),
        schema.SchemaField("field05", "FLOAT"),
        schema.SchemaField("field06", "FLOAT64"),
        schema.SchemaField("field07", "NUMERIC"),
        schema.SchemaField("field08", "BOOLEAN"),
        schema.SchemaField("field09", "BOOL"),
        schema.SchemaField("field10", "TIMESTAMP"),
        schema.SchemaField("field11", "DATE"),
        schema.SchemaField("field12", "TIME"),
        schema.SchemaField("field13", "DATETIME"),
        schema.SchemaField("field14", "GEOGRAPHY"),
    )
    field = schema.SchemaField("ignored_name", bq_type, mode="NULLABLE", fields=fields)
    actual = module_under_test.bq_to_arrow_data_type(field)
    expected = pyarrow.struct(
        (
            pyarrow.field("field01", pyarrow.string()),
            pyarrow.field("field02", pyarrow.binary()),
            pyarrow.field("field03", pyarrow.int64()),
            pyarrow.field("field04", pyarrow.int64()),
            pyarrow.field("field05", pyarrow.float64()),
            pyarrow.field("field06", pyarrow.float64()),
            pyarrow.field("field07", module_under_test.pyarrow_numeric()),
            pyarrow.field("field08", pyarrow.bool_()),
            pyarrow.field("field09", pyarrow.bool_()),
            pyarrow.field("field10", module_under_test.pyarrow_timestamp()),
            pyarrow.field("field11", pyarrow.date32()),
            pyarrow.field("field12", module_under_test.pyarrow_time()),
            pyarrow.field("field13", module_under_test.pyarrow_datetime()),
            pyarrow.field("field14", pyarrow.string()),
        )
    )
    assert pyarrow.types.is_struct(actual)
    assert actual.num_children == len(fields)
    assert actual.equals(expected)

Source File: types.py From LearningApacheSpark with MIT License

5 votes

def to_arrow_type(dt):
    """ Convert Spark data type to pyarrow type
    """
    from distutils.version import LooseVersion
    import pyarrow as pa
    if type(dt) == BooleanType:
        arrow_type = pa.bool_()
    elif type(dt) == ByteType:
        arrow_type = pa.int8()
    elif type(dt) == ShortType:
        arrow_type = pa.int16()
    elif type(dt) == IntegerType:
        arrow_type = pa.int32()
    elif type(dt) == LongType:
        arrow_type = pa.int64()
    elif type(dt) == FloatType:
        arrow_type = pa.float32()
    elif type(dt) == DoubleType:
        arrow_type = pa.float64()
    elif type(dt) == DecimalType:
        arrow_type = pa.decimal128(dt.precision, dt.scale)
    elif type(dt) == StringType:
        arrow_type = pa.string()
    elif type(dt) == BinaryType:
        # TODO: remove version check once minimum pyarrow version is 0.10.0
        if LooseVersion(pa.__version__) < LooseVersion("0.10.0"):
            raise TypeError("Unsupported type in conversion to Arrow: " + str(dt) +
                            "\nPlease install pyarrow >= 0.10.0 for BinaryType support.")
        arrow_type = pa.binary()
    elif type(dt) == DateType:
        arrow_type = pa.date32()
    elif type(dt) == TimestampType:
        # Timestamps should be in UTC, JVM Arrow timestamps require a timezone to be read
        arrow_type = pa.timestamp('us', tz='UTC')
    elif type(dt) == ArrayType:
        if type(dt.elementType) == TimestampType:
            raise TypeError("Unsupported type in conversion to Arrow: " + str(dt))
        arrow_type = pa.list_(to_arrow_type(dt.elementType))
    else:
        raise TypeError("Unsupported type in conversion to Arrow: " + str(dt))
    return arrow_type

Source File: test_unischema.py From petastorm with Apache License 2.0

5 votes

def test_arrow_schema_convertion():
    fields = [
        pa.field('string', pa.string()),
        pa.field('int8', pa.int8()),
        pa.field('int16', pa.int16()),
        pa.field('int32', pa.int32()),
        pa.field('int64', pa.int64()),
        pa.field('float', pa.float32()),
        pa.field('double', pa.float64()),
        pa.field('bool', pa.bool_(), False),
        pa.field('fixed_size_binary', pa.binary(10)),
        pa.field('variable_size_binary', pa.binary()),
        pa.field('decimal', pa.decimal128(3, 4)),
        pa.field('timestamp_s', pa.timestamp('s')),
        pa.field('timestamp_ns', pa.timestamp('ns')),
        pa.field('date_32', pa.date32()),
        pa.field('date_64', pa.date64())
    ]
    arrow_schema = pa.schema(fields)

    mock_dataset = _mock_parquet_dataset([], arrow_schema)

    unischema = Unischema.from_arrow_schema(mock_dataset)
    for name in arrow_schema.names:
        assert getattr(unischema, name).name == name
        assert getattr(unischema, name).codec is None

        if name == 'bool':
            assert not getattr(unischema, name).nullable
        else:
            assert getattr(unischema, name).nullable

    # Test schema preserve fields order
    field_name_list = [f.name for f in fields]
    assert list(unischema.fields.keys()) == field_name_list

Source File: test_db.py From aws-data-wrangler with Apache License 2.0

5 votes

def test_redshift_category(bucket, databases_parameters):
    path = f"s3://{bucket}/test_redshift_category/"
    df = get_df_category().drop(["binary"], axis=1, inplace=False)
    engine = wr.catalog.get_engine(connection="aws-data-wrangler-redshift")
    wr.db.copy_to_redshift(
        df=df,
        path=path,
        con=engine,
        schema="public",
        table="test_redshift_category",
        mode="overwrite",
        iam_role=databases_parameters["redshift"]["role"],
    )
    df2 = wr.db.unload_redshift(
        sql="SELECT * FROM public.test_redshift_category",
        con=engine,
        iam_role=databases_parameters["redshift"]["role"],
        path=path,
        keep_files=False,
        categories=df.columns,
    )
    ensure_data_types_category(df2)
    dfs = wr.db.unload_redshift(
        sql="SELECT * FROM public.test_redshift_category",
        con=engine,
        iam_role=databases_parameters["redshift"]["role"],
        path=path,
        keep_files=False,
        categories=df.columns,
        chunked=True,
    )
    for df2 in dfs:
        ensure_data_types_category(df2)
    wr.s3.delete_objects(path=path)

Source File: arrow_util_test.py From data-validation with Apache License 2.0

5 votes

def testEnumerateArraysStringWeight(self):
    # The arrow type of a string changes between py2 and py3 so we accept either
    with self.assertRaisesRegex(
        ValueError,
        r'Weight column "w" must be of numeric type. Found (string|binary).*'):
      for _ in arrow_util.enumerate_arrays(
          pa.RecordBatch.from_arrays(
              [pa.array([[1], [2, 3]]),
               pa.array([["a"], ["b"]])], ["v", "w"]),
          weight_column="w",
          enumerate_leaves_only=True):
        pass

Source File: arrow_util_test.py From data-validation with Apache License 2.0

5 votes

def testIsBinaryLike(self):
    for t in (pa.binary(), pa.large_binary(), pa.string(), pa.large_string()):
      self.assertTrue(arrow_util.is_binary_like(t))

    for t in (pa.list_(pa.binary()), pa.large_list(pa.string())):
      self.assertFalse(arrow_util.is_binary_like(t))

Source File: arrow_util_test.py From data-validation with Apache License 2.0

5 votes

def testIsListLike(self):
    for t in (pa.list_(pa.int64()), pa.large_list(pa.int64())):
      self.assertTrue(arrow_util.is_list_like(t))

    for t in (pa.binary(), pa.int64(), pa.large_string()):
      self.assertFalse(arrow_util.is_list_like(t))

Source File: test_common_metadata.py From kartothek with MIT License

4 votes

def test_store_schema_metadata(store, df_all_types):
    store_schema_metadata(
        schema=make_meta(df_all_types, origin="df_all_types"),
        dataset_uuid="some_uuid",
        store=store,
        table="some_table",
    )

    key = "some_uuid/some_table/_common_metadata"
    assert key in store.keys()
    pq_file = pq.ParquetFile(store.open(key))
    actual_schema = pq_file.schema.to_arrow_schema()
    fields = [
        pa.field("array_float32", pa.list_(pa.float64())),
        pa.field("array_float64", pa.list_(pa.float64())),
        pa.field("array_int16", pa.list_(pa.int64())),
        pa.field("array_int32", pa.list_(pa.int64())),
        pa.field("array_int64", pa.list_(pa.int64())),
        pa.field("array_int8", pa.list_(pa.int64())),
        pa.field("array_uint16", pa.list_(pa.uint64())),
        pa.field("array_uint32", pa.list_(pa.uint64())),
        pa.field("array_uint64", pa.list_(pa.uint64())),
        pa.field("array_uint8", pa.list_(pa.uint64())),
        pa.field("array_unicode", pa.list_(pa.string())),
        pa.field("bool", pa.bool_()),
        pa.field("byte", pa.binary()),
        pa.field("date", pa.date32()),
        pa.field("datetime64", pa.timestamp("us")),
        pa.field("float32", pa.float64()),
        pa.field("float64", pa.float64()),
        pa.field("int16", pa.int64()),
        pa.field("int32", pa.int64()),
        pa.field("int64", pa.int64()),
        pa.field("int8", pa.int64()),
        pa.field("null", pa.null()),
        pa.field("uint16", pa.uint64()),
        pa.field("uint32", pa.uint64()),
        pa.field("uint64", pa.uint64()),
        pa.field("uint8", pa.uint64()),
        pa.field("unicode", pa.string()),
    ]
    expected_schema = pa.schema(fields)

    assert actual_schema.remove_metadata() == expected_schema

Source File: test_db.py From aws-data-wrangler with Apache License 2.0

4 votes

def test_redshift_copy_unload(bucket, databases_parameters):
    path = f"s3://{bucket}/test_redshift_copy/"
    df = get_df().drop(["iint8", "binary"], axis=1, inplace=False)
    engine = wr.catalog.get_engine(connection="aws-data-wrangler-redshift")
    wr.db.copy_to_redshift(
        df=df,
        path=path,
        con=engine,
        schema="public",
        table="__test_redshift_copy",
        mode="overwrite",
        iam_role=databases_parameters["redshift"]["role"],
    )
    df2 = wr.db.unload_redshift(
        sql="SELECT * FROM public.__test_redshift_copy",
        con=engine,
        iam_role=databases_parameters["redshift"]["role"],
        path=path,
        keep_files=False,
    )
    assert len(df2.index) == 3
    ensure_data_types(df=df2, has_list=False)
    wr.db.copy_to_redshift(
        df=df,
        path=path,
        con=engine,
        schema="public",
        table="__test_redshift_copy",
        mode="append",
        iam_role=databases_parameters["redshift"]["role"],
    )
    df2 = wr.db.unload_redshift(
        sql="SELECT * FROM public.__test_redshift_copy",
        con=engine,
        iam_role=databases_parameters["redshift"]["role"],
        path=path,
        keep_files=False,
    )
    assert len(df2.index) == 6
    ensure_data_types(df=df2, has_list=False)
    dfs = wr.db.unload_redshift(
        sql="SELECT * FROM public.__test_redshift_copy",
        con=engine,
        iam_role=databases_parameters["redshift"]["role"],
        path=path,
        keep_files=False,
        chunked=True,
    )
    for chunk in dfs:
        ensure_data_types(df=chunk, has_list=False)

Python pyarrow.binary() Examples