Python pyarrow.list_() Examples

The following are 30 code examples of pyarrow.list_(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pyarrow , or try the search function .
Example #1
Source File: tf_sequence_example_record_test.py    From tfx-bsl with Apache License 2.0 6 votes vote down vote up
def _GetExpectedColumnValues(tfxio):
  if tfxio._can_produce_large_types:
    list_factory = pa.large_list
    bytes_type = pa.large_binary()
  else:
    list_factory = pa.list_
    bytes_type = pa.binary()

  return {
      path.ColumnPath(["int_feature"]):
          pa.array([[1], [2], [3]], type=list_factory(pa.int64())),
      path.ColumnPath(["float_feature"]):
          pa.array([[1, 2, 3, 4], [2, 3, 4, 5], None],
                   type=list_factory(pa.float32())),
      path.ColumnPath([_SEQUENCE_COLUMN_NAME, "int_feature"]):
          pa.array([[[1, 2], [3]], None, [[4]]],
                   list_factory(list_factory(pa.int64()))),
      path.ColumnPath([_SEQUENCE_COLUMN_NAME, "string_feature"]):
          pa.array([None, [[b"foo", b"bar"], []], [[b"baz"]]],
                   list_factory(list_factory(bytes_type)))
  } 
Example #2
Source File: basic_stats_generator_test.py    From data-validation with Apache License 2.0 6 votes vote down vote up
def test_basic_stats_generator_empty_batch(self):
    batches = [
        pa.RecordBatch.from_arrays([pa.array([], type=pa.list_(pa.binary()))],
                                   ['a'])
    ]
    expected_result = {
        types.FeaturePath(['a']): text_format.Parse(
            """
            path {
              step: 'a'
            }
            type: STRING
            string_stats {
              common_stats {
                num_non_missing: 0
                tot_num_values: 0
              }
            }
            """, statistics_pb2.FeatureNameStatistics())}
    generator = basic_stats_generator.BasicStatsGenerator()
    self.assertCombinerOutputEqual(batches, generator, expected_result) 
Example #3
Source File: example_coder_test.py    From tfx-bsl with Apache License 2.0 6 votes vote down vote up
def test_decode(self, schema_text_proto, examples_text_proto,
                  create_expected):
    serialized_examples = [
        text_format.Parse(pbtxt, tf.train.Example()).SerializeToString()
        for pbtxt in examples_text_proto
    ]
    serialized_schema = None
    if schema_text_proto is not None:
      serialized_schema = text_format.Parse(
          schema_text_proto, schema_pb2.Schema()).SerializeToString()

    if serialized_schema:
      coder = example_coder.ExamplesToRecordBatchDecoder(serialized_schema)
    else:
      coder = example_coder.ExamplesToRecordBatchDecoder()

    result = coder.DecodeBatch(serialized_examples)
    self.assertIsInstance(result, pa.RecordBatch)
    expected = create_expected(pa.list_, pa.binary())
    self.assertTrue(
        result.equals(expected),
        "actual: {}\n expected:{}".format(result, expected))
    if serialized_schema:
      self.assertTrue(expected.schema.equals(coder.ArrowSchema())) 
Example #4
Source File: tf_example_record_test.py    From tfx-bsl with Apache License 2.0 6 votes vote down vote up
def GetExpectedColumnValues(tfxio):
  if tfxio._can_produce_large_types:
    int_type = pa.large_list(pa.int64())
    float_type = pa.large_list(pa.float32())
    bytes_type = pa.large_list(pa.large_binary())
  else:
    int_type = pa.list_(pa.int64())
    float_type = pa.list_(pa.float32())
    bytes_type = pa.list_(pa.binary())

  return {
      "int_feature":
          pa.array([[1], [2], [3]], type=int_type),
      "float_feature":
          pa.array([[1, 2, 3, 4], [2, 3, 4, 5], [4, 5, 6, 7]], type=float_type),
      "string_feature":
          pa.array([None, ["foo", "bar"], None], type=bytes_type),
  } 
Example #5
Source File: table_util_test.py    From tfx-bsl with Apache License 2.0 6 votes vote down vote up
def test_simple(self, factory):
    # 3 int64 values
    # 5 int32 offsets
    # 1 null bitmap byte for outer ListArray
    # 1 null bitmap byte for inner Int64Array
    # 46 bytes in total.
    list_array = pa.array([[1, 2], [None], None, None],
                          type=pa.list_(pa.int64()))

    # 1 null bitmap byte for outer StructArray.
    # 1 null bitmap byte for inner Int64Array.
    # 4 int64 values.
    # 34 bytes in total
    struct_array = pa.array([{"a": 1}, {"a": 2}, {"a": None}, None],
                            type=pa.struct([pa.field("a", pa.int64())]))
    entity = factory([list_array, struct_array], ["a1", "a2"])

    self.assertEqual(46 + 34, table_util.TotalByteSize(entity)) 
Example #6
Source File: csv_tfxio_test.py    From tfx-bsl with Apache License 2.0 6 votes vote down vote up
def _GetExpectedArrowSchema(tfxio, raw_record_column_name=None):
  if tfxio._can_produce_large_types:
    int_type = pa.large_list(pa.int64())
    float_type = pa.large_list(pa.float32())
    bytes_type = pa.large_list(pa.large_binary())
  else:
    int_type = pa.list_(pa.int64())
    float_type = pa.list_(pa.float32())
    bytes_type = pa.list_(pa.binary())
  fields = [
      pa.field("int_feature", int_type),
      pa.field("float_feature", float_type),
      pa.field("string_feature", bytes_type)
  ]
  if raw_record_column_name is not None:
    fields.append(pa.field(raw_record_column_name, bytes_type))
  return pa.schema(fields) 
Example #7
Source File: tensor_adapter_test.py    From tfx-bsl with Apache License 2.0 6 votes vote down vote up
def testRaggedTensorStructTypeInvalidSteps(self):
    tensor_representation = text_format.Parse(
        """
        ragged_tensor {
          feature_path {
            step: "ragged_feature"
            step: "wrong_step"
          }
        }
        """, schema_pb2.TensorRepresentation())
    record_batch = pa.RecordBatch.from_arrays([
        pa.StructArray.from_arrays([
            pa.array([[1, 2, 3]], pa.list_(pa.int64())),
            pa.array([["a", "b", "c"]], pa.list_(pa.binary()))
        ], ["inner_feature", "x2"])
    ], ["ragged_feature"])
    with self.assertRaisesRegex(ValueError,
                                ".*Unable to handle tensor output.*"):
      tensor_adapter.TensorAdapter(
          tensor_adapter.TensorAdapterConfig(record_batch.schema,
                                             {"output": tensor_representation})) 
Example #8
Source File: tensor_adapter_test.py    From tfx-bsl with Apache License 2.0 6 votes vote down vote up
def testRaggedTensorStructTypeTooManySteps(self):
    tensor_representation = text_format.Parse(
        """
        ragged_tensor {
          feature_path {
            step: "ragged_feature"
            step: "inner_feature"
            step: "non_existant_feature"
          }
        }
        """, schema_pb2.TensorRepresentation())
    record_batch = pa.RecordBatch.from_arrays([
        pa.StructArray.from_arrays([
            pa.array([[1, 2, 3]], pa.list_(pa.int64())),
            pa.array([["a", "b", "c"]], pa.list_(pa.binary()))
        ], ["inner_feature", "x2"])
    ], ["ragged_feature"])
    with self.assertRaisesRegex(ValueError,
                                ".*Unable to handle tensor output.*"):
      tensor_adapter.TensorAdapter(
          tensor_adapter.TensorAdapterConfig(record_batch.schema,
                                             {"output": tensor_representation})) 
Example #9
Source File: tensor_adapter_test.py    From tfx-bsl with Apache License 2.0 6 votes vote down vote up
def testRaiseOnNoMatchingHandler(self):
    with self.assertRaisesRegexp(ValueError, "Unable to handle tensor"):
      tensor_adapter.TensorAdapter(
          tensor_adapter.TensorAdapterConfig(
              # nested lists are not supported now.
              pa.schema([pa.field("unsupported_column",
                                  pa.list_(pa.list_(pa.int64())))]),
              {
                  "tensor":
                      text_format.Parse(
                          """
                  dense_tensor {
                    column_name: "unsupported_column"
                    shape: {}
                  }
                  """, schema_pb2.TensorRepresentation())
              })) 
Example #10
Source File: tensor_adapter_test.py    From tfx-bsl with Apache License 2.0 6 votes vote down vote up
def testRaggedTensorStructTypeNonLeaf(self):
    tensor_representation = text_format.Parse(
        """
        ragged_tensor {
          feature_path {
            step: "ragged_feature"
          }
        }
        """, schema_pb2.TensorRepresentation())
    record_batch = pa.RecordBatch.from_arrays([
        pa.StructArray.from_arrays([
            pa.array([[1, 2, 3]], pa.list_(pa.int64())),
            pa.array([["a", "b", "c"]], pa.list_(pa.binary()))
        ], ["inner_feature", "x2"])
    ], ["ragged_feature"])
    with self.assertRaisesRegex(ValueError,
                                ".*Unable to handle tensor output.*"):
      tensor_adapter.TensorAdapter(
          tensor_adapter.TensorAdapterConfig(record_batch.schema,
                                             {"output": tensor_representation})) 
Example #11
Source File: _pandas_helpers.py    From python-bigquery with Apache License 2.0 6 votes vote down vote up
def bq_to_arrow_data_type(field):
    """Return the Arrow data type, corresponding to a given BigQuery column.

    Returns:
        None: if default Arrow type inspection should be used.
    """
    if field.mode is not None and field.mode.upper() == "REPEATED":
        inner_type = bq_to_arrow_data_type(
            schema.SchemaField(field.name, field.field_type, fields=field.fields)
        )
        if inner_type:
            return pyarrow.list_(inner_type)
        return None

    field_type_upper = field.field_type.upper() if field.field_type else ""
    if field_type_upper in schema._STRUCT_TYPES:
        return bq_to_arrow_struct_data_type(field)

    data_type_constructor = BQ_TO_ARROW_SCALARS.get(field_type_upper)
    if data_type_constructor is None:
        return None
    return data_type_constructor() 
Example #12
Source File: tensor_adapter_test.py    From tfx-bsl with Apache License 2.0 5 votes vote down vote up
def testRaiseOnInvalidDefaultValue(self, value_type, default_value_pbtxt,
                                     exception_regexp):
    tensor_representation = text_format.Parse("""
                  dense_tensor {
                    column_name: "column"
                    shape {}
                  }""", schema_pb2.TensorRepresentation())
    tensor_representation.dense_tensor.default_value.CopyFrom(
        text_format.Parse(default_value_pbtxt,
                          schema_pb2.TensorRepresentation.DefaultValue()))
    with self.assertRaisesRegexp(ValueError, exception_regexp):
      tensor_adapter.TensorAdapter(
          tensor_adapter.TensorAdapterConfig(
              pa.schema([pa.field("column", pa.list_(value_type))]),
              {"tensor": tensor_representation})) 
Example #13
Source File: tensor_adapter_test.py    From tfx-bsl with Apache License 2.0 5 votes vote down vote up
def _MakeFloatingDefaultFilledDenseTensorFromListArrayTestCases():
  tensor_representation_textpb = """
  dense_tensor {
    column_name: "input"
    shape {
      dim {
        size: 2
      }
      dim {
        size: 1
      }
    }
    default_value {
      float_value: -1
    }
  }
  """
  result = []
  for t in _ALL_SUPPORTED_FLOATING_VALUE_TYPES:
    arrow_array = pa.array([None, [1, 2], None], type=pa.list_(t))
    if tf.executing_eagerly():
      expected_output = tf.constant([[-1, -1], [1, 2], [-1, -1]],
                                    dtype=_ARROW_TYPE_TO_TF_TYPE[t],
                                    shape=(3, 2, 1))
    else:
      expected_output = np.array(
          [-1, -1, 1, 2, -1, -1],
          dtype=_ARROW_TYPE_TO_NP_TYPE[t]).reshape((3, 2, 1))
    result.append({
        "testcase_name": "default_filled_dense_from_list_array_{}".format(t),
        "tensor_representation_textpb": tensor_representation_textpb,
        "arrow_array": arrow_array,
        "expected_output": expected_output,
        "expected_type_spec": tf.TensorSpec([None, 2, 1],
                                            dtype=_ARROW_TYPE_TO_TF_TYPE[t])
    })
  return result 
Example #14
Source File: tensor_adapter_test.py    From tfx-bsl with Apache License 2.0 5 votes vote down vote up
def _MakeStringDefaultFilledDenseTensorFromListArrayTestCases():
  tensor_representation_textpb = """
  dense_tensor {
    column_name: "input"
    shape {
    }
    default_value {
      bytes_value: "nil"
    }
  }
  """
  result = []
  for t in _ALL_SUPPORTED_STRING_VALUE_TYPES:
    arrow_array = pa.array([None, ["hello"], None], type=pa.list_(t))
    if tf.executing_eagerly():
      expected_output = tf.constant(["nil", "hello", "nil"],
                                    dtype=_ARROW_TYPE_TO_TF_TYPE[t])
    else:
      expected_output = np.array([b"nil", b"hello", b"nil"],
                                 dtype=_ARROW_TYPE_TO_NP_TYPE[t])
    result.append({
        "testcase_name": "default_filled_dense_from_list_array_{}".format(t),
        "tensor_representation_textpb": tensor_representation_textpb,
        "arrow_array": arrow_array,
        "expected_output": expected_output,
        "expected_type_spec": tf.TensorSpec([None], _ARROW_TYPE_TO_TF_TYPE[t])
    })
  return result 
Example #15
Source File: tensor_adapter_test.py    From tfx-bsl with Apache License 2.0 5 votes vote down vote up
def test2DSparseTensor(self):
    tensor_representation = text_format.Parse(
        """
        sparse_tensor {
          value_column_name: "values"
          index_column_names: ["d0", "d1"]
          dense_shape {
            dim {
              size: 10
            }
            dim {
              size: 20
            }
          }
        }
        """, schema_pb2.TensorRepresentation())
    record_batch = pa.RecordBatch.from_arrays([
        pa.array([[1], None, [2], [3, 4, 5], []], type=pa.list_(pa.int64())),
        # Also test that the index column can be of an integral type other
        # than int64.
        pa.array([[9], None, [9], [7, 8, 9], []], type=pa.list_(pa.uint32())),
        pa.array([[0], None, [0], [0, 1, 2], []], type=pa.list_(pa.int64()))
    ], ["values", "d0", "d1"])
    adapter = tensor_adapter.TensorAdapter(
        tensor_adapter.TensorAdapterConfig(record_batch.schema,
                                           {"output": tensor_representation}))
    converted = adapter.ToBatchTensors(record_batch)
    self.assertLen(converted, 1)
    self.assertIn("output", converted)
    actual_output = converted["output"]
    self.assertIsInstance(actual_output,
                          (tf.SparseTensor, tf.compat.v1.SparseTensorValue))
    self.assertSparseAllEqual(
        tf.compat.v1.SparseTensorValue(
            dense_shape=[5, 10, 20],
            indices=[[0, 9, 0], [2, 9, 0], [3, 7, 0], [3, 8, 1], [3, 9, 2]],
            values=tf.convert_to_tensor([1, 2, 3, 4, 5], dtype=tf.int64)),
        actual_output)

    self.assertAdapterCanProduceNonEagerInEagerMode(adapter, record_batch) 
Example #16
Source File: tensor_adapter_test.py    From tfx-bsl with Apache License 2.0 5 votes vote down vote up
def testRaiseOnUnsupportedTensorRepresentation(self):
    with self.assertRaisesRegexp(ValueError, "Unable to handle tensor"):
      tensor_adapter.TensorAdapter(
          tensor_adapter.TensorAdapterConfig(
              pa.schema([pa.field("a", pa.list_(pa.int64()))]),
              {"tensor": schema_pb2.TensorRepresentation()})) 
Example #17
Source File: tensor_to_arrow_test.py    From tfx-bsl with Apache License 2.0 5 votes vote down vote up
def _make_2d_varlen_sparse_tensor_test_cases():
  result = []
  for tf_type, arrow_type in _TF_TYPE_TO_ARROW_TYPE.items():
    if tf_type == tf.string:
      values = tf.constant([b"1", b"2", b"3"], dtype=tf.string)
      expected_array = pa.array([[b"1"], [], [b"2", b"3"], []],
                                type=pa.list_(arrow_type))
    else:
      values = tf.constant([1, 2, 3], dtype=tf_type)
      expected_array = pa.array([[1], [], [2, 3], []],
                                type=pa.list_(arrow_type))
    result.append(dict(
        testcase_name="2d_varlen_sparse_tensor_%s" % tf_type.name,
        type_specs={"sp": tf.SparseTensorSpec([None, None], tf_type)},
        expected_schema={
            "sp": pa.list_(arrow_type)
        },
        expected_tensor_representations={
            "sp": """varlen_sparse_tensor { column_name: "sp" }""",
        },
        tensor_input={
            "sp":
                tf.SparseTensor(
                    values=values,
                    indices=[[0, 0], [2, 0], [2, 1]],
                    dense_shape=[4, 2]),
        },
        expected_record_batch={
            "sp": expected_array
        }
        ))
  return result 
Example #18
Source File: raw_tf_record_test.py    From tfx-bsl with Apache License 2.0 5 votes vote down vote up
def testRecordBatchAndTensorAdapter(self):
    column_name = "raw_record"
    telemetry_descriptors = ["some", "component"]
    tfxio = raw_tf_record.RawTfRecordTFXIO(
        self._raw_record_file, column_name,
        telemetry_descriptors=telemetry_descriptors)
    expected_type = (
        pa.large_list(pa.large_binary())
        if _ProducesLargeTypes(tfxio) else pa.list_(pa.binary()))

    got_schema = tfxio.ArrowSchema()
    self.assertTrue(got_schema.equals(
        pa.schema([pa.field(column_name, expected_type)])),
                    "got: {}".format(got_schema))

    def _AssertFn(record_batches):
      self.assertLen(record_batches, 1)
      record_batch = record_batches[0]
      self.assertTrue(record_batch.schema.equals(tfxio.ArrowSchema()))
      self.assertTrue(record_batch.columns[0].equals(
          pa.array([[r] for r in _RAW_RECORDS], type=expected_type)))
      tensor_adapter = tfxio.TensorAdapter()
      tensors = tensor_adapter.ToBatchTensors(record_batch)
      self.assertLen(tensors, 1)
      self.assertIn(column_name, tensors)

    p = beam.Pipeline()
    record_batch_pcoll = p | tfxio.BeamSource(batch_size=len(_RAW_RECORDS))
    beam_testing_util.assert_that(record_batch_pcoll, _AssertFn)
    pipeline_result = p.run()
    pipeline_result.wait_until_finish()
    telemetry_test_util.ValidateMetrics(self, pipeline_result,
                                        telemetry_descriptors, "bytes",
                                        "tfrecords_gzip") 
Example #19
Source File: tensor_to_arrow.py    From tfx-bsl with Apache License 2.0 5 votes vote down vote up
def arrow_fields(self) -> List[pa.Field]:
    return [
        pa.field(self._tensor_name,
                 pa.list_(_tf_dtype_to_arrow_type(self._type_spec.dtype)))
    ] 
Example #20
Source File: csv_tfxio_test.py    From tfx-bsl with Apache License 2.0 5 votes vote down vote up
def _GetExpectedColumnValues(tfxio):
  if tfxio._can_produce_large_types:
    int_type = pa.large_list(pa.int64())
    float_type = pa.large_list(pa.float32())
    bytes_type = pa.large_list(pa.large_binary())
  else:
    int_type = pa.list_(pa.int64())
    float_type = pa.list_(pa.float32())
    bytes_type = pa.list_(pa.binary())

  return {
      "int_feature": pa.array([[1], [2]], type=int_type),
      "float_feature": pa.array([[2.0], [3.0]], type=float_type),
      "string_feature": pa.array([[b"abc"], [b"xyz"]], type=bytes_type),
  } 
Example #21
Source File: record_based_tfxio.py    From tfx-bsl with Apache License 2.0 5 votes vote down vote up
def ArrowSchema(self) -> pa.Schema:
    schema = self._ArrowSchemaNoRawRecordColumn()
    if self._raw_record_column_name is not None:
      column_type = (pa.large_list(pa.large_binary()) if
                     self._can_produce_large_types else pa.list_(pa.binary()))
      if schema.get_field_index(self._raw_record_column_name) != -1:
        raise ValueError(
            "Raw record column name {} collided with a column in the schema."
            .format(self._raw_record_column_name))
      schema = schema.append(
          pa.field(self._raw_record_column_name, column_type))
    return schema 
Example #22
Source File: sequence_example_coder_test.py    From tfx-bsl with Apache License 2.0 5 votes vote down vote up
def _test_decode(self, schema_text_proto, sequence_examples_text_proto,
                   create_expected, use_large_types):
    serialized_sequence_examples = [
        text_format.Parse(pbtxt,
                          tf.train.SequenceExample()).SerializeToString()
        for pbtxt in sequence_examples_text_proto
    ]
    serialized_schema = None
    if schema_text_proto is not None:
      serialized_schema = text_format.Parse(
          schema_text_proto, schema_pb2.Schema()).SerializeToString()

    if serialized_schema:
      coder = sequence_example_coder.SequenceExamplesToRecordBatchDecoder(
          _TEST_SEQUENCE_COLUMN_NAME,
          serialized_schema,
          use_large_types=use_large_types)
    else:
      coder = sequence_example_coder.SequenceExamplesToRecordBatchDecoder(
          _TEST_SEQUENCE_COLUMN_NAME, use_large_types=use_large_types)

    result = coder.DecodeBatch(serialized_sequence_examples)
    self.assertIsInstance(result, pa.RecordBatch)
    if use_large_types:
      expected = create_expected(pa.large_list, pa.large_binary())
    else:
      expected = create_expected(pa.list_, pa.binary())
    self.assertTrue(
        result.equals(expected),
        "actual: {}\n expected:{}".format(result, expected))

    if serialized_schema is not None:
      self.assertTrue(coder.ArrowSchema().equals(result.schema)) 
Example #23
Source File: csv_decoder.py    From tfx-bsl with Apache License 2.0 5 votes vote down vote up
def _GetFeatureTypeToArrowTypeMapping(
    large_types: bool) -> Dict[int, pa.DataType]:
  if large_types:
    return {
        ColumnType.UNKNOWN: pa.null(),
        ColumnType.INT: pa.large_list(pa.int64()),
        ColumnType.FLOAT: pa.large_list(pa.float32()),
        ColumnType.STRING: pa.large_list(pa.large_binary())
    }
  return {
      ColumnType.UNKNOWN: pa.null(),
      ColumnType.INT: pa.list_(pa.int64()),
      ColumnType.FLOAT: pa.list_(pa.float32()),
      ColumnType.STRING: pa.list_(pa.binary())
  } 
Example #24
Source File: impl_use_tfxio_test.py    From transform with Apache License 2.0 5 votes vote down vote up
def testPassthroughKeys(self):
    passthrough_key = '__passthrough__'

    def preprocessing_fn(inputs):
      self.assertNotIn(passthrough_key, inputs)
      return {'x_scaled': tft.scale_to_0_1(inputs['x'])}

    x_data = [0., 1., 2.]
    passthrough_data = [1, None, 3]
    input_record_batch = pa.RecordBatch.from_arrays([
        pa.array([[x] for x in x_data], type=pa.list_(pa.float32())),
        pa.array([None if p is None else [p] for p in passthrough_data],
                 type=pa.list_(pa.int64())),
    ], ['x', passthrough_key])
    tensor_adapter_config = tensor_adapter.TensorAdapterConfig(
        input_record_batch.schema,
        {'x': text_format.Parse(
            'dense_tensor { column_name: "x" shape {} }',
            schema_pb2.TensorRepresentation())})
    expected_data = [{'x_scaled': x / 2.0, passthrough_key: p}
                     for x, p in zip(x_data, passthrough_data)]

    with self._makeTestPipeline() as pipeline:
      input_data = (
          pipeline | beam.Create([input_record_batch]))
      with beam_impl.Context(
          temp_dir=self.get_temp_dir(),
          passthrough_keys=set([passthrough_key])):
        (transformed_data, _), _ = (
            (input_data, tensor_adapter_config)
            | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))

        def _assert_fn(output_data):
          self.assertCountEqual(expected_data, output_data)

        beam_test_util.assert_that(transformed_data, _assert_fn) 
Example #25
Source File: arrow_util_test.py    From data-validation with Apache License 2.0 5 votes vote down vote up
def testIsListLike(self):
    for t in (pa.list_(pa.int64()), pa.large_list(pa.int64())):
      self.assertTrue(arrow_util.is_list_like(t))

    for t in (pa.binary(), pa.int64(), pa.large_string()):
      self.assertFalse(arrow_util.is_list_like(t)) 
Example #26
Source File: arrow_util_test.py    From data-validation with Apache License 2.0 5 votes vote down vote up
def testIsBinaryLike(self):
    for t in (pa.binary(), pa.large_binary(), pa.string(), pa.large_string()):
      self.assertTrue(arrow_util.is_binary_like(t))

    for t in (pa.list_(pa.binary()), pa.large_list(pa.string())):
      self.assertFalse(arrow_util.is_binary_like(t)) 
Example #27
Source File: basic_stats_generator_test.py    From data-validation with Apache License 2.0 5 votes vote down vote up
def test_basic_stats_generator_invalid_value_numpy_dtype(self):
    batches = [pa.RecordBatch.from_arrays(
        [pa.array([[]], type=pa.list_(pa.date32()))], ['a'])]
    generator = basic_stats_generator.BasicStatsGenerator()
    with self.assertRaisesRegex(  # pylint: disable=g-error-prone-assert-raises
        TypeError, 'Feature a has unsupported arrow type'):
      self.assertCombinerOutputEqual(batches, generator, None) 
Example #28
Source File: top_k_uniques_combiner_stats_generator_test.py    From data-validation with Apache License 2.0 5 votes vote down vote up
def test_topk_uniques_combiner_zero_row(self):
    batches = [
        pa.RecordBatch.from_arrays([pa.array([], type=pa.list_(pa.binary()))],
                                   ['f1'])
    ]
    expected_result = {}
    generator = (
        top_k_uniques_combiner_stats_generator
        .TopKUniquesCombinerStatsGenerator(
            num_top_values=4, num_rank_histogram_buckets=3))
    self.assertCombinerOutputEqual(batches, generator, expected_result) 
Example #29
Source File: lift_stats_generator_test.py    From data-validation with Apache License 2.0 5 votes vote down vote up
def test_lift_missing_x_and_y(self):
    examples = [
        pa.RecordBatch.from_arrays([
            # explicitly construct type to avoid treating as null type
            pa.array([], type=pa.list_(pa.binary())),
            pa.array([], type=pa.list_(pa.binary())),
        ], ['categorical_x', 'string_y']),
    ]
    schema = text_format.Parse(
        """
        feature {
          name: 'categorical_x'
          type: BYTES
        }
        feature {
          name: 'string_y'
          type: BYTES
        }
        """, schema_pb2.Schema())
    expected_result = []
    generator = lift_stats_generator.LiftStatsGenerator(
        schema=schema, y_path=types.FeaturePath(['string_y']))
    self.assertSlicingAwareTransformOutputEqual(
        examples,
        generator,
        expected_result,
        add_default_slice_key_to_input=True,
        add_default_slice_key_to_output=True) 
Example #30
Source File: stats_impl_test.py    From data-validation with Apache License 2.0 5 votes vote down vote up
def test_filter_features(self):
    input_record_batch = pa.RecordBatch.from_arrays([
        pa.array([[]], type=pa.list_(pa.int64())),
        pa.array([[]], type=pa.list_(pa.int64())),
        pa.array([[]], type=pa.list_(pa.int64())),
    ], ['a', 'b', 'c'])
    actual = stats_impl._filter_features(input_record_batch, ['a', 'c'])
    expected = pa.RecordBatch.from_arrays([
        pa.array([[]], type=pa.list_(pa.int64())),
        pa.array([[]], type=pa.list_(pa.int64())),
    ], ['a', 'c'])
    self.assertEqual(set(actual.schema.names), set(expected.schema.names))