Python Examples of pyarrow.array

Source File: arrow_util_test.py From data-validation with Apache License 2.0

6 votes

def testGetArrayReturnExampleIndices(self):
    record_batch = pa.RecordBatch.from_arrays([
        pa.array([[{
            "sf": [{
                "ssf": [1]
            }, {
                "ssf": [2]
            }]
        }], [{
            "sf": [{
                "ssf": [3, 4]
            }]
        }]]),
        pa.array([["one"], ["two"]])
    ], ["f", "w"])
    feature = types.FeaturePath(["f", "sf", "ssf"])
    actual_arr, actual_indices = arrow_util.get_array(
        record_batch, feature, return_example_indices=True)
    expected_arr = pa.array([[1], [2], [3, 4]])
    expected_indices = np.array([0, 0, 1])
    self.assertTrue(
        actual_arr.equals(expected_arr),
        "\nfeature: {};\nexpected:\n{};\nactual:\n{}".format(
            feature, expected_arr, actual_arr))
    np.testing.assert_array_equal(expected_indices, actual_indices)

Source File: slicing_util_test.py From data-validation with Apache License 2.0

6 votes

def test_get_feature_value_slicer_bytes_feature_valid_utf8(self):
    features = {'b': None}
    input_record_batch = pa.RecordBatch.from_arrays([
        pa.array([[1], [2, 1]]),
        pa.array([[b'dog'], [b'cat']]),
    ], ['a', 'b'])
    expected_result = [
        (u'b_dog',
         pa.RecordBatch.from_arrays(
             [pa.array([[1]]), pa.array([[b'dog']])], ['a', 'b'])
        ),
        (u'b_cat',
         pa.RecordBatch.from_arrays(
             [pa.array([[2, 1]]), pa.array([[b'cat']])], ['a', 'b'])
        ),
    ]
    self._check_results(
        slicing_util.get_feature_value_slicer(features)(input_record_batch),
        expected_result)

Source File: time_stats_generator_test.py From data-validation with Apache License 2.0

6 votes

def test_time_stats_generator_integer_formats(self):
    """Tests that the generator handles integer formats."""
    # Three of values are within the valid range for Unix seconds, one is within
    # the valid range for Unix milliseconds, and the other two are not within
    # the valid range for any integer time formats.
    input_batches = [
        pa.array([[631152001, 631152002]]),
        pa.array([[631152003, 631152000001]]),
        pa.array([[1, 2]])
    ]
    generator = time_stats_generator.TimeStatsGenerator(
        match_ratio=0.1, values_threshold=1)
    assert schema_pb2.TimeDomain.UNIX_SECONDS == 1
    self.assertCombinerOutputEqual(
        input_batches, generator,
        statistics_pb2.FeatureNameStatistics(custom_stats=[
            statistics_pb2.CustomStatistic(
                name='domain_info',
                str=('time_domain {integer_format: 1}')
            ),
            statistics_pb2.CustomStatistic(name='time_match_ratio', num=0.5),
        ]))

Source File: time_stats_generator_test.py From data-validation with Apache License 2.0

6 votes

def test_time_stats_generator_combined_string_formats(self):
    """Tests that the generator handles combined string formats."""
    # The combined format is the most common, since the generator should count
    # it only as the combined format and not its component parts.
    input_batches = [
        pa.array([['2018/11/30 23:59', '2018/12/01 23:59']]),
        pa.array([['2018/11/30 23:59', '23:59']]),
        pa.array([['2018/11/30', '2018/11/30']]),
    ]
    generator = time_stats_generator.TimeStatsGenerator(
        match_ratio=0.1, values_threshold=1)
    self.assertCombinerOutputEqual(
        input_batches, generator,
        statistics_pb2.FeatureNameStatistics(custom_stats=[
            statistics_pb2.CustomStatistic(
                name='domain_info',
                str="time_domain {string_format: '%Y/%m/%d %H:%M'}"),
            statistics_pb2.CustomStatistic(name='time_match_ratio', num=0.5),
        ]))

Source File: stats_api_test.py From data-validation with Apache License 2.0

6 votes

def test_stats_pipeline_with_sample_rate(self):
    record_batches = [
        pa.RecordBatch.from_arrays(
            [pa.array([np.linspace(1, 3000, 3000, dtype=np.int32)])], ['c']),
    ]

    with beam.Pipeline() as p:
      options = stats_options.StatsOptions(
          sample_rate=1.0,
          num_top_values=2,
          num_rank_histogram_buckets=2,
          num_values_histogram_buckets=2,
          num_histogram_buckets=2,
          num_quantiles_histogram_buckets=2,
          epsilon=0.001)
      result = (
          p | beam.Create(record_batches)
          | stats_api.GenerateStatistics(options))
      util.assert_that(
          result,
          test_util.make_dataset_feature_stats_list_proto_equal_fn(
              self, self._sampling_test_expected_result))

Source File: basic_stats_generator_test.py From data-validation with Apache License 2.0

6 votes

def test_basic_stats_generator_no_runtime_warnings_close_to_max_int(self):
    # input has batches with values that are slightly smaller than the maximum
    # integer value.
    less_than_max_int_value = np.iinfo(np.int64).max - 1
    batches = ([
        pa.RecordBatch.from_arrays([pa.array([[less_than_max_int_value]])],
                                   ['a'])
    ] * 2)
    generator = basic_stats_generator.BasicStatsGenerator()
    old_nperr = np.geterr()
    np.seterr(over='raise')
    accumulators = [
        generator.add_input(generator.create_accumulator(), batch)
        for batch in batches
    ]
    generator.merge_accumulators(accumulators)
    np.seterr(**old_nperr)

Source File: time_stats_generator_test.py From data-validation with Apache License 2.0

6 votes

def test_time_stats_generator_match_ratio_with_same_valid_format(self):
    """Tests match ratio where all valid values have the same format."""
    input_batches = [
        pa.array([['2018-11-30', '2018-11-30', '2018-11-30'],
                  ['2018-11-30', '2018-11-30']]),
        pa.array([['not-valid', 'not-valid', 'not-valid'],
                  ['not-valid', 'not-valid']]),
    ]
    # Try generator with match_ratio 0.51 (should not create stats).
    generator = time_stats_generator.TimeStatsGenerator(
        match_ratio=0.51, values_threshold=5)
    self.assertCombinerOutputEqual(input_batches, generator,
                                   statistics_pb2.FeatureNameStatistics())
    # Try generator with match_ratio 0.49 (should create stats).
    generator = time_stats_generator.TimeStatsGenerator(
        match_ratio=0.49, values_threshold=5)
    self.assertCombinerOutputEqual(
        input_batches, generator,
        statistics_pb2.FeatureNameStatistics(custom_stats=[
            statistics_pb2.CustomStatistic(
                name='domain_info',
                str="time_domain {string_format: '%Y-%m-%d'}"),
            statistics_pb2.CustomStatistic(name='time_match_ratio', num=0.50),
        ]))

Source File: time_stats_generator_test.py From data-validation with Apache License 2.0

6 votes

def test_time_stats_generator_no_values_exits_add_input_early(
      self, mock_update):
    generator = time_stats_generator.TimeStatsGenerator()
    accumulator = generator.create_accumulator()

    # The accumulator is not updated when the values list in an input batch is
    # None.
    input_batch = pa.array([None])
    generator.add_input(accumulator, types.FeaturePath(['']), input_batch)
    self.assertFalse(mock_update.called)

    # The accumulator is not updated when the values list in an input batch is
    # empty.
    input_batch = pa.array([])
    generator.add_input(accumulator, types.FeaturePath(['']), input_batch)
    self.assertFalse(mock_update.called)

    # The accumulator is updated when a non-empty input_batch is added.
    input_batch = pa.array([['2018-11-30']])
    generator.add_input(accumulator, types.FeaturePath(['']), input_batch)
    self.assertTrue(mock_update.called)

Source File: time_stats_generator_test.py From data-validation with Apache License 2.0

6 votes

def test_time_stats_generator_invalidated_exits_add_input_early(
      self, mock_update):
    input_batch = pa.array([['2018-11-30']])
    generator = time_stats_generator.TimeStatsGenerator()
    accumulator = generator.create_accumulator()

    # When an accumulator is invalidated is True, it is not updated when an
    # input batch is added.
    accumulator.invalidated = True
    generator.add_input(accumulator, types.FeaturePath(['']), input_batch)
    self.assertFalse(mock_update.called)

    # When an accumulator is not invalidated, it is updated when an input batch
    # is added.
    accumulator.invalidated = False
    generator.add_input(accumulator, types.FeaturePath(['']), input_batch)
    self.assertTrue(mock_update.called)

Source File: time_stats_generator_test.py From data-validation with Apache License 2.0

6 votes

def test_time_stats_generator_values_threshold_check(self):
    """Tests generator values threshold."""
    # Expected to give 6 matches with the same format.
    input_batches = [
        pa.array([['2018-11-30', '2018-11-30', '2018-11-30'], ['2018-11-30']]),
        pa.array([['2018-11-30', '2018-11-30']]),
        pa.array([None, None]),
    ]
    # Try generator with values_threshold=7 (should not create stats).
    generator = time_stats_generator.TimeStatsGenerator(values_threshold=7)
    self.assertCombinerOutputEqual(input_batches, generator,
                                   statistics_pb2.FeatureNameStatistics())

    # Try generator with values_threshold=6 (should create stats).
    generator = time_stats_generator.TimeStatsGenerator(values_threshold=6)
    self.assertCombinerOutputEqual(
        input_batches, generator,
        statistics_pb2.FeatureNameStatistics(custom_stats=[
            statistics_pb2.CustomStatistic(
                name='domain_info',
                str="time_domain {string_format: '%Y-%m-%d'}"),
            statistics_pb2.CustomStatistic(name='time_match_ratio', num=1.0),
        ]))

Source File: image_stats_generator_test.py From data-validation with Apache License 2.0

5 votes

def test_image_stats_generator_with_missing_feature(self):
    """Test with missing values for a batch."""
    batches = [
        pa.array([]),
        pa.array([[
            FakeImageDecoder.encode_image_metadata('JPEG', 10, 1),
        ]]),
    ]
    expected_result = text_format.Parse(
        """
            custom_stats {
              name: 'domain_info'
              str: 'image_domain {}'
            }
            custom_stats {
              name: 'image_format_histogram'
              rank_histogram {
                buckets {
                  label: 'JPEG'
                  sample_count: 1
                }
              }
            }
            custom_stats {
              name: 'image_max_width'
              num: 1.0
            }
            custom_stats {
              name: 'image_max_height'
              num: 10.0
            }""", statistics_pb2.FeatureNameStatistics())
    image_decoder = FakeImageDecoder()
    generator = image_stats_generator.ImageStatsGenerator(
        image_decoder=image_decoder,
        values_threshold=1,
        enable_size_stats=True)
    self.assertCombinerOutputEqual(batches, generator, expected_result)

Source File: cross_feature_stats_generator_test.py From data-validation with Apache License 2.0

5 votes

def test_cross_feature_stats_generator_string_feature(self):
    generator = cross_feature_stats_generator.CrossFeatureStatsGenerator(
        sample_rate=1.0)
    b1 = pa.RecordBatch.from_arrays(
        [pa.array([['x'], ['y']]),
         pa.array([[2.0], [4.0]])], ['a', 'b'])
    b2 = pa.RecordBatch.from_arrays(
        [pa.array([['a'], ['b']]),
         pa.array([[14.0], [16.0]])], ['a', 'b'])
    batches = [b1, b2]
    self.assertCombinerOutputEqual(batches, generator, {}, {})

Source File: sklearn_mutual_information_test.py From data-validation with Apache License 2.0

5 votes

def test_mi_with_missing_label_key(self):
    batch = pa.RecordBatch.from_arrays(
        [pa.array([[1]]), pa.array([[1]])], ["label", "fa"])

    schema = text_format.Parse(
        """
          feature {
            name: "fa"
            type: FLOAT
              shape {
              dim {
                size: 1
              }
            }
          }
          feature {
            name: "label"
            type: FLOAT
            shape {
              dim {
                size: 1
              }
            }
          }
          """, schema_pb2.Schema())

    with self.assertRaisesRegexp(ValueError,
                                 "Feature label_key not found in the schema."):
      sklearn_mutual_information.SkLearnMutualInformation(
          types.FeaturePath(["label_key"]), schema, TEST_SEED).compute(batch)

Source File: count_missing_generator_test.py From data-validation with Apache License 2.0

5 votes

def test_count_missing_generator_required_path(self):
    batch = input_batch.InputBatch(
        pa.RecordBatch.from_arrays(
            [pa.array([[1], None, []]),
             pa.array([[1], None, []])], ['index', 'value']))
    path = types.FeaturePath(['index'])
    required_path = types.FeaturePath(['value'])
    generator = count_missing_generator.CountMissingGenerator(
        path, [required_path])
    accumulator = generator.create_accumulator()
    accumulator = generator.add_input(accumulator, batch)
    self.assertEqual(0, generator.extract_output(accumulator))

Source File: count_missing_generator_test.py From data-validation with Apache License 2.0

5 votes

def test_count_missing_generator_single_batch(self):
    batch = input_batch.InputBatch(
        pa.RecordBatch.from_arrays([pa.array([[1], None, []])], ['feature']))
    path = types.FeaturePath(['feature'])
    generator = count_missing_generator.CountMissingGenerator(path)
    accumulator = generator.create_accumulator()
    accumulator = generator.add_input(accumulator, batch)
    self.assertEqual(1, generator.extract_output(accumulator))

Source File: time_stats_generator_test.py From data-validation with Apache License 2.0

5 votes

def test_time_stats_generator_non_time_integers(self):
    """Tests that the generator handles integers that are not times."""
    # None of these numbers are valid times.
    input_batches = [
        pa.array([[1, 2]]),
    ]
    generator = time_stats_generator.TimeStatsGenerator(
        match_ratio=0.1, values_threshold=1)
    self.assertCombinerOutputEqual(
        input_batches, generator, statistics_pb2.FeatureNameStatistics())

Source File: time_stats_generator_test.py From data-validation with Apache License 2.0

5 votes

def test_time_stats_generator_match_ratio_with_different_valid_formats(self):
    """Tests match ratio where valid values have different formats."""
    input_batches = [
        pa.array(
            [['2018-11-30', '2018/11/30', '20181130', '18-11-30', '18/11/30'],
             ['11-30-2018', '11/30/2018', '11302018', '11/30/18', '11/30/18']]),
    ]
    # Any single format could satisfy the match_ratio, but this should identify
    # only the most common as the time format.
    generator = time_stats_generator.TimeStatsGenerator(
        match_ratio=0.05, values_threshold=1)
    self.assertCombinerOutputEqual(
        input_batches, generator,
        statistics_pb2.FeatureNameStatistics(custom_stats=[
            statistics_pb2.CustomStatistic(
                name='domain_info',
                str="time_domain {string_format: '%m/%d/%y'}"),
            statistics_pb2.CustomStatistic(name='time_match_ratio', num=0.2),
        ]))

    # No single valid format satisfies the specified match_ratio, so this should
    # not create stats.
    generator = time_stats_generator.TimeStatsGenerator(
        match_ratio=0.3, values_threshold=1)
    self.assertCombinerOutputEqual(input_batches, generator,
                                   statistics_pb2.FeatureNameStatistics())

Source File: time_stats_generator_test.py From data-validation with Apache License 2.0

5 votes

def test_time_stats_generator_inconsistent_type_invalidation_check(self):
    """Tests that generator invalidates stats if inconsistent types are used."""
    # Absent invalidation, this is expected to give 6 matches.
    input_batches = [
        pa.array([['2018-11-30', '2018-11-30', '2018-11-30'], ['2018-11-30']]),
        pa.array([['2018-11-30', '2018-11-30']]),
        pa.array([[1.0]]),
    ]
    # No domain_info should be generated as the incorrect type of the 1.0 value
    # should invalidate the stats. Absent this type issue, these examples would
    # satisfy the specified match_ratio and values_threshold.
    generator = time_stats_generator.TimeStatsGenerator(
        match_ratio=0.5, values_threshold=1)
    self.assertCombinerOutputEqual(input_batches, generator,
                                   statistics_pb2.FeatureNameStatistics())

Source File: sklearn_mutual_information_test.py From data-validation with Apache License 2.0

5 votes

def test_mi_with_multivalent_label(self):
    batch = pa.RecordBatch.from_arrays(
        [pa.array([[1, 2]]), pa.array([[1]])], ["label_key", "fa"])
    schema = text_format.Parse(
        """
          feature {
            name: "fa"
            type: FLOAT
            shape {
              dim {
                size: 1
              }
            }
          }
          feature {
            name: "label_key"
            type: FLOAT
            value_count: {
              min: 1
              max: 2
            }
          }
          """, schema_pb2.Schema())

    with self.assertRaisesRegexp(ValueError,
                                 "Label column contains unsupported data."):
      sklearn_mutual_information.SkLearnMutualInformation(
          types.FeaturePath(["label_key"]), schema, TEST_SEED).compute(batch)

Source File: basic_stats_generator_test.py From data-validation with Apache License 2.0

5 votes

def test_basic_stats_generator_invalid_value_numpy_dtype(self):
    batches = [pa.RecordBatch.from_arrays(
        [pa.array([[]], type=pa.list_(pa.date32()))], ['a'])]
    generator = basic_stats_generator.BasicStatsGenerator()
    with self.assertRaisesRegex(  # pylint: disable=g-error-prone-assert-raises
        TypeError, 'Feature a has unsupported arrow type'):
      self.assertCombinerOutputEqual(batches, generator, None)

Source File: image_stats_generator_test.py From data-validation with Apache License 2.0

5 votes

def get_sizes(self, value_list):
    loaded_metadata = [json.loads(value) for value in value_list]
    return np.array([[meta['height'], meta['width']]
                     for meta in loaded_metadata])

Source File: image_stats_generator_test.py From data-validation with Apache License 2.0

5 votes

def get_formats(self, value_list):
    return np.array([json.loads(value)['format'] for value in value_list],
                    dtype=np.object)

Source File: basic_stats_generator_test.py From data-validation with Apache License 2.0

5 votes

def test_basic_stats_generator_different_nest_levels(self):
    batches = [
        pa.RecordBatch.from_arrays([pa.array([[1]])], ['a']),
        pa.RecordBatch.from_arrays([pa.array([[[1]]])], ['a']),
    ]
    generator = basic_stats_generator.BasicStatsGenerator()
    with self.assertRaisesRegex(
        ValueError, 'Unable to merge common stats with different nest levels'):
      self.assertCombinerOutputEqual(batches, generator, None)

Source File: cross_feature_stats_generator_test.py From data-validation with Apache License 2.0

5 votes

def test_cross_feature_stats_generator_single_feature(self):
    generator = cross_feature_stats_generator.CrossFeatureStatsGenerator(
        sample_rate=1.0)
    b1 = pa.RecordBatch.from_arrays([pa.array([[1.0], [3.0]])], ['a'])
    self.assertCombinerOutputEqual([b1], generator, {}, {})

Source File: cross_feature_stats_generator_test.py From data-validation with Apache License 2.0

5 votes

def test_cross_feature_stats_generator_with_crosses_specified(self):
    generator = cross_feature_stats_generator.CrossFeatureStatsGenerator(
        feature_crosses=[('a', 'c'), ('b', 'c')], sample_rate=1.0)
    b1 = pa.RecordBatch.from_arrays([
        pa.array([[1.0], [3.0], [5.0]]),
        pa.array([[2.0], [4.0], [6.0]]),
        pa.array([[5.0], [3.0], [7.0]]),
    ], ['a', 'b', 'c'])
    b2 = pa.RecordBatch.from_arrays([
        pa.array([[6.0], [10.0]]),
        pa.array([[14.0], [16.0]]),
        pa.array([[-1.0], [0]]),
    ], ['a', 'b', 'c'])
    batches = [b1, b2]
    expected_result = {
        ('a', 'c'): text_format.Parse(
            """
            path_x { step: "a" }
            path_y { step: "c" }
            count: 5
            num_cross_stats {
              correlation: -0.59476602
              covariance: -5.4000001
            }
            """, statistics_pb2.CrossFeatureStatistics()),
        ('b', 'c'): text_format.Parse(
            """
            path_x { step: "b" }
            path_y { step: "c" }
            count: 5
            num_cross_stats {
              correlation: -0.81070298
              covariance: -13.52
            }
            """, statistics_pb2.CrossFeatureStatistics())}
    self.assertCombinerOutputEqual(batches, generator, {}, expected_result)

Source File: input_batch_test.py From data-validation with Apache License 2.0

5 votes

def test_list_lengths_non_list(self):
    batch = input_batch.InputBatch(
        pa.RecordBatch.from_arrays([
            pa.array([1, None, 1]),
        ], ['f1']))
    with self.assertRaisesRegex(
        ValueError, r'Can only compute list lengths on list arrays, found.*'):
      batch.list_lengths(types.FeaturePath(['f1']))

Source File: input_batch_test.py From data-validation with Apache License 2.0

5 votes

def test_all_null_mask_unequal_lengths(self):
    batch = input_batch.InputBatch(
        pa.RecordBatch.from_arrays([
            pa.array([[1]]),
            pa.array([[{
                'sf1': [[1]]
            }, {
                'sf1': [[1]]
            }]]),
        ], ['f1', 'f2']))
    with self.assertRaisesRegex(ValueError,
                                r'.*null_mask\(f2.sf1\).size.*\(1 != 2\).*'):
      batch.all_null_mask(
          types.FeaturePath(['f1']), types.FeaturePath(['f2', 'sf1']))

Source File: input_batch_test.py From data-validation with Apache License 2.0

5 votes

def test_list_lengths_empty_array(self):
    batch = input_batch.InputBatch(
        pa.RecordBatch.from_arrays([pa.array([])], ['f1']))
    np.testing.assert_array_equal(
        batch.list_lengths(types.FeaturePath(['f1'])), [])

Source File: input_batch_test.py From data-validation with Apache License 2.0

5 votes

def test_list_lengths(self):
    batch = input_batch.InputBatch(
        pa.RecordBatch.from_arrays([
            pa.array([[1], None, [1, 2]]),
        ], ['f1']))
    np.testing.assert_array_equal(
        batch.list_lengths(types.FeaturePath(['f1'])), [1, 0, 2])

Source File: input_batch_test.py From data-validation with Apache License 2.0

5 votes

def test_all_null_mask_no_paths(self):
    batch = input_batch.InputBatch(
        pa.RecordBatch.from_arrays([pa.array([None, None], type=pa.null())],
                                   ['f3']))
    with self.assertRaisesRegex(ValueError, r'Paths cannot be empty.*'):
      batch.all_null_mask()

Python pyarrow.array() Examples