Python pyarrow.array() Examples

The following are 30 code examples of pyarrow.array(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pyarrow , or try the search function .
Example #1
Source File: arrow_util_test.py    From data-validation with Apache License 2.0 6 votes vote down vote up
def testGetArrayReturnExampleIndices(self):
    record_batch = pa.RecordBatch.from_arrays([
        pa.array([[{
            "sf": [{
                "ssf": [1]
            }, {
                "ssf": [2]
            }]
        }], [{
            "sf": [{
                "ssf": [3, 4]
            }]
        }]]),
        pa.array([["one"], ["two"]])
    ], ["f", "w"])
    feature = types.FeaturePath(["f", "sf", "ssf"])
    actual_arr, actual_indices = arrow_util.get_array(
        record_batch, feature, return_example_indices=True)
    expected_arr = pa.array([[1], [2], [3, 4]])
    expected_indices = np.array([0, 0, 1])
    self.assertTrue(
        actual_arr.equals(expected_arr),
        "\nfeature: {};\nexpected:\n{};\nactual:\n{}".format(
            feature, expected_arr, actual_arr))
    np.testing.assert_array_equal(expected_indices, actual_indices) 
Example #2
Source File: slicing_util_test.py    From data-validation with Apache License 2.0 6 votes vote down vote up
def test_get_feature_value_slicer_bytes_feature_valid_utf8(self):
    features = {'b': None}
    input_record_batch = pa.RecordBatch.from_arrays([
        pa.array([[1], [2, 1]]),
        pa.array([[b'dog'], [b'cat']]),
    ], ['a', 'b'])
    expected_result = [
        (u'b_dog',
         pa.RecordBatch.from_arrays(
             [pa.array([[1]]), pa.array([[b'dog']])], ['a', 'b'])
        ),
        (u'b_cat',
         pa.RecordBatch.from_arrays(
             [pa.array([[2, 1]]), pa.array([[b'cat']])], ['a', 'b'])
        ),
    ]
    self._check_results(
        slicing_util.get_feature_value_slicer(features)(input_record_batch),
        expected_result) 
Example #3
Source File: time_stats_generator_test.py    From data-validation with Apache License 2.0 6 votes vote down vote up
def test_time_stats_generator_integer_formats(self):
    """Tests that the generator handles integer formats."""
    # Three of values are within the valid range for Unix seconds, one is within
    # the valid range for Unix milliseconds, and the other two are not within
    # the valid range for any integer time formats.
    input_batches = [
        pa.array([[631152001, 631152002]]),
        pa.array([[631152003, 631152000001]]),
        pa.array([[1, 2]])
    ]
    generator = time_stats_generator.TimeStatsGenerator(
        match_ratio=0.1, values_threshold=1)
    assert schema_pb2.TimeDomain.UNIX_SECONDS == 1
    self.assertCombinerOutputEqual(
        input_batches, generator,
        statistics_pb2.FeatureNameStatistics(custom_stats=[
            statistics_pb2.CustomStatistic(
                name='domain_info',
                str=('time_domain {integer_format: 1}')
            ),
            statistics_pb2.CustomStatistic(name='time_match_ratio', num=0.5),
        ])) 
Example #4
Source File: time_stats_generator_test.py    From data-validation with Apache License 2.0 6 votes vote down vote up
def test_time_stats_generator_combined_string_formats(self):
    """Tests that the generator handles combined string formats."""
    # The combined format is the most common, since the generator should count
    # it only as the combined format and not its component parts.
    input_batches = [
        pa.array([['2018/11/30 23:59', '2018/12/01 23:59']]),
        pa.array([['2018/11/30 23:59', '23:59']]),
        pa.array([['2018/11/30', '2018/11/30']]),
    ]
    generator = time_stats_generator.TimeStatsGenerator(
        match_ratio=0.1, values_threshold=1)
    self.assertCombinerOutputEqual(
        input_batches, generator,
        statistics_pb2.FeatureNameStatistics(custom_stats=[
            statistics_pb2.CustomStatistic(
                name='domain_info',
                str="time_domain {string_format: '%Y/%m/%d %H:%M'}"),
            statistics_pb2.CustomStatistic(name='time_match_ratio', num=0.5),
        ])) 
Example #5
Source File: stats_api_test.py    From data-validation with Apache License 2.0 6 votes vote down vote up
def test_stats_pipeline_with_sample_rate(self):
    record_batches = [
        pa.RecordBatch.from_arrays(
            [pa.array([np.linspace(1, 3000, 3000, dtype=np.int32)])], ['c']),
    ]

    with beam.Pipeline() as p:
      options = stats_options.StatsOptions(
          sample_rate=1.0,
          num_top_values=2,
          num_rank_histogram_buckets=2,
          num_values_histogram_buckets=2,
          num_histogram_buckets=2,
          num_quantiles_histogram_buckets=2,
          epsilon=0.001)
      result = (
          p | beam.Create(record_batches)
          | stats_api.GenerateStatistics(options))
      util.assert_that(
          result,
          test_util.make_dataset_feature_stats_list_proto_equal_fn(
              self, self._sampling_test_expected_result)) 
Example #6
Source File: basic_stats_generator_test.py    From data-validation with Apache License 2.0 6 votes vote down vote up
def test_basic_stats_generator_no_runtime_warnings_close_to_max_int(self):
    # input has batches with values that are slightly smaller than the maximum
    # integer value.
    less_than_max_int_value = np.iinfo(np.int64).max - 1
    batches = ([
        pa.RecordBatch.from_arrays([pa.array([[less_than_max_int_value]])],
                                   ['a'])
    ] * 2)
    generator = basic_stats_generator.BasicStatsGenerator()
    old_nperr = np.geterr()
    np.seterr(over='raise')
    accumulators = [
        generator.add_input(generator.create_accumulator(), batch)
        for batch in batches
    ]
    generator.merge_accumulators(accumulators)
    np.seterr(**old_nperr) 
Example #7
Source File: time_stats_generator_test.py    From data-validation with Apache License 2.0 6 votes vote down vote up
def test_time_stats_generator_match_ratio_with_same_valid_format(self):
    """Tests match ratio where all valid values have the same format."""
    input_batches = [
        pa.array([['2018-11-30', '2018-11-30', '2018-11-30'],
                  ['2018-11-30', '2018-11-30']]),
        pa.array([['not-valid', 'not-valid', 'not-valid'],
                  ['not-valid', 'not-valid']]),
    ]
    # Try generator with match_ratio 0.51 (should not create stats).
    generator = time_stats_generator.TimeStatsGenerator(
        match_ratio=0.51, values_threshold=5)
    self.assertCombinerOutputEqual(input_batches, generator,
                                   statistics_pb2.FeatureNameStatistics())
    # Try generator with match_ratio 0.49 (should create stats).
    generator = time_stats_generator.TimeStatsGenerator(
        match_ratio=0.49, values_threshold=5)
    self.assertCombinerOutputEqual(
        input_batches, generator,
        statistics_pb2.FeatureNameStatistics(custom_stats=[
            statistics_pb2.CustomStatistic(
                name='domain_info',
                str="time_domain {string_format: '%Y-%m-%d'}"),
            statistics_pb2.CustomStatistic(name='time_match_ratio', num=0.50),
        ])) 
Example #8
Source File: time_stats_generator_test.py    From data-validation with Apache License 2.0 6 votes vote down vote up
def test_time_stats_generator_no_values_exits_add_input_early(
      self, mock_update):
    generator = time_stats_generator.TimeStatsGenerator()
    accumulator = generator.create_accumulator()

    # The accumulator is not updated when the values list in an input batch is
    # None.
    input_batch = pa.array([None])
    generator.add_input(accumulator, types.FeaturePath(['']), input_batch)
    self.assertFalse(mock_update.called)

    # The accumulator is not updated when the values list in an input batch is
    # empty.
    input_batch = pa.array([])
    generator.add_input(accumulator, types.FeaturePath(['']), input_batch)
    self.assertFalse(mock_update.called)

    # The accumulator is updated when a non-empty input_batch is added.
    input_batch = pa.array([['2018-11-30']])
    generator.add_input(accumulator, types.FeaturePath(['']), input_batch)
    self.assertTrue(mock_update.called) 
Example #9
Source File: time_stats_generator_test.py    From data-validation with Apache License 2.0 6 votes vote down vote up
def test_time_stats_generator_invalidated_exits_add_input_early(
      self, mock_update):
    input_batch = pa.array([['2018-11-30']])
    generator = time_stats_generator.TimeStatsGenerator()
    accumulator = generator.create_accumulator()

    # When an accumulator is invalidated is True, it is not updated when an
    # input batch is added.
    accumulator.invalidated = True
    generator.add_input(accumulator, types.FeaturePath(['']), input_batch)
    self.assertFalse(mock_update.called)

    # When an accumulator is not invalidated, it is updated when an input batch
    # is added.
    accumulator.invalidated = False
    generator.add_input(accumulator, types.FeaturePath(['']), input_batch)
    self.assertTrue(mock_update.called) 
Example #10
Source File: time_stats_generator_test.py    From data-validation with Apache License 2.0 6 votes vote down vote up
def test_time_stats_generator_values_threshold_check(self):
    """Tests generator values threshold."""
    # Expected to give 6 matches with the same format.
    input_batches = [
        pa.array([['2018-11-30', '2018-11-30', '2018-11-30'], ['2018-11-30']]),
        pa.array([['2018-11-30', '2018-11-30']]),
        pa.array([None, None]),
    ]
    # Try generator with values_threshold=7 (should not create stats).
    generator = time_stats_generator.TimeStatsGenerator(values_threshold=7)
    self.assertCombinerOutputEqual(input_batches, generator,
                                   statistics_pb2.FeatureNameStatistics())

    # Try generator with values_threshold=6 (should create stats).
    generator = time_stats_generator.TimeStatsGenerator(values_threshold=6)
    self.assertCombinerOutputEqual(
        input_batches, generator,
        statistics_pb2.FeatureNameStatistics(custom_stats=[
            statistics_pb2.CustomStatistic(
                name='domain_info',
                str="time_domain {string_format: '%Y-%m-%d'}"),
            statistics_pb2.CustomStatistic(name='time_match_ratio', num=1.0),
        ])) 
Example #11
Source File: image_stats_generator_test.py    From data-validation with Apache License 2.0 5 votes vote down vote up
def test_image_stats_generator_with_missing_feature(self):
    """Test with missing values for a batch."""
    batches = [
        pa.array([]),
        pa.array([[
            FakeImageDecoder.encode_image_metadata('JPEG', 10, 1),
        ]]),
    ]
    expected_result = text_format.Parse(
        """
            custom_stats {
              name: 'domain_info'
              str: 'image_domain {}'
            }
            custom_stats {
              name: 'image_format_histogram'
              rank_histogram {
                buckets {
                  label: 'JPEG'
                  sample_count: 1
                }
              }
            }
            custom_stats {
              name: 'image_max_width'
              num: 1.0
            }
            custom_stats {
              name: 'image_max_height'
              num: 10.0
            }""", statistics_pb2.FeatureNameStatistics())
    image_decoder = FakeImageDecoder()
    generator = image_stats_generator.ImageStatsGenerator(
        image_decoder=image_decoder,
        values_threshold=1,
        enable_size_stats=True)
    self.assertCombinerOutputEqual(batches, generator, expected_result) 
Example #12
Source File: cross_feature_stats_generator_test.py    From data-validation with Apache License 2.0 5 votes vote down vote up
def test_cross_feature_stats_generator_string_feature(self):
    generator = cross_feature_stats_generator.CrossFeatureStatsGenerator(
        sample_rate=1.0)
    b1 = pa.RecordBatch.from_arrays(
        [pa.array([['x'], ['y']]),
         pa.array([[2.0], [4.0]])], ['a', 'b'])
    b2 = pa.RecordBatch.from_arrays(
        [pa.array([['a'], ['b']]),
         pa.array([[14.0], [16.0]])], ['a', 'b'])
    batches = [b1, b2]
    self.assertCombinerOutputEqual(batches, generator, {}, {}) 
Example #13
Source File: sklearn_mutual_information_test.py    From data-validation with Apache License 2.0 5 votes vote down vote up
def test_mi_with_missing_label_key(self):
    batch = pa.RecordBatch.from_arrays(
        [pa.array([[1]]), pa.array([[1]])], ["label", "fa"])

    schema = text_format.Parse(
        """
          feature {
            name: "fa"
            type: FLOAT
              shape {
              dim {
                size: 1
              }
            }
          }
          feature {
            name: "label"
            type: FLOAT
            shape {
              dim {
                size: 1
              }
            }
          }
          """, schema_pb2.Schema())

    with self.assertRaisesRegexp(ValueError,
                                 "Feature label_key not found in the schema."):
      sklearn_mutual_information.SkLearnMutualInformation(
          types.FeaturePath(["label_key"]), schema, TEST_SEED).compute(batch) 
Example #14
Source File: count_missing_generator_test.py    From data-validation with Apache License 2.0 5 votes vote down vote up
def test_count_missing_generator_required_path(self):
    batch = input_batch.InputBatch(
        pa.RecordBatch.from_arrays(
            [pa.array([[1], None, []]),
             pa.array([[1], None, []])], ['index', 'value']))
    path = types.FeaturePath(['index'])
    required_path = types.FeaturePath(['value'])
    generator = count_missing_generator.CountMissingGenerator(
        path, [required_path])
    accumulator = generator.create_accumulator()
    accumulator = generator.add_input(accumulator, batch)
    self.assertEqual(0, generator.extract_output(accumulator)) 
Example #15
Source File: count_missing_generator_test.py    From data-validation with Apache License 2.0 5 votes vote down vote up
def test_count_missing_generator_single_batch(self):
    batch = input_batch.InputBatch(
        pa.RecordBatch.from_arrays([pa.array([[1], None, []])], ['feature']))
    path = types.FeaturePath(['feature'])
    generator = count_missing_generator.CountMissingGenerator(path)
    accumulator = generator.create_accumulator()
    accumulator = generator.add_input(accumulator, batch)
    self.assertEqual(1, generator.extract_output(accumulator)) 
Example #16
Source File: time_stats_generator_test.py    From data-validation with Apache License 2.0 5 votes vote down vote up
def test_time_stats_generator_non_time_integers(self):
    """Tests that the generator handles integers that are not times."""
    # None of these numbers are valid times.
    input_batches = [
        pa.array([[1, 2]]),
    ]
    generator = time_stats_generator.TimeStatsGenerator(
        match_ratio=0.1, values_threshold=1)
    self.assertCombinerOutputEqual(
        input_batches, generator, statistics_pb2.FeatureNameStatistics()) 
Example #17
Source File: time_stats_generator_test.py    From data-validation with Apache License 2.0 5 votes vote down vote up
def test_time_stats_generator_match_ratio_with_different_valid_formats(self):
    """Tests match ratio where valid values have different formats."""
    input_batches = [
        pa.array(
            [['2018-11-30', '2018/11/30', '20181130', '18-11-30', '18/11/30'],
             ['11-30-2018', '11/30/2018', '11302018', '11/30/18', '11/30/18']]),
    ]
    # Any single format could satisfy the match_ratio, but this should identify
    # only the most common as the time format.
    generator = time_stats_generator.TimeStatsGenerator(
        match_ratio=0.05, values_threshold=1)
    self.assertCombinerOutputEqual(
        input_batches, generator,
        statistics_pb2.FeatureNameStatistics(custom_stats=[
            statistics_pb2.CustomStatistic(
                name='domain_info',
                str="time_domain {string_format: '%m/%d/%y'}"),
            statistics_pb2.CustomStatistic(name='time_match_ratio', num=0.2),
        ]))

    # No single valid format satisfies the specified match_ratio, so this should
    # not create stats.
    generator = time_stats_generator.TimeStatsGenerator(
        match_ratio=0.3, values_threshold=1)
    self.assertCombinerOutputEqual(input_batches, generator,
                                   statistics_pb2.FeatureNameStatistics()) 
Example #18
Source File: time_stats_generator_test.py    From data-validation with Apache License 2.0 5 votes vote down vote up
def test_time_stats_generator_inconsistent_type_invalidation_check(self):
    """Tests that generator invalidates stats if inconsistent types are used."""
    # Absent invalidation, this is expected to give 6 matches.
    input_batches = [
        pa.array([['2018-11-30', '2018-11-30', '2018-11-30'], ['2018-11-30']]),
        pa.array([['2018-11-30', '2018-11-30']]),
        pa.array([[1.0]]),
    ]
    # No domain_info should be generated as the incorrect type of the 1.0 value
    # should invalidate the stats. Absent this type issue, these examples would
    # satisfy the specified match_ratio and values_threshold.
    generator = time_stats_generator.TimeStatsGenerator(
        match_ratio=0.5, values_threshold=1)
    self.assertCombinerOutputEqual(input_batches, generator,
                                   statistics_pb2.FeatureNameStatistics()) 
Example #19
Source File: sklearn_mutual_information_test.py    From data-validation with Apache License 2.0 5 votes vote down vote up
def test_mi_with_multivalent_label(self):
    batch = pa.RecordBatch.from_arrays(
        [pa.array([[1, 2]]), pa.array([[1]])], ["label_key", "fa"])
    schema = text_format.Parse(
        """
          feature {
            name: "fa"
            type: FLOAT
            shape {
              dim {
                size: 1
              }
            }
          }
          feature {
            name: "label_key"
            type: FLOAT
            value_count: {
              min: 1
              max: 2
            }
          }
          """, schema_pb2.Schema())

    with self.assertRaisesRegexp(ValueError,
                                 "Label column contains unsupported data."):
      sklearn_mutual_information.SkLearnMutualInformation(
          types.FeaturePath(["label_key"]), schema, TEST_SEED).compute(batch) 
Example #20
Source File: basic_stats_generator_test.py    From data-validation with Apache License 2.0 5 votes vote down vote up
def test_basic_stats_generator_invalid_value_numpy_dtype(self):
    batches = [pa.RecordBatch.from_arrays(
        [pa.array([[]], type=pa.list_(pa.date32()))], ['a'])]
    generator = basic_stats_generator.BasicStatsGenerator()
    with self.assertRaisesRegex(  # pylint: disable=g-error-prone-assert-raises
        TypeError, 'Feature a has unsupported arrow type'):
      self.assertCombinerOutputEqual(batches, generator, None) 
Example #21
Source File: image_stats_generator_test.py    From data-validation with Apache License 2.0 5 votes vote down vote up
def get_sizes(self, value_list):
    loaded_metadata = [json.loads(value) for value in value_list]
    return np.array([[meta['height'], meta['width']]
                     for meta in loaded_metadata]) 
Example #22
Source File: image_stats_generator_test.py    From data-validation with Apache License 2.0 5 votes vote down vote up
def get_formats(self, value_list):
    return np.array([json.loads(value)['format'] for value in value_list],
                    dtype=np.object) 
Example #23
Source File: basic_stats_generator_test.py    From data-validation with Apache License 2.0 5 votes vote down vote up
def test_basic_stats_generator_different_nest_levels(self):
    batches = [
        pa.RecordBatch.from_arrays([pa.array([[1]])], ['a']),
        pa.RecordBatch.from_arrays([pa.array([[[1]]])], ['a']),
    ]
    generator = basic_stats_generator.BasicStatsGenerator()
    with self.assertRaisesRegex(
        ValueError, 'Unable to merge common stats with different nest levels'):
      self.assertCombinerOutputEqual(batches, generator, None) 
Example #24
Source File: cross_feature_stats_generator_test.py    From data-validation with Apache License 2.0 5 votes vote down vote up
def test_cross_feature_stats_generator_single_feature(self):
    generator = cross_feature_stats_generator.CrossFeatureStatsGenerator(
        sample_rate=1.0)
    b1 = pa.RecordBatch.from_arrays([pa.array([[1.0], [3.0]])], ['a'])
    self.assertCombinerOutputEqual([b1], generator, {}, {}) 
Example #25
Source File: cross_feature_stats_generator_test.py    From data-validation with Apache License 2.0 5 votes vote down vote up
def test_cross_feature_stats_generator_with_crosses_specified(self):
    generator = cross_feature_stats_generator.CrossFeatureStatsGenerator(
        feature_crosses=[('a', 'c'), ('b', 'c')], sample_rate=1.0)
    b1 = pa.RecordBatch.from_arrays([
        pa.array([[1.0], [3.0], [5.0]]),
        pa.array([[2.0], [4.0], [6.0]]),
        pa.array([[5.0], [3.0], [7.0]]),
    ], ['a', 'b', 'c'])
    b2 = pa.RecordBatch.from_arrays([
        pa.array([[6.0], [10.0]]),
        pa.array([[14.0], [16.0]]),
        pa.array([[-1.0], [0]]),
    ], ['a', 'b', 'c'])
    batches = [b1, b2]
    expected_result = {
        ('a', 'c'): text_format.Parse(
            """
            path_x { step: "a" }
            path_y { step: "c" }
            count: 5
            num_cross_stats {
              correlation: -0.59476602
              covariance: -5.4000001
            }
            """, statistics_pb2.CrossFeatureStatistics()),
        ('b', 'c'): text_format.Parse(
            """
            path_x { step: "b" }
            path_y { step: "c" }
            count: 5
            num_cross_stats {
              correlation: -0.81070298
              covariance: -13.52
            }
            """, statistics_pb2.CrossFeatureStatistics())}
    self.assertCombinerOutputEqual(batches, generator, {}, expected_result) 
Example #26
Source File: input_batch_test.py    From data-validation with Apache License 2.0 5 votes vote down vote up
def test_list_lengths_non_list(self):
    batch = input_batch.InputBatch(
        pa.RecordBatch.from_arrays([
            pa.array([1, None, 1]),
        ], ['f1']))
    with self.assertRaisesRegex(
        ValueError, r'Can only compute list lengths on list arrays, found.*'):
      batch.list_lengths(types.FeaturePath(['f1'])) 
Example #27
Source File: input_batch_test.py    From data-validation with Apache License 2.0 5 votes vote down vote up
def test_all_null_mask_unequal_lengths(self):
    batch = input_batch.InputBatch(
        pa.RecordBatch.from_arrays([
            pa.array([[1]]),
            pa.array([[{
                'sf1': [[1]]
            }, {
                'sf1': [[1]]
            }]]),
        ], ['f1', 'f2']))
    with self.assertRaisesRegex(ValueError,
                                r'.*null_mask\(f2.sf1\).size.*\(1 != 2\).*'):
      batch.all_null_mask(
          types.FeaturePath(['f1']), types.FeaturePath(['f2', 'sf1'])) 
Example #28
Source File: input_batch_test.py    From data-validation with Apache License 2.0 5 votes vote down vote up
def test_list_lengths_empty_array(self):
    batch = input_batch.InputBatch(
        pa.RecordBatch.from_arrays([pa.array([])], ['f1']))
    np.testing.assert_array_equal(
        batch.list_lengths(types.FeaturePath(['f1'])), []) 
Example #29
Source File: input_batch_test.py    From data-validation with Apache License 2.0 5 votes vote down vote up
def test_list_lengths(self):
    batch = input_batch.InputBatch(
        pa.RecordBatch.from_arrays([
            pa.array([[1], None, [1, 2]]),
        ], ['f1']))
    np.testing.assert_array_equal(
        batch.list_lengths(types.FeaturePath(['f1'])), [1, 0, 2]) 
Example #30
Source File: input_batch_test.py    From data-validation with Apache License 2.0 5 votes vote down vote up
def test_all_null_mask_no_paths(self):
    batch = input_batch.InputBatch(
        pa.RecordBatch.from_arrays([pa.array([None, None], type=pa.null())],
                                   ['f3']))
    with self.assertRaisesRegex(ValueError, r'Paths cannot be empty.*'):
      batch.all_null_mask()