Python Examples of pyarrow.RecordBatch

Source File: tensor_adapter.py From tfx-bsl with Apache License 2.0

6 votes

def GetTensor(self, record_batch: pa.RecordBatch,
                produce_eager_tensors: bool) -> Any:
    """Converts the RecordBatch to Tensor or CompositeTensor.

    The result must be of the same (not only compatible) TypeSpec as
    self.type_spec.

    Args:
      record_batch: a RecordBatch that is of the same Schema as what was
        passed at initialization time.
      produce_eager_tensors: if True, returns Eager Tensors, otherwise returns
        ndarrays or Tensor value objects.

    Returns:
      A Tensor or a CompositeTensor. Note that their types may vary depending
      on whether the TF eager mode is on.
    """

Source File: anomalies_util.py From data-validation with Apache License 2.0

6 votes

def anomalies_slicer(
    unused_example: pa.RecordBatch,
    anomalies: anomalies_pb2.Anomalies) -> types.SliceKeysList:
  """Returns slice keys for an example based on the given Anomalies proto.

  This slicer will generate a slice key for each anomaly reason in the proto.

  Args:
    unused_example: The example for which to generate slice keys.
    anomalies: An Anomalies proto from which to generate the list of slice
      keys.

  Returns:
    A list of slice keys.
  """
  slice_keys = []
  for feature_name, anomaly_info in anomalies.anomaly_info.items():
    for anomaly_reason in anomaly_info.reason:
      slice_keys.append(
          feature_name + '_' +
          anomalies_pb2.AnomalyInfo.Type.Name(anomaly_reason.type))
  return slice_keys

Source File: example_coder_test.py From tfx-bsl with Apache License 2.0

6 votes

def test_decode(self, schema_text_proto, examples_text_proto,
                  create_expected):
    serialized_examples = [
        text_format.Parse(pbtxt, tf.train.Example()).SerializeToString()
        for pbtxt in examples_text_proto
    ]
    serialized_schema = None
    if schema_text_proto is not None:
      serialized_schema = text_format.Parse(
          schema_text_proto, schema_pb2.Schema()).SerializeToString()

    if serialized_schema:
      coder = example_coder.ExamplesToRecordBatchDecoder(serialized_schema)
    else:
      coder = example_coder.ExamplesToRecordBatchDecoder()

    result = coder.DecodeBatch(serialized_examples)
    self.assertIsInstance(result, pa.RecordBatch)
    expected = create_expected(pa.list_, pa.binary())
    self.assertTrue(
        result.equals(expected),
        "actual: {}\n expected:{}".format(result, expected))
    if serialized_schema:
      self.assertTrue(expected.schema.equals(coder.ArrowSchema()))

Source File: example_coder_test.py From tfx-bsl with Apache License 2.0

6 votes

def test_decode_large_types(self, schema_text_proto, examples_text_proto,
                              create_expected):
    serialized_examples = [
        text_format.Parse(pbtxt, tf.train.Example()).SerializeToString()
        for pbtxt in examples_text_proto
    ]
    serialized_schema = None
    if schema_text_proto is not None:
      serialized_schema = text_format.Parse(
          schema_text_proto, schema_pb2.Schema()).SerializeToString()

    if serialized_schema:
      coder = example_coder.ExamplesToRecordBatchDecoder(
          serialized_schema=serialized_schema, use_large_types=True)
    else:
      coder = example_coder.ExamplesToRecordBatchDecoder(use_large_types=True)

    result = coder.DecodeBatch(serialized_examples)
    self.assertIsInstance(result, pa.RecordBatch)
    expected = create_expected(pa.large_list, pa.large_binary())
    self.assertTrue(
        result.equals(expected),
        "actual: {}\n expected:{}".format(result, expected))
    if serialized_schema:
      self.assertTrue(expected.schema.equals(coder.ArrowSchema()))

Source File: stats_impl.py From data-validation with Apache License 2.0

6 votes

def _filter_features(
    record_batch: pa.RecordBatch,
    feature_whitelist: List[types.FeatureName]) -> pa.RecordBatch:
  """Removes features that are not whitelisted.

  Args:
    record_batch: Input Arrow RecordBatch.
    feature_whitelist: A set of feature names to whitelist.

  Returns:
    An Arrow RecordBatch containing only the whitelisted features of the input.
  """
  schema = record_batch.schema
  column_names = set(schema.names)
  columns_to_select = []
  column_names_to_select = []
  for feature_name in feature_whitelist:
    if feature_name in column_names:
      columns_to_select.append(
          record_batch.column(schema.get_field_index(feature_name)))
      column_names_to_select.append(feature_name)
  return pa.RecordBatch.from_arrays(columns_to_select, column_names_to_select)

Source File: tf_example_record_test.py From tfx-bsl with Apache License 2.0

6 votes

def _ValidateRecordBatch(
      self, tfxio, record_batch, raw_record_column_name=None):
    self.assertIsInstance(record_batch, pa.RecordBatch)
    self.assertEqual(record_batch.num_rows, 3)
    expected_column_values = GetExpectedColumnValues(tfxio)
    for i, field in enumerate(record_batch.schema):
      if field.name == raw_record_column_name:
        continue
      self.assertTrue(record_batch.column(i).equals(
          expected_column_values[field.name]),
                      "Column {} did not match ({} vs {})."
                      .format(field.name, record_batch.column(i),
                              expected_column_values[field.name]))

    if raw_record_column_name is not None:
      if tfxio._can_produce_large_types:
        raw_record_column_type = pa.large_list(pa.large_binary())
      else:
        raw_record_column_type = pa.list_(pa.binary())
      self.assertEqual(record_batch.schema.names[-1], raw_record_column_name)
      self.assertTrue(
          record_batch.columns[-1].type.equals(raw_record_column_type))
      self.assertEqual(record_batch.columns[-1].flatten().to_pylist(),
                       _SERIALIZED_EXAMPLES)

Source File: tensor_adapter.py From tfx-bsl with Apache License 2.0

6 votes

def GetTensor(self, record_batch: pa.RecordBatch,
                produce_eager_tensors: bool) -> Any:
    array = record_batch.column(self._column_index)
    coo_array, dense_shape_array = array_util.CooFromListArray(array)
    dense_shape_np = dense_shape_array.to_numpy()
    values_array = array.flatten()
    if self._convert_to_binary_fn is not None:
      values_array = self._convert_to_binary_fn(values_array)
    values_np = np.asarray(values_array)
    coo_np = coo_array.to_numpy().reshape(values_np.size, 2)

    if produce_eager_tensors:
      return tf.sparse.SparseTensor(
          indices=tf.convert_to_tensor(coo_np),
          dense_shape=tf.convert_to_tensor(dense_shape_np),
          values=tf.convert_to_tensor(values_np))
    return tf.compat.v1.SparseTensorValue(
        indices=coo_np, dense_shape=dense_shape_np, values=values_np)

Source File: stats_impl.py From data-validation with Apache License 2.0

6 votes

def generate_statistics_in_memory(
    record_batch: pa.RecordBatch,
    options: stats_options.StatsOptions = stats_options.StatsOptions()
) -> statistics_pb2.DatasetFeatureStatisticsList:
  """Generates statistics for an in-memory list of examples.

  Args:
    record_batch: Arrow RecordBatch.
    options: Options for generating data statistics.

  Returns:
    A DatasetFeatureStatisticsList proto.
  """
  stats_generators = get_generators(options, in_memory=True)  # type: List[stats_generator.CombinerStatsGenerator]
  partial_stats = generate_partial_statistics_in_memory(record_batch, options,
                                                        stats_generators)
  return extract_statistics_output(partial_stats, stats_generators)

Source File: tfxio.py From tfx-bsl with Apache License 2.0

6 votes

def BeamSource(self, batch_size: Optional[int] = None) -> beam.PTransform:
    """Returns a beam `PTransform` that produces `PCollection[pa.RecordBatch]`.

    May NOT raise an error if the TFMD schema was not provided at construction
    time.

    If a TFMD schema was provided at construction time, all the
    `pa.RecordBatch`es in the result `PCollection` must be of the same schema
    returned by `self.ArrowSchema`. If a TFMD schema was not provided, the
    `pa.RecordBatch`es might not be of the same schema (they may contain
    different numbers of columns).

    Args:
      batch_size: if not None, the `pa.RecordBatch` produced will be of the
        specified size. Otherwise it's automatically tuned by Beam.
    """

Source File: tensor_to_arrow.py From tfx-bsl with Apache License 2.0

6 votes

def convert(self, tensors: Dict[Text, TensorAlike]) -> pa.RecordBatch:
    """Converts a dict of tensors to a RecordBatch.

    Args:
      tensors: must contain the same keys as the dict passed to the initialier.
        and each TensorAlike must be compatible with the corresponding TypeSpec.

    Returns:
      a RecordBatch, whose schema equals to self.arrow_schema().
    """
    assert len(self._handlers) == len(tensors)
    arrays = []
    for tensor_name, handler in self._handlers:
      arrays.extend(handler.convert(tensors[tensor_name]))

    return pa.record_batch(arrays, schema=self._arrow_schema)

Source File: _pandas_loaders.py From pymapd with Apache License 2.0

6 votes

def _serialize_arrow_payload(data, table_metadata, preserve_index=True):

    if isinstance(data, pd.DataFrame):

        # detect if there are categorical columns in dataframe
        cols = data.select_dtypes(include=['category']).columns

        # if there are categorical columns, make a copy before casting
        # to avoid mutating input data
        # https://github.com/omnisci/pymapd/issues/169
        if cols.size > 0:
            data_ = data.copy()
            data_[cols] = data_[cols].astype('object')
        else:
            data_ = data

        data = pa.RecordBatch.from_pandas(data_, preserve_index=preserve_index)

    stream = pa.BufferOutputStream()
    writer = pa.RecordBatchStreamWriter(stream, data.schema)

    if isinstance(data, pa.RecordBatch):
        writer.write_batch(data)
    elif isinstance(data, pa.Table):
        writer.write_table(data)

    writer.close()
    return stream.getvalue()

Source File: tft_unit.py From transform with Apache License 2.0

6 votes

def convert_to_tfxio_api_inputs(
      self, legacy_input_data, legacy_input_metadata, label='input_data'):
    """Converts from the legacy TFT API inputs to TFXIO-based inputs.

    Args:
      legacy_input_data: a PCollection of instance dicts.
      legacy_input_metadata: a tft.DatasetMetadata.
      label: label for the PTransform that translates `legacy_input_data` into
        the TFXIO input data. Set to different values if this method is called
        multiple times in a beam Pipeline.
    Returns:
      A tuple of a PCollection of `pyarrow.RecordBatch` and a
      `tensor_adapter.TensorAdapterConfig`. This tuple can be fed directly to
      TFT's `{Analyze,Transform,AnalyzeAndTransform}Dataset` APIs.
    """
    tfxio_impl = _LegacyCompatibilityTFXIO(legacy_input_metadata.schema)
    input_data = (
        legacy_input_data |
        ('LegacyFormatToTfxio[%s]' % label >> tfxio_impl.BeamSource(
            beam_impl.Context.get_desired_batch_size())))
    return input_data, tfxio_impl.TensorAdapterConfig()

Source File: tf_sequence_example_record.py From tfx-bsl with Apache License 2.0

6 votes

def _RawRecordToRecordBatchInternal(self,
                                      batch_size: Optional[int] = None
                                     ) -> beam.PTransform:

    @beam.typehints.with_input_types(bytes)
    @beam.typehints.with_output_types(pa.RecordBatch)
    def _PTransformFn(raw_records_pcoll: beam.pvalue.PCollection):
      return (raw_records_pcoll
              | "Batch" >> beam.BatchElements(
                  **batch_util.GetBatchElementsKwargs(batch_size))
              | "Decode" >> beam.ParDo(
                  _DecodeBatchExamplesDoFn(self._schema,
                                           self.raw_record_column_name,
                                           self._can_produce_large_types)))

    return beam.ptransform_fn(_PTransformFn)()

Source File: _pandas_helpers.py From python-bigquery with Apache License 2.0

6 votes

def download_arrow_tabledata_list(pages, bq_schema):
    """Use tabledata.list to construct an iterable of RecordBatches.

    Args:
        pages (Iterator[:class:`google.api_core.page_iterator.Page`]):
            An iterator over the result pages.
        bq_schema (Sequence[Union[ \
            :class:`~google.cloud.bigquery.schema.SchemaField`, \
            Mapping[str, Any] \
        ]]):
            A decription of the fields in result pages.
    Yields:
        :class:`pyarrow.RecordBatch`
        The next page of records as a ``pyarrow`` record batch.
    """
    bq_schema = schema._to_schema_fields(bq_schema)
    column_names = bq_to_arrow_schema(bq_schema) or [field.name for field in bq_schema]
    arrow_types = [bq_to_arrow_data_type(field) for field in bq_schema]

    for page in pages:
        yield _tabledata_list_page_to_arrow(page, column_names, arrow_types)

Source File: table.py From python-bigquery with Apache License 2.0

6 votes

def _tabledata_list_page_columns(schema, response):
    """Make a generator of all the columns in a page from tabledata.list.

    This enables creating a :class:`pandas.DataFrame` and other
    column-oriented data structures such as :class:`pyarrow.RecordBatch`
    """
    columns = []
    rows = response.get("rows", [])

    def get_column_data(field_index, field):
        for row in rows:
            yield _helpers._field_from_json(row["f"][field_index]["v"], field)

    for field_index, field in enumerate(schema):
        columns.append(get_column_data(field_index, field))

    return columns


# pylint: disable=unused-argument

Source File: table_util.py From tfx-bsl with Apache License 2.0

6 votes

def CanonicalizeRecordBatch(
    record_batch_with_primitive_arrays: pa.RecordBatch,) -> pa.RecordBatch:
  """Converts primitive arrays in a pyarrow.RecordBatch to SingletonListArrays.

  Args:
    record_batch_with_primitive_arrays: A pyarrow.RecordBatch where values are
      stored in primitive arrays or singleton list arrays.

  Returns:
    pyArrow.RecordBatch in SingletonListArray format.
  """
  arrays = []
  for column_array in record_batch_with_primitive_arrays.columns:
    arr_type = column_array.type
    if not (pa.types.is_list(arr_type) or pa.types.is_large_list(arr_type)):
      arrays.append(array_util.ToSingletonListArray(column_array))
    else:
      arrays.append(column_array)
  # TODO(pachristopher): Consider using a list of record batches instead of a
  # single record batch to avoid having list arrays larger than 2^31 elements.
  return pa.RecordBatch.from_arrays(
      arrays, record_batch_with_primitive_arrays.schema.names)

Source File: table_util.py From tfx-bsl with Apache License 2.0

6 votes

def DataFrameToRecordBatch(
    dataframe: pd.DataFrame) -> pa.RecordBatch:
  """Convert pandas.DataFrame to a pyarrow.RecordBatch with primitive arrays.

  Args:
    dataframe: A pandas.DataFrame, where rows correspond to examples and columns
      correspond to features.

  Returns:
    A pa.RecordBatch containing the same values as the input data in primitive
    array format.
  """

  arrow_fields = []
  for col_name, col_type in zip(dataframe.columns, dataframe.dtypes):
    arrow_type = NumpyKindToArrowType(col_type.kind)
    if not arrow_type:
      logging.warning("Ignoring feature %s of type %s", col_name, col_type)
      continue
    arrow_fields.append(pa.field(col_name, arrow_type))
  return pa.RecordBatch.from_pandas(dataframe, schema=pa.schema(arrow_fields))

Source File: table_util.py From tfx-bsl with Apache License 2.0

6 votes

def MergeRecordBatches(record_batches: List[pa.RecordBatch]) -> pa.RecordBatch:
  """Merges a list of arrow RecordBatches into one. Similar to MergeTables."""
  if not record_batches:
    return _EMPTY_RECORD_BATCH
  first_schema = record_batches[0].schema
  assert any([r.num_rows > 0 for r in record_batches]), (
      "Unable to merge empty RecordBatches.")
  if all([r.schema.equals(first_schema) for r in record_batches[1:]]):
    one_chunk_table = pa.Table.from_batches(record_batches).combine_chunks()
    batches = one_chunk_table.to_batches(max_chunksize=None)
    assert len(batches) == 1
    return batches[0]
  else:
    # TODO(zhuo, b/158335158): switch to pa.Table.concat_tables(promote=True)
    # once the upstream bug is fixed:
    # https://jira.apache.org/jira/browse/ARROW-9071
    return _MergeRecordBatches(record_batches)

Source File: record_based_tfxio.py From tfx-bsl with Apache License 2.0

5 votes

def RawRecordBeamSource(self) -> beam.PTransform:
    """Returns a PTransform that produces a PCollection[bytes].

    Used together with RawRecordToRecordBatch(), it allows getting both the
    PCollection of the raw records and the PCollection of the RecordBatch from
    the same source. For example:

    record_batch = pipeline | tfxio.BeamSource()
    raw_record = pipeline | tfxio.RawRecordBeamSource()

    would result in the files being read twice, while the following would only
    read once:

    raw_record = pipeline | tfxio.RawRecordBeamSource()
    record_batch = raw_record | tfxio.RawRecordToRecordBatch()
    """

    @beam.typehints.with_input_types(beam.Pipeline)
    @beam.typehints.with_output_types(bytes)
    def _PTransformFn(pipeline: beam.Pipeline):
      return (pipeline | "ReadRawRecords" >> self._RawRecordBeamSourceInternal()
              | "CollectRawRecordTelemetry" >> telemetry.ProfileRawRecords(
                  self._telemetry_descriptors, self._logical_format,
                  self._physical_format))

    return beam.ptransform_fn(_PTransformFn)()

Source File: tensor_adapter.py From tfx-bsl with Apache License 2.0

5 votes

def GetTensor(self, record_batch: pa.RecordBatch,
                produce_eager_tensors: bool) -> Any:
    values_array = record_batch.column(self._value_column_index)
    values_parent_indices = array_util.GetFlattenedArrayParentIndices(
        values_array)
    indices_arrays = [np.asarray(values_parent_indices)]
    for index_column_index in self._index_column_indices:
      indices_arrays.append(
          np.asarray(record_batch.column(index_column_index).flatten()))
    flat_values_array = values_array.flatten()
    if self._convert_to_binary_fn is not None:
      flat_values_array = self._convert_to_binary_fn(flat_values_array)
    values_np = np.asarray(flat_values_array)
    coo_np = np.empty(shape=(len(values_np), self._coo_size), dtype=np.int64)
    try:
      np.stack(indices_arrays, axis=1, out=coo_np)
    except ValueError as e:
      raise ValueError("Error constructing the COO for SparseTensor. "
                       "number of values: {}; "
                       "size of each index array: {}; "
                       "original error {}.".format(
                           len(values_np), [len(i) for i in indices_arrays], e))

    dense_shape = [len(record_batch)] + self._shape

    if produce_eager_tensors:
      return tf.sparse.SparseTensor(
          indices=tf.convert_to_tensor(coo_np),
          dense_shape=tf.convert_to_tensor(dense_shape, dtype=tf.int64),
          values=tf.convert_to_tensor(values_np))
    return tf.compat.v1.SparseTensorValue(
        indices=coo_np, dense_shape=dense_shape, values=values_np)

Source File: record_based_tfxio.py From tfx-bsl with Apache License 2.0

5 votes

def BeamSource(self, batch_size: Optional[int] = None) -> beam.PTransform:

    @beam.typehints.with_input_types(beam.Pipeline)
    @beam.typehints.with_output_types(pa.RecordBatch)
    def _PTransformFn(pipeline: beam.pvalue.PCollection):
      """Converts raw records to RecordBatches."""
      return (
          pipeline
          | "RawRecordBeamSource" >> self.RawRecordBeamSource()
          | "RawRecordToRecordBatch" >> self.RawRecordToRecordBatch(batch_size))

    return beam.ptransform_fn(_PTransformFn)()

Source File: raw_tf_record.py From tfx-bsl with Apache License 2.0

5 votes

def _BatchedRecordsToArrow(records: List[bytes],
                           raw_record_column_name: Text,
                           should_produce_large_types: bool) -> pa.RecordBatch:
  raw_record_column = record_based_tfxio.CreateRawRecordColumn(
      records, should_produce_large_types)
  return pa.RecordBatch.from_arrays(
      [raw_record_column], [raw_record_column_name])

Source File: telemetry.py From tfx-bsl with Apache License 2.0

5 votes

def process(self, record_batch: pa.RecordBatch) -> Iterable[pa.RecordBatch]:
    num_rows = record_batch.num_rows
    self._num_rows.inc(num_rows)
    self._UpdateNumCellsCounters(record_batch)
    total_byte_size = table_util.TotalByteSize(
        record_batch, ignore_unsupported=True)
    self._byte_size_dist.update(total_byte_size)
    # These distributions are per-row therefore expensive to update because
    # dist.update() needs to be called num_rows * k times.
    if np.random.rand() < self._dist_update_prob:
      self._UpdateNumColumnsDist(record_batch)
      self._UpdateNumValuesDist(record_batch)
    yield record_batch

Source File: raw_tf_record.py From tfx-bsl with Apache License 2.0

5 votes

def _RawRecordToRecordBatchInternal(self,
                                      batch_size: Optional[int] = None
                                     ) -> beam.PTransform:

    @beam.typehints.with_input_types(beam.Pipeline)
    @beam.typehints.with_output_types(pa.RecordBatch)
    def _PTransformFn(raw_record_pcoll: beam.pvalue.PCollection):
      return (raw_record_pcoll
              | "Batch" >> beam.BatchElements(
                  **batch_util.GetBatchElementsKwargs(batch_size))
              | "ToRecordBatch" >>
              beam.Map(_BatchedRecordsToArrow, self.raw_record_column_name,
                       self._can_produce_large_types))

    return beam.ptransform_fn(_PTransformFn)()

Source File: tf_sequence_example_record_test.py From tfx-bsl with Apache License 2.0

5 votes

def _ValidateRecordBatch(
      self, tfxio, record_batch, raw_record_column_name=None):
    self.assertIsInstance(record_batch, pa.RecordBatch)
    self.assertEqual(record_batch.num_rows, 3)
    expected_column_values = _GetExpectedColumnValues(tfxio)
    for i, field in enumerate(record_batch.schema):
      if field.name == raw_record_column_name:
        continue
      if field.name == _SEQUENCE_COLUMN_NAME:
        self.assertTrue(pa.types.is_struct(field.type))
        for seq_column, seq_field in zip(
            record_batch.column(i).flatten(), list(field.type)):
          expected_array = expected_column_values[path.ColumnPath(
              [_SEQUENCE_COLUMN_NAME, seq_field.name])]
          self.assertTrue(
              seq_column.equals(expected_array),
              "Sequence column {} did not match ({} vs {})".format(
                  seq_field.name, seq_column, expected_array))
        continue
      self.assertTrue(
          record_batch.column(i).equals(expected_column_values[path.ColumnPath(
              [field.name])]), "Column {} did not match ({} vs {}).".format(
                  field.name, record_batch.column(i),
                  expected_column_values[path.ColumnPath([field.name])]))

    if raw_record_column_name is not None:
      if tfxio._can_produce_large_types:
        raw_record_column_type = pa.large_list(pa.large_binary())
      else:
        raw_record_column_type = pa.list_(pa.binary())
      self.assertEqual(record_batch.schema.names[-1], raw_record_column_name)
      self.assertTrue(
          record_batch.columns[-1].type.equals(raw_record_column_type))
      self.assertEqual(record_batch.columns[-1].flatten().to_pylist(),
                       _SERIALIZED_EXAMPLES)

Source File: csv_decoder.py From tfx-bsl with Apache License 2.0

5 votes

def process(self, batch_of_tuple: List[Tuple[List[CSVCell], CSVLine]],
              column_infos: List[ColumnInfo]) -> Iterable[pa.RecordBatch]:
    if self._column_names is None:
      self._process_column_infos(column_infos)

    raw_records = []
    values_list_by_column = [[] for _ in self._column_handlers]
    for (csv_row, raw_record) in batch_of_tuple:
      if not csv_row:
        if not self._skip_blank_lines:
          for l in values_list_by_column:
            l.append(None)
        continue
      if len(csv_row) != len(self._column_handlers):
        raise ValueError("Encountered a row of unexpected number of columns")
      for value, handler, values_list in (zip(csv_row, self._column_handlers,
                                              values_list_by_column)):
        values_list.append(handler(value) if value else None)
      if self._raw_record_column_name is not None:
        raw_records.append([raw_record])

    arrow_arrays = [
        pa.array(l, type=t)
        for l, t in zip(values_list_by_column, self._column_arrow_types)
    ]

    if self._raw_record_column_name is not None:
      arrow_arrays.append(
          pa.array(raw_records, type=self._raw_record_column_type))
      self._column_names.append(self._raw_record_column_name)
    yield pa.RecordBatch.from_arrays(arrow_arrays, self._column_names)

Source File: model_eval_lib.py From model-analysis with Apache License 2.0

5 votes

def BatchedInputsToExtracts(  # pylint: disable=invalid-name
    batched_inputs: beam.pvalue.PCollection) -> beam.pvalue.PCollection:
  """Converts Arrow RecordBatch inputs to Extracts."""

  def to_extracts(x: Union[bytes, pa.RecordBatch]) -> types.Extracts:
    result = {}
    if isinstance(x, dict):
      result.update(x)
    else:
      result[constants.ARROW_RECORD_BATCH_KEY] = x
    return result

  return batched_inputs | 'AddArrowRecordBatchKey' >> beam.Map(to_extracts)

Source File: batched_input_extractor.py From model-analysis with Apache License 2.0

5 votes

def BatchedInputExtractor(
    eval_config: config.EvalConfig) -> extractor.Extractor:
  """Creates an extractor for extracting features, labels, and example weights.

  The extractor's PTransform extracts features, labels, and example weights from
  the batched features (i.e., Arrow RecordBatch) stored under
  tfma.ARROW_RECORD_BATCH_KEY in the incoming extract and adds it to the output
  extract under the keys tfma.FEATURES_KEY, tfma.LABELS_KEY, and
  tfma.EXAMPLE_WEIGHTS_KEY. If the eval_config contains a
  prediction_key and a corresponding key is found in the parse example, then
  predictions will also be extracted and stored under the
  tfma.PREDICTIONS_KEY. Any extracts that already exist will be merged
  with the values parsed by this extractor with this extractor's values taking
  precedence when duplicate keys are detected.

  Note that the use of a prediction_key in an eval_config serves two use cases:
    (1) as a key into the dict of predictions output by predict extractor
    (2) as the key for a pre-computed prediction stored as a feature.
  The InputExtractor can be used to handle case (2). These cases are meant to be
  exclusive (i.e. if approach (2) is used then a predict extractor would not be
  configured and if (1) is used then a key matching the predictons would not be
  stored in the features). However, if a feature key happens to match the same
  name as the prediction output key then both paths may be executed. In this
  case, the value stored here will be replaced by the predict extractor (though
  it will still be popped from the features).

  Args:
    eval_config: Eval config.

  Returns:
    Batched extractor for extracting features, labels, and example weights.
  """
  # pylint: disable=no-value-for-parameter
  return extractor.Extractor(
      stage_name=INPUT_EXTRACTOR_STAGE_NAME,
      ptransform=_ExtractBatchedInputs(eval_config=eval_config))

Source File: batched_input_extractor.py From model-analysis with Apache License 2.0

5 votes

def _DropUnsupportedColumnsAndFetchRawDataColumn(
    record_batch: pa.RecordBatch
) -> Tuple[pa.RecordBatch, Optional[np.ndarray]]:
  """Drops unsupported columns and fetches the raw data column.

  Currently, types that are not binary_like or ListArray[primitive types] are
  dropped.

  Args:
    record_batch: An Arrow RecordBatch.

  Returns:
    Arrow RecordBatch with only supported columns.
  """
  column_names, column_arrays = [], []
  serialized_examples = None
  for column_name, column_array in zip(record_batch.schema.names,
                                       record_batch.columns):
    column_type = column_array.type
    if column_name == constants.ARROW_INPUT_COLUMN:
      assert (_IsListLike(column_type) and
              _IsBinaryLike(column_type.value_type)), (
                  'Invalid type for batched input key: {}. '
                  'Expected binary like.'.format(column_type))
      serialized_examples = np.asarray(column_array.flatten())
    # Currently we only handle columns of type list<primitive|binary_like>.
    # We ignore other columns as we cannot efficiently convert them into an
    # instance dict format.
    elif (_IsListLike(column_type) and
          _IsSupportedArrowValueType(column_type.value_type)):
      column_names.append(column_name)
      column_arrays.append(column_array)
  return (pa.RecordBatch.from_arrays(column_arrays,
                                     column_names), serialized_examples)

Source File: telemetry.py From tfx-bsl with Apache License 2.0

5 votes

def _UpdateNumCellsCounters(self, record_batch: pa.RecordBatch) -> None:
    num_rows = record_batch.num_rows
    for column in record_batch:
      column_type = column.type
      if pa.types.is_null(column_type):
        self._num_cells_by_type[_ValueType.NULL].inc(num_rows)
        continue

      if _IsListLike(column_type):
        value_type = _GetValueType(column_type.value_type)
      else:
        value_type = _ValueType.OTHER
      self._num_cells_by_type[value_type].inc(num_rows - column.null_count)

Python pyarrow.RecordBatch() Examples