Python pyarrow.RecordBatch() Examples
The following are 30
code examples of pyarrow.RecordBatch().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyarrow
, or try the search function
.
Example #1
Source File: tensor_adapter.py From tfx-bsl with Apache License 2.0 | 6 votes |
def GetTensor(self, record_batch: pa.RecordBatch, produce_eager_tensors: bool) -> Any: """Converts the RecordBatch to Tensor or CompositeTensor. The result must be of the same (not only compatible) TypeSpec as self.type_spec. Args: record_batch: a RecordBatch that is of the same Schema as what was passed at initialization time. produce_eager_tensors: if True, returns Eager Tensors, otherwise returns ndarrays or Tensor value objects. Returns: A Tensor or a CompositeTensor. Note that their types may vary depending on whether the TF eager mode is on. """
Example #2
Source File: anomalies_util.py From data-validation with Apache License 2.0 | 6 votes |
def anomalies_slicer( unused_example: pa.RecordBatch, anomalies: anomalies_pb2.Anomalies) -> types.SliceKeysList: """Returns slice keys for an example based on the given Anomalies proto. This slicer will generate a slice key for each anomaly reason in the proto. Args: unused_example: The example for which to generate slice keys. anomalies: An Anomalies proto from which to generate the list of slice keys. Returns: A list of slice keys. """ slice_keys = [] for feature_name, anomaly_info in anomalies.anomaly_info.items(): for anomaly_reason in anomaly_info.reason: slice_keys.append( feature_name + '_' + anomalies_pb2.AnomalyInfo.Type.Name(anomaly_reason.type)) return slice_keys
Example #3
Source File: example_coder_test.py From tfx-bsl with Apache License 2.0 | 6 votes |
def test_decode(self, schema_text_proto, examples_text_proto, create_expected): serialized_examples = [ text_format.Parse(pbtxt, tf.train.Example()).SerializeToString() for pbtxt in examples_text_proto ] serialized_schema = None if schema_text_proto is not None: serialized_schema = text_format.Parse( schema_text_proto, schema_pb2.Schema()).SerializeToString() if serialized_schema: coder = example_coder.ExamplesToRecordBatchDecoder(serialized_schema) else: coder = example_coder.ExamplesToRecordBatchDecoder() result = coder.DecodeBatch(serialized_examples) self.assertIsInstance(result, pa.RecordBatch) expected = create_expected(pa.list_, pa.binary()) self.assertTrue( result.equals(expected), "actual: {}\n expected:{}".format(result, expected)) if serialized_schema: self.assertTrue(expected.schema.equals(coder.ArrowSchema()))
Example #4
Source File: example_coder_test.py From tfx-bsl with Apache License 2.0 | 6 votes |
def test_decode_large_types(self, schema_text_proto, examples_text_proto, create_expected): serialized_examples = [ text_format.Parse(pbtxt, tf.train.Example()).SerializeToString() for pbtxt in examples_text_proto ] serialized_schema = None if schema_text_proto is not None: serialized_schema = text_format.Parse( schema_text_proto, schema_pb2.Schema()).SerializeToString() if serialized_schema: coder = example_coder.ExamplesToRecordBatchDecoder( serialized_schema=serialized_schema, use_large_types=True) else: coder = example_coder.ExamplesToRecordBatchDecoder(use_large_types=True) result = coder.DecodeBatch(serialized_examples) self.assertIsInstance(result, pa.RecordBatch) expected = create_expected(pa.large_list, pa.large_binary()) self.assertTrue( result.equals(expected), "actual: {}\n expected:{}".format(result, expected)) if serialized_schema: self.assertTrue(expected.schema.equals(coder.ArrowSchema()))
Example #5
Source File: stats_impl.py From data-validation with Apache License 2.0 | 6 votes |
def _filter_features( record_batch: pa.RecordBatch, feature_whitelist: List[types.FeatureName]) -> pa.RecordBatch: """Removes features that are not whitelisted. Args: record_batch: Input Arrow RecordBatch. feature_whitelist: A set of feature names to whitelist. Returns: An Arrow RecordBatch containing only the whitelisted features of the input. """ schema = record_batch.schema column_names = set(schema.names) columns_to_select = [] column_names_to_select = [] for feature_name in feature_whitelist: if feature_name in column_names: columns_to_select.append( record_batch.column(schema.get_field_index(feature_name))) column_names_to_select.append(feature_name) return pa.RecordBatch.from_arrays(columns_to_select, column_names_to_select)
Example #6
Source File: tf_example_record_test.py From tfx-bsl with Apache License 2.0 | 6 votes |
def _ValidateRecordBatch( self, tfxio, record_batch, raw_record_column_name=None): self.assertIsInstance(record_batch, pa.RecordBatch) self.assertEqual(record_batch.num_rows, 3) expected_column_values = GetExpectedColumnValues(tfxio) for i, field in enumerate(record_batch.schema): if field.name == raw_record_column_name: continue self.assertTrue(record_batch.column(i).equals( expected_column_values[field.name]), "Column {} did not match ({} vs {})." .format(field.name, record_batch.column(i), expected_column_values[field.name])) if raw_record_column_name is not None: if tfxio._can_produce_large_types: raw_record_column_type = pa.large_list(pa.large_binary()) else: raw_record_column_type = pa.list_(pa.binary()) self.assertEqual(record_batch.schema.names[-1], raw_record_column_name) self.assertTrue( record_batch.columns[-1].type.equals(raw_record_column_type)) self.assertEqual(record_batch.columns[-1].flatten().to_pylist(), _SERIALIZED_EXAMPLES)
Example #7
Source File: tensor_adapter.py From tfx-bsl with Apache License 2.0 | 6 votes |
def GetTensor(self, record_batch: pa.RecordBatch, produce_eager_tensors: bool) -> Any: array = record_batch.column(self._column_index) coo_array, dense_shape_array = array_util.CooFromListArray(array) dense_shape_np = dense_shape_array.to_numpy() values_array = array.flatten() if self._convert_to_binary_fn is not None: values_array = self._convert_to_binary_fn(values_array) values_np = np.asarray(values_array) coo_np = coo_array.to_numpy().reshape(values_np.size, 2) if produce_eager_tensors: return tf.sparse.SparseTensor( indices=tf.convert_to_tensor(coo_np), dense_shape=tf.convert_to_tensor(dense_shape_np), values=tf.convert_to_tensor(values_np)) return tf.compat.v1.SparseTensorValue( indices=coo_np, dense_shape=dense_shape_np, values=values_np)
Example #8
Source File: stats_impl.py From data-validation with Apache License 2.0 | 6 votes |
def generate_statistics_in_memory( record_batch: pa.RecordBatch, options: stats_options.StatsOptions = stats_options.StatsOptions() ) -> statistics_pb2.DatasetFeatureStatisticsList: """Generates statistics for an in-memory list of examples. Args: record_batch: Arrow RecordBatch. options: Options for generating data statistics. Returns: A DatasetFeatureStatisticsList proto. """ stats_generators = get_generators(options, in_memory=True) # type: List[stats_generator.CombinerStatsGenerator] partial_stats = generate_partial_statistics_in_memory(record_batch, options, stats_generators) return extract_statistics_output(partial_stats, stats_generators)
Example #9
Source File: tfxio.py From tfx-bsl with Apache License 2.0 | 6 votes |
def BeamSource(self, batch_size: Optional[int] = None) -> beam.PTransform: """Returns a beam `PTransform` that produces `PCollection[pa.RecordBatch]`. May NOT raise an error if the TFMD schema was not provided at construction time. If a TFMD schema was provided at construction time, all the `pa.RecordBatch`es in the result `PCollection` must be of the same schema returned by `self.ArrowSchema`. If a TFMD schema was not provided, the `pa.RecordBatch`es might not be of the same schema (they may contain different numbers of columns). Args: batch_size: if not None, the `pa.RecordBatch` produced will be of the specified size. Otherwise it's automatically tuned by Beam. """
Example #10
Source File: tensor_to_arrow.py From tfx-bsl with Apache License 2.0 | 6 votes |
def convert(self, tensors: Dict[Text, TensorAlike]) -> pa.RecordBatch: """Converts a dict of tensors to a RecordBatch. Args: tensors: must contain the same keys as the dict passed to the initialier. and each TensorAlike must be compatible with the corresponding TypeSpec. Returns: a RecordBatch, whose schema equals to self.arrow_schema(). """ assert len(self._handlers) == len(tensors) arrays = [] for tensor_name, handler in self._handlers: arrays.extend(handler.convert(tensors[tensor_name])) return pa.record_batch(arrays, schema=self._arrow_schema)
Example #11
Source File: _pandas_loaders.py From pymapd with Apache License 2.0 | 6 votes |
def _serialize_arrow_payload(data, table_metadata, preserve_index=True): if isinstance(data, pd.DataFrame): # detect if there are categorical columns in dataframe cols = data.select_dtypes(include=['category']).columns # if there are categorical columns, make a copy before casting # to avoid mutating input data # https://github.com/omnisci/pymapd/issues/169 if cols.size > 0: data_ = data.copy() data_[cols] = data_[cols].astype('object') else: data_ = data data = pa.RecordBatch.from_pandas(data_, preserve_index=preserve_index) stream = pa.BufferOutputStream() writer = pa.RecordBatchStreamWriter(stream, data.schema) if isinstance(data, pa.RecordBatch): writer.write_batch(data) elif isinstance(data, pa.Table): writer.write_table(data) writer.close() return stream.getvalue()
Example #12
Source File: tft_unit.py From transform with Apache License 2.0 | 6 votes |
def convert_to_tfxio_api_inputs( self, legacy_input_data, legacy_input_metadata, label='input_data'): """Converts from the legacy TFT API inputs to TFXIO-based inputs. Args: legacy_input_data: a PCollection of instance dicts. legacy_input_metadata: a tft.DatasetMetadata. label: label for the PTransform that translates `legacy_input_data` into the TFXIO input data. Set to different values if this method is called multiple times in a beam Pipeline. Returns: A tuple of a PCollection of `pyarrow.RecordBatch` and a `tensor_adapter.TensorAdapterConfig`. This tuple can be fed directly to TFT's `{Analyze,Transform,AnalyzeAndTransform}Dataset` APIs. """ tfxio_impl = _LegacyCompatibilityTFXIO(legacy_input_metadata.schema) input_data = ( legacy_input_data | ('LegacyFormatToTfxio[%s]' % label >> tfxio_impl.BeamSource( beam_impl.Context.get_desired_batch_size()))) return input_data, tfxio_impl.TensorAdapterConfig()
Example #13
Source File: tf_sequence_example_record.py From tfx-bsl with Apache License 2.0 | 6 votes |
def _RawRecordToRecordBatchInternal(self, batch_size: Optional[int] = None ) -> beam.PTransform: @beam.typehints.with_input_types(bytes) @beam.typehints.with_output_types(pa.RecordBatch) def _PTransformFn(raw_records_pcoll: beam.pvalue.PCollection): return (raw_records_pcoll | "Batch" >> beam.BatchElements( **batch_util.GetBatchElementsKwargs(batch_size)) | "Decode" >> beam.ParDo( _DecodeBatchExamplesDoFn(self._schema, self.raw_record_column_name, self._can_produce_large_types))) return beam.ptransform_fn(_PTransformFn)()
Example #14
Source File: _pandas_helpers.py From python-bigquery with Apache License 2.0 | 6 votes |
def download_arrow_tabledata_list(pages, bq_schema): """Use tabledata.list to construct an iterable of RecordBatches. Args: pages (Iterator[:class:`google.api_core.page_iterator.Page`]): An iterator over the result pages. bq_schema (Sequence[Union[ \ :class:`~google.cloud.bigquery.schema.SchemaField`, \ Mapping[str, Any] \ ]]): A decription of the fields in result pages. Yields: :class:`pyarrow.RecordBatch` The next page of records as a ``pyarrow`` record batch. """ bq_schema = schema._to_schema_fields(bq_schema) column_names = bq_to_arrow_schema(bq_schema) or [field.name for field in bq_schema] arrow_types = [bq_to_arrow_data_type(field) for field in bq_schema] for page in pages: yield _tabledata_list_page_to_arrow(page, column_names, arrow_types)
Example #15
Source File: table.py From python-bigquery with Apache License 2.0 | 6 votes |
def _tabledata_list_page_columns(schema, response): """Make a generator of all the columns in a page from tabledata.list. This enables creating a :class:`pandas.DataFrame` and other column-oriented data structures such as :class:`pyarrow.RecordBatch` """ columns = [] rows = response.get("rows", []) def get_column_data(field_index, field): for row in rows: yield _helpers._field_from_json(row["f"][field_index]["v"], field) for field_index, field in enumerate(schema): columns.append(get_column_data(field_index, field)) return columns # pylint: disable=unused-argument
Example #16
Source File: table_util.py From tfx-bsl with Apache License 2.0 | 6 votes |
def CanonicalizeRecordBatch( record_batch_with_primitive_arrays: pa.RecordBatch,) -> pa.RecordBatch: """Converts primitive arrays in a pyarrow.RecordBatch to SingletonListArrays. Args: record_batch_with_primitive_arrays: A pyarrow.RecordBatch where values are stored in primitive arrays or singleton list arrays. Returns: pyArrow.RecordBatch in SingletonListArray format. """ arrays = [] for column_array in record_batch_with_primitive_arrays.columns: arr_type = column_array.type if not (pa.types.is_list(arr_type) or pa.types.is_large_list(arr_type)): arrays.append(array_util.ToSingletonListArray(column_array)) else: arrays.append(column_array) # TODO(pachristopher): Consider using a list of record batches instead of a # single record batch to avoid having list arrays larger than 2^31 elements. return pa.RecordBatch.from_arrays( arrays, record_batch_with_primitive_arrays.schema.names)
Example #17
Source File: table_util.py From tfx-bsl with Apache License 2.0 | 6 votes |
def DataFrameToRecordBatch( dataframe: pd.DataFrame) -> pa.RecordBatch: """Convert pandas.DataFrame to a pyarrow.RecordBatch with primitive arrays. Args: dataframe: A pandas.DataFrame, where rows correspond to examples and columns correspond to features. Returns: A pa.RecordBatch containing the same values as the input data in primitive array format. """ arrow_fields = [] for col_name, col_type in zip(dataframe.columns, dataframe.dtypes): arrow_type = NumpyKindToArrowType(col_type.kind) if not arrow_type: logging.warning("Ignoring feature %s of type %s", col_name, col_type) continue arrow_fields.append(pa.field(col_name, arrow_type)) return pa.RecordBatch.from_pandas(dataframe, schema=pa.schema(arrow_fields))
Example #18
Source File: table_util.py From tfx-bsl with Apache License 2.0 | 6 votes |
def MergeRecordBatches(record_batches: List[pa.RecordBatch]) -> pa.RecordBatch: """Merges a list of arrow RecordBatches into one. Similar to MergeTables.""" if not record_batches: return _EMPTY_RECORD_BATCH first_schema = record_batches[0].schema assert any([r.num_rows > 0 for r in record_batches]), ( "Unable to merge empty RecordBatches.") if all([r.schema.equals(first_schema) for r in record_batches[1:]]): one_chunk_table = pa.Table.from_batches(record_batches).combine_chunks() batches = one_chunk_table.to_batches(max_chunksize=None) assert len(batches) == 1 return batches[0] else: # TODO(zhuo, b/158335158): switch to pa.Table.concat_tables(promote=True) # once the upstream bug is fixed: # https://jira.apache.org/jira/browse/ARROW-9071 return _MergeRecordBatches(record_batches)
Example #19
Source File: record_based_tfxio.py From tfx-bsl with Apache License 2.0 | 5 votes |
def RawRecordBeamSource(self) -> beam.PTransform: """Returns a PTransform that produces a PCollection[bytes]. Used together with RawRecordToRecordBatch(), it allows getting both the PCollection of the raw records and the PCollection of the RecordBatch from the same source. For example: record_batch = pipeline | tfxio.BeamSource() raw_record = pipeline | tfxio.RawRecordBeamSource() would result in the files being read twice, while the following would only read once: raw_record = pipeline | tfxio.RawRecordBeamSource() record_batch = raw_record | tfxio.RawRecordToRecordBatch() """ @beam.typehints.with_input_types(beam.Pipeline) @beam.typehints.with_output_types(bytes) def _PTransformFn(pipeline: beam.Pipeline): return (pipeline | "ReadRawRecords" >> self._RawRecordBeamSourceInternal() | "CollectRawRecordTelemetry" >> telemetry.ProfileRawRecords( self._telemetry_descriptors, self._logical_format, self._physical_format)) return beam.ptransform_fn(_PTransformFn)()
Example #20
Source File: tensor_adapter.py From tfx-bsl with Apache License 2.0 | 5 votes |
def GetTensor(self, record_batch: pa.RecordBatch, produce_eager_tensors: bool) -> Any: values_array = record_batch.column(self._value_column_index) values_parent_indices = array_util.GetFlattenedArrayParentIndices( values_array) indices_arrays = [np.asarray(values_parent_indices)] for index_column_index in self._index_column_indices: indices_arrays.append( np.asarray(record_batch.column(index_column_index).flatten())) flat_values_array = values_array.flatten() if self._convert_to_binary_fn is not None: flat_values_array = self._convert_to_binary_fn(flat_values_array) values_np = np.asarray(flat_values_array) coo_np = np.empty(shape=(len(values_np), self._coo_size), dtype=np.int64) try: np.stack(indices_arrays, axis=1, out=coo_np) except ValueError as e: raise ValueError("Error constructing the COO for SparseTensor. " "number of values: {}; " "size of each index array: {}; " "original error {}.".format( len(values_np), [len(i) for i in indices_arrays], e)) dense_shape = [len(record_batch)] + self._shape if produce_eager_tensors: return tf.sparse.SparseTensor( indices=tf.convert_to_tensor(coo_np), dense_shape=tf.convert_to_tensor(dense_shape, dtype=tf.int64), values=tf.convert_to_tensor(values_np)) return tf.compat.v1.SparseTensorValue( indices=coo_np, dense_shape=dense_shape, values=values_np)
Example #21
Source File: record_based_tfxio.py From tfx-bsl with Apache License 2.0 | 5 votes |
def BeamSource(self, batch_size: Optional[int] = None) -> beam.PTransform: @beam.typehints.with_input_types(beam.Pipeline) @beam.typehints.with_output_types(pa.RecordBatch) def _PTransformFn(pipeline: beam.pvalue.PCollection): """Converts raw records to RecordBatches.""" return ( pipeline | "RawRecordBeamSource" >> self.RawRecordBeamSource() | "RawRecordToRecordBatch" >> self.RawRecordToRecordBatch(batch_size)) return beam.ptransform_fn(_PTransformFn)()
Example #22
Source File: raw_tf_record.py From tfx-bsl with Apache License 2.0 | 5 votes |
def _BatchedRecordsToArrow(records: List[bytes], raw_record_column_name: Text, should_produce_large_types: bool) -> pa.RecordBatch: raw_record_column = record_based_tfxio.CreateRawRecordColumn( records, should_produce_large_types) return pa.RecordBatch.from_arrays( [raw_record_column], [raw_record_column_name])
Example #23
Source File: telemetry.py From tfx-bsl with Apache License 2.0 | 5 votes |
def process(self, record_batch: pa.RecordBatch) -> Iterable[pa.RecordBatch]: num_rows = record_batch.num_rows self._num_rows.inc(num_rows) self._UpdateNumCellsCounters(record_batch) total_byte_size = table_util.TotalByteSize( record_batch, ignore_unsupported=True) self._byte_size_dist.update(total_byte_size) # These distributions are per-row therefore expensive to update because # dist.update() needs to be called num_rows * k times. if np.random.rand() < self._dist_update_prob: self._UpdateNumColumnsDist(record_batch) self._UpdateNumValuesDist(record_batch) yield record_batch
Example #24
Source File: raw_tf_record.py From tfx-bsl with Apache License 2.0 | 5 votes |
def _RawRecordToRecordBatchInternal(self, batch_size: Optional[int] = None ) -> beam.PTransform: @beam.typehints.with_input_types(beam.Pipeline) @beam.typehints.with_output_types(pa.RecordBatch) def _PTransformFn(raw_record_pcoll: beam.pvalue.PCollection): return (raw_record_pcoll | "Batch" >> beam.BatchElements( **batch_util.GetBatchElementsKwargs(batch_size)) | "ToRecordBatch" >> beam.Map(_BatchedRecordsToArrow, self.raw_record_column_name, self._can_produce_large_types)) return beam.ptransform_fn(_PTransformFn)()
Example #25
Source File: tf_sequence_example_record_test.py From tfx-bsl with Apache License 2.0 | 5 votes |
def _ValidateRecordBatch( self, tfxio, record_batch, raw_record_column_name=None): self.assertIsInstance(record_batch, pa.RecordBatch) self.assertEqual(record_batch.num_rows, 3) expected_column_values = _GetExpectedColumnValues(tfxio) for i, field in enumerate(record_batch.schema): if field.name == raw_record_column_name: continue if field.name == _SEQUENCE_COLUMN_NAME: self.assertTrue(pa.types.is_struct(field.type)) for seq_column, seq_field in zip( record_batch.column(i).flatten(), list(field.type)): expected_array = expected_column_values[path.ColumnPath( [_SEQUENCE_COLUMN_NAME, seq_field.name])] self.assertTrue( seq_column.equals(expected_array), "Sequence column {} did not match ({} vs {})".format( seq_field.name, seq_column, expected_array)) continue self.assertTrue( record_batch.column(i).equals(expected_column_values[path.ColumnPath( [field.name])]), "Column {} did not match ({} vs {}).".format( field.name, record_batch.column(i), expected_column_values[path.ColumnPath([field.name])])) if raw_record_column_name is not None: if tfxio._can_produce_large_types: raw_record_column_type = pa.large_list(pa.large_binary()) else: raw_record_column_type = pa.list_(pa.binary()) self.assertEqual(record_batch.schema.names[-1], raw_record_column_name) self.assertTrue( record_batch.columns[-1].type.equals(raw_record_column_type)) self.assertEqual(record_batch.columns[-1].flatten().to_pylist(), _SERIALIZED_EXAMPLES)
Example #26
Source File: csv_decoder.py From tfx-bsl with Apache License 2.0 | 5 votes |
def process(self, batch_of_tuple: List[Tuple[List[CSVCell], CSVLine]], column_infos: List[ColumnInfo]) -> Iterable[pa.RecordBatch]: if self._column_names is None: self._process_column_infos(column_infos) raw_records = [] values_list_by_column = [[] for _ in self._column_handlers] for (csv_row, raw_record) in batch_of_tuple: if not csv_row: if not self._skip_blank_lines: for l in values_list_by_column: l.append(None) continue if len(csv_row) != len(self._column_handlers): raise ValueError("Encountered a row of unexpected number of columns") for value, handler, values_list in (zip(csv_row, self._column_handlers, values_list_by_column)): values_list.append(handler(value) if value else None) if self._raw_record_column_name is not None: raw_records.append([raw_record]) arrow_arrays = [ pa.array(l, type=t) for l, t in zip(values_list_by_column, self._column_arrow_types) ] if self._raw_record_column_name is not None: arrow_arrays.append( pa.array(raw_records, type=self._raw_record_column_type)) self._column_names.append(self._raw_record_column_name) yield pa.RecordBatch.from_arrays(arrow_arrays, self._column_names)
Example #27
Source File: model_eval_lib.py From model-analysis with Apache License 2.0 | 5 votes |
def BatchedInputsToExtracts( # pylint: disable=invalid-name batched_inputs: beam.pvalue.PCollection) -> beam.pvalue.PCollection: """Converts Arrow RecordBatch inputs to Extracts.""" def to_extracts(x: Union[bytes, pa.RecordBatch]) -> types.Extracts: result = {} if isinstance(x, dict): result.update(x) else: result[constants.ARROW_RECORD_BATCH_KEY] = x return result return batched_inputs | 'AddArrowRecordBatchKey' >> beam.Map(to_extracts)
Example #28
Source File: batched_input_extractor.py From model-analysis with Apache License 2.0 | 5 votes |
def BatchedInputExtractor( eval_config: config.EvalConfig) -> extractor.Extractor: """Creates an extractor for extracting features, labels, and example weights. The extractor's PTransform extracts features, labels, and example weights from the batched features (i.e., Arrow RecordBatch) stored under tfma.ARROW_RECORD_BATCH_KEY in the incoming extract and adds it to the output extract under the keys tfma.FEATURES_KEY, tfma.LABELS_KEY, and tfma.EXAMPLE_WEIGHTS_KEY. If the eval_config contains a prediction_key and a corresponding key is found in the parse example, then predictions will also be extracted and stored under the tfma.PREDICTIONS_KEY. Any extracts that already exist will be merged with the values parsed by this extractor with this extractor's values taking precedence when duplicate keys are detected. Note that the use of a prediction_key in an eval_config serves two use cases: (1) as a key into the dict of predictions output by predict extractor (2) as the key for a pre-computed prediction stored as a feature. The InputExtractor can be used to handle case (2). These cases are meant to be exclusive (i.e. if approach (2) is used then a predict extractor would not be configured and if (1) is used then a key matching the predictons would not be stored in the features). However, if a feature key happens to match the same name as the prediction output key then both paths may be executed. In this case, the value stored here will be replaced by the predict extractor (though it will still be popped from the features). Args: eval_config: Eval config. Returns: Batched extractor for extracting features, labels, and example weights. """ # pylint: disable=no-value-for-parameter return extractor.Extractor( stage_name=INPUT_EXTRACTOR_STAGE_NAME, ptransform=_ExtractBatchedInputs(eval_config=eval_config))
Example #29
Source File: batched_input_extractor.py From model-analysis with Apache License 2.0 | 5 votes |
def _DropUnsupportedColumnsAndFetchRawDataColumn( record_batch: pa.RecordBatch ) -> Tuple[pa.RecordBatch, Optional[np.ndarray]]: """Drops unsupported columns and fetches the raw data column. Currently, types that are not binary_like or ListArray[primitive types] are dropped. Args: record_batch: An Arrow RecordBatch. Returns: Arrow RecordBatch with only supported columns. """ column_names, column_arrays = [], [] serialized_examples = None for column_name, column_array in zip(record_batch.schema.names, record_batch.columns): column_type = column_array.type if column_name == constants.ARROW_INPUT_COLUMN: assert (_IsListLike(column_type) and _IsBinaryLike(column_type.value_type)), ( 'Invalid type for batched input key: {}. ' 'Expected binary like.'.format(column_type)) serialized_examples = np.asarray(column_array.flatten()) # Currently we only handle columns of type list<primitive|binary_like>. # We ignore other columns as we cannot efficiently convert them into an # instance dict format. elif (_IsListLike(column_type) and _IsSupportedArrowValueType(column_type.value_type)): column_names.append(column_name) column_arrays.append(column_array) return (pa.RecordBatch.from_arrays(column_arrays, column_names), serialized_examples)
Example #30
Source File: telemetry.py From tfx-bsl with Apache License 2.0 | 5 votes |
def _UpdateNumCellsCounters(self, record_batch: pa.RecordBatch) -> None: num_rows = record_batch.num_rows for column in record_batch: column_type = column.type if pa.types.is_null(column_type): self._num_cells_by_type[_ValueType.NULL].inc(num_rows) continue if _IsListLike(column_type): value_type = _GetValueType(column_type.value_type) else: value_type = _ValueType.OTHER self._num_cells_by_type[value_type].inc(num_rows - column.null_count)