Python pyarrow.schema() Examples
The following are 30
code examples of pyarrow.schema().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pyarrow
, or try the search function
.
Example #1
Source File: test_table.py From python-bigquery with Apache License 2.0 | 6 votes |
def test__row_from_mapping_w_invalid_schema(self): from google.cloud.bigquery.schema import SchemaField from google.cloud.bigquery.table import Table MAPPING = { "full_name": "Phred Phlyntstone", "age": 32, "colors": ["red", "green"], "bogus": "WHATEVER", } dataset = DatasetReference(self.PROJECT, self.DS_ID) table_ref = dataset.table(self.TABLE_NAME) full_name = SchemaField("full_name", "STRING", mode="REQUIRED") age = SchemaField("age", "INTEGER", mode="REQUIRED") colors = SchemaField("colors", "DATETIME", mode="REPEATED") bogus = SchemaField("joined", "STRING", mode="BOGUS") table = Table(table_ref, schema=[full_name, age, colors, bogus]) with self.assertRaises(ValueError) as exc: self._call_fut(MAPPING, table.schema) self.assertIn("Unknown field mode: BOGUS", str(exc.exception))
Example #2
Source File: test_table.py From python-bigquery with Apache License 2.0 | 6 votes |
def test_to_dataframe_w_bqstorage_partition(self): from google.cloud.bigquery import schema from google.cloud.bigquery import table as mut bqstorage_client = mock.create_autospec(bigquery_storage_v1.BigQueryReadClient) row_iterator = mut.RowIterator( _mock_client(), None, # api_request: ignored None, # path: ignored [schema.SchemaField("colA", "IGNORED")], table=mut.TableReference.from_string("proj.dset.tbl$20181225"), ) with pytest.raises(ValueError): row_iterator.to_dataframe(bqstorage_client)
Example #3
Source File: test_table.py From python-bigquery with Apache License 2.0 | 6 votes |
def test_to_dataframe_w_bqstorage_snapshot(self): from google.cloud.bigquery import schema from google.cloud.bigquery import table as mut bqstorage_client = mock.create_autospec(bigquery_storage_v1.BigQueryReadClient) row_iterator = mut.RowIterator( _mock_client(), None, # api_request: ignored None, # path: ignored [schema.SchemaField("colA", "IGNORED")], table=mut.TableReference.from_string("proj.dset.tbl@1234567890000"), ) with pytest.raises(ValueError): row_iterator.to_dataframe(bqstorage_client)
Example #4
Source File: _pandas_helpers.py From python-bigquery with Apache License 2.0 | 6 votes |
def bq_to_arrow_data_type(field): """Return the Arrow data type, corresponding to a given BigQuery column. Returns: None: if default Arrow type inspection should be used. """ if field.mode is not None and field.mode.upper() == "REPEATED": inner_type = bq_to_arrow_data_type( schema.SchemaField(field.name, field.field_type, fields=field.fields) ) if inner_type: return pyarrow.list_(inner_type) return None field_type_upper = field.field_type.upper() if field.field_type else "" if field_type_upper in schema._STRUCT_TYPES: return bq_to_arrow_struct_data_type(field) data_type_constructor = BQ_TO_ARROW_SCALARS.get(field_type_upper) if data_type_constructor is None: return None return data_type_constructor()
Example #5
Source File: test_table.py From python-bigquery with Apache License 2.0 | 6 votes |
def test__row_from_mapping_w_schema(self): from google.cloud.bigquery.schema import SchemaField from google.cloud.bigquery.table import Table MAPPING = { "full_name": "Phred Phlyntstone", "age": 32, "colors": ["red", "green"], "extra": "IGNORED", } dataset = DatasetReference(self.PROJECT, self.DS_ID) table_ref = dataset.table(self.TABLE_NAME) full_name = SchemaField("full_name", "STRING", mode="REQUIRED") age = SchemaField("age", "INTEGER", mode="REQUIRED") colors = SchemaField("colors", "DATETIME", mode="REPEATED") joined = SchemaField("joined", "STRING", mode="NULLABLE") table = Table(table_ref, schema=[full_name, age, colors, joined]) self.assertEqual( self._call_fut(MAPPING, table.schema), ("Phred Phlyntstone", 32, ["red", "green"], None), )
Example #6
Source File: types.py From LearningApacheSpark with MIT License | 6 votes |
def _infer_schema(row, names=None): """Infer the schema from dict/namedtuple/object""" if isinstance(row, dict): items = sorted(row.items()) elif isinstance(row, (tuple, list)): if hasattr(row, "__fields__"): # Row items = zip(row.__fields__, tuple(row)) elif hasattr(row, "_fields"): # namedtuple items = zip(row._fields, tuple(row)) else: if names is None: names = ['_%d' % i for i in range(1, len(row) + 1)] elif len(names) < len(row): names.extend('_%d' % i for i in range(len(names) + 1, len(row) + 1)) items = zip(names, row) elif hasattr(row, "__dict__"): # object items = sorted(row.__dict__.items()) else: raise TypeError("Can not infer schema for type: %s" % type(row)) fields = [StructField(k, _infer_type(v), True) for k, v in items] return StructType(fields)
Example #7
Source File: types.py From LearningApacheSpark with MIT License | 6 votes |
def jsonValue(self): if self.scalaUDT(): assert self.module() != '__main__', 'UDT in __main__ cannot work with ScalaUDT' schema = { "type": "udt", "class": self.scalaUDT(), "pyClass": "%s.%s" % (self.module(), type(self).__name__), "sqlType": self.sqlType().jsonValue() } else: ser = CloudPickleSerializer() b = ser.dumps(type(self)) schema = { "type": "udt", "pyClass": "%s.%s" % (self.module(), type(self).__name__), "serializedClass": base64.b64encode(b).decode('utf8'), "sqlType": self.sqlType().jsonValue() } return schema
Example #8
Source File: test_table.py From python-bigquery with Apache License 2.0 | 6 votes |
def test_to_dataframe_w_empty_results_wo_pyarrow(self): from google.cloud.bigquery.schema import SchemaField with mock.patch("google.cloud.bigquery.table.pyarrow", None): schema = [ SchemaField("name", "STRING", mode="REQUIRED"), SchemaField("age", "INTEGER", mode="REQUIRED"), ] api_request = mock.Mock(return_value={"rows": []}) row_iterator = self._make_one(_mock_client(), api_request, schema=schema) df = row_iterator.to_dataframe() self.assertIsInstance(df, pandas.DataFrame) self.assertEqual(len(df), 0) # verify the number of rows self.assertEqual(list(df), ["name", "age"]) # verify the column names
Example #9
Source File: test_table.py From python-bigquery with Apache License 2.0 | 6 votes |
def test_to_dataframe_error_if_pandas_is_none(self): from google.cloud.bigquery.schema import SchemaField schema = [ SchemaField("name", "STRING", mode="REQUIRED"), SchemaField("age", "INTEGER", mode="REQUIRED"), ] rows = [ {"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]}, {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]}, ] path = "/foo" api_request = mock.Mock(return_value={"rows": rows}) row_iterator = self._make_one(_mock_client(), api_request, path, schema) with self.assertRaises(ValueError): row_iterator.to_dataframe()
Example #10
Source File: _pandas_helpers.py From python-bigquery with Apache License 2.0 | 6 votes |
def download_dataframe_tabledata_list(pages, bq_schema, dtypes): """Use (slower, but free) tabledata.list to construct a DataFrame. Args: pages (Iterator[:class:`google.api_core.page_iterator.Page`]): An iterator over the result pages. bq_schema (Sequence[Union[ \ :class:`~google.cloud.bigquery.schema.SchemaField`, \ Mapping[str, Any] \ ]]): A decription of the fields in result pages. dtypes(Mapping[str, numpy.dtype]): The types of columns in result data to hint construction of the resulting DataFrame. Not all column types have to be specified. Yields: :class:`pandas.DataFrame` The next page of records as a ``pandas.DataFrame`` record batch. """ bq_schema = schema._to_schema_fields(bq_schema) column_names = [field.name for field in bq_schema] for page in pages: yield _tabledata_list_page_to_dataframe(page, column_names, dtypes)
Example #11
Source File: parquet_util.py From professional-services with Apache License 2.0 | 6 votes |
def get_pa_translated_schema(self): """Translates a BigQuery schema to an parquet schema. Returns: Translated parquet schema in pyarrow.Schema format. """ type_conversions = { 'STRING': pa.string(), 'NUMERIC': pa.int64(), } # TODO(annarudy@google.com): add support for nested fields pa_schema_list = [ pa.field( bq_field.name, type_conversions[bq_field.field_type], ) for bq_field in self.bq_schema ] return pa.schema(pa_schema_list)
Example #12
Source File: test_table.py From python-bigquery with Apache License 2.0 | 6 votes |
def test_to_dataframe_no_tqdm_no_progress_bar(self): from google.cloud.bigquery.schema import SchemaField schema = [ SchemaField("name", "STRING", mode="REQUIRED"), SchemaField("age", "INTEGER", mode="REQUIRED"), ] rows = [ {"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]}, {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]}, {"f": [{"v": "Wylma Phlyntstone"}, {"v": "29"}]}, {"f": [{"v": "Bhettye Rhubble"}, {"v": "27"}]}, ] path = "/foo" api_request = mock.Mock(return_value={"rows": rows}) row_iterator = self._make_one(_mock_client(), api_request, path, schema) with warnings.catch_warnings(record=True) as warned: df = row_iterator.to_dataframe(create_bqstorage_client=False) self.assertEqual(len(warned), 0) self.assertEqual(len(df), 4)
Example #13
Source File: test_table.py From python-bigquery with Apache License 2.0 | 6 votes |
def test_to_dataframe_w_no_results_wo_pyarrow(self): from google.cloud.bigquery.schema import SchemaField with mock.patch("google.cloud.bigquery.table.pyarrow", None): schema = [ SchemaField("name", "STRING", mode="REQUIRED"), SchemaField("age", "INTEGER", mode="REQUIRED"), ] api_request = mock.Mock(return_value={"rows": []}) row_iterator = self._make_one(_mock_client(), api_request, schema=schema) def empty_iterable(dtypes=None): return [] row_iterator.to_dataframe_iterable = empty_iterable df = row_iterator.to_dataframe() self.assertIsInstance(df, pandas.DataFrame) self.assertEqual(len(df), 0) # verify the number of rows self.assertEqual(list(df), ["name", "age"]) # verify the column names
Example #14
Source File: parquet.py From boxball with Apache License 2.0 | 6 votes |
def chunked_write(df_iterator: TextFileReader, parquet_writer: pq.ParquetWriter, date_cols: List[str]): """ Writes Parquet version of the chunked dataframe input. Arrow table creation and Parquet-writes take up around 25% of the time on this function. The CSV read takes around 75%. """ rows_processed = 0 for df in df_iterator: rows_processed += min(BUFFER_SIZE_ROWS, len(df)) for col_name in date_cols: df[col_name] = pd.to_datetime(df[col_name], unit="ms") pa_table = pa.Table.from_pandas(df=df, schema=parquet_writer.schema) parquet_writer.write_table(pa_table) print("Rows processed: {}".format(rows_processed), end="\r", flush=True) print()
Example #15
Source File: test_unischema.py From petastorm with Apache License 2.0 | 6 votes |
def _mock_parquet_dataset(partitions, arrow_schema): """Creates a pyarrow.ParquetDataset mock capable of returning: parquet_dataset.pieces[0].get_metadata(parquet_dataset.fs.open).schema.to_arrow_schema() == schema parquet_dataset.partitions = partitions :param partitions: expected to be a list of pa.parquet.PartitionSet :param arrow_schema: an instance of pa.arrow_schema to be assumed by the mock parquet dataset object. :return: """ piece_mock = mock.Mock() piece_mock.get_metadata().schema.to_arrow_schema.return_value = arrow_schema dataset_mock = mock.Mock() type(dataset_mock).pieces = mock.PropertyMock(return_value=[piece_mock]) type(dataset_mock).partitions = partitions return dataset_mock
Example #16
Source File: test_table.py From python-bigquery with Apache License 2.0 | 6 votes |
def test_to_dataframe_w_bqstorage_no_streams(self): from google.cloud.bigquery import schema from google.cloud.bigquery import table as mut bqstorage_client = mock.create_autospec(bigquery_storage_v1.BigQueryReadClient) session = bigquery_storage_v1.types.ReadSession() bqstorage_client.create_read_session.return_value = session row_iterator = mut.RowIterator( _mock_client(), api_request=None, path=None, schema=[ schema.SchemaField("colA", "INTEGER"), schema.SchemaField("colC", "FLOAT"), schema.SchemaField("colB", "STRING"), ], table=mut.TableReference.from_string("proj.dset.tbl"), ) got = row_iterator.to_dataframe(bqstorage_client) column_names = ["colA", "colC", "colB"] self.assertEqual(list(got), column_names) self.assertTrue(got.empty)
Example #17
Source File: test_table.py From python-bigquery with Apache License 2.0 | 6 votes |
def test_to_dataframe(self): from google.cloud.bigquery.schema import SchemaField schema = [ SchemaField("name", "STRING", mode="REQUIRED"), SchemaField("age", "INTEGER", mode="REQUIRED"), ] rows = [ {"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]}, {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]}, {"f": [{"v": "Wylma Phlyntstone"}, {"v": "29"}]}, {"f": [{"v": "Bhettye Rhubble"}, {"v": "27"}]}, ] path = "/foo" api_request = mock.Mock(return_value={"rows": rows}) row_iterator = self._make_one(_mock_client(), api_request, path, schema) df = row_iterator.to_dataframe(create_bqstorage_client=False) self.assertIsInstance(df, pandas.DataFrame) self.assertEqual(len(df), 4) # verify the number of rows self.assertEqual(list(df), ["name", "age"]) # verify the column names self.assertEqual(df.name.dtype.name, "object") self.assertEqual(df.age.dtype.name, "int64")
Example #18
Source File: test_table.py From python-bigquery with Apache License 2.0 | 6 votes |
def test_to_dataframe_iterable_error_if_pandas_is_none(self): from google.cloud.bigquery.schema import SchemaField schema = [ SchemaField("name", "STRING", mode="REQUIRED"), SchemaField("age", "INTEGER", mode="REQUIRED"), ] rows = [ {"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]}, {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]}, ] path = "/foo" api_request = mock.Mock(return_value={"rows": rows}) row_iterator = self._make_one(_mock_client(), api_request, path, schema) with pytest.raises(ValueError, match="pandas"): row_iterator.to_dataframe_iterable()
Example #19
Source File: test_table.py From python-bigquery with Apache License 2.0 | 5 votes |
def test_to_dataframe_column_dtypes(self): from google.cloud.bigquery.schema import SchemaField schema = [ SchemaField("start_timestamp", "TIMESTAMP"), SchemaField("seconds", "INT64"), SchemaField("miles", "FLOAT64"), SchemaField("km", "FLOAT64"), SchemaField("payment_type", "STRING"), SchemaField("complete", "BOOL"), SchemaField("date", "DATE"), ] row_data = [ ["1.4338368E9", "420", "1.1", "1.77", u"Cash", "true", "1999-12-01"], ["1.3878117E9", "2580", "17.7", "28.5", u"Cash", "false", "1953-06-14"], ["1.3855653E9", "2280", "4.4", "7.1", u"Credit", "true", "1981-11-04"], ] rows = [{"f": [{"v": field} for field in row]} for row in row_data] path = "/foo" api_request = mock.Mock(return_value={"rows": rows}) row_iterator = self._make_one(_mock_client(), api_request, path, schema) df = row_iterator.to_dataframe( dtypes={"km": "float16"}, create_bqstorage_client=False, ) self.assertIsInstance(df, pandas.DataFrame) self.assertEqual(len(df), 3) # verify the number of rows exp_columns = [field.name for field in schema] self.assertEqual(list(df), exp_columns) # verify the column names self.assertEqual(df.start_timestamp.dtype.name, "datetime64[ns, UTC]") self.assertEqual(df.seconds.dtype.name, "int64") self.assertEqual(df.miles.dtype.name, "float64") self.assertEqual(df.km.dtype.name, "float16") self.assertEqual(df.payment_type.dtype.name, "object") self.assertEqual(df.complete.dtype.name, "bool") self.assertEqual(df.date.dtype.name, "object")
Example #20
Source File: test_validate.py From cjworkbench with GNU Affero General Public License v3.0 | 5 votes |
def test_text_zero_chunks_valid(self): validate_table_metadata( pyarrow.Table.from_batches([], pyarrow.schema([("A", pyarrow.string())])), TableMetadata(0, [Text("A")]), )
Example #21
Source File: test_validate.py From cjworkbench with GNU Affero General Public License v3.0 | 5 votes |
def test_text_dictionary_zero_chunks_is_valid(self): validate_table_metadata( pyarrow.Table.from_batches( [], pyarrow.schema( [("A", pyarrow.dictionary(pyarrow.int32(), pyarrow.string()))] ), ), TableMetadata(0, [Text("A")]), )
Example #22
Source File: parquet.py From boxball with Apache License 2.0 | 5 votes |
def write_files(metadata: AlchemyMetadata) -> None: """ Creates a Parquet file for each table in the schema. """ tables: Iterator[AlchemyTable] = metadata.tables.values() for table in tables: name = table.name print(name) def get_path(prefix: Path, suffix: str): parent_dir = prefix.joinpath(metadata.schema) parent_dir.mkdir(exist_ok=True, parents=True) return parent_dir.joinpath(name).with_suffix(suffix) extract_file = get_path(EXTRACT_PATH_PREFIX, ".csv.zst") parquet_file = get_path(PARQUET_PREFIX, ".parquet") pandas_fields = get_pandas_fields(table) arrow_fields = get_arrow_fields(table) arrow_schema = pa.schema(get_arrow_fields(table)) column_names = [name for name, dtype in pandas_fields] date_cols = [name for name, dtype in arrow_fields if "timestamp" in dtype] # Using both Arrow and Pandas allows each library to cover the other's current shortcomings. # Pandas's read_csv can handle chunked/complex reads, while Arrow's WriteParquet can handle chunked writes. # Arrow's input streams are capable of handling zstd files, which Pandas hasn't implemented yet. in_buf = pa.OSFile(str(extract_file), mode="r") reader = pa.CompressedInputStream(in_buf, compression="zstd") # Have to use snappy codec for Parquet because Drill doesn't read zstd parquet_writer = pq.ParquetWriter(parquet_file, schema=arrow_schema, compression='snappy', version="2.0", use_dictionary=True) df_iterator: TextFileReader = pd.read_csv(reader, header=None, names=column_names, dtype=dict(pandas_fields), true_values=map_to_bytes('T'), false_values=map_to_bytes('F'), chunksize=BUFFER_SIZE_ROWS, parse_dates=date_cols) chunked_write(df_iterator, parquet_writer, date_cols)
Example #23
Source File: common_metadata.py From kartothek with MIT License | 5 votes |
def __init__(self, schema, origin: Union[str, Set[str]]): if isinstance(origin, str): origin = {origin} elif isinstance(origin, set): origin = copy(origin) if not all(isinstance(s, str) for s in origin): raise TypeError("Schema origin elements must be strings.") self.__schema = schema self.__origin = origin self._schema_compat()
Example #24
Source File: test_table.py From python-bigquery with Apache License 2.0 | 5 votes |
def test_to_dataframe_tabledata_list_w_multiple_pages_return_unique_index(self): from google.cloud.bigquery import schema from google.cloud.bigquery import table as mut iterator_schema = [schema.SchemaField("name", "STRING", mode="REQUIRED")] path = "/foo" api_request = mock.Mock( side_effect=[ {"rows": [{"f": [{"v": "Bengt"}]}], "pageToken": "NEXTPAGE"}, {"rows": [{"f": [{"v": "Sven"}]}]}, ] ) row_iterator = mut.RowIterator( _mock_client(), api_request, path, iterator_schema, table=mut.Table("proj.dset.tbl"), ) df = row_iterator.to_dataframe( bqstorage_client=None, create_bqstorage_client=False, ) self.assertIsInstance(df, pandas.DataFrame) self.assertEqual(len(df), 2) self.assertEqual(list(df), ["name"]) self.assertEqual(df.name.dtype.name, "object") self.assertTrue(df.index.is_unique)
Example #25
Source File: test_table.py From python-bigquery with Apache License 2.0 | 5 votes |
def test_to_dataframe_w_various_types_nullable(self): import datetime from google.cloud.bigquery.schema import SchemaField schema = [ SchemaField("start_timestamp", "TIMESTAMP"), SchemaField("seconds", "INT64"), SchemaField("miles", "FLOAT64"), SchemaField("payment_type", "STRING"), SchemaField("complete", "BOOL"), SchemaField("date", "DATE"), ] row_data = [ [None, None, None, None, None, None], ["1.4338368E9", "420", "1.1", u"Cash", "true", "1999-12-01"], ["1.3878117E9", "2580", "17.7", u"Cash", "false", "1953-06-14"], ["1.3855653E9", "2280", "4.4", u"Credit", "true", "1981-11-04"], ] rows = [{"f": [{"v": field} for field in row]} for row in row_data] path = "/foo" api_request = mock.Mock(return_value={"rows": rows}) row_iterator = self._make_one(_mock_client(), api_request, path, schema) df = row_iterator.to_dataframe(create_bqstorage_client=False) self.assertIsInstance(df, pandas.DataFrame) self.assertEqual(len(df), 4) # verify the number of rows exp_columns = [field.name for field in schema] self.assertEqual(list(df), exp_columns) # verify the column names for index, row in df.iterrows(): if index == 0: self.assertTrue(row.isnull().all()) else: self.assertIsInstance(row.start_timestamp, pandas.Timestamp) self.assertIsInstance(row.seconds, float) self.assertIsInstance(row.payment_type, six.string_types) self.assertIsInstance(row.complete, bool) self.assertIsInstance(row.date, datetime.date)
Example #26
Source File: test_table.py From python-bigquery with Apache License 2.0 | 5 votes |
def test_to_dataframe_tqdm_error(self): from google.cloud.bigquery.schema import SchemaField schema = [ SchemaField("name", "STRING", mode="REQUIRED"), SchemaField("age", "INTEGER", mode="REQUIRED"), ] rows = [ {"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]}, {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]}, {"f": [{"v": "Wylma Phlyntstone"}, {"v": "29"}]}, {"f": [{"v": "Bhettye Rhubble"}, {"v": "27"}]}, ] path = "/foo" for progress_bar_type in ("tqdm", "tqdm_notebook", "tqdm_gui"): api_request = mock.Mock(return_value={"rows": rows}) row_iterator = self._make_one(_mock_client(), api_request, path, schema) with warnings.catch_warnings(record=True) as warned: df = row_iterator.to_dataframe( progress_bar_type=progress_bar_type, create_bqstorage_client=False, ) self.assertEqual(len(df), 4) # all should be well # Warn that a progress bar was requested, but creating the tqdm # progress bar failed. for warning in warned: self.assertIs(warning.category, UserWarning)
Example #27
Source File: test_table.py From python-bigquery with Apache License 2.0 | 5 votes |
def test_to_dataframe_no_tqdm(self): from google.cloud.bigquery.schema import SchemaField schema = [ SchemaField("name", "STRING", mode="REQUIRED"), SchemaField("age", "INTEGER", mode="REQUIRED"), ] rows = [ {"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]}, {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]}, {"f": [{"v": "Wylma Phlyntstone"}, {"v": "29"}]}, {"f": [{"v": "Bhettye Rhubble"}, {"v": "27"}]}, ] path = "/foo" api_request = mock.Mock(return_value={"rows": rows}) row_iterator = self._make_one(_mock_client(), api_request, path, schema) with warnings.catch_warnings(record=True) as warned: df = row_iterator.to_dataframe( progress_bar_type="tqdm", create_bqstorage_client=False, ) self.assertEqual(len(warned), 1) for warning in warned: self.assertIs(warning.category, UserWarning) # Even though the progress bar won't show, downloading the dataframe # should still work. self.assertEqual(len(df), 4)
Example #28
Source File: test_table.py From python-bigquery with Apache License 2.0 | 5 votes |
def test_to_dataframe_progress_bar_wo_pyarrow( self, tqdm_mock, tqdm_notebook_mock, tqdm_gui_mock ): from google.cloud.bigquery.schema import SchemaField schema = [ SchemaField("name", "STRING", mode="REQUIRED"), SchemaField("age", "INTEGER", mode="REQUIRED"), ] rows = [ {"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]}, {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]}, {"f": [{"v": "Wylma Phlyntstone"}, {"v": "29"}]}, {"f": [{"v": "Bhettye Rhubble"}, {"v": "27"}]}, ] path = "/foo" api_request = mock.Mock(return_value={"rows": rows}) progress_bars = ( ("tqdm", tqdm_mock), ("tqdm_notebook", tqdm_notebook_mock), ("tqdm_gui", tqdm_gui_mock), ) for progress_bar_type, progress_bar_mock in progress_bars: row_iterator = self._make_one(_mock_client(), api_request, path, schema) with mock.patch("google.cloud.bigquery.table.pyarrow", None): df = row_iterator.to_dataframe(progress_bar_type=progress_bar_type) progress_bar_mock.assert_called() progress_bar_mock().update.assert_called() progress_bar_mock().close.assert_called_once() self.assertEqual(len(df), 4)
Example #29
Source File: test_table.py From python-bigquery with Apache License 2.0 | 5 votes |
def test_to_dataframe_progress_bar( self, tqdm_mock, tqdm_notebook_mock, tqdm_gui_mock ): from google.cloud.bigquery.schema import SchemaField schema = [ SchemaField("name", "STRING", mode="REQUIRED"), SchemaField("age", "INTEGER", mode="REQUIRED"), ] rows = [ {"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]}, {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]}, {"f": [{"v": "Wylma Phlyntstone"}, {"v": "29"}]}, {"f": [{"v": "Bhettye Rhubble"}, {"v": "27"}]}, ] path = "/foo" api_request = mock.Mock(return_value={"rows": rows}) progress_bars = ( ("tqdm", tqdm_mock), ("tqdm_notebook", tqdm_notebook_mock), ("tqdm_gui", tqdm_gui_mock), ) for progress_bar_type, progress_bar_mock in progress_bars: row_iterator = self._make_one(_mock_client(), api_request, path, schema) df = row_iterator.to_dataframe( progress_bar_type=progress_bar_type, create_bqstorage_client=False, ) progress_bar_mock.assert_called() progress_bar_mock().update.assert_called() progress_bar_mock().close.assert_called_once() self.assertEqual(len(df), 4)
Example #30
Source File: test_table.py From python-bigquery with Apache License 2.0 | 5 votes |
def test_to_dataframe_iterable(self): from google.cloud.bigquery.schema import SchemaField import types schema = [ SchemaField("name", "STRING", mode="REQUIRED"), SchemaField("age", "INTEGER", mode="REQUIRED"), ] path = "/foo" api_request = mock.Mock( side_effect=[ { "rows": [{"f": [{"v": "Bengt"}, {"v": "32"}]}], "pageToken": "NEXTPAGE", }, {"rows": [{"f": [{"v": "Sven"}, {"v": "33"}]}]}, ] ) row_iterator = self._make_one( _mock_client(), api_request, path, schema, page_size=1, max_results=5 ) dfs = row_iterator.to_dataframe_iterable() self.assertIsInstance(dfs, types.GeneratorType) df_1 = next(dfs) self.assertIsInstance(df_1, pandas.DataFrame) self.assertEqual(df_1.name.dtype.name, "object") self.assertEqual(df_1.age.dtype.name, "int64") self.assertEqual(len(df_1), 1) # verify the number of rows self.assertEqual( df_1["name"][0], "Bengt" ) # verify the first value of 'name' column self.assertEqual(df_1["age"][0], 32) # verify the first value of 'age' column df_2 = next(dfs) self.assertEqual(len(df_2), 1) # verify the number of rows self.assertEqual(df_2["name"][0], "Sven") self.assertEqual(df_2["age"][0], 33)