Python Examples of pyarrow.schema

Source File: test_table.py From python-bigquery with Apache License 2.0

6 votes

def test__row_from_mapping_w_invalid_schema(self):
        from google.cloud.bigquery.schema import SchemaField
        from google.cloud.bigquery.table import Table

        MAPPING = {
            "full_name": "Phred Phlyntstone",
            "age": 32,
            "colors": ["red", "green"],
            "bogus": "WHATEVER",
        }
        dataset = DatasetReference(self.PROJECT, self.DS_ID)
        table_ref = dataset.table(self.TABLE_NAME)
        full_name = SchemaField("full_name", "STRING", mode="REQUIRED")
        age = SchemaField("age", "INTEGER", mode="REQUIRED")
        colors = SchemaField("colors", "DATETIME", mode="REPEATED")
        bogus = SchemaField("joined", "STRING", mode="BOGUS")
        table = Table(table_ref, schema=[full_name, age, colors, bogus])

        with self.assertRaises(ValueError) as exc:
            self._call_fut(MAPPING, table.schema)

        self.assertIn("Unknown field mode: BOGUS", str(exc.exception))

Source File: test_table.py From python-bigquery with Apache License 2.0

6 votes

def test_to_dataframe_w_bqstorage_partition(self):
        from google.cloud.bigquery import schema
        from google.cloud.bigquery import table as mut

        bqstorage_client = mock.create_autospec(bigquery_storage_v1.BigQueryReadClient)

        row_iterator = mut.RowIterator(
            _mock_client(),
            None,  # api_request: ignored
            None,  # path: ignored
            [schema.SchemaField("colA", "IGNORED")],
            table=mut.TableReference.from_string("proj.dset.tbl$20181225"),
        )

        with pytest.raises(ValueError):
            row_iterator.to_dataframe(bqstorage_client)

Source File: test_table.py From python-bigquery with Apache License 2.0

6 votes

def test_to_dataframe_w_bqstorage_snapshot(self):
        from google.cloud.bigquery import schema
        from google.cloud.bigquery import table as mut

        bqstorage_client = mock.create_autospec(bigquery_storage_v1.BigQueryReadClient)

        row_iterator = mut.RowIterator(
            _mock_client(),
            None,  # api_request: ignored
            None,  # path: ignored
            [schema.SchemaField("colA", "IGNORED")],
            table=mut.TableReference.from_string("proj.dset.tbl@1234567890000"),
        )

        with pytest.raises(ValueError):
            row_iterator.to_dataframe(bqstorage_client)

Source File: _pandas_helpers.py From python-bigquery with Apache License 2.0

6 votes

def bq_to_arrow_data_type(field):
    """Return the Arrow data type, corresponding to a given BigQuery column.

    Returns:
        None: if default Arrow type inspection should be used.
    """
    if field.mode is not None and field.mode.upper() == "REPEATED":
        inner_type = bq_to_arrow_data_type(
            schema.SchemaField(field.name, field.field_type, fields=field.fields)
        )
        if inner_type:
            return pyarrow.list_(inner_type)
        return None

    field_type_upper = field.field_type.upper() if field.field_type else ""
    if field_type_upper in schema._STRUCT_TYPES:
        return bq_to_arrow_struct_data_type(field)

    data_type_constructor = BQ_TO_ARROW_SCALARS.get(field_type_upper)
    if data_type_constructor is None:
        return None
    return data_type_constructor()

Source File: test_table.py From python-bigquery with Apache License 2.0

6 votes

def test__row_from_mapping_w_schema(self):
        from google.cloud.bigquery.schema import SchemaField
        from google.cloud.bigquery.table import Table

        MAPPING = {
            "full_name": "Phred Phlyntstone",
            "age": 32,
            "colors": ["red", "green"],
            "extra": "IGNORED",
        }
        dataset = DatasetReference(self.PROJECT, self.DS_ID)
        table_ref = dataset.table(self.TABLE_NAME)
        full_name = SchemaField("full_name", "STRING", mode="REQUIRED")
        age = SchemaField("age", "INTEGER", mode="REQUIRED")
        colors = SchemaField("colors", "DATETIME", mode="REPEATED")
        joined = SchemaField("joined", "STRING", mode="NULLABLE")
        table = Table(table_ref, schema=[full_name, age, colors, joined])

        self.assertEqual(
            self._call_fut(MAPPING, table.schema),
            ("Phred Phlyntstone", 32, ["red", "green"], None),
        )

Source File: types.py From LearningApacheSpark with MIT License

6 votes

def _infer_schema(row, names=None):
    """Infer the schema from dict/namedtuple/object"""
    if isinstance(row, dict):
        items = sorted(row.items())

    elif isinstance(row, (tuple, list)):
        if hasattr(row, "__fields__"):  # Row
            items = zip(row.__fields__, tuple(row))
        elif hasattr(row, "_fields"):  # namedtuple
            items = zip(row._fields, tuple(row))
        else:
            if names is None:
                names = ['_%d' % i for i in range(1, len(row) + 1)]
            elif len(names) < len(row):
                names.extend('_%d' % i for i in range(len(names) + 1, len(row) + 1))
            items = zip(names, row)

    elif hasattr(row, "__dict__"):  # object
        items = sorted(row.__dict__.items())

    else:
        raise TypeError("Can not infer schema for type: %s" % type(row))

    fields = [StructField(k, _infer_type(v), True) for k, v in items]
    return StructType(fields)

Source File: types.py From LearningApacheSpark with MIT License

6 votes

def jsonValue(self):
        if self.scalaUDT():
            assert self.module() != '__main__', 'UDT in __main__ cannot work with ScalaUDT'
            schema = {
                "type": "udt",
                "class": self.scalaUDT(),
                "pyClass": "%s.%s" % (self.module(), type(self).__name__),
                "sqlType": self.sqlType().jsonValue()
            }
        else:
            ser = CloudPickleSerializer()
            b = ser.dumps(type(self))
            schema = {
                "type": "udt",
                "pyClass": "%s.%s" % (self.module(), type(self).__name__),
                "serializedClass": base64.b64encode(b).decode('utf8'),
                "sqlType": self.sqlType().jsonValue()
            }
        return schema

Source File: test_table.py From python-bigquery with Apache License 2.0

6 votes

def test_to_dataframe_w_empty_results_wo_pyarrow(self):
        from google.cloud.bigquery.schema import SchemaField

        with mock.patch("google.cloud.bigquery.table.pyarrow", None):
            schema = [
                SchemaField("name", "STRING", mode="REQUIRED"),
                SchemaField("age", "INTEGER", mode="REQUIRED"),
            ]
            api_request = mock.Mock(return_value={"rows": []})
            row_iterator = self._make_one(_mock_client(), api_request, schema=schema)

            df = row_iterator.to_dataframe()

            self.assertIsInstance(df, pandas.DataFrame)
            self.assertEqual(len(df), 0)  # verify the number of rows
            self.assertEqual(list(df), ["name", "age"])  # verify the column names

Source File: test_table.py From python-bigquery with Apache License 2.0

6 votes

def test_to_dataframe_error_if_pandas_is_none(self):
        from google.cloud.bigquery.schema import SchemaField

        schema = [
            SchemaField("name", "STRING", mode="REQUIRED"),
            SchemaField("age", "INTEGER", mode="REQUIRED"),
        ]
        rows = [
            {"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]},
            {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]},
        ]
        path = "/foo"
        api_request = mock.Mock(return_value={"rows": rows})
        row_iterator = self._make_one(_mock_client(), api_request, path, schema)

        with self.assertRaises(ValueError):
            row_iterator.to_dataframe()

Source File: _pandas_helpers.py From python-bigquery with Apache License 2.0

6 votes

def download_dataframe_tabledata_list(pages, bq_schema, dtypes):
    """Use (slower, but free) tabledata.list to construct a DataFrame.

    Args:
        pages (Iterator[:class:`google.api_core.page_iterator.Page`]):
            An iterator over the result pages.
        bq_schema (Sequence[Union[ \
            :class:`~google.cloud.bigquery.schema.SchemaField`, \
            Mapping[str, Any] \
        ]]):
            A decription of the fields in result pages.
        dtypes(Mapping[str, numpy.dtype]):
            The types of columns in result data to hint construction of the
            resulting DataFrame. Not all column types have to be specified.
    Yields:
        :class:`pandas.DataFrame`
        The next page of records as a ``pandas.DataFrame`` record batch.
    """
    bq_schema = schema._to_schema_fields(bq_schema)
    column_names = [field.name for field in bq_schema]
    for page in pages:
        yield _tabledata_list_page_to_dataframe(page, column_names, dtypes)

Source File: parquet_util.py From professional-services with Apache License 2.0

6 votes

def get_pa_translated_schema(self):
        """Translates a BigQuery schema to an parquet schema.

        Returns: Translated parquet schema in pyarrow.Schema format.
        """

        type_conversions = {
            'STRING': pa.string(),
            'NUMERIC': pa.int64(),
        }

        # TODO(annarudy@google.com): add support for nested fields
        pa_schema_list = [
            pa.field(
                bq_field.name,
                type_conversions[bq_field.field_type],
            ) for bq_field in self.bq_schema
        ]

        return pa.schema(pa_schema_list)

Source File: test_table.py From python-bigquery with Apache License 2.0

6 votes

def test_to_dataframe_no_tqdm_no_progress_bar(self):
        from google.cloud.bigquery.schema import SchemaField

        schema = [
            SchemaField("name", "STRING", mode="REQUIRED"),
            SchemaField("age", "INTEGER", mode="REQUIRED"),
        ]
        rows = [
            {"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]},
            {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]},
            {"f": [{"v": "Wylma Phlyntstone"}, {"v": "29"}]},
            {"f": [{"v": "Bhettye Rhubble"}, {"v": "27"}]},
        ]
        path = "/foo"
        api_request = mock.Mock(return_value={"rows": rows})
        row_iterator = self._make_one(_mock_client(), api_request, path, schema)

        with warnings.catch_warnings(record=True) as warned:
            df = row_iterator.to_dataframe(create_bqstorage_client=False)

        self.assertEqual(len(warned), 0)
        self.assertEqual(len(df), 4)

Source File: test_table.py From python-bigquery with Apache License 2.0

6 votes

def test_to_dataframe_w_no_results_wo_pyarrow(self):
        from google.cloud.bigquery.schema import SchemaField

        with mock.patch("google.cloud.bigquery.table.pyarrow", None):
            schema = [
                SchemaField("name", "STRING", mode="REQUIRED"),
                SchemaField("age", "INTEGER", mode="REQUIRED"),
            ]
            api_request = mock.Mock(return_value={"rows": []})
            row_iterator = self._make_one(_mock_client(), api_request, schema=schema)

            def empty_iterable(dtypes=None):
                return []

            row_iterator.to_dataframe_iterable = empty_iterable

            df = row_iterator.to_dataframe()

            self.assertIsInstance(df, pandas.DataFrame)
            self.assertEqual(len(df), 0)  # verify the number of rows
            self.assertEqual(list(df), ["name", "age"])  # verify the column names

Source File: parquet.py From boxball with Apache License 2.0

6 votes

def chunked_write(df_iterator: TextFileReader, parquet_writer: pq.ParquetWriter, date_cols: List[str]):
    """
    Writes  Parquet version of the chunked dataframe input.

    Arrow table creation and Parquet-writes take up around 25% of the time on this function.
    The CSV read takes around 75%.
    """
    rows_processed = 0
    for df in df_iterator:
        rows_processed += min(BUFFER_SIZE_ROWS, len(df))
        for col_name in date_cols:
            df[col_name] = pd.to_datetime(df[col_name], unit="ms")
        pa_table = pa.Table.from_pandas(df=df, schema=parquet_writer.schema)
        parquet_writer.write_table(pa_table)

        print("Rows processed: {}".format(rows_processed), end="\r", flush=True)
    print()

Source File: test_unischema.py From petastorm with Apache License 2.0

6 votes

def _mock_parquet_dataset(partitions, arrow_schema):
    """Creates a pyarrow.ParquetDataset mock capable of returning:

        parquet_dataset.pieces[0].get_metadata(parquet_dataset.fs.open).schema.to_arrow_schema() == schema
        parquet_dataset.partitions = partitions

    :param partitions: expected to be a list of pa.parquet.PartitionSet
    :param arrow_schema: an instance of pa.arrow_schema to be assumed by the mock parquet dataset object.
    :return:
    """
    piece_mock = mock.Mock()
    piece_mock.get_metadata().schema.to_arrow_schema.return_value = arrow_schema

    dataset_mock = mock.Mock()
    type(dataset_mock).pieces = mock.PropertyMock(return_value=[piece_mock])
    type(dataset_mock).partitions = partitions

    return dataset_mock

Source File: test_table.py From python-bigquery with Apache License 2.0

6 votes

def test_to_dataframe_w_bqstorage_no_streams(self):
        from google.cloud.bigquery import schema
        from google.cloud.bigquery import table as mut

        bqstorage_client = mock.create_autospec(bigquery_storage_v1.BigQueryReadClient)
        session = bigquery_storage_v1.types.ReadSession()
        bqstorage_client.create_read_session.return_value = session

        row_iterator = mut.RowIterator(
            _mock_client(),
            api_request=None,
            path=None,
            schema=[
                schema.SchemaField("colA", "INTEGER"),
                schema.SchemaField("colC", "FLOAT"),
                schema.SchemaField("colB", "STRING"),
            ],
            table=mut.TableReference.from_string("proj.dset.tbl"),
        )

        got = row_iterator.to_dataframe(bqstorage_client)
        column_names = ["colA", "colC", "colB"]
        self.assertEqual(list(got), column_names)
        self.assertTrue(got.empty)

Source File: test_table.py From python-bigquery with Apache License 2.0

6 votes

def test_to_dataframe(self):
        from google.cloud.bigquery.schema import SchemaField

        schema = [
            SchemaField("name", "STRING", mode="REQUIRED"),
            SchemaField("age", "INTEGER", mode="REQUIRED"),
        ]
        rows = [
            {"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]},
            {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]},
            {"f": [{"v": "Wylma Phlyntstone"}, {"v": "29"}]},
            {"f": [{"v": "Bhettye Rhubble"}, {"v": "27"}]},
        ]
        path = "/foo"
        api_request = mock.Mock(return_value={"rows": rows})
        row_iterator = self._make_one(_mock_client(), api_request, path, schema)

        df = row_iterator.to_dataframe(create_bqstorage_client=False)

        self.assertIsInstance(df, pandas.DataFrame)
        self.assertEqual(len(df), 4)  # verify the number of rows
        self.assertEqual(list(df), ["name", "age"])  # verify the column names
        self.assertEqual(df.name.dtype.name, "object")
        self.assertEqual(df.age.dtype.name, "int64")

Source File: test_table.py From python-bigquery with Apache License 2.0

6 votes

def test_to_dataframe_iterable_error_if_pandas_is_none(self):
        from google.cloud.bigquery.schema import SchemaField

        schema = [
            SchemaField("name", "STRING", mode="REQUIRED"),
            SchemaField("age", "INTEGER", mode="REQUIRED"),
        ]
        rows = [
            {"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]},
            {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]},
        ]
        path = "/foo"
        api_request = mock.Mock(return_value={"rows": rows})
        row_iterator = self._make_one(_mock_client(), api_request, path, schema)

        with pytest.raises(ValueError, match="pandas"):
            row_iterator.to_dataframe_iterable()

Source File: test_table.py From python-bigquery with Apache License 2.0

5 votes

def test_to_dataframe_column_dtypes(self):
        from google.cloud.bigquery.schema import SchemaField

        schema = [
            SchemaField("start_timestamp", "TIMESTAMP"),
            SchemaField("seconds", "INT64"),
            SchemaField("miles", "FLOAT64"),
            SchemaField("km", "FLOAT64"),
            SchemaField("payment_type", "STRING"),
            SchemaField("complete", "BOOL"),
            SchemaField("date", "DATE"),
        ]
        row_data = [
            ["1.4338368E9", "420", "1.1", "1.77", u"Cash", "true", "1999-12-01"],
            ["1.3878117E9", "2580", "17.7", "28.5", u"Cash", "false", "1953-06-14"],
            ["1.3855653E9", "2280", "4.4", "7.1", u"Credit", "true", "1981-11-04"],
        ]
        rows = [{"f": [{"v": field} for field in row]} for row in row_data]
        path = "/foo"
        api_request = mock.Mock(return_value={"rows": rows})
        row_iterator = self._make_one(_mock_client(), api_request, path, schema)

        df = row_iterator.to_dataframe(
            dtypes={"km": "float16"}, create_bqstorage_client=False,
        )

        self.assertIsInstance(df, pandas.DataFrame)
        self.assertEqual(len(df), 3)  # verify the number of rows
        exp_columns = [field.name for field in schema]
        self.assertEqual(list(df), exp_columns)  # verify the column names

        self.assertEqual(df.start_timestamp.dtype.name, "datetime64[ns, UTC]")
        self.assertEqual(df.seconds.dtype.name, "int64")
        self.assertEqual(df.miles.dtype.name, "float64")
        self.assertEqual(df.km.dtype.name, "float16")
        self.assertEqual(df.payment_type.dtype.name, "object")
        self.assertEqual(df.complete.dtype.name, "bool")
        self.assertEqual(df.date.dtype.name, "object")

Source File: test_validate.py From cjworkbench with GNU Affero General Public License v3.0

5 votes

def test_text_zero_chunks_valid(self):
        validate_table_metadata(
            pyarrow.Table.from_batches([], pyarrow.schema([("A", pyarrow.string())])),
            TableMetadata(0, [Text("A")]),
        )

Source File: test_validate.py From cjworkbench with GNU Affero General Public License v3.0

5 votes

def test_text_dictionary_zero_chunks_is_valid(self):
        validate_table_metadata(
            pyarrow.Table.from_batches(
                [],
                pyarrow.schema(
                    [("A", pyarrow.dictionary(pyarrow.int32(), pyarrow.string()))]
                ),
            ),
            TableMetadata(0, [Text("A")]),
        )

Source File: parquet.py From boxball with Apache License 2.0

5 votes

def write_files(metadata: AlchemyMetadata) -> None:
    """
    Creates a Parquet file for each table in the schema.
    """
    tables: Iterator[AlchemyTable] = metadata.tables.values()
    for table in tables:
        name = table.name
        print(name)

        def get_path(prefix: Path, suffix: str):
            parent_dir = prefix.joinpath(metadata.schema)
            parent_dir.mkdir(exist_ok=True, parents=True)
            return parent_dir.joinpath(name).with_suffix(suffix)

        extract_file = get_path(EXTRACT_PATH_PREFIX, ".csv.zst")
        parquet_file = get_path(PARQUET_PREFIX, ".parquet")

        pandas_fields = get_pandas_fields(table)
        arrow_fields = get_arrow_fields(table)
        arrow_schema = pa.schema(get_arrow_fields(table))
        column_names = [name for name, dtype in pandas_fields]
        date_cols = [name for name, dtype in arrow_fields if "timestamp" in dtype]

        # Using both Arrow and Pandas allows each library to cover the other's current shortcomings.
        # Pandas's read_csv can handle chunked/complex reads, while Arrow's WriteParquet can handle chunked writes.
        # Arrow's input streams are capable of handling zstd files, which Pandas hasn't implemented yet.
        in_buf = pa.OSFile(str(extract_file), mode="r")
        reader = pa.CompressedInputStream(in_buf, compression="zstd")

        # Have to use snappy codec for Parquet because Drill doesn't read zstd
        parquet_writer = pq.ParquetWriter(parquet_file, schema=arrow_schema, compression='snappy',
                                          version="2.0", use_dictionary=True)
        df_iterator: TextFileReader = pd.read_csv(reader, header=None, names=column_names, dtype=dict(pandas_fields),
                                                  true_values=map_to_bytes('T'), false_values=map_to_bytes('F'),
                                                  chunksize=BUFFER_SIZE_ROWS, parse_dates=date_cols)

        chunked_write(df_iterator, parquet_writer, date_cols)

Source File: common_metadata.py From kartothek with MIT License

5 votes

def __init__(self, schema, origin: Union[str, Set[str]]):
        if isinstance(origin, str):
            origin = {origin}
        elif isinstance(origin, set):
            origin = copy(origin)
        if not all(isinstance(s, str) for s in origin):
            raise TypeError("Schema origin elements must be strings.")

        self.__schema = schema
        self.__origin = origin
        self._schema_compat()

Source File: test_table.py From python-bigquery with Apache License 2.0

5 votes

def test_to_dataframe_tabledata_list_w_multiple_pages_return_unique_index(self):
        from google.cloud.bigquery import schema
        from google.cloud.bigquery import table as mut

        iterator_schema = [schema.SchemaField("name", "STRING", mode="REQUIRED")]
        path = "/foo"
        api_request = mock.Mock(
            side_effect=[
                {"rows": [{"f": [{"v": "Bengt"}]}], "pageToken": "NEXTPAGE"},
                {"rows": [{"f": [{"v": "Sven"}]}]},
            ]
        )
        row_iterator = mut.RowIterator(
            _mock_client(),
            api_request,
            path,
            iterator_schema,
            table=mut.Table("proj.dset.tbl"),
        )

        df = row_iterator.to_dataframe(
            bqstorage_client=None, create_bqstorage_client=False,
        )

        self.assertIsInstance(df, pandas.DataFrame)
        self.assertEqual(len(df), 2)
        self.assertEqual(list(df), ["name"])
        self.assertEqual(df.name.dtype.name, "object")
        self.assertTrue(df.index.is_unique)

Source File: test_table.py From python-bigquery with Apache License 2.0

5 votes

def test_to_dataframe_w_various_types_nullable(self):
        import datetime
        from google.cloud.bigquery.schema import SchemaField

        schema = [
            SchemaField("start_timestamp", "TIMESTAMP"),
            SchemaField("seconds", "INT64"),
            SchemaField("miles", "FLOAT64"),
            SchemaField("payment_type", "STRING"),
            SchemaField("complete", "BOOL"),
            SchemaField("date", "DATE"),
        ]
        row_data = [
            [None, None, None, None, None, None],
            ["1.4338368E9", "420", "1.1", u"Cash", "true", "1999-12-01"],
            ["1.3878117E9", "2580", "17.7", u"Cash", "false", "1953-06-14"],
            ["1.3855653E9", "2280", "4.4", u"Credit", "true", "1981-11-04"],
        ]
        rows = [{"f": [{"v": field} for field in row]} for row in row_data]
        path = "/foo"
        api_request = mock.Mock(return_value={"rows": rows})
        row_iterator = self._make_one(_mock_client(), api_request, path, schema)

        df = row_iterator.to_dataframe(create_bqstorage_client=False)

        self.assertIsInstance(df, pandas.DataFrame)
        self.assertEqual(len(df), 4)  # verify the number of rows
        exp_columns = [field.name for field in schema]
        self.assertEqual(list(df), exp_columns)  # verify the column names

        for index, row in df.iterrows():
            if index == 0:
                self.assertTrue(row.isnull().all())
            else:
                self.assertIsInstance(row.start_timestamp, pandas.Timestamp)
                self.assertIsInstance(row.seconds, float)
                self.assertIsInstance(row.payment_type, six.string_types)
                self.assertIsInstance(row.complete, bool)
                self.assertIsInstance(row.date, datetime.date)

Source File: test_table.py From python-bigquery with Apache License 2.0

5 votes

def test_to_dataframe_tqdm_error(self):
        from google.cloud.bigquery.schema import SchemaField

        schema = [
            SchemaField("name", "STRING", mode="REQUIRED"),
            SchemaField("age", "INTEGER", mode="REQUIRED"),
        ]
        rows = [
            {"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]},
            {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]},
            {"f": [{"v": "Wylma Phlyntstone"}, {"v": "29"}]},
            {"f": [{"v": "Bhettye Rhubble"}, {"v": "27"}]},
        ]
        path = "/foo"

        for progress_bar_type in ("tqdm", "tqdm_notebook", "tqdm_gui"):
            api_request = mock.Mock(return_value={"rows": rows})
            row_iterator = self._make_one(_mock_client(), api_request, path, schema)

            with warnings.catch_warnings(record=True) as warned:
                df = row_iterator.to_dataframe(
                    progress_bar_type=progress_bar_type, create_bqstorage_client=False,
                )

            self.assertEqual(len(df), 4)  # all should be well

            # Warn that a progress bar was requested, but creating the tqdm
            # progress bar failed.
            for warning in warned:
                self.assertIs(warning.category, UserWarning)

Source File: test_table.py From python-bigquery with Apache License 2.0

5 votes

def test_to_dataframe_no_tqdm(self):
        from google.cloud.bigquery.schema import SchemaField

        schema = [
            SchemaField("name", "STRING", mode="REQUIRED"),
            SchemaField("age", "INTEGER", mode="REQUIRED"),
        ]
        rows = [
            {"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]},
            {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]},
            {"f": [{"v": "Wylma Phlyntstone"}, {"v": "29"}]},
            {"f": [{"v": "Bhettye Rhubble"}, {"v": "27"}]},
        ]
        path = "/foo"
        api_request = mock.Mock(return_value={"rows": rows})
        row_iterator = self._make_one(_mock_client(), api_request, path, schema)

        with warnings.catch_warnings(record=True) as warned:
            df = row_iterator.to_dataframe(
                progress_bar_type="tqdm", create_bqstorage_client=False,
            )

        self.assertEqual(len(warned), 1)
        for warning in warned:
            self.assertIs(warning.category, UserWarning)

        # Even though the progress bar won't show, downloading the dataframe
        # should still work.
        self.assertEqual(len(df), 4)

Source File: test_table.py From python-bigquery with Apache License 2.0

5 votes

def test_to_dataframe_progress_bar_wo_pyarrow(
        self, tqdm_mock, tqdm_notebook_mock, tqdm_gui_mock
    ):
        from google.cloud.bigquery.schema import SchemaField

        schema = [
            SchemaField("name", "STRING", mode="REQUIRED"),
            SchemaField("age", "INTEGER", mode="REQUIRED"),
        ]
        rows = [
            {"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]},
            {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]},
            {"f": [{"v": "Wylma Phlyntstone"}, {"v": "29"}]},
            {"f": [{"v": "Bhettye Rhubble"}, {"v": "27"}]},
        ]
        path = "/foo"
        api_request = mock.Mock(return_value={"rows": rows})

        progress_bars = (
            ("tqdm", tqdm_mock),
            ("tqdm_notebook", tqdm_notebook_mock),
            ("tqdm_gui", tqdm_gui_mock),
        )

        for progress_bar_type, progress_bar_mock in progress_bars:
            row_iterator = self._make_one(_mock_client(), api_request, path, schema)
            with mock.patch("google.cloud.bigquery.table.pyarrow", None):
                df = row_iterator.to_dataframe(progress_bar_type=progress_bar_type)

            progress_bar_mock.assert_called()
            progress_bar_mock().update.assert_called()
            progress_bar_mock().close.assert_called_once()
            self.assertEqual(len(df), 4)

Source File: test_table.py From python-bigquery with Apache License 2.0

5 votes

def test_to_dataframe_progress_bar(
        self, tqdm_mock, tqdm_notebook_mock, tqdm_gui_mock
    ):
        from google.cloud.bigquery.schema import SchemaField

        schema = [
            SchemaField("name", "STRING", mode="REQUIRED"),
            SchemaField("age", "INTEGER", mode="REQUIRED"),
        ]
        rows = [
            {"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]},
            {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]},
            {"f": [{"v": "Wylma Phlyntstone"}, {"v": "29"}]},
            {"f": [{"v": "Bhettye Rhubble"}, {"v": "27"}]},
        ]
        path = "/foo"
        api_request = mock.Mock(return_value={"rows": rows})

        progress_bars = (
            ("tqdm", tqdm_mock),
            ("tqdm_notebook", tqdm_notebook_mock),
            ("tqdm_gui", tqdm_gui_mock),
        )

        for progress_bar_type, progress_bar_mock in progress_bars:
            row_iterator = self._make_one(_mock_client(), api_request, path, schema)
            df = row_iterator.to_dataframe(
                progress_bar_type=progress_bar_type, create_bqstorage_client=False,
            )

            progress_bar_mock.assert_called()
            progress_bar_mock().update.assert_called()
            progress_bar_mock().close.assert_called_once()
            self.assertEqual(len(df), 4)

Source File: test_table.py From python-bigquery with Apache License 2.0

5 votes

def test_to_dataframe_iterable(self):
        from google.cloud.bigquery.schema import SchemaField
        import types

        schema = [
            SchemaField("name", "STRING", mode="REQUIRED"),
            SchemaField("age", "INTEGER", mode="REQUIRED"),
        ]

        path = "/foo"
        api_request = mock.Mock(
            side_effect=[
                {
                    "rows": [{"f": [{"v": "Bengt"}, {"v": "32"}]}],
                    "pageToken": "NEXTPAGE",
                },
                {"rows": [{"f": [{"v": "Sven"}, {"v": "33"}]}]},
            ]
        )

        row_iterator = self._make_one(
            _mock_client(), api_request, path, schema, page_size=1, max_results=5
        )
        dfs = row_iterator.to_dataframe_iterable()

        self.assertIsInstance(dfs, types.GeneratorType)

        df_1 = next(dfs)
        self.assertIsInstance(df_1, pandas.DataFrame)
        self.assertEqual(df_1.name.dtype.name, "object")
        self.assertEqual(df_1.age.dtype.name, "int64")
        self.assertEqual(len(df_1), 1)  # verify the number of rows
        self.assertEqual(
            df_1["name"][0], "Bengt"
        )  # verify the first value of 'name' column
        self.assertEqual(df_1["age"][0], 32)  # verify the first value of 'age' column

        df_2 = next(dfs)
        self.assertEqual(len(df_2), 1)  # verify the number of rows
        self.assertEqual(df_2["name"][0], "Sven")
        self.assertEqual(df_2["age"][0], 33)

Python pyarrow.schema() Examples