Python pyarrow.schema() Examples

The following are 30 code examples of pyarrow.schema(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pyarrow , or try the search function .
Example #1
Source File: test_table.py    From python-bigquery with Apache License 2.0 6 votes vote down vote up
def test__row_from_mapping_w_invalid_schema(self):
        from google.cloud.bigquery.schema import SchemaField
        from google.cloud.bigquery.table import Table

        MAPPING = {
            "full_name": "Phred Phlyntstone",
            "age": 32,
            "colors": ["red", "green"],
            "bogus": "WHATEVER",
        }
        dataset = DatasetReference(self.PROJECT, self.DS_ID)
        table_ref = dataset.table(self.TABLE_NAME)
        full_name = SchemaField("full_name", "STRING", mode="REQUIRED")
        age = SchemaField("age", "INTEGER", mode="REQUIRED")
        colors = SchemaField("colors", "DATETIME", mode="REPEATED")
        bogus = SchemaField("joined", "STRING", mode="BOGUS")
        table = Table(table_ref, schema=[full_name, age, colors, bogus])

        with self.assertRaises(ValueError) as exc:
            self._call_fut(MAPPING, table.schema)

        self.assertIn("Unknown field mode: BOGUS", str(exc.exception)) 
Example #2
Source File: test_table.py    From python-bigquery with Apache License 2.0 6 votes vote down vote up
def test_to_dataframe_w_bqstorage_partition(self):
        from google.cloud.bigquery import schema
        from google.cloud.bigquery import table as mut

        bqstorage_client = mock.create_autospec(bigquery_storage_v1.BigQueryReadClient)

        row_iterator = mut.RowIterator(
            _mock_client(),
            None,  # api_request: ignored
            None,  # path: ignored
            [schema.SchemaField("colA", "IGNORED")],
            table=mut.TableReference.from_string("proj.dset.tbl$20181225"),
        )

        with pytest.raises(ValueError):
            row_iterator.to_dataframe(bqstorage_client) 
Example #3
Source File: test_table.py    From python-bigquery with Apache License 2.0 6 votes vote down vote up
def test_to_dataframe_w_bqstorage_snapshot(self):
        from google.cloud.bigquery import schema
        from google.cloud.bigquery import table as mut

        bqstorage_client = mock.create_autospec(bigquery_storage_v1.BigQueryReadClient)

        row_iterator = mut.RowIterator(
            _mock_client(),
            None,  # api_request: ignored
            None,  # path: ignored
            [schema.SchemaField("colA", "IGNORED")],
            table=mut.TableReference.from_string("proj.dset.tbl@1234567890000"),
        )

        with pytest.raises(ValueError):
            row_iterator.to_dataframe(bqstorage_client) 
Example #4
Source File: _pandas_helpers.py    From python-bigquery with Apache License 2.0 6 votes vote down vote up
def bq_to_arrow_data_type(field):
    """Return the Arrow data type, corresponding to a given BigQuery column.

    Returns:
        None: if default Arrow type inspection should be used.
    """
    if field.mode is not None and field.mode.upper() == "REPEATED":
        inner_type = bq_to_arrow_data_type(
            schema.SchemaField(field.name, field.field_type, fields=field.fields)
        )
        if inner_type:
            return pyarrow.list_(inner_type)
        return None

    field_type_upper = field.field_type.upper() if field.field_type else ""
    if field_type_upper in schema._STRUCT_TYPES:
        return bq_to_arrow_struct_data_type(field)

    data_type_constructor = BQ_TO_ARROW_SCALARS.get(field_type_upper)
    if data_type_constructor is None:
        return None
    return data_type_constructor() 
Example #5
Source File: test_table.py    From python-bigquery with Apache License 2.0 6 votes vote down vote up
def test__row_from_mapping_w_schema(self):
        from google.cloud.bigquery.schema import SchemaField
        from google.cloud.bigquery.table import Table

        MAPPING = {
            "full_name": "Phred Phlyntstone",
            "age": 32,
            "colors": ["red", "green"],
            "extra": "IGNORED",
        }
        dataset = DatasetReference(self.PROJECT, self.DS_ID)
        table_ref = dataset.table(self.TABLE_NAME)
        full_name = SchemaField("full_name", "STRING", mode="REQUIRED")
        age = SchemaField("age", "INTEGER", mode="REQUIRED")
        colors = SchemaField("colors", "DATETIME", mode="REPEATED")
        joined = SchemaField("joined", "STRING", mode="NULLABLE")
        table = Table(table_ref, schema=[full_name, age, colors, joined])

        self.assertEqual(
            self._call_fut(MAPPING, table.schema),
            ("Phred Phlyntstone", 32, ["red", "green"], None),
        ) 
Example #6
Source File: types.py    From LearningApacheSpark with MIT License 6 votes vote down vote up
def _infer_schema(row, names=None):
    """Infer the schema from dict/namedtuple/object"""
    if isinstance(row, dict):
        items = sorted(row.items())

    elif isinstance(row, (tuple, list)):
        if hasattr(row, "__fields__"):  # Row
            items = zip(row.__fields__, tuple(row))
        elif hasattr(row, "_fields"):  # namedtuple
            items = zip(row._fields, tuple(row))
        else:
            if names is None:
                names = ['_%d' % i for i in range(1, len(row) + 1)]
            elif len(names) < len(row):
                names.extend('_%d' % i for i in range(len(names) + 1, len(row) + 1))
            items = zip(names, row)

    elif hasattr(row, "__dict__"):  # object
        items = sorted(row.__dict__.items())

    else:
        raise TypeError("Can not infer schema for type: %s" % type(row))

    fields = [StructField(k, _infer_type(v), True) for k, v in items]
    return StructType(fields) 
Example #7
Source File: types.py    From LearningApacheSpark with MIT License 6 votes vote down vote up
def jsonValue(self):
        if self.scalaUDT():
            assert self.module() != '__main__', 'UDT in __main__ cannot work with ScalaUDT'
            schema = {
                "type": "udt",
                "class": self.scalaUDT(),
                "pyClass": "%s.%s" % (self.module(), type(self).__name__),
                "sqlType": self.sqlType().jsonValue()
            }
        else:
            ser = CloudPickleSerializer()
            b = ser.dumps(type(self))
            schema = {
                "type": "udt",
                "pyClass": "%s.%s" % (self.module(), type(self).__name__),
                "serializedClass": base64.b64encode(b).decode('utf8'),
                "sqlType": self.sqlType().jsonValue()
            }
        return schema 
Example #8
Source File: test_table.py    From python-bigquery with Apache License 2.0 6 votes vote down vote up
def test_to_dataframe_w_empty_results_wo_pyarrow(self):
        from google.cloud.bigquery.schema import SchemaField

        with mock.patch("google.cloud.bigquery.table.pyarrow", None):
            schema = [
                SchemaField("name", "STRING", mode="REQUIRED"),
                SchemaField("age", "INTEGER", mode="REQUIRED"),
            ]
            api_request = mock.Mock(return_value={"rows": []})
            row_iterator = self._make_one(_mock_client(), api_request, schema=schema)

            df = row_iterator.to_dataframe()

            self.assertIsInstance(df, pandas.DataFrame)
            self.assertEqual(len(df), 0)  # verify the number of rows
            self.assertEqual(list(df), ["name", "age"])  # verify the column names 
Example #9
Source File: test_table.py    From python-bigquery with Apache License 2.0 6 votes vote down vote up
def test_to_dataframe_error_if_pandas_is_none(self):
        from google.cloud.bigquery.schema import SchemaField

        schema = [
            SchemaField("name", "STRING", mode="REQUIRED"),
            SchemaField("age", "INTEGER", mode="REQUIRED"),
        ]
        rows = [
            {"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]},
            {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]},
        ]
        path = "/foo"
        api_request = mock.Mock(return_value={"rows": rows})
        row_iterator = self._make_one(_mock_client(), api_request, path, schema)

        with self.assertRaises(ValueError):
            row_iterator.to_dataframe() 
Example #10
Source File: _pandas_helpers.py    From python-bigquery with Apache License 2.0 6 votes vote down vote up
def download_dataframe_tabledata_list(pages, bq_schema, dtypes):
    """Use (slower, but free) tabledata.list to construct a DataFrame.

    Args:
        pages (Iterator[:class:`google.api_core.page_iterator.Page`]):
            An iterator over the result pages.
        bq_schema (Sequence[Union[ \
            :class:`~google.cloud.bigquery.schema.SchemaField`, \
            Mapping[str, Any] \
        ]]):
            A decription of the fields in result pages.
        dtypes(Mapping[str, numpy.dtype]):
            The types of columns in result data to hint construction of the
            resulting DataFrame. Not all column types have to be specified.
    Yields:
        :class:`pandas.DataFrame`
        The next page of records as a ``pandas.DataFrame`` record batch.
    """
    bq_schema = schema._to_schema_fields(bq_schema)
    column_names = [field.name for field in bq_schema]
    for page in pages:
        yield _tabledata_list_page_to_dataframe(page, column_names, dtypes) 
Example #11
Source File: parquet_util.py    From professional-services with Apache License 2.0 6 votes vote down vote up
def get_pa_translated_schema(self):
        """Translates a BigQuery schema to an parquet schema.

        Returns: Translated parquet schema in pyarrow.Schema format.
        """

        type_conversions = {
            'STRING': pa.string(),
            'NUMERIC': pa.int64(),
        }

        # TODO(annarudy@google.com): add support for nested fields
        pa_schema_list = [
            pa.field(
                bq_field.name,
                type_conversions[bq_field.field_type],
            ) for bq_field in self.bq_schema
        ]

        return pa.schema(pa_schema_list) 
Example #12
Source File: test_table.py    From python-bigquery with Apache License 2.0 6 votes vote down vote up
def test_to_dataframe_no_tqdm_no_progress_bar(self):
        from google.cloud.bigquery.schema import SchemaField

        schema = [
            SchemaField("name", "STRING", mode="REQUIRED"),
            SchemaField("age", "INTEGER", mode="REQUIRED"),
        ]
        rows = [
            {"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]},
            {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]},
            {"f": [{"v": "Wylma Phlyntstone"}, {"v": "29"}]},
            {"f": [{"v": "Bhettye Rhubble"}, {"v": "27"}]},
        ]
        path = "/foo"
        api_request = mock.Mock(return_value={"rows": rows})
        row_iterator = self._make_one(_mock_client(), api_request, path, schema)

        with warnings.catch_warnings(record=True) as warned:
            df = row_iterator.to_dataframe(create_bqstorage_client=False)

        self.assertEqual(len(warned), 0)
        self.assertEqual(len(df), 4) 
Example #13
Source File: test_table.py    From python-bigquery with Apache License 2.0 6 votes vote down vote up
def test_to_dataframe_w_no_results_wo_pyarrow(self):
        from google.cloud.bigquery.schema import SchemaField

        with mock.patch("google.cloud.bigquery.table.pyarrow", None):
            schema = [
                SchemaField("name", "STRING", mode="REQUIRED"),
                SchemaField("age", "INTEGER", mode="REQUIRED"),
            ]
            api_request = mock.Mock(return_value={"rows": []})
            row_iterator = self._make_one(_mock_client(), api_request, schema=schema)

            def empty_iterable(dtypes=None):
                return []

            row_iterator.to_dataframe_iterable = empty_iterable

            df = row_iterator.to_dataframe()

            self.assertIsInstance(df, pandas.DataFrame)
            self.assertEqual(len(df), 0)  # verify the number of rows
            self.assertEqual(list(df), ["name", "age"])  # verify the column names 
Example #14
Source File: parquet.py    From boxball with Apache License 2.0 6 votes vote down vote up
def chunked_write(df_iterator: TextFileReader, parquet_writer: pq.ParquetWriter, date_cols: List[str]):
    """
    Writes  Parquet version of the chunked dataframe input.

    Arrow table creation and Parquet-writes take up around 25% of the time on this function.
    The CSV read takes around 75%.
    """
    rows_processed = 0
    for df in df_iterator:
        rows_processed += min(BUFFER_SIZE_ROWS, len(df))
        for col_name in date_cols:
            df[col_name] = pd.to_datetime(df[col_name], unit="ms")
        pa_table = pa.Table.from_pandas(df=df, schema=parquet_writer.schema)
        parquet_writer.write_table(pa_table)

        print("Rows processed: {}".format(rows_processed), end="\r", flush=True)
    print() 
Example #15
Source File: test_unischema.py    From petastorm with Apache License 2.0 6 votes vote down vote up
def _mock_parquet_dataset(partitions, arrow_schema):
    """Creates a pyarrow.ParquetDataset mock capable of returning:

        parquet_dataset.pieces[0].get_metadata(parquet_dataset.fs.open).schema.to_arrow_schema() == schema
        parquet_dataset.partitions = partitions

    :param partitions: expected to be a list of pa.parquet.PartitionSet
    :param arrow_schema: an instance of pa.arrow_schema to be assumed by the mock parquet dataset object.
    :return:
    """
    piece_mock = mock.Mock()
    piece_mock.get_metadata().schema.to_arrow_schema.return_value = arrow_schema

    dataset_mock = mock.Mock()
    type(dataset_mock).pieces = mock.PropertyMock(return_value=[piece_mock])
    type(dataset_mock).partitions = partitions

    return dataset_mock 
Example #16
Source File: test_table.py    From python-bigquery with Apache License 2.0 6 votes vote down vote up
def test_to_dataframe_w_bqstorage_no_streams(self):
        from google.cloud.bigquery import schema
        from google.cloud.bigquery import table as mut

        bqstorage_client = mock.create_autospec(bigquery_storage_v1.BigQueryReadClient)
        session = bigquery_storage_v1.types.ReadSession()
        bqstorage_client.create_read_session.return_value = session

        row_iterator = mut.RowIterator(
            _mock_client(),
            api_request=None,
            path=None,
            schema=[
                schema.SchemaField("colA", "INTEGER"),
                schema.SchemaField("colC", "FLOAT"),
                schema.SchemaField("colB", "STRING"),
            ],
            table=mut.TableReference.from_string("proj.dset.tbl"),
        )

        got = row_iterator.to_dataframe(bqstorage_client)
        column_names = ["colA", "colC", "colB"]
        self.assertEqual(list(got), column_names)
        self.assertTrue(got.empty) 
Example #17
Source File: test_table.py    From python-bigquery with Apache License 2.0 6 votes vote down vote up
def test_to_dataframe(self):
        from google.cloud.bigquery.schema import SchemaField

        schema = [
            SchemaField("name", "STRING", mode="REQUIRED"),
            SchemaField("age", "INTEGER", mode="REQUIRED"),
        ]
        rows = [
            {"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]},
            {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]},
            {"f": [{"v": "Wylma Phlyntstone"}, {"v": "29"}]},
            {"f": [{"v": "Bhettye Rhubble"}, {"v": "27"}]},
        ]
        path = "/foo"
        api_request = mock.Mock(return_value={"rows": rows})
        row_iterator = self._make_one(_mock_client(), api_request, path, schema)

        df = row_iterator.to_dataframe(create_bqstorage_client=False)

        self.assertIsInstance(df, pandas.DataFrame)
        self.assertEqual(len(df), 4)  # verify the number of rows
        self.assertEqual(list(df), ["name", "age"])  # verify the column names
        self.assertEqual(df.name.dtype.name, "object")
        self.assertEqual(df.age.dtype.name, "int64") 
Example #18
Source File: test_table.py    From python-bigquery with Apache License 2.0 6 votes vote down vote up
def test_to_dataframe_iterable_error_if_pandas_is_none(self):
        from google.cloud.bigquery.schema import SchemaField

        schema = [
            SchemaField("name", "STRING", mode="REQUIRED"),
            SchemaField("age", "INTEGER", mode="REQUIRED"),
        ]
        rows = [
            {"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]},
            {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]},
        ]
        path = "/foo"
        api_request = mock.Mock(return_value={"rows": rows})
        row_iterator = self._make_one(_mock_client(), api_request, path, schema)

        with pytest.raises(ValueError, match="pandas"):
            row_iterator.to_dataframe_iterable() 
Example #19
Source File: test_table.py    From python-bigquery with Apache License 2.0 5 votes vote down vote up
def test_to_dataframe_column_dtypes(self):
        from google.cloud.bigquery.schema import SchemaField

        schema = [
            SchemaField("start_timestamp", "TIMESTAMP"),
            SchemaField("seconds", "INT64"),
            SchemaField("miles", "FLOAT64"),
            SchemaField("km", "FLOAT64"),
            SchemaField("payment_type", "STRING"),
            SchemaField("complete", "BOOL"),
            SchemaField("date", "DATE"),
        ]
        row_data = [
            ["1.4338368E9", "420", "1.1", "1.77", u"Cash", "true", "1999-12-01"],
            ["1.3878117E9", "2580", "17.7", "28.5", u"Cash", "false", "1953-06-14"],
            ["1.3855653E9", "2280", "4.4", "7.1", u"Credit", "true", "1981-11-04"],
        ]
        rows = [{"f": [{"v": field} for field in row]} for row in row_data]
        path = "/foo"
        api_request = mock.Mock(return_value={"rows": rows})
        row_iterator = self._make_one(_mock_client(), api_request, path, schema)

        df = row_iterator.to_dataframe(
            dtypes={"km": "float16"}, create_bqstorage_client=False,
        )

        self.assertIsInstance(df, pandas.DataFrame)
        self.assertEqual(len(df), 3)  # verify the number of rows
        exp_columns = [field.name for field in schema]
        self.assertEqual(list(df), exp_columns)  # verify the column names

        self.assertEqual(df.start_timestamp.dtype.name, "datetime64[ns, UTC]")
        self.assertEqual(df.seconds.dtype.name, "int64")
        self.assertEqual(df.miles.dtype.name, "float64")
        self.assertEqual(df.km.dtype.name, "float16")
        self.assertEqual(df.payment_type.dtype.name, "object")
        self.assertEqual(df.complete.dtype.name, "bool")
        self.assertEqual(df.date.dtype.name, "object") 
Example #20
Source File: test_validate.py    From cjworkbench with GNU Affero General Public License v3.0 5 votes vote down vote up
def test_text_zero_chunks_valid(self):
        validate_table_metadata(
            pyarrow.Table.from_batches([], pyarrow.schema([("A", pyarrow.string())])),
            TableMetadata(0, [Text("A")]),
        ) 
Example #21
Source File: test_validate.py    From cjworkbench with GNU Affero General Public License v3.0 5 votes vote down vote up
def test_text_dictionary_zero_chunks_is_valid(self):
        validate_table_metadata(
            pyarrow.Table.from_batches(
                [],
                pyarrow.schema(
                    [("A", pyarrow.dictionary(pyarrow.int32(), pyarrow.string()))]
                ),
            ),
            TableMetadata(0, [Text("A")]),
        ) 
Example #22
Source File: parquet.py    From boxball with Apache License 2.0 5 votes vote down vote up
def write_files(metadata: AlchemyMetadata) -> None:
    """
    Creates a Parquet file for each table in the schema.
    """
    tables: Iterator[AlchemyTable] = metadata.tables.values()
    for table in tables:
        name = table.name
        print(name)

        def get_path(prefix: Path, suffix: str):
            parent_dir = prefix.joinpath(metadata.schema)
            parent_dir.mkdir(exist_ok=True, parents=True)
            return parent_dir.joinpath(name).with_suffix(suffix)

        extract_file = get_path(EXTRACT_PATH_PREFIX, ".csv.zst")
        parquet_file = get_path(PARQUET_PREFIX, ".parquet")

        pandas_fields = get_pandas_fields(table)
        arrow_fields = get_arrow_fields(table)
        arrow_schema = pa.schema(get_arrow_fields(table))
        column_names = [name for name, dtype in pandas_fields]
        date_cols = [name for name, dtype in arrow_fields if "timestamp" in dtype]

        # Using both Arrow and Pandas allows each library to cover the other's current shortcomings.
        # Pandas's read_csv can handle chunked/complex reads, while Arrow's WriteParquet can handle chunked writes.
        # Arrow's input streams are capable of handling zstd files, which Pandas hasn't implemented yet.
        in_buf = pa.OSFile(str(extract_file), mode="r")
        reader = pa.CompressedInputStream(in_buf, compression="zstd")

        # Have to use snappy codec for Parquet because Drill doesn't read zstd
        parquet_writer = pq.ParquetWriter(parquet_file, schema=arrow_schema, compression='snappy',
                                          version="2.0", use_dictionary=True)
        df_iterator: TextFileReader = pd.read_csv(reader, header=None, names=column_names, dtype=dict(pandas_fields),
                                                  true_values=map_to_bytes('T'), false_values=map_to_bytes('F'),
                                                  chunksize=BUFFER_SIZE_ROWS, parse_dates=date_cols)

        chunked_write(df_iterator, parquet_writer, date_cols) 
Example #23
Source File: common_metadata.py    From kartothek with MIT License 5 votes vote down vote up
def __init__(self, schema, origin: Union[str, Set[str]]):
        if isinstance(origin, str):
            origin = {origin}
        elif isinstance(origin, set):
            origin = copy(origin)
        if not all(isinstance(s, str) for s in origin):
            raise TypeError("Schema origin elements must be strings.")

        self.__schema = schema
        self.__origin = origin
        self._schema_compat() 
Example #24
Source File: test_table.py    From python-bigquery with Apache License 2.0 5 votes vote down vote up
def test_to_dataframe_tabledata_list_w_multiple_pages_return_unique_index(self):
        from google.cloud.bigquery import schema
        from google.cloud.bigquery import table as mut

        iterator_schema = [schema.SchemaField("name", "STRING", mode="REQUIRED")]
        path = "/foo"
        api_request = mock.Mock(
            side_effect=[
                {"rows": [{"f": [{"v": "Bengt"}]}], "pageToken": "NEXTPAGE"},
                {"rows": [{"f": [{"v": "Sven"}]}]},
            ]
        )
        row_iterator = mut.RowIterator(
            _mock_client(),
            api_request,
            path,
            iterator_schema,
            table=mut.Table("proj.dset.tbl"),
        )

        df = row_iterator.to_dataframe(
            bqstorage_client=None, create_bqstorage_client=False,
        )

        self.assertIsInstance(df, pandas.DataFrame)
        self.assertEqual(len(df), 2)
        self.assertEqual(list(df), ["name"])
        self.assertEqual(df.name.dtype.name, "object")
        self.assertTrue(df.index.is_unique) 
Example #25
Source File: test_table.py    From python-bigquery with Apache License 2.0 5 votes vote down vote up
def test_to_dataframe_w_various_types_nullable(self):
        import datetime
        from google.cloud.bigquery.schema import SchemaField

        schema = [
            SchemaField("start_timestamp", "TIMESTAMP"),
            SchemaField("seconds", "INT64"),
            SchemaField("miles", "FLOAT64"),
            SchemaField("payment_type", "STRING"),
            SchemaField("complete", "BOOL"),
            SchemaField("date", "DATE"),
        ]
        row_data = [
            [None, None, None, None, None, None],
            ["1.4338368E9", "420", "1.1", u"Cash", "true", "1999-12-01"],
            ["1.3878117E9", "2580", "17.7", u"Cash", "false", "1953-06-14"],
            ["1.3855653E9", "2280", "4.4", u"Credit", "true", "1981-11-04"],
        ]
        rows = [{"f": [{"v": field} for field in row]} for row in row_data]
        path = "/foo"
        api_request = mock.Mock(return_value={"rows": rows})
        row_iterator = self._make_one(_mock_client(), api_request, path, schema)

        df = row_iterator.to_dataframe(create_bqstorage_client=False)

        self.assertIsInstance(df, pandas.DataFrame)
        self.assertEqual(len(df), 4)  # verify the number of rows
        exp_columns = [field.name for field in schema]
        self.assertEqual(list(df), exp_columns)  # verify the column names

        for index, row in df.iterrows():
            if index == 0:
                self.assertTrue(row.isnull().all())
            else:
                self.assertIsInstance(row.start_timestamp, pandas.Timestamp)
                self.assertIsInstance(row.seconds, float)
                self.assertIsInstance(row.payment_type, six.string_types)
                self.assertIsInstance(row.complete, bool)
                self.assertIsInstance(row.date, datetime.date) 
Example #26
Source File: test_table.py    From python-bigquery with Apache License 2.0 5 votes vote down vote up
def test_to_dataframe_tqdm_error(self):
        from google.cloud.bigquery.schema import SchemaField

        schema = [
            SchemaField("name", "STRING", mode="REQUIRED"),
            SchemaField("age", "INTEGER", mode="REQUIRED"),
        ]
        rows = [
            {"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]},
            {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]},
            {"f": [{"v": "Wylma Phlyntstone"}, {"v": "29"}]},
            {"f": [{"v": "Bhettye Rhubble"}, {"v": "27"}]},
        ]
        path = "/foo"

        for progress_bar_type in ("tqdm", "tqdm_notebook", "tqdm_gui"):
            api_request = mock.Mock(return_value={"rows": rows})
            row_iterator = self._make_one(_mock_client(), api_request, path, schema)

            with warnings.catch_warnings(record=True) as warned:
                df = row_iterator.to_dataframe(
                    progress_bar_type=progress_bar_type, create_bqstorage_client=False,
                )

            self.assertEqual(len(df), 4)  # all should be well

            # Warn that a progress bar was requested, but creating the tqdm
            # progress bar failed.
            for warning in warned:
                self.assertIs(warning.category, UserWarning) 
Example #27
Source File: test_table.py    From python-bigquery with Apache License 2.0 5 votes vote down vote up
def test_to_dataframe_no_tqdm(self):
        from google.cloud.bigquery.schema import SchemaField

        schema = [
            SchemaField("name", "STRING", mode="REQUIRED"),
            SchemaField("age", "INTEGER", mode="REQUIRED"),
        ]
        rows = [
            {"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]},
            {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]},
            {"f": [{"v": "Wylma Phlyntstone"}, {"v": "29"}]},
            {"f": [{"v": "Bhettye Rhubble"}, {"v": "27"}]},
        ]
        path = "/foo"
        api_request = mock.Mock(return_value={"rows": rows})
        row_iterator = self._make_one(_mock_client(), api_request, path, schema)

        with warnings.catch_warnings(record=True) as warned:
            df = row_iterator.to_dataframe(
                progress_bar_type="tqdm", create_bqstorage_client=False,
            )

        self.assertEqual(len(warned), 1)
        for warning in warned:
            self.assertIs(warning.category, UserWarning)

        # Even though the progress bar won't show, downloading the dataframe
        # should still work.
        self.assertEqual(len(df), 4) 
Example #28
Source File: test_table.py    From python-bigquery with Apache License 2.0 5 votes vote down vote up
def test_to_dataframe_progress_bar_wo_pyarrow(
        self, tqdm_mock, tqdm_notebook_mock, tqdm_gui_mock
    ):
        from google.cloud.bigquery.schema import SchemaField

        schema = [
            SchemaField("name", "STRING", mode="REQUIRED"),
            SchemaField("age", "INTEGER", mode="REQUIRED"),
        ]
        rows = [
            {"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]},
            {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]},
            {"f": [{"v": "Wylma Phlyntstone"}, {"v": "29"}]},
            {"f": [{"v": "Bhettye Rhubble"}, {"v": "27"}]},
        ]
        path = "/foo"
        api_request = mock.Mock(return_value={"rows": rows})

        progress_bars = (
            ("tqdm", tqdm_mock),
            ("tqdm_notebook", tqdm_notebook_mock),
            ("tqdm_gui", tqdm_gui_mock),
        )

        for progress_bar_type, progress_bar_mock in progress_bars:
            row_iterator = self._make_one(_mock_client(), api_request, path, schema)
            with mock.patch("google.cloud.bigquery.table.pyarrow", None):
                df = row_iterator.to_dataframe(progress_bar_type=progress_bar_type)

            progress_bar_mock.assert_called()
            progress_bar_mock().update.assert_called()
            progress_bar_mock().close.assert_called_once()
            self.assertEqual(len(df), 4) 
Example #29
Source File: test_table.py    From python-bigquery with Apache License 2.0 5 votes vote down vote up
def test_to_dataframe_progress_bar(
        self, tqdm_mock, tqdm_notebook_mock, tqdm_gui_mock
    ):
        from google.cloud.bigquery.schema import SchemaField

        schema = [
            SchemaField("name", "STRING", mode="REQUIRED"),
            SchemaField("age", "INTEGER", mode="REQUIRED"),
        ]
        rows = [
            {"f": [{"v": "Phred Phlyntstone"}, {"v": "32"}]},
            {"f": [{"v": "Bharney Rhubble"}, {"v": "33"}]},
            {"f": [{"v": "Wylma Phlyntstone"}, {"v": "29"}]},
            {"f": [{"v": "Bhettye Rhubble"}, {"v": "27"}]},
        ]
        path = "/foo"
        api_request = mock.Mock(return_value={"rows": rows})

        progress_bars = (
            ("tqdm", tqdm_mock),
            ("tqdm_notebook", tqdm_notebook_mock),
            ("tqdm_gui", tqdm_gui_mock),
        )

        for progress_bar_type, progress_bar_mock in progress_bars:
            row_iterator = self._make_one(_mock_client(), api_request, path, schema)
            df = row_iterator.to_dataframe(
                progress_bar_type=progress_bar_type, create_bqstorage_client=False,
            )

            progress_bar_mock.assert_called()
            progress_bar_mock().update.assert_called()
            progress_bar_mock().close.assert_called_once()
            self.assertEqual(len(df), 4) 
Example #30
Source File: test_table.py    From python-bigquery with Apache License 2.0 5 votes vote down vote up
def test_to_dataframe_iterable(self):
        from google.cloud.bigquery.schema import SchemaField
        import types

        schema = [
            SchemaField("name", "STRING", mode="REQUIRED"),
            SchemaField("age", "INTEGER", mode="REQUIRED"),
        ]

        path = "/foo"
        api_request = mock.Mock(
            side_effect=[
                {
                    "rows": [{"f": [{"v": "Bengt"}, {"v": "32"}]}],
                    "pageToken": "NEXTPAGE",
                },
                {"rows": [{"f": [{"v": "Sven"}, {"v": "33"}]}]},
            ]
        )

        row_iterator = self._make_one(
            _mock_client(), api_request, path, schema, page_size=1, max_results=5
        )
        dfs = row_iterator.to_dataframe_iterable()

        self.assertIsInstance(dfs, types.GeneratorType)

        df_1 = next(dfs)
        self.assertIsInstance(df_1, pandas.DataFrame)
        self.assertEqual(df_1.name.dtype.name, "object")
        self.assertEqual(df_1.age.dtype.name, "int64")
        self.assertEqual(len(df_1), 1)  # verify the number of rows
        self.assertEqual(
            df_1["name"][0], "Bengt"
        )  # verify the first value of 'name' column
        self.assertEqual(df_1["age"][0], 32)  # verify the first value of 'age' column

        df_2 = next(dfs)
        self.assertEqual(len(df_2), 1)  # verify the number of rows
        self.assertEqual(df_2["name"][0], "Sven")
        self.assertEqual(df_2["age"][0], 33)