Python pandas.read_gbq() Examples
The following are 25
code examples of pandas.read_gbq().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pandas
, or try the search function
.
Example #1
Source File: utils.py From cloudml-samples with Apache License 2.0 | 6 votes |
def read_df_from_bigquery(full_table_path, project_id=None, num_samples=None): """Read data from BigQuery and split into train and validation sets. Args: full_table_path: (string) full path of the table containing training data in the format of [project_id.dataset_name.table_name]. project_id: (string, Optional) Google BigQuery Account project ID. num_samples: (int, Optional) Number of data samples to read. Returns: pandas.DataFrame """ query = metadata.BASE_QUERY.format(table=full_table_path) limit = ' LIMIT {}'.format(num_samples) if num_samples else '' query += limit # Use "application default credentials" # Use SQL syntax dialect data_df = pd.read_gbq(query, project_id=project_id, dialect='standard') return data_df
Example #2
Source File: test_gbq.py From vnpy_crypto with MIT License | 6 votes |
def test_roundtrip(self): destination_table = DESTINATION_TABLE + "1" test_size = 20001 df = make_mixed_dataframe_v2(test_size) df.to_gbq(destination_table, _get_project_id(), chunksize=10000, private_key=_get_private_key_path()) sleep(30) # <- Curses Google!!! result = pd.read_gbq("SELECT COUNT(*) AS num_rows FROM {0}" .format(destination_table), project_id=_get_project_id(), private_key=_get_private_key_path()) assert result['num_rows'][0] == test_size
Example #3
Source File: weatherhistoryapp.py From spyre with MIT License | 6 votes |
def get_data(self, type, station_ids, n_years): query = """ SELECT station_number, year, month, day, {type} as value, rain, snow FROM `publicdata.samples.gsod` WHERE station_number IN ({stns}) AND year < 2010 AND year >= {minyr} """.format( type=type, stns=','.join(station_ids), minyr=2010 - n_years ) df = pd.read_gbq(query, project_id=PROJECT_ID, dialect='standard') df['date'] = pd.to_datetime(df[['year', 'month', 'day']]) stations_df = pd.DataFrame({ 'location': stations.keys(), 'station_number': [int(v) for v in stations.values()] }) df = pd.merge(df, stations_df, on='station_number') return df
Example #4
Source File: io.py From modin with Apache License 2.0 | 6 votes |
def read_gbq( cls, query: str, project_id=None, index_col=None, col_order=None, reauth=False, auth_local_webserver=False, dialect=None, location=None, configuration=None, credentials=None, use_bqstorage_api=None, private_key=None, verbose=None, progress_bar_type=None, ): ErrorMessage.default_to_pandas("`read_gbq`") return cls.from_pandas( pandas.read_gbq( query, project_id=project_id, index_col=index_col, col_order=col_order, reauth=reauth, auth_local_webserver=auth_local_webserver, dialect=dialect, location=location, configuration=configuration, credentials=credentials, use_bqstorage_api=use_bqstorage_api, private_key=private_key, verbose=verbose, progress_bar_type=progress_bar_type, ) )
Example #5
Source File: test_gbq.py From elasticintel with GNU General Public License v3.0 | 6 votes |
def test_roundtrip(self): destination_table = DESTINATION_TABLE + "1" test_size = 20001 df = make_mixed_dataframe_v2(test_size) df.to_gbq(destination_table, _get_project_id(), chunksize=10000, private_key=_get_private_key_path()) sleep(30) # <- Curses Google!!! result = pd.read_gbq("SELECT COUNT(*) AS num_rows FROM {0}" .format(destination_table), project_id=_get_project_id(), private_key=_get_private_key_path()) assert result['num_rows'][0] == test_size
Example #6
Source File: samples_test.py From python-docs-samples with Apache License 2.0 | 6 votes |
def test_pandas_gbq_query(): # [START bigquery_migration_pandas_gbq_query] import pandas sql = """ SELECT name FROM `bigquery-public-data.usa_names.usa_1910_current` WHERE state = 'TX' LIMIT 100 """ # Run a Standard SQL query using the environment's default project df = pandas.read_gbq(sql, dialect='standard') # Run a Standard SQL query with the project set explicitly project_id = 'your-project-id' # [END bigquery_migration_pandas_gbq_query] assert len(df) > 0 project_id = os.environ['GOOGLE_CLOUD_PROJECT'] # [START bigquery_migration_pandas_gbq_query] df = pandas.read_gbq(sql, project_id=project_id, dialect='standard') # [END bigquery_migration_pandas_gbq_query] assert len(df) > 0
Example #7
Source File: compute_vessel_metrics.py From vessel-classification with Apache License 2.0 | 6 votes |
def load_class_weights(inference_table): query = ''' with core as ( select * from `{}*` where max_label is not null ), count as ( select count(*) as total from core ) select max_label as label, count(*) / total as fraction from core cross join count group by label, total order by fraction desc '''.format(inference_table) df = pd.read_gbq(query, project_id='world-fishing-827', dialect='standard') wt_map = {x.label : x.fraction for x in df.itertuples()} return wt_map
Example #8
Source File: compute_vessel_metrics.py From vessel-classification with Apache License 2.0 | 6 votes |
def load_inferred(inference_table, label_table, extractors): """Load inferred data and generate comparison data """ query = """ SELECT inference_table.* except (ssvid), ssvid as id FROM `{}` label_table JOIN `{}*` inference_table ON (cast(label_table.id as string) = inference_table.ssvid) where split = "Test" """.format(label_table, inference_table) print(query) df = pd.read_gbq(query, project_id='world-fishing-827', dialect='standard') for row in df.itertuples(): for ext in extractors: ext.extract(row) for ext in extractors: ext.finalize()
Example #9
Source File: samples_test.py From python-docs-samples with Apache License 2.0 | 6 votes |
def test_pandas_gbq_query_with_parameters(): # [START bigquery_migration_pandas_gbq_query_parameters] import pandas sql = """ SELECT name FROM `bigquery-public-data.usa_names.usa_1910_current` WHERE state = @state LIMIT @limit """ query_config = { 'query': { 'parameterMode': 'NAMED', 'queryParameters': [ { 'name': 'state', 'parameterType': {'type': 'STRING'}, 'parameterValue': {'value': 'TX'} }, { 'name': 'limit', 'parameterType': {'type': 'INTEGER'}, 'parameterValue': {'value': 100} } ] } } df = pandas.read_gbq(sql, configuration=query_config) # [END bigquery_migration_pandas_gbq_query_parameters] assert len(df) > 0
Example #10
Source File: utils.py From professional-services with Apache License 2.0 | 6 votes |
def read_df_from_bigquery(full_table_path, project_id=None, num_samples=None): """Read data from BigQuery and split into train and validation sets. Args: full_table_path: (string) full path of the table containing training data in the format of [project_id.dataset_name.table_name]. project_id: (string, Optional) Google BigQuery Account project ID. num_samples: (int, Optional) Number of data samples to read. Returns: pandas.DataFrame """ query = metadata.BASE_QUERY.format(table=full_table_path) limit = ' LIMIT {}'.format(num_samples) if num_samples else '' query += limit # Use "application default credentials" # Use SQL syntax dialect data_df = pd.read_gbq(query, project_id=project_id, dialect='standard') return data_df
Example #11
Source File: bigquery.py From openprescribing with MIT License | 5 votes |
def query_into_dataframe(self, sql, legacy=False): sql = interpolate_sql(sql) kwargs = { "project_id": self.project, "dialect": "legacy" if legacy else "standard", } with exception_sql_printer(sql): return pd.read_gbq(sql, **kwargs)
Example #12
Source File: test_gbq.py From recruit with Apache License 2.0 | 5 votes |
def test_read_gbq_without_dialect_warns_future_change(monkeypatch): # Default dialect is changing to standard SQL. See: # https://github.com/pydata/pandas-gbq/issues/195 def mock_read_gbq(*args, **kwargs): return DataFrame([[1.0]]) monkeypatch.setattr(pandas_gbq, 'read_gbq', mock_read_gbq) with tm.assert_produces_warning(FutureWarning): pd.read_gbq("SELECT 1")
Example #13
Source File: compute_fishing_metrics.py From vessel-classification with Apache License 2.0 | 5 votes |
def load_inferred_fishing(table, id_list, project_id, threshold=True): """Load inferred data and generate comparison data """ query_template = """ SELECT vessel_id as id, start_time, end_time, nnet_score FROM TABLE_DATE_RANGE([{table}], TIMESTAMP('{year}-01-01'), TIMESTAMP('{year}-12-31')) WHERE vessel_id in ({ids}) """ ids = ','.join('"{}"'.format(x) for x in id_list) ranges = defaultdict(list) for year in range(2012, 2019): query = query_template.format(table=table, year=year, ids=ids) try: df = pd.read_gbq(query, project_id=project_id, dialect='legacy') except pandas_gbq.gbq.GenericGBQException as err: if 'matches no table' in err.args[0]: print('skipping', year) continue else: print(query) raise for x in df.itertuples(): score = x.nnet_score if threshold: score = score > 0.5 start = x.start_time.replace(tzinfo=pytz.utc) end = x.end_time.replace(tzinfo=pytz.utc) ranges[x.id].append(FishingRange(score, start, end)) return ranges
Example #14
Source File: create_train_info.py From vessel-classification with Apache License 2.0 | 5 votes |
def read_vessel_database_for_char_mmsi(dbname, dataset): query = ''' with multi_id as ( select identity.ssvid as id from {dbname} group by id having count(identity.ssvid) > 1 ) select identity.ssvid as id, feature.length_m as length, feature.tonnage_gt as tonnage, feature.engine_power_kw as engine_power, feature.crew as crew_size, array_to_string(feature.geartype, '|') as label from {dbname} a where (feature.length_m is not null or feature.tonnage_gt is not null or feature.engine_power_kw is not null or feature.crew is not null or (feature.geartype is not null and array_length(feature.geartype) > 0)) and identity.ssvid not in (select * from multi_id) order by id '''.format(**locals()) try: return pd.read_gbq(query, dialect='standard', project_id='world-fishing-827') except: print(query) raise
Example #15
Source File: query.py From subreddit-generator with MIT License | 5 votes |
def get_reddit_data(project_id, subreddits, start_month, end_month, max_posts): query = ''' # standardSQL SELECT title, subreddit AS context_label FROM ( SELECT title, subreddit, ROW_NUMBER() OVER (PARTITION BY subreddit ORDER BY score DESC) AS rank_num FROM `fh-bigquery.reddit_posts.*` WHERE _TABLE_SUFFIX BETWEEN "{}" AND "{}" AND LOWER(subreddit) IN ({}) ) WHERE rank_num <= {} ''' query = query.format(start_month, end_month, str([x.lower() for x in subreddits])[1:-1], max_posts) df = pd.read_gbq(query, project_id, dialect='standard') return df
Example #16
Source File: samples_test.py From python-docs-samples with Apache License 2.0 | 5 votes |
def test_pandas_gbq_legacy_query(): # [START bigquery_migration_pandas_gbq_query_legacy] import pandas sql = """ SELECT name FROM [bigquery-public-data:usa_names.usa_1910_current] WHERE state = 'TX' LIMIT 100 """ df = pandas.read_gbq(sql, dialect='legacy') # [END bigquery_migration_pandas_gbq_query_legacy] assert len(df) > 0
Example #17
Source File: samples_test.py From python-docs-samples with Apache License 2.0 | 5 votes |
def test_pandas_gbq_query_bqstorage(): # [START bigquery_migration_pandas_gbq_query_bqstorage] import pandas sql = "SELECT * FROM `bigquery-public-data.irs_990.irs_990_2012`" # Use the BigQuery Storage API to download results more quickly. df = pandas.read_gbq(sql, dialect='standard', use_bqstorage_api=True) # [END bigquery_migration_pandas_gbq_query_bqstorage] assert len(df) > 0
Example #18
Source File: test_gbq.py From predictive-maintenance-using-machine-learning with Apache License 2.0 | 5 votes |
def test_roundtrip(self): destination_table = DESTINATION_TABLE + "1" test_size = 20001 df = make_mixed_dataframe_v2(test_size) df.to_gbq(destination_table, _get_project_id(), chunksize=None, credentials=_get_credentials()) result = pd.read_gbq("SELECT COUNT(*) AS num_rows FROM {0}" .format(destination_table), project_id=_get_project_id(), credentials=_get_credentials(), dialect="standard") assert result['num_rows'][0] == test_size
Example #19
Source File: test_gbq.py From predictive-maintenance-using-machine-learning with Apache License 2.0 | 5 votes |
def test_read_gbq_without_dialect_warns_future_change(monkeypatch): # Default dialect is changing to standard SQL. See: # https://github.com/pydata/pandas-gbq/issues/195 def mock_read_gbq(*args, **kwargs): return DataFrame([[1.0]]) monkeypatch.setattr(pandas_gbq, 'read_gbq', mock_read_gbq) with tm.assert_produces_warning(FutureWarning): pd.read_gbq("SELECT 1")
Example #20
Source File: utils.py From bigquery-bokeh-dashboard with Apache License 2.0 | 5 votes |
def _run(query, dialect='legacy'): return pd.read_gbq( query, project_id=os.environ['GOOGLE_PROJECT_ID'], private_key=os.environ['GOOGLE_APPLICATION_CREDENTIALS'], dialect=dialect )
Example #21
Source File: test_gbq.py From recruit with Apache License 2.0 | 5 votes |
def test_roundtrip(self): destination_table = DESTINATION_TABLE + "1" test_size = 20001 df = make_mixed_dataframe_v2(test_size) df.to_gbq(destination_table, _get_project_id(), chunksize=None, credentials=_get_credentials()) result = pd.read_gbq("SELECT COUNT(*) AS num_rows FROM {0}" .format(destination_table), project_id=_get_project_id(), credentials=_get_credentials(), dialect="standard") assert result['num_rows'][0] == test_size
Example #22
Source File: create_train_info.py From vessel-classification with Apache License 2.0 | 4 votes |
def read_vessel_database_for_char_vessel_id(dbname, dataset): query = ''' with core as ( select vessel_id as id, feature.length_m as length, feature.tonnage_gt as tonnage, feature.engine_power_kw as engine_power, feature.geartype as label, feature.crew as crew_size, array_to_string(feature.geartype, '|') as lbl, pos_count from (select * from {dbname} cross join unnest (activity)) a join `{dataset}.vessel_info` b on (a.identity.ssvid = b.ssvid) where (not ((a.last_timestamp is not null and a.last_timestamp < b.first_timestamp) or (a.first_timestamp is not null and a.first_timestamp > b.last_timestamp))) and (feature.length_m is not null or feature.tonnage_gt is not null or feature.engine_power_kw is not null or feature.geartype is not null --or feature.crew is not null ) ), counted as ( select * except(pos_count, label, lbl), lbl as label, sum(pos_count) as count from core group by id, length, tonnage, engine_power, label, crew_size ), ordered as ( select *, row_number() over (partition by id order by count desc, label, length, tonnage, engine_power, crew_size) as rk from counted ) select * except(rk, count) from ordered where rk = 1 '''.format(**locals()) try: return pd.read_gbq(query, dialect='standard', project_id='world-fishing-827') except: print(query) raise
Example #23
Source File: create_train_info.py From vessel-classification with Apache License 2.0 | 4 votes |
def read_vessel_database_for_detect_vessel_id(dbname, fishdbname, dataset): fishing_range_query=fishing_range_vessel_id(fishdbname, dataset) query = ''' with fishing_range_vessel_id as {fishing_range_query}, core as ( select vessel_id as id, feature.length_m as length, feature.tonnage_gt as tonnage, feature.engine_power_kw as engine_power, array_to_string(feature.geartype, '|') as label, feature.crew as crew_size, pos_count, transit_only from (select * from {dbname} cross join unnest (activity)) a join `{dataset}.segment_info` b on (cast(a.identity.ssvid as string) = ssvid) join `fishing_range_vessel_id` c using (vessel_id) where -- valid times overlap with segments (not ((a.last_timestamp is not null and a.last_timestamp < b.first_timestamp) or (a.first_timestamp is not null and a.first_timestamp > b.last_timestamp))) and -- valid times overlaps with fishing ranges (not ((a.last_timestamp is not null and a.last_timestamp < c.first_timestamp) or (a.first_timestamp is not null and a.first_timestamp > c.last_timestamp))) ), counted as ( select * except(pos_count, transit_only, length, tonnage, engine_power, label, crew_size), sum(pos_count) as count, min(transit_only) as transit_only, avg(length) as length, avg(tonnage) as tonnage, avg(engine_power) as engine_power, any_value(label) as label, avg(crew_size) as crew_size from core group by id ), ordered as ( select *, row_number() over (partition by id order by count desc, label, length, tonnage, engine_power, crew_size) as rk from counted ) select * except(rk, count) from ordered '''.format(**locals()) try: return pd.read_gbq(query, dialect='standard', project_id='world-fishing-827') except: print(query) raise
Example #24
Source File: create_train_info.py From vessel-classification with Apache License 2.0 | 4 votes |
def read_vessel_database_for_detect_mmsi(dbname, fishdbname, dataset): fishing_range_query=fishing_range_mmsi(fishdbname, dataset) query = ''' with fishing_range_mmsi as {fishing_range_query}, core as ( select identity.ssvid as id, length_m as length, tonnage_gt as tonnage, engine_power_kw as engine_power, geartype as label, crew as crew_size, confidence, -- 1==low, 2==typical, 3==high (select sum(messages) from unnest(activity)) as pos_count, transit_only from {dbname} a cross join unnest (registry) join `fishing_range_mmsi` c on (cast(mmsi as string) = identity.ssvid) where -- valid times overlaps with fishing ranges -- TODO: should do each activity period separately here. (not (((select min(last_timestamp) from unnest(activity)) < c.first_timestamp) or ((select min(first_timestamp) from unnest(activity)) > c.last_timestamp))) ), counted as ( select * except(pos_count, confidence, transit_only, length, tonnage, engine_power, label, crew_size), sum(pos_count) as count, avg(confidence) as confidence, min(transit_only) as transit_only, avg(length) as length, avg(tonnage) as tonnage, avg(engine_power) as engine_power, any_value(label) as label, avg(crew_size) as crew_size from core group by id ), ordered as ( select *, row_number() over (partition by id order by count desc, label, length, tonnage, engine_power, crew_size, confidence) as rk from counted ) select * except(rk, count) from ordered '''.format(**locals()) try: return pd.read_gbq(query, dialect='standard', project_id='world-fishing-827') except: print(query) raise
Example #25
Source File: create_train_info.py From vessel-classification with Apache License 2.0 | 4 votes |
def read_fishing_ranges_vessel_id(fishdbname, dataset): query = ''' with fishing_ranges as {fishing_ranges}, core as ( select mmsi as ssvid, vessel_id as id, start_time, end_time, is_fishing, pos_count from `{fishdbname}` a join `{dataset}.vessel_info` b on (a.mmsi = cast(ssvid as int64)) join `fishing_ranges` c using (vessel_id) where (not (b.last_timestamp < c.first_timestamp or b.first_timestamp > c.last_timestamp)) ), counted as ( select id, start_time, end_time, is_fishing, sum(pos_count) as count from core group by id, start_time, end_time, is_fishing ), ordered as ( select *, row_number() over (partition by id, start_time, end_time order by count desc) as rk from counted ) select * except(rk, count) from ordered where rk = 1 '''.format(fishdbname=fishdbname, dataset=dataset, fishing_ranges=fishing_range_vessel_id(fishdbname, dataset)) try: return pd.read_gbq(query, dialect='standard', project_id='world-fishing-827') except: print(query) raise