Python Examples of pandas.read

Source File: utils.py From cloudml-samples with Apache License 2.0

6 votes

def read_df_from_bigquery(full_table_path, project_id=None, num_samples=None):
  """Read data from BigQuery and split into train and validation sets.

  Args:
    full_table_path: (string) full path of the table containing training data
      in the format of [project_id.dataset_name.table_name].
    project_id: (string, Optional) Google BigQuery Account project ID.
    num_samples: (int, Optional) Number of data samples to read.

  Returns:
    pandas.DataFrame
  """

  query = metadata.BASE_QUERY.format(table=full_table_path)
  limit = ' LIMIT {}'.format(num_samples) if num_samples else ''
  query += limit

  # Use "application default credentials"
  # Use SQL syntax dialect
  data_df = pd.read_gbq(query, project_id=project_id, dialect='standard')

  return data_df

Source File: test_gbq.py From vnpy_crypto with MIT License

6 votes

def test_roundtrip(self):
        destination_table = DESTINATION_TABLE + "1"

        test_size = 20001
        df = make_mixed_dataframe_v2(test_size)

        df.to_gbq(destination_table, _get_project_id(), chunksize=10000,
                  private_key=_get_private_key_path())

        sleep(30)  # <- Curses Google!!!

        result = pd.read_gbq("SELECT COUNT(*) AS num_rows FROM {0}"
                             .format(destination_table),
                             project_id=_get_project_id(),
                             private_key=_get_private_key_path())
        assert result['num_rows'][0] == test_size

Source File: weatherhistoryapp.py From spyre with MIT License

6 votes

def get_data(self, type, station_ids, n_years):
        query = """
            SELECT station_number, year, month, day, {type} as value, rain, snow
            FROM `publicdata.samples.gsod`
            WHERE station_number IN ({stns})
            AND year < 2010
            AND year >= {minyr}
        """.format(
            type=type,
            stns=','.join(station_ids),
            minyr=2010 - n_years
        )

        df = pd.read_gbq(query, project_id=PROJECT_ID, dialect='standard')
        df['date'] = pd.to_datetime(df[['year', 'month', 'day']])

        stations_df = pd.DataFrame({
            'location': stations.keys(),
            'station_number': [int(v) for v in stations.values()]
        })
        df = pd.merge(df, stations_df, on='station_number')
        return df

Source File: io.py From modin with Apache License 2.0

6 votes

def read_gbq(
        cls,
        query: str,
        project_id=None,
        index_col=None,
        col_order=None,
        reauth=False,
        auth_local_webserver=False,
        dialect=None,
        location=None,
        configuration=None,
        credentials=None,
        use_bqstorage_api=None,
        private_key=None,
        verbose=None,
        progress_bar_type=None,
    ):
        ErrorMessage.default_to_pandas("`read_gbq`")
        return cls.from_pandas(
            pandas.read_gbq(
                query,
                project_id=project_id,
                index_col=index_col,
                col_order=col_order,
                reauth=reauth,
                auth_local_webserver=auth_local_webserver,
                dialect=dialect,
                location=location,
                configuration=configuration,
                credentials=credentials,
                use_bqstorage_api=use_bqstorage_api,
                private_key=private_key,
                verbose=verbose,
                progress_bar_type=progress_bar_type,
            )
        )

Source File: test_gbq.py From elasticintel with GNU General Public License v3.0

6 votes

def test_roundtrip(self):
        destination_table = DESTINATION_TABLE + "1"

        test_size = 20001
        df = make_mixed_dataframe_v2(test_size)

        df.to_gbq(destination_table, _get_project_id(), chunksize=10000,
                  private_key=_get_private_key_path())

        sleep(30)  # <- Curses Google!!!

        result = pd.read_gbq("SELECT COUNT(*) AS num_rows FROM {0}"
                             .format(destination_table),
                             project_id=_get_project_id(),
                             private_key=_get_private_key_path())
        assert result['num_rows'][0] == test_size

Source File: samples_test.py From python-docs-samples with Apache License 2.0

6 votes

def test_pandas_gbq_query():
    # [START bigquery_migration_pandas_gbq_query]
    import pandas

    sql = """
        SELECT name
        FROM `bigquery-public-data.usa_names.usa_1910_current`
        WHERE state = 'TX'
        LIMIT 100
    """

    # Run a Standard SQL query using the environment's default project
    df = pandas.read_gbq(sql, dialect='standard')

    # Run a Standard SQL query with the project set explicitly
    project_id = 'your-project-id'
    # [END bigquery_migration_pandas_gbq_query]
    assert len(df) > 0
    project_id = os.environ['GOOGLE_CLOUD_PROJECT']
    # [START bigquery_migration_pandas_gbq_query]
    df = pandas.read_gbq(sql, project_id=project_id, dialect='standard')
    # [END bigquery_migration_pandas_gbq_query]
    assert len(df) > 0

Source File: compute_vessel_metrics.py From vessel-classification with Apache License 2.0

6 votes

def load_class_weights(inference_table):
    query = '''
        with

        core as (
        select * from `{}*`
        where max_label is not null
        ),

        count as (
        select count(*) as total from core
        )
        select max_label as label, count(*) / total as fraction
        from core
        cross join count
        group by label, total
        order by fraction desc
    '''.format(inference_table)
    df = pd.read_gbq(query, project_id='world-fishing-827', dialect='standard')
    wt_map = {x.label : x.fraction for x in df.itertuples()}
    return wt_map

Source File: compute_vessel_metrics.py From vessel-classification with Apache License 2.0

6 votes

def load_inferred(inference_table, label_table, extractors):
    """Load inferred data and generate comparison data

    """
    query = """

    SELECT inference_table.* except (ssvid), ssvid as id FROM 
    `{}` label_table
    JOIN
   `{}*` inference_table
    ON (cast(label_table.id as string) = inference_table.ssvid)
    where split = "Test"
    """.format(label_table, inference_table)
    print(query)
    df = pd.read_gbq(query, project_id='world-fishing-827', dialect='standard')

    for row in df.itertuples():
        for ext in extractors:
            ext.extract(row)
    for ext in extractors:
        ext.finalize()

Source File: samples_test.py From python-docs-samples with Apache License 2.0

6 votes

def test_pandas_gbq_query_with_parameters():
    # [START bigquery_migration_pandas_gbq_query_parameters]
    import pandas

    sql = """
        SELECT name
        FROM `bigquery-public-data.usa_names.usa_1910_current`
        WHERE state = @state
        LIMIT @limit
    """
    query_config = {
        'query': {
            'parameterMode': 'NAMED',
            'queryParameters': [
                {
                    'name': 'state',
                    'parameterType': {'type': 'STRING'},
                    'parameterValue': {'value': 'TX'}
                },
                {
                    'name': 'limit',
                    'parameterType': {'type': 'INTEGER'},
                    'parameterValue': {'value': 100}
                }
            ]
        }
    }

    df = pandas.read_gbq(sql, configuration=query_config)
    # [END bigquery_migration_pandas_gbq_query_parameters]
    assert len(df) > 0

Source File: utils.py From professional-services with Apache License 2.0

6 votes

def read_df_from_bigquery(full_table_path, project_id=None, num_samples=None):
  """Read data from BigQuery and split into train and validation sets.

  Args:
    full_table_path: (string) full path of the table containing training data
      in the format of [project_id.dataset_name.table_name].
    project_id: (string, Optional) Google BigQuery Account project ID.
    num_samples: (int, Optional) Number of data samples to read.

  Returns:
    pandas.DataFrame
  """

  query = metadata.BASE_QUERY.format(table=full_table_path)
  limit = ' LIMIT {}'.format(num_samples) if num_samples else ''
  query += limit

  # Use "application default credentials"
  # Use SQL syntax dialect
  data_df = pd.read_gbq(query, project_id=project_id, dialect='standard')

  return data_df

Source File: bigquery.py From openprescribing with MIT License

5 votes

def query_into_dataframe(self, sql, legacy=False):
        sql = interpolate_sql(sql)
        kwargs = {
            "project_id": self.project,
            "dialect": "legacy" if legacy else "standard",
        }
        with exception_sql_printer(sql):
            return pd.read_gbq(sql, **kwargs)

Source File: test_gbq.py From recruit with Apache License 2.0

5 votes

def test_read_gbq_without_dialect_warns_future_change(monkeypatch):
    # Default dialect is changing to standard SQL. See:
    # https://github.com/pydata/pandas-gbq/issues/195

    def mock_read_gbq(*args, **kwargs):
        return DataFrame([[1.0]])

    monkeypatch.setattr(pandas_gbq, 'read_gbq', mock_read_gbq)
    with tm.assert_produces_warning(FutureWarning):
        pd.read_gbq("SELECT 1")

Source File: compute_fishing_metrics.py From vessel-classification with Apache License 2.0

5 votes

def load_inferred_fishing(table, id_list, project_id, threshold=True):
    """Load inferred data and generate comparison data

    """
    query_template = """
    SELECT vessel_id as id, start_time, end_time, nnet_score FROM 
        TABLE_DATE_RANGE([{table}],
            TIMESTAMP('{year}-01-01'), TIMESTAMP('{year}-12-31'))
        WHERE vessel_id in ({ids})
    """
    ids = ','.join('"{}"'.format(x) for x in id_list)
    ranges = defaultdict(list)
    for year in range(2012, 2019):
        query = query_template.format(table=table, year=year, ids=ids)
        try:
            df = pd.read_gbq(query, project_id=project_id, dialect='legacy')
        except pandas_gbq.gbq.GenericGBQException as err:
            if 'matches no table' in err.args[0]:
                print('skipping', year)
                continue
            else:
                print(query)
                raise
        for x in df.itertuples():
            score = x.nnet_score
            if threshold:
                score = score > 0.5
            start = x.start_time.replace(tzinfo=pytz.utc)
            end = x.end_time.replace(tzinfo=pytz.utc)
            ranges[x.id].append(FishingRange(score, start, end))
    return ranges

Source File: create_train_info.py From vessel-classification with Apache License 2.0

5 votes

def read_vessel_database_for_char_mmsi(dbname, dataset):
    query = '''
      with multi_id as (
        select identity.ssvid as id
        from {dbname}
        group by id
        having count(identity.ssvid) > 1
      )
      
      select identity.ssvid as id, 
             feature.length_m as length, 
             feature.tonnage_gt as tonnage, 
             feature.engine_power_kw as engine_power, 
             feature.crew as crew_size,
             array_to_string(feature.geartype, '|') as label
      from {dbname} a
      where (feature.length_m is not null or 
            feature.tonnage_gt is not null or 
            feature.engine_power_kw is not null or 
            feature.crew is not null or
            (feature.geartype is not null and array_length(feature.geartype) > 0)) and
            identity.ssvid not in (select * from multi_id)
            order by id
    '''.format(**locals())
    try:
        return pd.read_gbq(query, dialect='standard', project_id='world-fishing-827')
    except:
        print(query)
        raise

Source File: query.py From subreddit-generator with MIT License

5 votes

def get_reddit_data(project_id, subreddits, start_month, end_month, max_posts):
    query = '''
    # standardSQL
    SELECT
    title,
    subreddit AS context_label
    FROM (
    SELECT
        title,
        subreddit,
        ROW_NUMBER() OVER (PARTITION BY subreddit ORDER BY score DESC)
        AS rank_num
    FROM
        `fh-bigquery.reddit_posts.*`
    WHERE
        _TABLE_SUFFIX BETWEEN "{}" AND "{}"
        AND LOWER(subreddit) IN ({})
        )
    WHERE
    rank_num <= {}
    '''

    query = query.format(start_month,
                         end_month,
                         str([x.lower() for x in subreddits])[1:-1],
                         max_posts)

    df = pd.read_gbq(query, project_id, dialect='standard')
    return df

Source File: samples_test.py From python-docs-samples with Apache License 2.0

5 votes

def test_pandas_gbq_legacy_query():
    # [START bigquery_migration_pandas_gbq_query_legacy]
    import pandas

    sql = """
        SELECT name
        FROM [bigquery-public-data:usa_names.usa_1910_current]
        WHERE state = 'TX'
        LIMIT 100
    """

    df = pandas.read_gbq(sql, dialect='legacy')
    # [END bigquery_migration_pandas_gbq_query_legacy]
    assert len(df) > 0

Source File: samples_test.py From python-docs-samples with Apache License 2.0

5 votes

def test_pandas_gbq_query_bqstorage():
    # [START bigquery_migration_pandas_gbq_query_bqstorage]
    import pandas

    sql = "SELECT * FROM `bigquery-public-data.irs_990.irs_990_2012`"

    # Use the BigQuery Storage API to download results more quickly.
    df = pandas.read_gbq(sql, dialect='standard', use_bqstorage_api=True)
    # [END bigquery_migration_pandas_gbq_query_bqstorage]
    assert len(df) > 0

Source File: test_gbq.py From predictive-maintenance-using-machine-learning with Apache License 2.0

5 votes

def test_roundtrip(self):
        destination_table = DESTINATION_TABLE + "1"

        test_size = 20001
        df = make_mixed_dataframe_v2(test_size)

        df.to_gbq(destination_table, _get_project_id(), chunksize=None,
                  credentials=_get_credentials())

        result = pd.read_gbq("SELECT COUNT(*) AS num_rows FROM {0}"
                             .format(destination_table),
                             project_id=_get_project_id(),
                             credentials=_get_credentials(),
                             dialect="standard")
        assert result['num_rows'][0] == test_size

Source File: test_gbq.py From predictive-maintenance-using-machine-learning with Apache License 2.0

5 votes

def test_read_gbq_without_dialect_warns_future_change(monkeypatch):
    # Default dialect is changing to standard SQL. See:
    # https://github.com/pydata/pandas-gbq/issues/195

    def mock_read_gbq(*args, **kwargs):
        return DataFrame([[1.0]])

    monkeypatch.setattr(pandas_gbq, 'read_gbq', mock_read_gbq)
    with tm.assert_produces_warning(FutureWarning):
        pd.read_gbq("SELECT 1")

Source File: utils.py From bigquery-bokeh-dashboard with Apache License 2.0

5 votes

def _run(query, dialect='legacy'):
    return pd.read_gbq(
        query,
        project_id=os.environ['GOOGLE_PROJECT_ID'],
        private_key=os.environ['GOOGLE_APPLICATION_CREDENTIALS'],
        dialect=dialect
    )

Source File: test_gbq.py From recruit with Apache License 2.0

5 votes

def test_roundtrip(self):
        destination_table = DESTINATION_TABLE + "1"

        test_size = 20001
        df = make_mixed_dataframe_v2(test_size)

        df.to_gbq(destination_table, _get_project_id(), chunksize=None,
                  credentials=_get_credentials())

        result = pd.read_gbq("SELECT COUNT(*) AS num_rows FROM {0}"
                             .format(destination_table),
                             project_id=_get_project_id(),
                             credentials=_get_credentials(),
                             dialect="standard")
        assert result['num_rows'][0] == test_size

Source File: create_train_info.py From vessel-classification with Apache License 2.0

4 votes

def read_vessel_database_for_char_vessel_id(dbname, dataset):
    query = '''
        with 

        core as (
          select vessel_id as id, 
                 feature.length_m as length, 
                 feature.tonnage_gt as tonnage, 
                 feature.engine_power_kw as engine_power, 
                 feature.geartype as label, 
                 feature.crew as crew_size,
                 array_to_string(feature.geartype, '|') as lbl,
                 pos_count
          from (select * from {dbname} cross join unnest (activity)) a
          join `{dataset}.vessel_info` b
          on (a.identity.ssvid = b.ssvid)
          where 
           (not ((a.last_timestamp is not null and 
                  a.last_timestamp < b.first_timestamp) or 
                 (a.first_timestamp is not null and
                  a.first_timestamp > b.last_timestamp)))
            and (feature.length_m is not null or feature.tonnage_gt is not null or 
                feature.engine_power_kw is not null or feature.geartype is not null
                --or feature.crew is not null
                )
        ),

        counted as (
          select * except(pos_count, label, lbl), lbl as label, sum(pos_count) as count
          from core 
          group by id, length, tonnage, 
                 engine_power, label, crew_size
        ),

        ordered as (
          select *, 
                 row_number() over (partition by id 
                        order by count desc, label,
                        length, tonnage, engine_power, 
                        crew_size) as rk
          from counted
        )
            
        select * except(rk, count) from ordered
        where rk = 1
    '''.format(**locals())
    try:
        return pd.read_gbq(query, dialect='standard', project_id='world-fishing-827')
    except:
        print(query)
        raise

Source File: create_train_info.py From vessel-classification with Apache License 2.0

4 votes

def read_vessel_database_for_detect_vessel_id(dbname, fishdbname, dataset):
    fishing_range_query=fishing_range_vessel_id(fishdbname, dataset)
    query = '''
        with 

        fishing_range_vessel_id as {fishing_range_query},

        core as (
          select vessel_id as id, feature.length_m as length, feature.tonnage_gt as tonnage, 
                 feature.engine_power_kw as engine_power, 
                 array_to_string(feature.geartype, '|') as label,
                 feature.crew as crew_size,
                 pos_count, transit_only
          from (select * from {dbname} cross join unnest (activity)) a
         join `{dataset}.segment_info` b
            on (cast(a.identity.ssvid as string) = ssvid)
          join `fishing_range_vessel_id` c
          using (vessel_id)
          where -- valid times overlap with segments
           (not ((a.last_timestamp is not null and 
                 a.last_timestamp < b.first_timestamp) or 
                 (a.first_timestamp is not null and
                 a.first_timestamp > b.last_timestamp)))
            and -- valid times overlaps with fishing ranges
           (not ((a.last_timestamp is not null and 
                  a.last_timestamp < c.first_timestamp) or 
                 (a.first_timestamp is not null and 
                  a.first_timestamp > c.last_timestamp))) 
        ),

        counted as (
          select * except(pos_count, transit_only, length, tonnage, engine_power,
                          label, crew_size), 
                    sum(pos_count) as count,
                   min(transit_only) as transit_only,
                   avg(length) as length, avg(tonnage) as tonnage, avg(engine_power) as engine_power,
                   any_value(label) as label, avg(crew_size) as crew_size
          from core 
          group by id
        ),     

        ordered as (
          select *, 
                 row_number() over (partition by id 
                        order by count desc, label,
                        length, tonnage, engine_power, 
                        crew_size) as rk
          from counted
        )
            
        select * except(rk, count) from ordered
    '''.format(**locals())
    try:
        return pd.read_gbq(query, dialect='standard', project_id='world-fishing-827')
    except:
        print(query)
        raise

Source File: create_train_info.py From vessel-classification with Apache License 2.0

4 votes

def read_vessel_database_for_detect_mmsi(dbname, fishdbname, dataset):
    fishing_range_query=fishing_range_mmsi(fishdbname, dataset)
    query = '''
        with 

        fishing_range_mmsi as {fishing_range_query},

        core as (
          select identity.ssvid as id, length_m as length, tonnage_gt as tonnage, 
                 engine_power_kw as engine_power, 
                 geartype as label, crew as crew_size,
                 confidence, -- 1==low, 2==typical, 3==high
                 (select sum(messages) from unnest(activity)) as pos_count, transit_only
          from {dbname} a
          cross join unnest (registry)
          join `fishing_range_mmsi` c
          on (cast(mmsi as string) = identity.ssvid)
          where  -- valid times overlaps with fishing ranges
          -- TODO: should do each activity period separately here.
           (not (((select min(last_timestamp) from unnest(activity)) < c.first_timestamp) or 
                 ((select min(first_timestamp) from unnest(activity)) > c.last_timestamp))) 
        ),

        counted as (
          select * except(pos_count, confidence, transit_only, length, tonnage, engine_power,
                          label, crew_size), 
                    sum(pos_count) as count,
                   avg(confidence) as confidence, min(transit_only) as transit_only,
                   avg(length) as length, avg(tonnage) as tonnage, avg(engine_power) as engine_power,
                   any_value(label) as label, avg(crew_size) as crew_size
          from core 
          group by id
        ),     

        ordered as (
          select *, 
                 row_number() over (partition by id 
                        order by count desc, label,
                        length, tonnage, engine_power, 
                        crew_size, confidence) as rk
          from counted
        )
            
        select * except(rk, count) from ordered
    '''.format(**locals())
    try:
        return pd.read_gbq(query, dialect='standard', project_id='world-fishing-827')
    except:
        print(query)
        raise

Source File: create_train_info.py From vessel-classification with Apache License 2.0

4 votes

def read_fishing_ranges_vessel_id(fishdbname, dataset):
    query = '''
        with 

        fishing_ranges as {fishing_ranges},

        core as (
          select mmsi as ssvid, vessel_id as id, 
                 start_time, end_time, is_fishing, pos_count
          from
          `{fishdbname}` a
          join
          `{dataset}.vessel_info` b
          on (a.mmsi = cast(ssvid as int64))
          join `fishing_ranges` c
          using (vessel_id)
          where 
            (not (b.last_timestamp <  c.first_timestamp or 
                  b.first_timestamp > c.last_timestamp))
        ),

        counted as (
          select id, start_time, end_time, is_fishing, sum(pos_count) as count
          from core 
          group by id, start_time, end_time, is_fishing
        ),
            
        ordered as (
          select *, 
                 row_number() over (partition by id, start_time, end_time
                                     order by count desc) as rk
          from counted
        )
            
        select * except(rk, count) from ordered
        where rk = 1
    '''.format(fishdbname=fishdbname,
               dataset=dataset, 
               fishing_ranges=fishing_range_vessel_id(fishdbname, dataset))
    try:
        return pd.read_gbq(query, dialect='standard', project_id='world-fishing-827')
    except:
        print(query)
        raise

Python pandas.read_gbq() Examples