Python google.cloud.bigquery.LoadJobConfig() Examples

The following are 30 code examples of google.cloud.bigquery.LoadJobConfig(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module google.cloud.bigquery , or try the search function .
Example #1
Source File: gcp_utils.py    From recommender-tensorflow with MIT License 8 votes vote down vote up
def df_to_bigquery(df, table_id, dataset_id, client):
    table = get_bigquery_table(table_id, dataset_id, client)

    # set config: insert overwrite
    job_config = bigquery.LoadJobConfig(
        write_disposition=bigquery.job.WriteDisposition.WRITE_TRUNCATE
    )

    # insert table
    job = client.load_table_from_dataframe(
        dataframe=df.compute().rename_axis("id"),
        destination=table,
        job_config=job_config
    )
    job.result()
    logger.info('%s rows loaded into %s.%s.%s.', job.output_rows, job.project, dataset_id, table_id)
    return table 
Example #2
Source File: load_table_uri_parquet.py    From python-bigquery with Apache License 2.0 7 votes vote down vote up
def load_table_uri_parquet(table_id):
    # [START bigquery_load_table_gcs_parquet]
    from google.cloud import bigquery

    # Construct a BigQuery client object.
    client = bigquery.Client()

    # TODO(developer): Set table_id to the ID of the table to create.
    # table_id = "your-project.your_dataset.your_table_name"

    job_config = bigquery.LoadJobConfig(source_format=bigquery.SourceFormat.PARQUET,)
    uri = "gs://cloud-samples-data/bigquery/us-states/us-states.parquet"

    load_job = client.load_table_from_uri(
        uri, table_id, job_config=job_config
    )  # Make an API request.

    load_job.result()  # Waits for the job to complete.

    destination_table = client.get_table(table_id)
    print("Loaded {} rows.".format(destination_table.num_rows))
    # [END bigquery_load_table_gcs_parquet] 
Example #3
Source File: bigquery.py    From openprescribing with MIT License 6 votes vote down vote up
def run_job(self, method_name, args, config_opts, config_default_opts):
        job_config = {
            "copy_table": gcbq.CopyJobConfig,
            "extract_table": gcbq.ExtractJobConfig,
            "load_table_from_file": gcbq.LoadJobConfig,
            "load_table_from_uri": gcbq.LoadJobConfig,
            "query": gcbq.QueryJobConfig,
        }[method_name]()

        for k, v in config_default_opts.items():
            setattr(job_config, k, v)
        for k, v in config_opts.items():
            setattr(job_config, k, v)

        method = getattr(self.gcbq_client, method_name)

        job = method(*args, job_config=job_config)
        if getattr(job_config, "dry_run", False):
            return []
        else:
            return job.result() 
Example #4
Source File: load_table_uri_orc.py    From python-bigquery with Apache License 2.0 6 votes vote down vote up
def load_table_uri_orc(table_id):

    # [START bigquery_load_table_gcs_orc]
    from google.cloud import bigquery

    # Construct a BigQuery client object.
    client = bigquery.Client()

    # TODO(developer): Set table_id to the ID of the table to create.
    # table_id = "your-project.your_dataset.your_table_name

    job_config = bigquery.LoadJobConfig(source_format=bigquery.SourceFormat.ORC)
    uri = "gs://cloud-samples-data/bigquery/us-states/us-states.orc"

    load_job = client.load_table_from_uri(
        uri, table_id, job_config=job_config
    )  # Make an API request.

    load_job.result()  # Waits for the job to complete.

    destination_table = client.get_table(table_id)
    print("Loaded {} rows.".format(destination_table.num_rows))
    # [END bigquery_load_table_gcs_orc] 
Example #5
Source File: load_table_uri_orc.py    From python-bigquery with Apache License 2.0 6 votes vote down vote up
def load_table_uri_orc(table_id):

    # [START bigquery_load_table_gcs_orc]
    from google.cloud import bigquery

    # Construct a BigQuery client object.
    client = bigquery.Client()

    # TODO(developer): Set table_id to the ID of the table to create.
    # table_id = "your-project.your_dataset.your_table_name

    job_config = bigquery.LoadJobConfig(source_format=bigquery.SourceFormat.ORC)
    uri = "gs://cloud-samples-data/bigquery/us-states/us-states.orc"

    load_job = client.load_table_from_uri(
        uri, table_id, job_config=job_config
    )  # Make an API request.

    load_job.result()  # Waits for the job to complete.

    destination_table = client.get_table(table_id)
    print("Loaded {} rows.".format(destination_table.num_rows))
    # [END bigquery_load_table_gcs_orc] 
Example #6
Source File: helper_function.py    From professional-services with Apache License 2.0 6 votes vote down vote up
def csv_in_gcs_to_table(bucket_name: str, object_name: str, dataset_id: str,
                        table_id: str,
                        schema: List[bigquery.SchemaField]) -> None:
    """Upload CSV to BigQuery table.
        If the table already exists, it overwrites the table data.

    Args:
        bucket_name: Bucket name for holding the object
        object_name: Name of object to be uploaded
        dataset_id: Dataset id where the table is located.
        table_id: String holding id of hte table.
        schema: Schema of the table_id
    """
    client = bigquery.Client()
    dataset_ref = client.dataset(dataset_id)
    job_config = bigquery.LoadJobConfig()
    job_config.schema = schema
    job_config.source_format = bigquery.SourceFormat.CSV
    job_config.write_disposition = bigquery.WriteDisposition().WRITE_TRUNCATE
    uri = "gs://{}/{}".format(bucket_name, object_name)
    load_job = client.load_table_from_uri(uri,
                                          dataset_ref.table(table_id),
                                          job_config=job_config)
    load_job.result() 
Example #7
Source File: load_table_uri_avro.py    From python-bigquery with Apache License 2.0 6 votes vote down vote up
def load_table_uri_avro(table_id):

    # [START bigquery_load_table_gcs_avro]
    from google.cloud import bigquery

    # Construct a BigQuery client object.
    client = bigquery.Client()

    # TODO(developer): Set table_id to the ID of the table to create.
    # table_id = "your-project.your_dataset.your_table_name

    job_config = bigquery.LoadJobConfig(source_format=bigquery.SourceFormat.AVRO)
    uri = "gs://cloud-samples-data/bigquery/us-states/us-states.avro"

    load_job = client.load_table_from_uri(
        uri, table_id, job_config=job_config
    )  # Make an API request.

    load_job.result()  # Waits for the job to complete.

    destination_table = client.get_table(table_id)
    print("Loaded {} rows.".format(destination_table.num_rows))
    # [END bigquery_load_table_gcs_avro] 
Example #8
Source File: main.py    From BigQuery-integrations with MIT License 6 votes vote down vote up
def give_file_gbq(path_to_file, bq_configuration):
    """
        Download file from *path_to_file* to BigQuery table using *bq_configuration* settings.
    """
    # construct Client object with the path to the table in which data will be stored
    client = bigquery.Client(project = bq_configuration["project_id"])
    dataset_ref = client.dataset(bq_configuration["dataset_id"])
    table_ref = dataset_ref.table(bq_configuration["table_id"])

    # determine uploading options
    job_config = bigquery.LoadJobConfig()
    job_config.source_format = "NEWLINE_DELIMITED_JSON"
    job_config.write_disposition = bq_configuration["write_disposition"]
    job_config.autodetect = True

    # upload the file to BigQuery table
    with open(path_to_file, "rb") as source_file:
        job = client.load_table_from_file(source_file, table_ref, location = bq_configuration["location"], job_config = job_config)
    job.result()
    print("The Job " + job.job_id + " in status " + job.state + " for table " + bq_configuration["project_id"] + "." + bq_configuration["dataset_id"] + "." + bq_configuration["table_id"] + ".") 
Example #9
Source File: main.py    From BigQuery-integrations with MIT License 6 votes vote down vote up
def give_file_gbq(path_to_file, bq_configuration):
    """
        Download file from *path_to_file* to BigQuery table using *bq_configuration* settings.
    """
    # construct Client object with the path to the table in which data will be stored
    client = bigquery.Client(project = bq_configuration["project_id"])
    dataset_ref = client.dataset(bq_configuration["dataset_id"])
    table_ref = dataset_ref.table(bq_configuration["table_id"])

    # determine uploading options
    job_config = bigquery.LoadJobConfig()
    job_config.source_format = bq_configuration["source_format"].upper()
    job_config.write_disposition = bq_configuration["write_disposition"]
    if bq_configuration["source_format"].upper() == "CSV":
        job_config.field_delimiter = bq_configuration["delimiter"]
        job_config.skip_leading_rows = 1
    job_config.autodetect = True

    # upload the file to BigQuery table
    with open(path_to_file, "rb") as source_file:
        job = client.load_table_from_file(source_file, table_ref, location = bq_configuration["location"], job_config = job_config)
    job.result()
    print("The Job " + job.job_id + " in status " + job.state + " for table " + bq_configuration["project_id"] + "." + bq_configuration["dataset_id"] + "." + bq_configuration["table_id"] + ".")
    os.remove(path_to_file) 
Example #10
Source File: main.py    From BigQuery-integrations with MIT License 6 votes vote down vote up
def give_file_gbq(path_to_file, bq_configuration):
    """
        Download file from *path_to_file* to BigQuery table using *bq_configuration* settings.
    """
    # construct Client object with the path to the table in which data will be stored
    client = bigquery.Client(project = bq_configuration["project_id"])
    dataset_ref = client.dataset(bq_configuration["dataset_id"])
    table_ref = dataset_ref.table(bq_configuration["table_id"])

    # determine uploading options
    job_config = bigquery.LoadJobConfig()
    job_config.source_format = bq_configuration["source_format"].upper()
    job_config.write_disposition = bq_configuration["write_disposition"]
    if bq_configuration["source_format"].upper() == "CSV":
        job_config.field_delimiter = bq_configuration["delimiter"]
        job_config.skip_leading_rows = 1
    job_config.autodetect = True

    # upload the file to BigQuery table
    with open(path_to_file, "rb") as source_file:
        job = client.load_table_from_file(source_file, table_ref, location = bq_configuration["location"], job_config = job_config)
    job.result()
    print("The Job " + job.job_id + " in status " + job.state + " for table " + bq_configuration["project_id"] + "." + bq_configuration["dataset_id"] + "." + bq_configuration["table_id"] + ".")
    os.remove(path_to_file) 
Example #11
Source File: main.py    From BigQuery-integrations with MIT License 6 votes vote down vote up
def load_to_gbq(filename, bq_configuration):
    """
        Loading data to BigQuery using *bq_configuration* settings.
    """
    # construct Client object with the path to the table in which data will be stored
    client = bigquery.Client(project = bq_configuration["project_id"])
    dataset_ref = client.dataset(bq_configuration["dataset_id"])
    table_ref = dataset_ref.table(bq_configuration["table"])

    # determine uploading options
    job_config = bigquery.LoadJobConfig()
    job_config.write_disposition = 'WRITE_TRUNCATE'
    job_config.source_format = bq_configuration["source_format"]
    job_config.autodetect = True
    if bq_configuration["source_format"].upper() == "CSV":
        job_config.skip_leading_rows = 1

    # upload the file to BigQuery table
    with open(filename, "rb") as source_file:
        job = client.load_table_from_file(source_file, table_ref, location = bq_configuration["location"], job_config = job_config)
    job.result()
    print("The Job " + job.job_id + " in status " + job.state + " for table " + bq_configuration["project_id"] + "." +
          bq_configuration["dataset_id"] + "." + bq_configuration["table"] + ".")
    os.remove(filename) 
Example #12
Source File: main.py    From BigQuery-integrations with MIT License 6 votes vote down vote up
def load_to_gbq(client, data, bq_configuration):
    """
        Loading data to BigQuery using *bq_configuration* settings.
    """
    client = bigquery.Client(project = bq_configuration["project_id"])
    dataset_ref = client.dataset(bq_configuration["dataset_id"])
    table_ref = dataset_ref.table(bq_configuration["table"])

    # determine uploading options
    job_config = bigquery.LoadJobConfig()
    job_config.write_disposition = 'WRITE_TRUNCATE'
    job_config.source_format = "NEWLINE_DELIMITED_JSON"
    job_config.autodetect = True

    load_job = client.load_table_from_file(
        data,
        table_ref,
        job_config = job_config)  # API request
    print('Starting job {}'.format(load_job.job_id))

    load_job.result()  # Waits for table load to complete.
    print('Job finished.') 
Example #13
Source File: load_table_uri_csv.py    From python-bigquery with Apache License 2.0 5 votes vote down vote up
def load_table_uri_csv(table_id):

    # [START bigquery_load_table_gcs_csv]
    from google.cloud import bigquery

    # Construct a BigQuery client object.
    client = bigquery.Client()

    # TODO(developer): Set table_id to the ID of the table to create.
    # table_id = "your-project.your_dataset.your_table_name"

    job_config = bigquery.LoadJobConfig(
        schema=[
            bigquery.SchemaField("name", "STRING"),
            bigquery.SchemaField("post_abbr", "STRING"),
        ],
        skip_leading_rows=1,
        # The source format defaults to CSV, so the line below is optional.
        source_format=bigquery.SourceFormat.CSV,
    )
    uri = "gs://cloud-samples-data/bigquery/us-states/us-states.csv"

    load_job = client.load_table_from_uri(
        uri, table_id, job_config=job_config
    )  # Make an API request.

    load_job.result()  # Waits for the job to complete.

    destination_table = client.get_table(table_id)  # Make an API request.
    print("Loaded {} rows.".format(destination_table.num_rows))
    # [END bigquery_load_table_gcs_csv] 
Example #14
Source File: load_table_uri_truncate_avro.py    From python-bigquery with Apache License 2.0 5 votes vote down vote up
def load_table_uri_truncate_avro(table_id):

    # [START bigquery_load_table_gcs_avro_truncate]
    import six

    from google.cloud import bigquery

    # Construct a BigQuery client object.
    client = bigquery.Client()

    # TODO(developer): Set table_id to the ID of the table to create.
    # table_id = "your-project.your_dataset.your_table_name

    job_config = bigquery.LoadJobConfig(
        schema=[
            bigquery.SchemaField("name", "STRING"),
            bigquery.SchemaField("post_abbr", "STRING"),
        ],
    )

    body = six.BytesIO(b"Washington,WA")
    client.load_table_from_file(body, table_id, job_config=job_config).result()
    previous_rows = client.get_table(table_id).num_rows
    assert previous_rows > 0

    job_config = bigquery.LoadJobConfig(
        write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,
        source_format=bigquery.SourceFormat.AVRO,
    )

    uri = "gs://cloud-samples-data/bigquery/us-states/us-states.avro"
    load_job = client.load_table_from_uri(
        uri, table_id, job_config=job_config
    )  # Make an API request.

    load_job.result()  # Waits for the job to complete.

    destination_table = client.get_table(table_id)
    print("Loaded {} rows.".format(destination_table.num_rows))
    # [END bigquery_load_table_gcs_avro_truncate] 
Example #15
Source File: load_table_file.py    From python-bigquery with Apache License 2.0 5 votes vote down vote up
def load_table_file(file_path, table_id):

    # [START bigquery_load_from_file]
    from google.cloud import bigquery

    # Construct a BigQuery client object.
    client = bigquery.Client()

    # TODO(developer): Set table_id to the ID of the table to create.
    # table_id = "your-project.your_dataset.your_table_name"

    job_config = bigquery.LoadJobConfig(
        source_format=bigquery.SourceFormat.CSV, skip_leading_rows=1, autodetect=True,
    )

    with open(file_path, "rb") as source_file:
        job = client.load_table_from_file(source_file, table_id, job_config=job_config)

    job.result()  # Waits for the job to complete.

    table = client.get_table(table_id)  # Make an API request.
    print(
        "Loaded {} rows and {} columns to {}".format(
            table.num_rows, len(table.schema), table_id
        )
    )
    # [END bigquery_load_from_file]
    return table 
Example #16
Source File: load_table_uri_truncate_parquet.py    From python-bigquery with Apache License 2.0 5 votes vote down vote up
def load_table_uri_truncate_parquet(table_id):

    # [START bigquery_load_table_gcs_parquet_truncate]
    import six

    from google.cloud import bigquery

    # Construct a BigQuery client object.
    client = bigquery.Client()

    # TODO(developer): Set table_id to the ID of the table to create.
    # table_id = "your-project.your_dataset.your_table_name

    job_config = bigquery.LoadJobConfig(
        schema=[
            bigquery.SchemaField("name", "STRING"),
            bigquery.SchemaField("post_abbr", "STRING"),
        ],
    )

    body = six.BytesIO(b"Washington,WA")
    client.load_table_from_file(body, table_id, job_config=job_config).result()
    previous_rows = client.get_table(table_id).num_rows
    assert previous_rows > 0

    job_config = bigquery.LoadJobConfig(
        write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,
        source_format=bigquery.SourceFormat.PARQUET,
    )

    uri = "gs://cloud-samples-data/bigquery/us-states/us-states.parquet"
    load_job = client.load_table_from_uri(
        uri, table_id, job_config=job_config
    )  # Make an API request.

    load_job.result()  # Waits for the job to complete.

    destination_table = client.get_table(table_id)
    print("Loaded {} rows.".format(destination_table.num_rows))
    # [END bigquery_load_table_gcs_parquet_truncate] 
Example #17
Source File: load_table_uri_truncate_json.py    From python-bigquery with Apache License 2.0 5 votes vote down vote up
def load_table_uri_truncate_json(table_id):

    # [START bigquery_load_table_gcs_json_truncate]
    import six

    from google.cloud import bigquery

    # Construct a BigQuery client object.
    client = bigquery.Client()

    # TODO(developer): Set table_id to the ID of the table to create.
    # table_id = "your-project.your_dataset.your_table_name

    job_config = bigquery.LoadJobConfig(
        schema=[
            bigquery.SchemaField("name", "STRING"),
            bigquery.SchemaField("post_abbr", "STRING"),
        ],
    )

    body = six.BytesIO(b"Washington,WA")
    client.load_table_from_file(body, table_id, job_config=job_config).result()
    previous_rows = client.get_table(table_id).num_rows
    assert previous_rows > 0

    job_config = bigquery.LoadJobConfig(
        write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,
        source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
    )

    uri = "gs://cloud-samples-data/bigquery/us-states/us-states.json"
    load_job = client.load_table_from_uri(
        uri, table_id, job_config=job_config
    )  # Make an API request.

    load_job.result()  # Waits for the job to complete.

    destination_table = client.get_table(table_id)
    print("Loaded {} rows.".format(destination_table.num_rows))
    # [END bigquery_load_table_gcs_json_truncate] 
Example #18
Source File: test_copy_table_multiple_source.py    From python-bigquery with Apache License 2.0 5 votes vote down vote up
def test_copy_table_multiple_source(capsys, random_table_id, random_dataset_id, client):

    dataset = bigquery.Dataset(random_dataset_id)
    dataset.location = "US"
    dataset = client.create_dataset(dataset)
    table_data = {"table1": b"Washington,WA", "table2": b"California,CA"}
    for table_id, data in table_data.items():
        table_ref = dataset.table(table_id)
        job_config = bigquery.LoadJobConfig(
            schema=[
                bigquery.SchemaField("name", "STRING"),
                bigquery.SchemaField("post_abbr", "STRING"),
            ]
        )
        body = six.BytesIO(data)
        client.load_table_from_file(
            body, table_ref, location="US", job_config=job_config
        ).result()

    table_ids = [
        "{}.table1".format(random_dataset_id),
        "{}.table2".format(random_dataset_id),
    ]

    copy_table_multiple_source.copy_table_multiple_source(random_table_id, table_ids)
    dest_table = client.get_table(random_table_id)
    out, err = capsys.readouterr()
    assert (
        "The tables {} have been appended to {}".format(table_ids, random_table_id)
        in out
    )
    assert dest_table.num_rows > 0 
Example #19
Source File: system.py    From python-bigquery with Apache License 2.0 5 votes vote down vote up
def test_load_table_from_uri_then_dump_table(self):
        from google.cloud.bigquery.job import CreateDisposition
        from google.cloud.bigquery.job import SourceFormat
        from google.cloud.bigquery.job import WriteDisposition

        TABLE_ID = "test_table"
        GS_URL = self._write_csv_to_storage(
            "bq_load_test" + unique_resource_id(), "person_ages.csv", HEADER_ROW, ROWS
        )

        dataset = self.temp_dataset(_make_dataset_id("load_gcs_then_dump"))

        table_arg = Table(dataset.table(TABLE_ID), schema=SCHEMA)
        table = retry_403(Config.CLIENT.create_table)(table_arg)
        self.to_delete.insert(0, table)

        config = bigquery.LoadJobConfig()
        config.create_disposition = CreateDisposition.CREATE_NEVER
        config.skip_leading_rows = 1
        config.source_format = SourceFormat.CSV
        config.write_disposition = WriteDisposition.WRITE_EMPTY
        job = Config.CLIENT.load_table_from_uri(
            GS_URL, dataset.table(TABLE_ID), job_config=config
        )

        # Allow for 90 seconds of "warm up" before rows visible.  See
        # https://cloud.google.com/bigquery/streaming-data-into-bigquery#dataavailability
        # 8 tries -> 1 + 2 + 4 + 8 + 16 + 32 + 64 = 127 seconds
        retry = RetryInstanceState(_job_done, max_tries=8)
        retry(job.reload)()

        rows = self._fetch_single_page(table)
        row_tuples = [r.values() for r in rows]
        by_age = operator.itemgetter(1)
        self.assertEqual(sorted(row_tuples, key=by_age), sorted(ROWS, key=by_age)) 
Example #20
Source File: system.py    From python-bigquery with Apache License 2.0 5 votes vote down vote up
def test_list_rows_page_size(self):
        from google.cloud.bigquery.job import SourceFormat
        from google.cloud.bigquery.job import WriteDisposition

        num_items = 7
        page_size = 3
        num_pages, num_last_page = divmod(num_items, page_size)

        SF = bigquery.SchemaField
        schema = [SF("string_col", "STRING", mode="NULLABLE")]
        to_insert = [{"string_col": "item%d" % i} for i in range(num_items)]
        rows = [json.dumps(row) for row in to_insert]
        body = six.BytesIO("{}\n".format("\n".join(rows)).encode("ascii"))

        table_id = "test_table"
        dataset = self.temp_dataset(_make_dataset_id("nested_df"))
        table = dataset.table(table_id)
        self.to_delete.insert(0, table)
        job_config = bigquery.LoadJobConfig()
        job_config.write_disposition = WriteDisposition.WRITE_TRUNCATE
        job_config.source_format = SourceFormat.NEWLINE_DELIMITED_JSON
        job_config.schema = schema
        # Load a table using a local JSON file from memory.
        Config.CLIENT.load_table_from_file(body, table, job_config=job_config).result()

        df = Config.CLIENT.list_rows(table, selected_fields=schema, page_size=page_size)
        pages = df.pages

        for i in range(num_pages):
            page = next(pages)
            self.assertEqual(page.num_items, page_size)
        page = next(pages)
        self.assertEqual(page.num_items, num_last_page) 
Example #21
Source File: load_table_uri_json.py    From python-bigquery with Apache License 2.0 5 votes vote down vote up
def load_table_uri_json(table_id):
    # [START bigquery_load_table_gcs_json]
    from google.cloud import bigquery

    # Construct a BigQuery client object.
    client = bigquery.Client()

    # TODO(developer): Set table_id to the ID of the table to create.
    # table_id = "your-project.your_dataset.your_table_name"

    job_config = bigquery.LoadJobConfig(
        schema=[
            bigquery.SchemaField("name", "STRING"),
            bigquery.SchemaField("post_abbr", "STRING"),
        ],
        source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
    )
    uri = "gs://cloud-samples-data/bigquery/us-states/us-states.json"

    load_job = client.load_table_from_uri(
        uri,
        table_id,
        location="US",  # Must match the destination dataset location.
        job_config=job_config,
    )  # Make an API request.

    load_job.result()  # Waits for the job to complete.

    destination_table = client.get_table(table_id)
    print("Loaded {} rows.".format(destination_table.num_rows))
    # [END bigquery_load_table_gcs_json] 
Example #22
Source File: client_load_partitioned_table.py    From python-bigquery with Apache License 2.0 5 votes vote down vote up
def client_load_partitioned_table(table_id):

    # [START bigquery_load_table_partitioned]
    from google.cloud import bigquery

    # Construct a BigQuery client object.
    client = bigquery.Client()

    # TODO(developer): Set table_id to the ID of the table to create.
    # table_id = "your-project.your_dataset.your_table_name"

    job_config = bigquery.LoadJobConfig(
        schema=[
            bigquery.SchemaField("name", "STRING"),
            bigquery.SchemaField("post_abbr", "STRING"),
            bigquery.SchemaField("date", "DATE"),
        ],
        skip_leading_rows=1,
        time_partitioning=bigquery.TimePartitioning(
            type_=bigquery.TimePartitioningType.DAY,
            field="date",  # Name of the column to use for partitioning.
            expiration_ms=7776000000,  # 90 days.
        ),
    )
    uri = "gs://cloud-samples-data/bigquery/us-states/us-states-by-date.csv"

    load_job = client.load_table_from_uri(
        uri, table_id, job_config=job_config
    )  # Make an API request.

    load_job.result()  # Wait for the job to complete.

    table = client.get_table(table_id)
    print("Loaded {} rows to table {}".format(table.num_rows, table_id))
    # [END bigquery_load_table_partitioned] 
Example #23
Source File: load_table_uri_truncate_csv.py    From python-bigquery with Apache License 2.0 5 votes vote down vote up
def load_table_uri_truncate_csv(table_id):

    # [START bigquery_load_table_gcs_csv_truncate]
    import six

    from google.cloud import bigquery

    # Construct a BigQuery client object.
    client = bigquery.Client()

    # TODO(developer): Set table_id to the ID of the table to create.
    # table_id = "your-project.your_dataset.your_table_name

    job_config = bigquery.LoadJobConfig(
        schema=[
            bigquery.SchemaField("name", "STRING"),
            bigquery.SchemaField("post_abbr", "STRING"),
        ],
    )

    body = six.BytesIO(b"Washington,WA")
    client.load_table_from_file(body, table_id, job_config=job_config).result()
    previous_rows = client.get_table(table_id).num_rows
    assert previous_rows > 0

    job_config = bigquery.LoadJobConfig(
        write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,
        source_format=bigquery.SourceFormat.CSV,
        skip_leading_rows=1,
    )

    uri = "gs://cloud-samples-data/bigquery/us-states/us-states.csv"
    load_job = client.load_table_from_uri(
        uri, table_id, job_config=job_config
    )  # Make an API request.

    load_job.result()  # Waits for the job to complete.

    destination_table = client.get_table(table_id)
    print("Loaded {} rows.".format(destination_table.num_rows))
    # [END bigquery_load_table_gcs_csv_truncate] 
Example #24
Source File: load_table_uri_autodetect_csv.py    From python-bigquery with Apache License 2.0 5 votes vote down vote up
def load_table_uri_autodetect_csv(table_id):

    # [START bigquery_load_table_gcs_csv_autodetect]
    from google.cloud import bigquery

    # Construct a BigQuery client object.
    client = bigquery.Client()

    # TODO(developer): Set table_id to the ID of the table to create.
    # table_id = "your-project.your_dataset.your_table_name

    # Set the encryption key to use for the destination.
    # TODO: Replace this key with a key you have created in KMS.
    # kms_key_name = "projects/{}/locations/{}/keyRings/{}/cryptoKeys/{}".format(
    #     "cloud-samples-tests", "us", "test", "test"
    # )
    job_config = bigquery.LoadJobConfig(
        autodetect=True,
        skip_leading_rows=1,
        # The source format defaults to CSV, so the line below is optional.
        source_format=bigquery.SourceFormat.CSV,
    )
    uri = "gs://cloud-samples-data/bigquery/us-states/us-states.csv"
    load_job = client.load_table_from_uri(
        uri, table_id, job_config=job_config
    )  # Make an API request.
    load_job.result()  # Waits for the job to complete.
    destination_table = client.get_table(table_id)
    print("Loaded {} rows.".format(destination_table.num_rows))
    # [END bigquery_load_table_gcs_csv_autodetect] 
Example #25
Source File: test_table_util.py    From professional-services with Apache License 2.0 5 votes vote down vote up
def test_set_table_properites(self):
        """Tests TableUtil.set_table_properites).

        Tests TableUtil's ability to set num_columns, num_rows, table_size,
        and column_types properties after a table has been created and data
        has been loaded into the table.

        Returns:
            True if test passes, else False.
        """
        self.table_util.create_table()
        job_config = bigquery.LoadJobConfig()
        job_config.source_format = bigquery.SourceFormat.CSV
        job_config.skip_leading_rows = 1
        abs_path = os.path.abspath(os.path.dirname(__file__))
        data_file = os.path.join(abs_path, 'test_data/test.csv')
        with open(data_file, 'rb') as file_obj:
            load_job = self.bq_client.load_table_from_file(
                file_obj=file_obj,
                destination=self.table_util.table_ref,
                job_config=job_config,
            )
            load_job.result()
        self.table_util.set_table_properties()
        assert self.table_util.num_columns == 2
        assert self.table_util.num_rows == 3
        assert self.table_util.table_size == 75
        assert self.table_util.column_types == '50_STRING_50_NUMERIC' 
Example #26
Source File: group_sync.py    From professional-services with Apache License 2.0 5 votes vote down vote up
def store_group_members(self, table_ref, data):
    """Stores the given group membership data into the given table.

    All data in the table will be replaced by the new data.

    Args:
      table_ref: a reference to the table that will receive the data. Needs to
        have the correct schema as created by `create_group_members_table`.
      data: bi-dimensional array representing the data to be inserted.

    Returns:
      An array of errors returned from the BigQuery API, if any.
    """

    # Use batch load from in-memory CSV file instead of streaming.
    csv_file = StringIO()
    csv_writer = csv.writer(csv_file)
    csv_writer.writerows(data)

    # Configure job to replace table data, (default is append).
    job_config = bigquery.LoadJobConfig()
    job_config.write_disposition = bigquery.job.WriteDisposition.WRITE_TRUNCATE

    errors = self.bq_client.load_table_from_file(
        csv_file, table_ref, rewind=True, job_config=job_config)
    return errors 
Example #27
Source File: import_pipeline.py    From professional-services with Apache License 2.0 5 votes vote down vote up
def process(self, element, schemas):
        """Element is a tuple of key_ name and iterable of filesystem paths."""

        dataset_ref = self.get_dataset_ref()
        sharded_key_name = element[0]
        key_name = AssignGroupByKey.remove_shard(element[0])
        object_paths = [object_path for object_path in element[1]]
        job_config = bigquery.LoadJobConfig()
        job_config.write_disposition = 'WRITE_APPEND'
        job_config.schema_update_options = [
            bigquery.job.SchemaUpdateOption.ALLOW_FIELD_ADDITION]

        table_ref = dataset_ref.table(self.asset_type_to_table_name(key_name))

        # use load_time as a timestamp.
        job_config.time_partitioning = bigquery.table.TimePartitioning(
            field='timestamp')
        job_config.schema = self.to_bigquery_schema(schemas[sharded_key_name])
        job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON
        try:
            load_job = self.bigquery_client.load_table_from_uri(
                object_paths,
                table_ref,
                location=self.dataset_location,
                job_config=job_config)
            self.load_jobs[key_name] = load_job
        except BadRequest as e:
            logging.error('error in load_job %s, %s, %s, %s',
                          str(object_paths), str(table_ref),
                          str(self.dataset_location),
                          str(job_config.to_api_repr()))
            raise e 
Example #28
Source File: test_user_info_updater.py    From professional-services with Apache License 2.0 5 votes vote down vote up
def load_csv_to_bq(self, filename, table):
        job_config = bigquery.LoadJobConfig()
        job_config.source_format = bigquery.SourceFormat.CSV
        job_config.skip_leading_rows = 1

        abs_path = os.path.abspath(os.path.dirname(__file__))
        data_file = os.path.join(abs_path, filename)
        with open(data_file, 'rb') as file_obj:
            load_job = self.bq_client.load_table_from_file(
                file_obj=file_obj, destination=table, job_config=job_config)
        return load_job.result() 
Example #29
Source File: test_nested_user_info_updater.py    From professional-services with Apache License 2.0 5 votes vote down vote up
def load_json_to_bq(self, filename, table):
        job_config = bigquery.LoadJobConfig()
        job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON

        abs_path = os.path.abspath(os.path.dirname(__file__))
        data_file = os.path.join(abs_path, filename)
        with open(data_file, 'rb') as file_obj:
            load_job = self.bq_client.load_table_from_file(
                file_obj=file_obj, destination=table, job_config=job_config)
        return load_job.result() 
Example #30
Source File: test_user_info_updater.py    From professional-services with Apache License 2.0 5 votes vote down vote up
def load_csv_to_bq(self, filename, table):
        job_config = bigquery.LoadJobConfig()
        job_config.source_format = bigquery.SourceFormat.CSV
        job_config.skip_leading_rows = 1

        abs_path = os.path.abspath(os.path.dirname(__file__))
        data_file = os.path.join(abs_path, filename)
        with open(data_file, 'rb') as file_obj:
            load_job = self.bq_client.load_table_from_file(
                file_obj=file_obj, destination=table, job_config=job_config)
        return load_job.result()