Example #1
Source File:    From digdag with Apache License 2.0 6 votes vote down vote up
protected JobConfiguration jobConfiguration(String projectId)
    JobConfigurationExtract cfg = new JobConfigurationExtract();

    try {
        cfg.setDestinationUris(params.getList("destination", String.class));
    catch (ConfigException ignore) {
        cfg.setDestinationUri(params.get("destination", String.class));

    Optional<DatasetReference> defaultDataset = params.getOptional("dataset", String.class)
    String sourceTable = params.get("_command", String.class);
    cfg.setSourceTable(tableReference(projectId, defaultDataset, sourceTable));

    params.getOptional("print_header", boolean.class).transform(cfg::setPrintHeader);
    params.getOptional("field_delimiter", String.class).transform(cfg::setFieldDelimiter);
    params.getOptional("destination_format", String.class).transform(cfg::setDestinationFormat);
    params.getOptional("compression", String.class).transform(cfg::setCompression);

    return new JobConfiguration()
Example #2
Source File:    From nomulus with Apache License 2.0 6 votes vote down vote up
 * Ensures the dataset exists by trying to create it. Note that it's not appreciably cheaper
 * to check for dataset existence than it is to try to create it and check for exceptions.
// Note that these are not static so they can be mocked for testing.
private void ensureDataset(Bigquery bigquery, String projectId, String datasetId)
    throws IOException {
  try {
            new Dataset().setDatasetReference(
                new DatasetReference()
  } catch (IOException e) {
    // Swallow errors about a duplicate dataset, and throw any other ones.
    if (!BigqueryJobFailureException.create(e).getReason().equals("duplicate")) {
      throw e;
Example #3
Source File:    From digdag with Apache License 2.0 6 votes vote down vote up
private Table table(String defaultProjectId, Optional<DatasetReference> defaultDataset, TableConfig config)
    Optional<String> datasetId = config.dataset().or(defaultDataset.transform(DatasetReference::getDatasetId));
    if (!datasetId.isPresent()) {
        throw new ConfigException("Bad table reference or configuration: Missing 'dataset'");
    return new Table()
            .setTableReference(new TableReference()
                    .transform(p -> p.getTimestamp().toInstant(request.getTimeZone()).toEpochMilli()).orNull())
Example #4
Source File:    From digdag with Apache License 2.0 6 votes vote down vote up
private Table table(String defaultProjectId, Optional<DatasetReference> defaultDataset, JsonNode node)
    if (node.isTextual()) {
        return new Table()
                .setTableReference(Bq.tableReference(defaultProjectId, defaultDataset, node.asText()));
    else {
        TableConfig config;
        try {
            config = objectMapper.readValue(node.traverse(), TableConfig.class);
        catch (IOException e) {
            throw new ConfigException("Invalid table reference or configuration: " + node, e);
        return table(defaultProjectId, defaultDataset, config);
Example #5
Source File:    From deployment-examples with MIT License 5 votes vote down vote up
private void setupBigQueryTable(
    String projectId, String datasetId, String tableId, TableSchema schema) throws IOException {
  if (bigQueryClient == null) {
    bigQueryClient = newBigQueryClient(;

  Datasets datasetService = bigQueryClient.datasets();
  if (executeNullIfNotFound(datasetService.get(projectId, datasetId)) == null) {
    Dataset newDataset =
        new Dataset()
                new DatasetReference().setProjectId(projectId).setDatasetId(datasetId));
    datasetService.insert(projectId, newDataset).execute();

  Tables tableService = bigQueryClient.tables();
  Table table = executeNullIfNotFound(tableService.get(projectId, datasetId, tableId));
  if (table == null) {
    Table newTable =
        new Table()
                new TableReference()
    tableService.insert(projectId, datasetId, newTable).execute();
  } else if (!table.getSchema().equals(schema)) {
    throw new RuntimeException(
        "Table exists and schemas do not match, expecting: "
            + schema.toPrettyString()
            + ", actual: "
            + table.getSchema().toPrettyString());
Example #6
Source File:    From digdag with Apache License 2.0 5 votes vote down vote up
static Dataset createDataset(Bigquery bq, String projectId, String datasetId)
        throws IOException, RetryExecutor.RetryGiveupException
    Dataset dataset = new Dataset()
            .setDatasetReference(new DatasetReference()
    Dataset created = createDataset(bq, projectId, dataset);
    assertThat(datasetExists(bq, projectId, datasetId), is(true));
    return created;
Example #7
Source File:    From digdag with Apache License 2.0 5 votes vote down vote up
public void testLoad()
        throws Exception
    assumeThat(GCS_TEST_BUCKET, not(isEmptyOrNullString()));

    // Create source data object
    String objectName = GCS_PREFIX + "test.csv";
    byte[] data = Joiner.on('\n').join("a,b", "c,d").getBytes(UTF_8);
    InputStreamContent content = new InputStreamContent("text/csv", new ByteArrayInputStream(data))
    StorageObject metadata = new StorageObject().setName(objectName); -> gcs.objects()
            .insert(GCS_TEST_BUCKET, metadata, content)

    // Create output dataset
    String datasetId = BQ_TAG + "_load_test";
    Dataset dataset = new Dataset().setDatasetReference(new DatasetReference()
            .setDatasetId(datasetId)); -> bq.datasets().insert(gcpProjectId, dataset)

    // Run load
    String tableId = "data";
    addWorkflow(projectDir, "acceptance/bigquery/load.dig");
    Id attemptId = pushAndStart(server.endpoint(), projectDir, "load", ImmutableMap.of(
            "source_bucket", GCS_TEST_BUCKET,
            "source_object", objectName,
            "target_dataset", datasetId,
            "target_table", tableId,
            "outfile", outfile.toString()));
    expect(Duration.ofMinutes(5), attemptSuccess(server.endpoint(), attemptId));
    assertThat(Files.exists(outfile), is(true));

    // Check that destination table was created
    Table destinationTable = -> bq.tables().get(gcpProjectId, datasetId, tableId).execute());
    assertThat(destinationTable.getTableReference().getTableId(), is(tableId));
Example #8
Source File:    From digdag with Apache License 2.0 5 votes vote down vote up
protected JobConfiguration jobConfiguration(String projectId)
    JobConfigurationQuery cfg = new JobConfigurationQuery()

    cfg.setUseLegacySql(params.get("use_legacy_sql", boolean.class, false));

    params.getOptional("allow_large_results", boolean.class).transform(cfg::setAllowLargeResults);
    params.getOptional("use_query_cache", Boolean.class).transform(cfg::setUseQueryCache);
    params.getOptional("create_disposition", String.class).transform(cfg::setCreateDisposition);
    params.getOptional("write_disposition", String.class).transform(cfg::setWriteDisposition);
    params.getOptional("flatten_results", Boolean.class).transform(cfg::setFlattenResults);
    params.getOptional("maximum_billing_tier", Integer.class).transform(cfg::setMaximumBillingTier);
    params.getOptional("priority", String.class).transform(cfg::setPriority);

    params.getOptional("table_definitions", new TypeReference<Map<String, ExternalDataConfiguration>>() {})
    params.getOptional("user_defined_function_resources", new TypeReference<List<UserDefinedFunctionResource>>() {})

    Optional<DatasetReference> defaultDataset = params.getOptional("dataset", String.class)

    params.getOptional("destination_table", String.class)
            .transform(s -> cfg.setDestinationTable(tableReference(projectId, defaultDataset, s)));

    return new JobConfiguration()
Example #9
Source File:    From digdag with Apache License 2.0 5 votes vote down vote up
static DatasetReference datasetReference(Optional<String> defaultProjectId, String s)
    Matcher matcher = DATASET_REFERENCE_PATTERN.matcher(s);
    if (!matcher.matches()) {
        throw new IllegalArgumentException("Bad dataset reference: " + s);
    return new DatasetReference()
Example #10
Source File:    From digdag with Apache License 2.0 5 votes vote down vote up
static TableReference tableReference(String defaultProjectId, Optional<DatasetReference> defaultDataset, String s)
    Matcher matcher = TABLE_REFERENCE_PATTERN.matcher(s);
    if (!matcher.matches()) {
        throw new IllegalArgumentException("Bad table reference: " + s);

    String project ="project");
    if (project == null) {
        if (defaultDataset.isPresent() && defaultDataset.get().getProjectId() != null) {
            project = defaultDataset.get().getProjectId();
        else {
            project = defaultProjectId;

    Optional<String> dataset = Optional.fromNullable("dataset"))

    String table ="table");

    if (!dataset.isPresent()) {
        throw new IllegalArgumentException("Bad table reference. Either configure 'dataset' or include dataset name in table reference: " + s);

    return new TableReference()
Example #11
Source File:    From digdag with Apache License 2.0 5 votes vote down vote up
private Dataset dataset(String defaultProjectId, DatasetConfig config)
    return new Dataset()
            .setDatasetReference(new DatasetReference()
            .setDefaultTableExpirationMs(config.default_table_expiration().transform(d -> d.getDuration().toMillis()).orNull())
Example #12
Source File:    From digdag with Apache License 2.0 5 votes vote down vote up
private BqOperation deleteDataset(JsonNode config)
    if (!config.isTextual()) {
        throw new ConfigException("Bad dataset reference: " + config);
    return (bq, projectId) -> {
        DatasetReference r = datasetReference(projectId, config.asText());
        bq.deleteDataset(r.getProjectId(), r.getDatasetId());
Example #13
Source File:    From digdag with Apache License 2.0 5 votes vote down vote up
protected JobConfiguration jobConfiguration(String projectId)
    JobConfigurationLoad cfg = new JobConfigurationLoad()

    if (params.has("schema")) {

    Optional<DatasetReference> defaultDataset = params.getOptional("dataset", String.class)

    String destinationTable = params.get("destination_table", String.class);
    cfg.setDestinationTable(tableReference(projectId, defaultDataset, destinationTable));

    params.getOptional("create_disposition", String.class).transform(cfg::setCreateDisposition);
    params.getOptional("write_disposition", String.class).transform(cfg::setWriteDisposition);

    params.getOptional("source_format", String.class).transform(cfg::setSourceFormat);
    params.getOptional("field_delimiter", String.class).transform(cfg::setFieldDelimiter);
    params.getOptional("skip_leading_rows", int.class).transform(cfg::setSkipLeadingRows);
    params.getOptional("encoding", String.class).transform(cfg::setEncoding);
    params.getOptional("quote", String.class).transform(cfg::setQuote);
    params.getOptional("max_bad_records", int.class).transform(cfg::setMaxBadRecords);
    params.getOptional("allow_quoted_newlines", boolean.class).transform(cfg::setAllowQuotedNewlines);
    params.getOptional("allow_jagged_rows", boolean.class).transform(cfg::setAllowJaggedRows);
    params.getOptional("ignore_unknown_values", boolean.class).transform(cfg::setIgnoreUnknownValues);
    Optional.of(params.getListOrEmpty("projection_fields", String.class)).transform(cfg::setProjectionFields);
    params.getOptional("autodetect", boolean.class).transform(cfg::setAutodetect);
    Optional.of(params.getListOrEmpty("schema_update_options", String.class)).transform(cfg::setSchemaUpdateOptions);

    return new JobConfiguration()
Example #14
Source File:    From nomulus with Apache License 2.0 5 votes vote down vote up
 * Helper that creates a dataset with this name if it doesn't already exist, and returns true
 * if creation took place.
public boolean createDatasetIfNeeded(String datasetName) throws IOException {
  if (!checkDatasetExists(datasetName)) {
        .insert(getProjectId(), new Dataset().setDatasetReference(new DatasetReference()
    logger.atInfo().log("Created dataset: %s:%s\n", getProjectId(), datasetName);
    return true;
  return false;
Example #15
Source File:    From beam with Apache License 2.0 5 votes vote down vote up
public Dataset getDataset(String projectId, String datasetId)
    throws IOException, InterruptedException {
  synchronized (tables) {
    Map<String, TableContainer> dataset = tables.get(projectId, datasetId);
    if (dataset == null) {
          "Tried to get a dataset %s:%s, but no such table was set", projectId, datasetId);
    return new Dataset()
            new DatasetReference().setDatasetId(datasetId).setProjectId(projectId));
Example #16
Source File:    From beam with Apache License 2.0 5 votes vote down vote up
private void setupBigQueryTable(
    String projectId, String datasetId, String tableId, TableSchema schema) throws IOException {
  if (bigQueryClient == null) {
    bigQueryClient = newBigQueryClient(;

  Datasets datasetService = bigQueryClient.datasets();
  if (executeNullIfNotFound(datasetService.get(projectId, datasetId)) == null) {
    Dataset newDataset =
        new Dataset()
                new DatasetReference().setProjectId(projectId).setDatasetId(datasetId));
    datasetService.insert(projectId, newDataset).execute();

  Tables tableService = bigQueryClient.tables();
  Table table = executeNullIfNotFound(tableService.get(projectId, datasetId, tableId));
  if (table == null) {
    Table newTable =
        new Table()
                new TableReference()
    tableService.insert(projectId, datasetId, newTable).execute();
  } else if (!table.getSchema().equals(schema)) {
    throw new RuntimeException(
        "Table exists and schemas do not match, expecting: "
            + schema.toPrettyString()
            + ", actual: "
            + table.getSchema().toPrettyString());
Example #17
Source File:    From digdag with Apache License 2.0 4 votes vote down vote up
static DatasetReference datasetReference(String s)
    return datasetReference(Optional.absent(), s);
Example #18
Source File:    From digdag with Apache License 2.0 4 votes vote down vote up
static DatasetReference datasetReference(String defaultProjectId, String s)
    return datasetReference(Optional.of(defaultProjectId), s);
Example #19
Source File:    From nomulus with Apache License 2.0 4 votes vote down vote up
/** Returns dataset reference that can be used to avoid having to specify dataset in SQL code. */
public DatasetReference getDataset() {
  return new DatasetReference()
Example #20
Source File:    From digdag with Apache License 2.0 4 votes vote down vote up
public void testExtract()
        throws Exception
    assumeThat(GCS_TEST_BUCKET, not(isEmptyOrNullString()));

    // Create source table
    String tableId = "data";
    String datasetId = BQ_TAG + "_extract_test";
    Dataset dataset = new Dataset().setDatasetReference(new DatasetReference()
            .setDatasetId(datasetId)); -> bq.datasets().insert(gcpProjectId, dataset)
    Table table = new Table().setTableReference(new TableReference()
            .setSchema(new TableSchema()
                            new TableFieldSchema().setName("foo").setType("STRING"),
                            new TableFieldSchema().setName("bar").setType("STRING")
                    ))); -> bq.tables().insert(gcpProjectId, datasetId, table)

    // Populate source table
    TableDataInsertAllRequest content = new TableDataInsertAllRequest()
                    new TableDataInsertAllRequest.Rows().setJson(ImmutableMap.of(
                            "foo", "a",
                            "bar", "b")),
                    new TableDataInsertAllRequest.Rows().setJson(ImmutableMap.of(
                            "foo", "c",
                            "bar", "d")))); -> bq.tabledata().insertAll(gcpProjectId, datasetId, tableId, content)

    // Run extract
    String objectName = GCS_PREFIX + "test.csv";
    addWorkflow(projectDir, "acceptance/bigquery/extract.dig");
    Id attemptId = pushAndStart(server.endpoint(), projectDir, "extract", ImmutableMap.of(
            "src_dataset", datasetId,
            "src_table", tableId,
            "dst_bucket", GCS_TEST_BUCKET,
            "dst_object", objectName,
            "outfile", outfile.toString()));
    expect(Duration.ofMinutes(5), attemptSuccess(server.endpoint(), attemptId));
    assertThat(Files.exists(outfile), is(true));

    // Check that destination file was created
    StorageObject metadata = -> gcs.objects().get(GCS_TEST_BUCKET, objectName)
    assertThat(metadata.getName(), is(objectName));
    ByteArrayOutputStream data = new ByteArrayOutputStream(); -> {
        try {
            gcs.objects().get(GCS_TEST_BUCKET, objectName)
        catch (IOException e) {
            throw Throwables.propagate(e);
Example #21
Source File:    From gcp-ingestion with Mozilla Public License 2.0 4 votes vote down vote up
 * Return the appropriate table destination instance for the given document type and other
 * attributes.
public TableDestination getTableDestination(Map<String, String> attributes) {
  attributes = new HashMap<>(attributes);

  // We coerce all docType and namespace names to be snake_case and to remove invalid
  // characters; these transformations MUST match with the transformations applied by the
  // jsonschema-transpiler and mozilla-schema-generator when creating table schemas in BigQuery.
  final String namespace = attributes.get(Attribute.DOCUMENT_NAMESPACE);
  final String docType = attributes.get(Attribute.DOCUMENT_TYPE);
  if (namespace != null) {
    attributes.put(Attribute.DOCUMENT_NAMESPACE, getAndCacheNormalizedName(namespace));
  if (docType != null) {
    attributes.put(Attribute.DOCUMENT_TYPE, getAndCacheNormalizedName(docType));

  // Only letters, numbers, and underscores are allowed in BigQuery dataset and table names,
  // but some doc types and namespaces contain '-', so we convert to '_'; we don't pass all
  // values through getAndCacheBqName to avoid expensive regex operations and polluting the
  // cache of transformed field names.
  attributes = Maps.transformValues(attributes, v -> v.replaceAll("-", "_"));

  final String tableSpec = StringSubstitutor.replace(tableSpecTemplate.get(), attributes);

  // Send to error collection if incomplete tableSpec; $ is not a valid char in tableSpecs.
  if (tableSpec.contains("$")) {
    throw new IllegalArgumentException("Element did not contain all the attributes needed to"
        + " fill out variables in the configured BigQuery output template: "
        + tableSpecTemplate.get());

  final TableDestination tableDestination = new TableDestination(tableSpec, null,
      new TimePartitioning().setField(partitioningField.get()),
      new Clustering().setFields(clusteringFields.get()));
  final TableReference ref = BigQueryHelpers.parseTableSpec(tableSpec);
  final DatasetReference datasetRef = new DatasetReference().setProjectId(ref.getProjectId())

  if (bqService == null) {
    bqService = BigQueryOptions.newBuilder().setProjectId(ref.getProjectId())

  // Get and cache a listing of table names for this dataset.
  Set<String> tablesInDataset;
  if (tableListingCache == null) {
    // We need to be very careful about settings for the cache here. We have had significant
    // issues in the past due to exceeding limits on BigQuery API requests; see
    tableListingCache = CacheBuilder.newBuilder().expireAfterWrite(Duration.ofMinutes(10))
  try {
    tablesInDataset = tableListingCache.get(datasetRef, () -> {
      Set<String> tableSet = new HashSet<>();
      Dataset dataset = bqService.getDataset(ref.getDatasetId());
      if (dataset != null) {
        dataset.list().iterateAll().forEach(t -> {
      return tableSet;
  } catch (ExecutionException e) {
    throw new UncheckedExecutionException(e.getCause());

  // Send to error collection if dataset or table doesn't exist so BigQueryIO doesn't throw a
  // pipeline execution exception.
  if (tablesInDataset.isEmpty()) {
    throw new IllegalArgumentException("Resolved destination dataset does not exist or has no "
        + " tables for tableSpec " + tableSpec);
  } else if (!tablesInDataset.contains(ref.getTableId())) {
    throw new IllegalArgumentException("Resolved destination table does not exist: " + tableSpec);

  return tableDestination;
Example #22
Source File:    From hadoop-connectors with Apache License 2.0 4 votes vote down vote up
public void setUp()
    throws IOException, GeneralSecurityException {


  bucketHelper = new TestBucketHelper("bq_integration_test");
  // A unique per-setUp String to avoid collisions between test runs.
  String testId = bucketHelper.getUniqueBucketPrefix();

  projectIdValue = TestConfiguration.getInstance().getProjectId();
  if (Strings.isNullOrEmpty(projectIdValue)) {
    projectIdValue = System.getenv(BIGQUERY_PROJECT_ID_ENVVARNAME);

      !Strings.isNullOrEmpty(projectIdValue), "Must provide %s", BIGQUERY_PROJECT_ID_ENVVARNAME);
  testDataset = testId + "_dataset";
  testBucket = testId + "_bucket";

  // We have to create the output dataset ourselves.
  // TODO(user): Extract dataset creation into a library which is also used by
  // BigQueryOutputCommitter.
  Dataset outputDataset = new Dataset();
  DatasetReference datasetReference = new DatasetReference();

  config = getConfigForGcsFromBigquerySettings(projectIdValue);
  BigQueryFactory factory = new BigQueryFactory();
  bigqueryInstance = factory.getBigQuery(config);

  Bigquery.Datasets datasets = bigqueryInstance.datasets();
      "Creating temporary dataset '%s' for project '%s'", testDataset, projectIdValue);
  datasets.insert(projectIdValue, outputDataset).execute();

  Path toCreate = new Path(String.format("gs://%s", testBucket));
  FileSystem fs = toCreate.getFileSystem(config);
  logger.atInfo().log("Creating temporary test bucket '%s'", toCreate);

  // Since the TaskAttemptContext and JobContexts are mostly used just to access a
  // "Configuration" object, we'll mock the two contexts to just return our fake configuration
  // object with which we'll provide the settings we want to test.


  // Have a realistic-looking fake TaskAttemptID.
  int taskNumber = 3;
  int taskAttempt = 2;
  int jobNumber = 42;
  String jobIdString = "jobid" + System.currentTimeMillis();
  JobID jobId = new JobID(jobIdString, jobNumber);
  TaskAttemptID taskAttemptId =
      new TaskAttemptID(new TaskID(jobId, false, taskNumber), taskAttempt);

  testTable = testId + "_table_" + jobIdString;