com.google.api.services.bigquery.model.TimePartitioning Java Exaples

Source File: BigQueryDeadletterSink.java From feast with Apache License 2.0

6 votes

@Override
public PDone expand(PCollection<FailedElement> input) {
  TimePartitioning partition = new TimePartitioning().setType("DAY");
  partition.setField(TIMESTAMP_COLUMN);
  input
      .apply("FailedElementToTableRow", ParDo.of(new FailedElementToTableRowFn()))
      .apply(
          "WriteFailedElementsToBigQuery",
          BigQueryIO.writeTableRows()
              .to(getTableSpec())
              .withJsonSchema(getJsonSchema())
              .withTimePartitioning(partition)
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_APPEND));
  return PDone.in(input.getPipeline());
}

Source File: BigQuerySinkHelpers.java From feast with Apache License 2.0

5 votes

/**
 * Generating BQ table destination from dataset reference and featuresSet's project and name. If
 * project is undefined "default" would be selected
 *
 * @param dataset {@link DatasetId} reference to bq project and dataset
 * @param featureSetKey Feature Set reference with format &lt;project&gt;/&lt;feature-set-name&gt;
 * @return {@link TableDestination}
 */
public static TableDestination getTableDestination(DatasetId dataset, String featureSetKey) {
  String[] splitName = featureSetKey.split("/");
  String projectName, setName;

  if (splitName.length == 2) {
    projectName = splitName[0];
    setName = splitName[1];
  } else {
    projectName = DEFAULT_PROJECT_NAME;
    setName = splitName[0];
  }

  TimePartitioning timePartitioning =
      new TimePartitioning()
          .setType("DAY")
          .setField(FeatureRowToTableRow.getEventTimestampColumn());

  return new TableDestination(
      String.format(
          "%s:%s.%s_%s",
          dataset.getProject(),
          dataset.getDataset(),
          projectName.replaceAll("-", "_"),
          setName.replaceAll("-", "_")),
      String.format("Feast table for %s", featureSetKey),
      timePartitioning);
}

Source File: TableDestination.java From beam with Apache License 2.0

5 votes

public TableDestination(
    TableReference tableReference,
    @Nullable String tableDescription,
    TimePartitioning timePartitioning) {
  this(
      BigQueryHelpers.toTableSpec(tableReference),
      tableDescription,
      timePartitioning != null ? BigQueryHelpers.toJsonString(timePartitioning) : null,
      (String) null);
}

Source File: TableDestination.java From beam with Apache License 2.0

5 votes

public TableDestination(
    String tableSpec, @Nullable String tableDescription, TimePartitioning timePartitioning) {
  this(
      tableSpec,
      tableDescription,
      timePartitioning != null ? BigQueryHelpers.toJsonString(timePartitioning) : null,
      (String) null);
}

Source File: TableDestination.java From beam with Apache License 2.0

5 votes

public TableDestination(
    String tableSpec,
    @Nullable String tableDescription,
    TimePartitioning timePartitioning,
    Clustering clustering) {
  this(
      tableSpec,
      tableDescription,
      timePartitioning != null ? BigQueryHelpers.toJsonString(timePartitioning) : null,
      clustering != null ? BigQueryHelpers.toJsonString(clustering) : null);
}

Source File: TableDestination.java From beam with Apache License 2.0

5 votes

public TimePartitioning getTimePartitioning() {
  if (jsonTimePartitioning == null) {
    return null;
  } else {
    return BigQueryHelpers.fromJsonString(jsonTimePartitioning, TimePartitioning.class);
  }
}

Source File: BigQueryOutputConfiguration.java From hadoop-connectors with Apache License 2.0

5 votes

/**
 * Gets the output table time partitioning based on the given configuration.
 *
 * @param conf the configuration to reference the keys from.
 * @return the derived table time partitioning, absent value if no table time partitioning exists
 *     in the configuration.
 * @throws IOException if a table time partitioning was set in the configuration but couldn't be
 *     parsed.
 */
static Optional<BigQueryTimePartitioning> getTablePartitioning(Configuration conf)
    throws IOException {
  String fieldsJson = OUTPUT_TABLE_PARTITIONING.get(conf, conf::get);
  if (!Strings.isNullOrEmpty(fieldsJson)) {
    try {
      TimePartitioning tablePartitioning = BigQueryTimePartitioning.getFromJson(fieldsJson);
      return Optional.of(BigQueryTimePartitioning.wrap(tablePartitioning));
    } catch (IOException e) {
      throw new IOException(
          "Unable to parse key '" + OUTPUT_TABLE_PARTITIONING.getKey() + "'.", e);
    }
  }
  return Optional.empty();
}

Source File: IndirectBigQueryOutputCommitterTest.java From hadoop-connectors with Apache License 2.0

4 votes

/** Test to make sure an IOException is thrown on interrupt of the BigQuery import call. */
@SuppressWarnings("unchecked")
@Test
public void testCommitJobInterrupt() throws Exception {
  // Setup the sample directory.
  generateSampleFiles();

  // Setup the expected exception
  InterruptedException helperInterruptedException = new InterruptedException("Test exception");

  // Configure special case mock.
  doThrow(helperInterruptedException)
      .when(mockBigQueryHelper)
      .importFromGcs(
          any(String.class),
          any(TableReference.class),
          any(TableSchema.class),
          any(TimePartitioning.class),
          anyString(),
          any(BigQueryFileFormat.class),
          any(String.class),
          any(String.class),
          any(List.class),
          eq(true));

  IOException thrown = assertThrows(IOException.class, () -> committer.commitJob(job));
  assertThat(thrown).hasCauseThat().isEqualTo(helperInterruptedException);

  // Verify we're making the BigQuery import call.
  verify(mockBigQueryHelper)
      .importFromGcs(
          eq(TEST_PROJECT_ID),
          eq(outputTableRef),
          eq(TEST_TABLE_SCHEMA.get()),
          eq(TEST_TIME_PARTITIONING.get()),
          eq(TEST_KMS_KEY_NAME),
          eq(TEST_FILE_FORMAT),
          eq(TEST_CREATE_DISPOSITION),
          eq(TEST_WRITE_DISPOSITION),
          any(List.class), // Tested, no need to capture
          eq(true));

  // Verify the delegate is being called.
  verify(mockCommitter).commitJob(eq(job));
}

Source File: KeyByBigQueryTableDestination.java From gcp-ingestion with Mozilla Public License 2.0

4 votes

/**
 * Return the appropriate table destination instance for the given document type and other
 * attributes.
 */
public TableDestination getTableDestination(Map<String, String> attributes) {
  attributes = new HashMap<>(attributes);

  // We coerce all docType and namespace names to be snake_case and to remove invalid
  // characters; these transformations MUST match with the transformations applied by the
  // jsonschema-transpiler and mozilla-schema-generator when creating table schemas in BigQuery.
  final String namespace = attributes.get(Attribute.DOCUMENT_NAMESPACE);
  final String docType = attributes.get(Attribute.DOCUMENT_TYPE);
  if (namespace != null) {
    attributes.put(Attribute.DOCUMENT_NAMESPACE, getAndCacheNormalizedName(namespace));
  }
  if (docType != null) {
    attributes.put(Attribute.DOCUMENT_TYPE, getAndCacheNormalizedName(docType));
  }

  // Only letters, numbers, and underscores are allowed in BigQuery dataset and table names,
  // but some doc types and namespaces contain '-', so we convert to '_'; we don't pass all
  // values through getAndCacheBqName to avoid expensive regex operations and polluting the
  // cache of transformed field names.
  attributes = Maps.transformValues(attributes, v -> v.replaceAll("-", "_"));

  final String tableSpec = StringSubstitutor.replace(tableSpecTemplate.get(), attributes);

  // Send to error collection if incomplete tableSpec; $ is not a valid char in tableSpecs.
  if (tableSpec.contains("$")) {
    throw new IllegalArgumentException("Element did not contain all the attributes needed to"
        + " fill out variables in the configured BigQuery output template: "
        + tableSpecTemplate.get());
  }

  final TableDestination tableDestination = new TableDestination(tableSpec, null,
      new TimePartitioning().setField(partitioningField.get()),
      new Clustering().setFields(clusteringFields.get()));
  final TableReference ref = BigQueryHelpers.parseTableSpec(tableSpec);
  final DatasetReference datasetRef = new DatasetReference().setProjectId(ref.getProjectId())
      .setDatasetId(ref.getDatasetId());

  if (bqService == null) {
    bqService = BigQueryOptions.newBuilder().setProjectId(ref.getProjectId())
        .setRetrySettings(RETRY_SETTINGS).build().getService();
  }

  // Get and cache a listing of table names for this dataset.
  Set<String> tablesInDataset;
  if (tableListingCache == null) {
    // We need to be very careful about settings for the cache here. We have had significant
    // issues in the past due to exceeding limits on BigQuery API requests; see
    // https://bugzilla.mozilla.org/show_bug.cgi?id=1623000
    tableListingCache = CacheBuilder.newBuilder().expireAfterWrite(Duration.ofMinutes(10))
        .build();
  }
  try {
    tablesInDataset = tableListingCache.get(datasetRef, () -> {
      Set<String> tableSet = new HashSet<>();
      Dataset dataset = bqService.getDataset(ref.getDatasetId());
      if (dataset != null) {
        dataset.list().iterateAll().forEach(t -> {
          tableSet.add(t.getTableId().getTable());
        });
      }
      return tableSet;
    });
  } catch (ExecutionException e) {
    throw new UncheckedExecutionException(e.getCause());
  }

  // Send to error collection if dataset or table doesn't exist so BigQueryIO doesn't throw a
  // pipeline execution exception.
  if (tablesInDataset.isEmpty()) {
    throw new IllegalArgumentException("Resolved destination dataset does not exist or has no "
        + " tables for tableSpec " + tableSpec);
  } else if (!tablesInDataset.contains(ref.getTableId())) {
    throw new IllegalArgumentException("Resolved destination table does not exist: " + tableSpec);
  }

  return tableDestination;
}

Source File: BigQueryTimePartitioning.java From hadoop-connectors with Apache License 2.0

4 votes

static BigQueryTimePartitioning wrap(TimePartitioning tableTimePartitioning) {
  return new BigQueryTimePartitioning(tableTimePartitioning);
}

Source File: BigQueryTimePartitioning.java From hadoop-connectors with Apache License 2.0

4 votes

static TimePartitioning getFromJson(String json) throws IOException {
  JsonParser parser = JacksonFactory.getDefaultInstance().createJsonParser(json);
  return parser.parseAndClose(TimePartitioning.class);
}

Source File: BigQueryTimePartitioning.java From hadoop-connectors with Apache License 2.0

4 votes

TimePartitioning get() {
  return timePartitioning;
}

Source File: BigQueryTimePartitioning.java From hadoop-connectors with Apache License 2.0

4 votes

public BigQueryTimePartitioning(TimePartitioning timePartitioning) {
  this.timePartitioning = timePartitioning;
}

Source File: BigQueryTimePartitioning.java From hadoop-connectors with Apache License 2.0

4 votes

public BigQueryTimePartitioning() {
  this.timePartitioning = new TimePartitioning();
}

Source File: BigQueryHelper.java From hadoop-connectors with Apache License 2.0

4 votes

/**
 * Imports data from GCS into BigQuery via a load job. Optionally polls for completion before
 * returning.
 *
 * @param projectId the project on whose behalf to perform the load.
 * @param tableRef the reference to the destination table.
 * @param schema the schema of the source data to populate the destination table by.
 * @param timePartitioning time partitioning to populate the destination table.
 * @param kmsKeyName the Cloud KMS encryption key used to protect the output table.
 * @param sourceFormat the file format of the source data.
 * @param createDisposition the create disposition of the output table.
 * @param writeDisposition the write disposition of the output table.
 * @param gcsPaths the location of the source data in GCS.
 * @param awaitCompletion if true, block and poll until job completes, otherwise return as soon as
 *     the job has been successfully dispatched.
 * @throws IOException
 * @throws InterruptedException if interrupted while waiting for job completion.
 */
public void importFromGcs(
    String projectId,
    TableReference tableRef,
    @Nullable TableSchema schema,
    @Nullable TimePartitioning timePartitioning,
    @Nullable String kmsKeyName,
    BigQueryFileFormat sourceFormat,
    String createDisposition,
    String writeDisposition,
    List<String> gcsPaths,
    boolean awaitCompletion)
    throws IOException, InterruptedException {
  logger.atInfo().log(
      "Importing into table '%s' from %s paths; path[0] is '%s'; awaitCompletion: %s;"
          + " timePartitioning: %s",
      lazy(() -> BigQueryStrings.toString(tableRef)),
      gcsPaths.size(),
      gcsPaths.isEmpty() ? "(empty)" : gcsPaths.get(0),
      awaitCompletion,
      timePartitioning);

  // Create load conf with minimal requirements.
  JobConfigurationLoad loadConfig = new JobConfigurationLoad();
  loadConfig.setSchema(schema);
  loadConfig.setSourceFormat(sourceFormat.getFormatIdentifier());
  loadConfig.setSourceUris(gcsPaths);
  loadConfig.setDestinationTable(tableRef);
  loadConfig.setTimePartitioning(timePartitioning);
  loadConfig.setCreateDisposition(createDisposition);
  loadConfig.setWriteDisposition(writeDisposition);
  if (!Strings.isNullOrEmpty(kmsKeyName)) {
    loadConfig.setDestinationEncryptionConfiguration(
        new EncryptionConfiguration().setKmsKeyName(kmsKeyName));
  }
  // Auto detect the schema if we're not given one, otherwise use the passed schema.
  if (schema == null) {
    logger.atInfo().log("No import schema provided, auto detecting schema.");
    loadConfig.setAutodetect(true);
  } else {
    logger.atInfo().log("Using provided import schema '%s'.", schema);
  }

  JobConfiguration config = new JobConfiguration();
  config.setLoad(loadConfig);

  // Get the dataset to determine the location
  Dataset dataset;
  try {
    dataset = service.datasets().get(tableRef.getProjectId(), tableRef.getDatasetId()).execute();
  } catch (IOException ioe) {
    throw new IOException(
        String.format(
            "Failed to get dataset '%s' in project '%s' for table '%s'",
            tableRef.getDatasetId(), tableRef.getProjectId(), tableRef),
        ioe);
  }

  JobReference jobReference =
      createJobReference(projectId, "direct-bigqueryhelper-import", dataset.getLocation());
  Job job = new Job();
  job.setConfiguration(config);
  job.setJobReference(jobReference);

  // Insert and run job.
  insertJobOrFetchDuplicate(projectId, job);

  if (awaitCompletion) {
    // Poll until job is complete.
    BigQueryUtils.waitForJobCompletion(getRawBigquery(), projectId, jobReference, () -> {});
  }
}

Source File: BigQueryIOWriteTest.java From beam with Apache License 2.0

4 votes

@Test
public void testClusteringTableFunction() throws Exception {
  TableRow row1 = new TableRow().set("date", "2018-01-01").set("number", "1");
  TableRow row2 = new TableRow().set("date", "2018-01-02").set("number", "2");

  TimePartitioning timePartitioning = new TimePartitioning().setType("DAY").setField("date");
  Clustering clustering = new Clustering().setFields(ImmutableList.of("date"));
  TableSchema schema =
      new TableSchema()
          .setFields(
              ImmutableList.of(
                  new TableFieldSchema()
                      .setName("date")
                      .setType("DATE")
                      .setName("number")
                      .setType("INTEGER")));
  p.apply(Create.of(row1, row2))
      .apply(
          BigQueryIO.writeTableRows()
              .to(
                  (ValueInSingleWindow<TableRow> vsw) -> {
                    String tableSpec =
                        "project-id:dataset-id.table-" + vsw.getValue().get("number");
                    return new TableDestination(
                        tableSpec,
                        null,
                        new TimePartitioning().setType("DAY").setField("date"),
                        new Clustering().setFields(ImmutableList.of("date")));
                  })
              .withTestServices(fakeBqServices)
              .withMethod(BigQueryIO.Write.Method.FILE_LOADS)
              .withSchema(schema)
              .withClustering()
              .withoutValidation());
  p.run();
  Table table =
      fakeDatasetService.getTable(
          BigQueryHelpers.parseTableSpec("project-id:dataset-id.table-1"));
  assertEquals(schema, table.getSchema());
  assertEquals(timePartitioning, table.getTimePartitioning());
  assertEquals(clustering, table.getClustering());
}

Source File: BigQueryIOWriteTest.java From beam with Apache License 2.0

4 votes

void testTimePartitioningClustering(
    BigQueryIO.Write.Method insertMethod, boolean enablePartitioning, boolean enableClustering)
    throws Exception {
  TableRow row1 = new TableRow().set("date", "2018-01-01").set("number", "1");
  TableRow row2 = new TableRow().set("date", "2018-01-02").set("number", "2");

  TimePartitioning timePartitioning = new TimePartitioning().setType("DAY").setField("date");
  Clustering clustering = new Clustering().setFields(ImmutableList.of("date"));
  TableSchema schema =
      new TableSchema()
          .setFields(
              ImmutableList.of(
                  new TableFieldSchema()
                      .setName("date")
                      .setType("DATE")
                      .setName("number")
                      .setType("INTEGER")));

  Write<TableRow> writeTransform =
      BigQueryIO.writeTableRows()
          .to("project-id:dataset-id.table-id")
          .withTestServices(fakeBqServices)
          .withMethod(insertMethod)
          .withSchema(schema)
          .withoutValidation();

  if (enablePartitioning) {
    writeTransform = writeTransform.withTimePartitioning(timePartitioning);
  }
  if (enableClustering) {
    writeTransform = writeTransform.withClustering(clustering);
  }

  p.apply(Create.of(row1, row2)).apply(writeTransform);
  p.run();
  Table table =
      fakeDatasetService.getTable(
          BigQueryHelpers.parseTableSpec("project-id:dataset-id.table-id"));

  assertEquals(schema, table.getSchema());
  if (enablePartitioning) {
    assertEquals(timePartitioning, table.getTimePartitioning());
  }
  if (enableClustering) {
    assertEquals(clustering, table.getClustering());
  }
}

Source File: FakeJobService.java From beam with Apache License 2.0

4 votes

private JobStatus runCopyJob(JobConfigurationTableCopy copy)
    throws InterruptedException, IOException {
  List<TableReference> sources = copy.getSourceTables();
  TableReference destination = copy.getDestinationTable();
  WriteDisposition writeDisposition = WriteDisposition.valueOf(copy.getWriteDisposition());
  CreateDisposition createDisposition = CreateDisposition.valueOf(copy.getCreateDisposition());
  Table existingTable = datasetService.getTable(destination);
  if (!validateDispositions(existingTable, createDisposition, writeDisposition)) {
    return new JobStatus().setState("FAILED").setErrorResult(new ErrorProto());
  }
  TimePartitioning partitioning = null;
  Clustering clustering = null;
  TableSchema schema = null;
  boolean first = true;
  List<TableRow> allRows = Lists.newArrayList();
  for (TableReference source : sources) {
    Table table = checkNotNull(datasetService.getTable(source));
    if (!first) {
      if (!Objects.equals(partitioning, table.getTimePartitioning())) {
        return new JobStatus().setState("FAILED").setErrorResult(new ErrorProto());
      }
      if (!Objects.equals(clustering, table.getClustering())) {
        return new JobStatus().setState("FAILED").setErrorResult(new ErrorProto());
      }
      if (!Objects.equals(schema, table.getSchema())) {
        return new JobStatus().setState("FAILED").setErrorResult(new ErrorProto());
      }
    }
    partitioning = table.getTimePartitioning();
    clustering = table.getClustering();
    schema = table.getSchema();
    first = false;
    allRows.addAll(
        datasetService.getAllRows(
            source.getProjectId(), source.getDatasetId(), source.getTableId()));
  }
  datasetService.createTable(
      new Table()
          .setTableReference(destination)
          .setSchema(schema)
          .setTimePartitioning(partitioning)
          .setClustering(clustering)
          .setEncryptionConfiguration(copy.getDestinationEncryptionConfiguration()));
  datasetService.insertAll(destination, allRows, null);
  return new JobStatus().setState("DONE");
}

Source File: BigQueryIO.java From beam with Apache License 2.0

4 votes

/**
 * Like {@link #withTimePartitioning(TimePartitioning)} but using a deferred {@link
 * ValueProvider}.
 */
public Write<T> withTimePartitioning(ValueProvider<TimePartitioning> partitioning) {
  checkArgument(partitioning != null, "partitioning can not be null");
  return withJsonTimePartitioning(
      NestedValueProvider.of(partitioning, new TimePartitioningToJson()));
}

Source File: BigQueryHelpers.java From beam with Apache License 2.0

4 votes

@Override
public String apply(TimePartitioning partitioning) {
  return toJsonString(partitioning);
}

Source File: BigQueryIO.java From beam with Apache License 2.0

2 votes

/**
 * Allows newly created tables to include a {@link TimePartitioning} class. Can only be used
 * when writing to a single table. If {@link #to(SerializableFunction)} or {@link
 * #to(DynamicDestinations)} is used to write dynamic tables, time partitioning can be directly
 * set in the returned {@link TableDestination}.
 */
public Write<T> withTimePartitioning(TimePartitioning partitioning) {
  checkArgument(partitioning != null, "partitioning can not be null");
  return withJsonTimePartitioning(
      StaticValueProvider.of(BigQueryHelpers.toJsonString(partitioning)));
}

Source File: BqDdlOperatorFactory.java From digdag with Apache License 2.0

votes

Optional<TimePartitioning> time_partitioning();

com.google.api.services.bigquery.model.TimePartitioning Java Examples