org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition Java Exaples

Source File: BigQueryDeadletterSink.java From feast with Apache License 2.0

6 votes

@Override
public PDone expand(PCollection<FailedElement> input) {
  TimePartitioning partition = new TimePartitioning().setType("DAY");
  partition.setField(TIMESTAMP_COLUMN);
  input
      .apply("FailedElementToTableRow", ParDo.of(new FailedElementToTableRowFn()))
      .apply(
          "WriteFailedElementsToBigQuery",
          BigQueryIO.writeTableRows()
              .to(getTableSpec())
              .withJsonSchema(getJsonSchema())
              .withTimePartitioning(partition)
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_APPEND));
  return PDone.in(input.getPipeline());
}

Source File: FakeJobService.java From beam with Apache License 2.0

6 votes

private boolean validateDispositions(
    Table table, CreateDisposition createDisposition, WriteDisposition writeDisposition)
    throws InterruptedException, IOException {
  if (table == null) {
    if (createDisposition == CreateDisposition.CREATE_NEVER) {
      return false;
    }
  } else if (writeDisposition == WriteDisposition.WRITE_TRUNCATE) {
    datasetService.deleteTable(table.getTableReference());
  } else if (writeDisposition == WriteDisposition.WRITE_EMPTY) {
    List<TableRow> allRows =
        datasetService.getAllRows(
            table.getTableReference().getProjectId(),
            table.getTableReference().getDatasetId(),
            table.getTableReference().getTableId());
    if (!allRows.isEmpty()) {
      return false;
    }
  }
  return true;
}

Source File: StreamingInserts.java From beam with Apache License 2.0

6 votes

/** Constructor. */
private StreamingInserts(
    CreateDisposition createDisposition,
    DynamicDestinations<?, DestinationT> dynamicDestinations,
    BigQueryServices bigQueryServices,
    InsertRetryPolicy retryPolicy,
    boolean extendedErrorInfo,
    boolean skipInvalidRows,
    boolean ignoreUnknownValues,
    boolean ignoreInsertIds,
    Coder<ElementT> elementCoder,
    SerializableFunction<ElementT, TableRow> toTableRow,
    String kmsKey) {
  this.createDisposition = createDisposition;
  this.dynamicDestinations = dynamicDestinations;
  this.bigQueryServices = bigQueryServices;
  this.retryPolicy = retryPolicy;
  this.extendedErrorInfo = extendedErrorInfo;
  this.skipInvalidRows = skipInvalidRows;
  this.ignoreUnknownValues = ignoreUnknownValues;
  this.ignoreInsertIds = ignoreInsertIds;
  this.elementCoder = elementCoder;
  this.toTableRow = toTableRow;
  this.kmsKey = kmsKey;
}

Source File: StreamingInserts.java From beam with Apache License 2.0

6 votes

/** Constructor. */
public StreamingInserts(
    CreateDisposition createDisposition,
    DynamicDestinations<?, DestinationT> dynamicDestinations,
    Coder<ElementT> elementCoder,
    SerializableFunction<ElementT, TableRow> toTableRow) {
  this(
      createDisposition,
      dynamicDestinations,
      new BigQueryServicesImpl(),
      InsertRetryPolicy.alwaysRetry(),
      false,
      false,
      false,
      false,
      elementCoder,
      toTableRow,
      null);
}

Source File: WriteWindowedToBigQuery.java From beam with Apache License 2.0

5 votes

@Override
public PDone expand(PCollection<T> teamAndScore) {
  teamAndScore
      .apply("ConvertToRow", ParDo.of(new BuildRowFn()))
      .apply(
          BigQueryIO.writeTableRows()
              .to(getTable(projectId, datasetId, tableName))
              .withSchema(getSchema())
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_APPEND));
  return PDone.in(teamAndScore.getPipeline());
}

Source File: CreateTables.java From beam with Apache License 2.0

5 votes

private CreateTables(
    CreateDisposition createDisposition,
    BigQueryServices bqServices,
    DynamicDestinations<?, DestinationT> dynamicDestinations,
    String kmsKey) {
  this.createDisposition = createDisposition;
  this.bqServices = bqServices;
  this.dynamicDestinations = dynamicDestinations;
  this.kmsKey = kmsKey;
}

Source File: WriteTables.java From beam with Apache License 2.0

5 votes

public WriteTables(
    boolean tempTable,
    BigQueryServices bqServices,
    PCollectionView<String> loadJobIdPrefixView,
    WriteDisposition writeDisposition,
    CreateDisposition createDisposition,
    List<PCollectionView<?>> sideInputs,
    DynamicDestinations<?, DestinationT> dynamicDestinations,
    @Nullable ValueProvider<String> loadJobProjectId,
    int maxRetryJobs,
    boolean ignoreUnknownValues,
    String kmsKey,
    String sourceFormat,
    boolean useAvroLogicalTypes,
    Set<SchemaUpdateOption> schemaUpdateOptions) {

  this.tempTable = tempTable;
  this.bqServices = bqServices;
  this.loadJobIdPrefixView = loadJobIdPrefixView;
  this.firstPaneWriteDisposition = writeDisposition;
  this.firstPaneCreateDisposition = createDisposition;
  this.sideInputs = sideInputs;
  this.dynamicDestinations = dynamicDestinations;
  this.mainOutputTag = new TupleTag<>("WriteTablesMainOutput");
  this.temporaryFilesTag = new TupleTag<>("TemporaryFiles");
  this.loadJobProjectId = loadJobProjectId;
  this.maxRetryJobs = maxRetryJobs;
  this.ignoreUnknownValues = ignoreUnknownValues;
  this.kmsKey = kmsKey;
  this.sourceFormat = sourceFormat;
  this.useAvroLogicalTypes = useAvroLogicalTypes;
  this.schemaUpdateOptions = schemaUpdateOptions;
}

Source File: BatchLoads.java From beam with Apache License 2.0

5 votes

BatchLoads(
    WriteDisposition writeDisposition,
    CreateDisposition createDisposition,
    boolean singletonTable,
    DynamicDestinations<?, DestinationT> dynamicDestinations,
    Coder<DestinationT> destinationCoder,
    ValueProvider<String> customGcsTempLocation,
    @Nullable ValueProvider<String> loadJobProjectId,
    boolean ignoreUnknownValues,
    Coder<ElementT> elementCoder,
    RowWriterFactory<ElementT, DestinationT> rowWriterFactory,
    @Nullable String kmsKey,
    boolean clusteringEnabled,
    boolean useAvroLogicalTypes) {
  bigQueryServices = new BigQueryServicesImpl();
  this.writeDisposition = writeDisposition;
  this.createDisposition = createDisposition;
  this.singletonTable = singletonTable;
  this.dynamicDestinations = dynamicDestinations;
  this.destinationCoder = destinationCoder;
  this.maxNumWritersPerBundle = DEFAULT_MAX_NUM_WRITERS_PER_BUNDLE;
  this.maxFileSize = DEFAULT_MAX_FILE_SIZE;
  this.numFileShards = DEFAULT_NUM_FILE_SHARDS;
  this.maxFilesPerPartition = DEFAULT_MAX_FILES_PER_PARTITION;
  this.maxBytesPerPartition = DEFAULT_MAX_BYTES_PER_PARTITION;
  this.triggeringFrequency = null;
  this.customGcsTempLocation = customGcsTempLocation;
  this.loadJobProjectId = loadJobProjectId;
  this.ignoreUnknownValues = ignoreUnknownValues;
  this.useAvroLogicalTypes = useAvroLogicalTypes;
  this.elementCoder = elementCoder;
  this.kmsKey = kmsKey;
  this.rowWriterFactory = rowWriterFactory;
  this.clusteringEnabled = clusteringEnabled;
  schemaUpdateOptions = Collections.emptySet();
}

Source File: WriteRename.java From beam with Apache License 2.0

5 votes

private PendingJobData startWriteRename(
    TableDestination finalTableDestination, Iterable<String> tempTableNames, ProcessContext c)
    throws Exception {
  WriteDisposition writeDisposition =
      (c.pane().getIndex() == 0) ? firstPaneWriteDisposition : WriteDisposition.WRITE_APPEND;
  CreateDisposition createDisposition =
      (c.pane().getIndex() == 0) ? firstPaneCreateDisposition : CreateDisposition.CREATE_NEVER;
  List<TableReference> tempTables =
      StreamSupport.stream(tempTableNames.spliterator(), false)
          .map(table -> BigQueryHelpers.fromJsonString(table, TableReference.class))
          .collect(Collectors.toList());
  ;

  // Make sure each destination table gets a unique job id.
  String jobIdPrefix =
      BigQueryHelpers.createJobId(
          c.sideInput(jobIdToken), finalTableDestination, -1, c.pane().getIndex());

  BigQueryHelpers.PendingJob retryJob =
      startCopy(
          bqServices.getJobService(c.getPipelineOptions().as(BigQueryOptions.class)),
          bqServices.getDatasetService(c.getPipelineOptions().as(BigQueryOptions.class)),
          jobIdPrefix,
          finalTableDestination.getTableReference(),
          tempTables,
          writeDisposition,
          createDisposition,
          kmsKey);
  return new PendingJobData(retryJob, finalTableDestination, tempTables);
}

Source File: WriteRename.java From beam with Apache License 2.0

5 votes

public WriteRename(
    BigQueryServices bqServices,
    PCollectionView<String> jobIdToken,
    WriteDisposition writeDisposition,
    CreateDisposition createDisposition,
    int maxRetryJobs,
    String kmsKey) {
  this.bqServices = bqServices;
  this.jobIdToken = jobIdToken;
  this.firstPaneWriteDisposition = writeDisposition;
  this.firstPaneCreateDisposition = createDisposition;
  this.maxRetryJobs = maxRetryJobs;
  this.kmsKey = kmsKey;
}

Source File: WriteFailedElementToBigQuery.java From feast with Apache License 2.0

5 votes

@Override
public WriteResult expand(PCollection<FailedElement> failedElements) {
  return failedElements
      .apply("FailedElementToTableRow", ParDo.of(new FailedElementToTableRowFn()))
      .apply(
          "WriteFailedElementsToBigQuery",
          BigQueryIO.writeTableRows()
              .to(getTableSpec())
              .withJsonSchema(getJsonSchema())
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_APPEND));
}

Source File: WriteToBigQuery.java From beam with Apache License 2.0

5 votes

@Override
public PDone expand(PCollection<InputT> teamAndScore) {
  teamAndScore
      .apply("ConvertToRow", ParDo.of(new BuildRowFn()))
      .apply(
          BigQueryIO.writeTableRows()
              .to(getTable(projectId, datasetId, tableName))
              .withSchema(getSchema())
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_APPEND));
  return PDone.in(teamAndScore.getPipeline());
}

Source File: DatastoreToBigQuery.java From DataflowTemplates with Apache License 2.0

5 votes

/**
 * Runs a pipeline which reads in Entities from Datastore, passes in the JSON encoded Entities
 * to a Javascript UDF that returns JSON that conforms to the BigQuery TableRow spec and writes
 * the TableRows to BigQuery.
 *
 * @param args arguments to the pipeline
 */
public static void main(String[] args) {
  DatastoreToBigQueryOptions options = PipelineOptionsFactory.fromArgs(args)
      .withValidation()
      .as(DatastoreToBigQueryOptions.class);

  Pipeline pipeline = Pipeline.create(options);

  pipeline
      .apply(
          ReadJsonEntities.newBuilder()
              .setGqlQuery(options.getDatastoreReadGqlQuery())
              .setProjectId(options.getDatastoreReadProjectId())
              .setNamespace(options.getDatastoreReadNamespace())
              .build())
      .apply(
          TransformTextViaJavascript.newBuilder()
              .setFileSystemPath(options.getJavascriptTextTransformGcsPath())
              .setFunctionName(options.getJavascriptTextTransformFunctionName())
              .build())
      .apply(BigQueryConverters.jsonToTableRow())
      .apply(
          "WriteBigQuery",
          BigQueryIO.writeTableRows()
              .withoutValidation()
              .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_NEVER)
              .to(options.getOutputTableSpec())
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_TRUNCATE)
              .withCustomGcsTempLocation(options.getBigQueryLoadingTemporaryDirectory()));

  pipeline.run();
}

Source File: ErrorConverters.java From DataflowTemplates with Apache License 2.0

5 votes

@Override
public WriteResult expand(PCollection<FailsafeElement<PubsubMessage, String>> failedRecords) {

  return failedRecords
      .apply("FailedRecordToTableRow", ParDo.of(new FailedPubsubMessageToTableRowFn()))
      .apply(
          "WriteFailedRecordsToBigQuery",
          BigQueryIO.writeTableRows()
              .to(getErrorRecordsTable())
              .withJsonSchema(getErrorRecordsTableSchema())
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_APPEND));
}

Source File: WriteToBigQuery.java From deployment-examples with MIT License

5 votes

@Override
public PDone expand(PCollection<InputT> teamAndScore) {
  teamAndScore
      .apply("ConvertToRow", ParDo.of(new BuildRowFn()))
      .apply(
          BigQueryIO.writeTableRows()
              .to(getTable(projectId, datasetId, tableName))
              .withSchema(getSchema())
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_APPEND));
  return PDone.in(teamAndScore.getPipeline());
}

Source File: WriteWindowedToBigQuery.java From deployment-examples with MIT License

5 votes

@Override
public PDone expand(PCollection<T> teamAndScore) {
  teamAndScore
      .apply("ConvertToRow", ParDo.of(new BuildRowFn()))
      .apply(
          BigQueryIO.writeTableRows()
              .to(getTable(projectId, datasetId, tableName))
              .withSchema(getSchema())
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_APPEND));
  return PDone.in(teamAndScore.getPipeline());
}

Source File: ErrorConverters.java From DataflowTemplates with Apache License 2.0

5 votes

@Override
public WriteResult expand(PCollection<FailsafeElement<String, String>> failedRecords) {

  return failedRecords
      .apply("FailedRecordToTableRow", ParDo.of(new FailedStringToTableRowFn()))
      .apply(
          "WriteFailedRecordsToBigQuery",
          BigQueryIO.writeTableRows()
              .to(getErrorRecordsTable())
              .withJsonSchema(getErrorRecordsTableSchema())
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_APPEND));
}

Source File: ErrorConverters.java From DataflowTemplates with Apache License 2.0

5 votes

@Override
public WriteResult expand(
    PCollection<FailsafeElement<KV<String, String>, String>> failedRecords) {

  return failedRecords
      .apply("FailedRecordToTableRow", ParDo.of(new FailedMessageToTableRowFn()))
      .apply(
          "WriteFailedRecordsToBigQuery",
          BigQueryIO.writeTableRows()
              .to(getErrorRecordsTable())
              .withJsonSchema(getErrorRecordsTableSchema())
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_APPEND));
}

Source File: ErrorConverters.java From DataflowTemplates with Apache License 2.0

5 votes

@Override
public WriteResult expand(
    PCollection<FailsafeElement<PubsubMessage, String>> failedRecords) {

  return failedRecords
      .apply("FailedRecordToTableRow", ParDo.of(new FailedPubsubMessageToTableRowFn()))
      .apply(
          "WriteFailedRecordsToBigQuery",
          BigQueryIO.writeTableRows()
              .to(getErrorRecordsTable())
              .withJsonSchema(getErrorRecordsTableSchema())
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_APPEND));
}

Source File: KafkaToBigQuery.java From DataflowTemplates with Apache License 2.0

5 votes

@Override
public WriteResult expand(
    PCollection<FailsafeElement<KV<String, String>, String>> failedRecords) {

  return failedRecords
      .apply("FailedRecordToTableRow", ParDo.of(new FailedMessageToTableRowFn()))
      .apply(
          "WriteFailedRecordsToBigQuery",
          BigQueryIO.writeTableRows()
              .to(getErrorRecordsTable())
              .withJsonSchema(getErrorRecordsTableSchema())
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_APPEND));
}

Source File: ErrorConverters.java From DataflowTemplates with Apache License 2.0

5 votes

@Override
public WriteResult expand(PCollection<FailsafeElement<String, String>> failedRecords) {

  return failedRecords
      .apply("FailedRecordToTableRow", ParDo.of(new FailedStringToTableRowFn()))
      .apply(
          "WriteFailedRecordsToBigQuery",
          BigQueryIO.writeTableRows()
              .to(getErrorRecordsTable())
              .withJsonSchema(getErrorRecordsTableSchema())
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_APPEND));
}

Source File: CreateTables.java From beam with Apache License 2.0

4 votes

private TableDestination getTableDestination(ProcessContext context, DestinationT destination) {
  TableDestination tableDestination = dynamicDestinations.getTable(destination);
  checkArgument(
      tableDestination != null,
      "DynamicDestinations.getTable() may not return null, "
          + "but %s returned null for destination %s",
      dynamicDestinations,
      destination);
  checkArgument(
      tableDestination.getTableSpec() != null,
      "DynamicDestinations.getTable() must return a TableDestination "
          + "with a non-null table spec, but %s returned %s for destination %s,"
          + "which has a null table spec",
      dynamicDestinations,
      tableDestination,
      destination);
  boolean destinationCoderSupportsClustering =
      !(dynamicDestinations.getDestinationCoder() instanceof TableDestinationCoderV2);
  checkArgument(
      tableDestination.getClustering() == null || destinationCoderSupportsClustering,
      "DynamicDestinations.getTable() may only return destinations with clustering configured"
          + " if a destination coder is supplied that supports clustering, but %s is configured"
          + " to use TableDestinationCoderV2. Set withClustering() on BigQueryIO.write() and, "
          + " if you provided a custom DynamicDestinations instance, override"
          + " getDestinationCoder() to return TableDestinationCoderV3.",
      dynamicDestinations);
  TableReference tableReference = tableDestination.getTableReference().clone();
  if (Strings.isNullOrEmpty(tableReference.getProjectId())) {
    tableReference.setProjectId(
        context.getPipelineOptions().as(BigQueryOptions.class).getProject());
    tableDestination = tableDestination.withTableReference(tableReference);
  }
  if (createDisposition == CreateDisposition.CREATE_NEVER) {
    return tableDestination;
  }

  String tableSpec = BigQueryHelpers.stripPartitionDecorator(tableDestination.getTableSpec());
  if (!createdTables.contains(tableSpec)) {
    // Another thread may have succeeded in creating the table in the meanwhile, so
    // check again. This check isn't needed for correctness, but we add it to prevent
    // every thread from attempting a create and overwhelming our BigQuery quota.
    synchronized (createdTables) {
      if (!createdTables.contains(tableSpec)) {
        tryCreateTable(context, destination, tableDestination, tableSpec, kmsKey);
      }
    }
  }
  return tableDestination;
}

Source File: KafkaToBigQuery.java From java-docs-samples with Apache License 2.0

4 votes

public static void main(final String[] args) {
  Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
  options.setStreaming(true);

  var pipeline = Pipeline.create(options);
  pipeline
      .apply("Read messages from Kafka",
          KafkaIO.<String, String>read()
              .withBootstrapServers(options.getBootstrapServer())
              .withTopic(options.getInputTopic())
              .withKeyDeserializer(StringDeserializer.class)
              .withValueDeserializer(StringDeserializer.class)
              .withoutMetadata())
      .apply("Get message contents", Values.<String>create())
      .apply("Log messages", MapElements.into(TypeDescriptor.of(String.class))
          .via(message -> {
            LOG.info("Received: {}", message);
            return message;
          }))
      .apply("Parse JSON", MapElements.into(TypeDescriptor.of(PageRating.class))
          .via(message -> GSON.fromJson(message, PageRating.class)))

      .apply("Add processing time", WithTimestamps.of((pageRating) -> new Instant(pageRating.processingTime)))
      .apply("Fixed-size windows", Window.into(FixedWindows.of(Duration.standardMinutes(1))))

      .apply("Convert to BigQuery TableRow", MapElements.into(TypeDescriptor.of(TableRow.class))
          .via(pageRating -> new TableRow()
              .set("processing_time", pageRating.processingTime.toString())
              .set("url", pageRating.url)
              .set("rating", pageRating.rating)))
      .apply("Write to BigQuery", BigQueryIO.writeTableRows()
          .to(options.getOutputTable())
          .withSchema(new TableSchema().setFields(Arrays.asList(
              new TableFieldSchema().setName("processing_time").setType("TIMESTAMP"),
              new TableFieldSchema().setName("url").setType("STRING"),
              new TableFieldSchema().setName("rating").setType("STRING"))))
          .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
          .withWriteDisposition(WriteDisposition.WRITE_APPEND));

  // For a Dataflow Flex Template, do NOT waitUntilFinish().
  pipeline.run();
}

Source File: StreamingBeamSQL.java From java-docs-samples with Apache License 2.0

4 votes

public static void main(final String[] args) {
  Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
  options.setStreaming(true);

  var project = options.as(GcpOptions.class).getProject();
  var subscription = ProjectSubscriptionName.of(project, options.getInputSubscription()).toString();

  var schema = Schema.builder()
      .addStringField("url")
      .addDoubleField("page_score")
      .addDateTimeField("processing_time")
      .build();

  var pipeline = Pipeline.create(options);
  pipeline
      // Read, parse, and validate messages from Pub/Sub.
      .apply("Read messages from Pub/Sub", PubsubIO.readStrings().fromSubscription(subscription))
      .apply("Parse JSON into SQL rows", MapElements.into(TypeDescriptor.of(Row.class)).via(message -> {
        // This is a good place to add error handling.
        // The first transform should act as a validation layer to make sure
        // that any data coming to the processing pipeline must be valid.
        // See `MapElements.MapWithFailures` for more details.
        LOG.info("message: {}", message);
        var msg = GSON.fromJson(message, PageReviewMessage.class);
        return Row.withSchema(schema).addValues(
            msg.url,                                    // row url
            msg.review.equals("positive") ? 1.0 : 0.0,  // row page_score
            new Instant()                               // row processing_time
        ).build();
      })).setRowSchema(schema) // make sure to set the row schema for the PCollection

      // Add timestamps and bundle elements into windows.
      .apply("Add processing time", WithTimestamps.of((row) -> row.getDateTime("processing_time").toInstant()))
      .apply("Fixed-size windows", Window.into(FixedWindows.of(Duration.standardMinutes(1))))

      // Apply a SQL query for every window of elements.
      .apply("Run Beam SQL query", SqlTransform.query(
          "SELECT " +
          "  url, " +
          "  COUNT(page_score) AS num_reviews, " +
          "  AVG(page_score) AS score, " +
          "  MIN(processing_time) AS first_date, " +
          "  MAX(processing_time) AS last_date " +
          "FROM PCOLLECTION " +
          "GROUP BY url"
      ))

      // Convert the SQL Rows into BigQuery TableRows and write them to BigQuery.
      .apply("Convert to BigQuery TableRow", MapElements.into(TypeDescriptor.of(TableRow.class)).via(row -> {
        LOG.info("rating summary: {} {} ({} reviews)", row.getDouble("score"), row.getString("url"),
            row.getInt64("num_reviews"));
        return new TableRow()
            .set("url", row.getString("url"))
            .set("num_reviews", row.getInt64("num_reviews"))
            .set("score", row.getDouble("score"))
            .set("first_date", row.getDateTime("first_date").toInstant().toString())
            .set("last_date", row.getDateTime("last_date").toInstant().toString());
      }))
      .apply("Write to BigQuery", BigQueryIO.writeTableRows()
          .to(options.getOutputTable())
          .withSchema(new TableSchema().setFields(Arrays.asList(
              // To learn more about the valid BigQuery types:
              //   https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types
              new TableFieldSchema().setName("url").setType("STRING"),
              new TableFieldSchema().setName("num_reviews").setType("INTEGER"),
              new TableFieldSchema().setName("score").setType("FLOAT64"),
              new TableFieldSchema().setName("first_date").setType("TIMESTAMP"),
              new TableFieldSchema().setName("last_date").setType("TIMESTAMP"))))
          .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
          .withWriteDisposition(WriteDisposition.WRITE_APPEND));

  // For a Dataflow Flex Template, do NOT waitUntilFinish().
  pipeline.run();
}

Source File: FakeJobService.java From beam with Apache License 2.0

4 votes

private JobStatus runCopyJob(JobConfigurationTableCopy copy)
    throws InterruptedException, IOException {
  List<TableReference> sources = copy.getSourceTables();
  TableReference destination = copy.getDestinationTable();
  WriteDisposition writeDisposition = WriteDisposition.valueOf(copy.getWriteDisposition());
  CreateDisposition createDisposition = CreateDisposition.valueOf(copy.getCreateDisposition());
  Table existingTable = datasetService.getTable(destination);
  if (!validateDispositions(existingTable, createDisposition, writeDisposition)) {
    return new JobStatus().setState("FAILED").setErrorResult(new ErrorProto());
  }
  TimePartitioning partitioning = null;
  Clustering clustering = null;
  TableSchema schema = null;
  boolean first = true;
  List<TableRow> allRows = Lists.newArrayList();
  for (TableReference source : sources) {
    Table table = checkNotNull(datasetService.getTable(source));
    if (!first) {
      if (!Objects.equals(partitioning, table.getTimePartitioning())) {
        return new JobStatus().setState("FAILED").setErrorResult(new ErrorProto());
      }
      if (!Objects.equals(clustering, table.getClustering())) {
        return new JobStatus().setState("FAILED").setErrorResult(new ErrorProto());
      }
      if (!Objects.equals(schema, table.getSchema())) {
        return new JobStatus().setState("FAILED").setErrorResult(new ErrorProto());
      }
    }
    partitioning = table.getTimePartitioning();
    clustering = table.getClustering();
    schema = table.getSchema();
    first = false;
    allRows.addAll(
        datasetService.getAllRows(
            source.getProjectId(), source.getDatasetId(), source.getTableId()));
  }
  datasetService.createTable(
      new Table()
          .setTableReference(destination)
          .setSchema(schema)
          .setTimePartitioning(partitioning)
          .setClustering(clustering)
          .setEncryptionConfiguration(copy.getDestinationEncryptionConfiguration()));
  datasetService.insertAll(destination, allRows, null);
  return new JobStatus().setState("DONE");
}

Source File: FakeJobService.java From beam with Apache License 2.0

4 votes

private JobStatus runLoadJob(JobReference jobRef, JobConfigurationLoad load)
    throws InterruptedException, IOException {
  TableReference destination = load.getDestinationTable();
  TableSchema schema = load.getSchema();
  checkArgument(schema != null, "No schema specified");
  List<ResourceId> sourceFiles = filesForLoadJobs.get(jobRef.getProjectId(), jobRef.getJobId());
  WriteDisposition writeDisposition = WriteDisposition.valueOf(load.getWriteDisposition());
  CreateDisposition createDisposition = CreateDisposition.valueOf(load.getCreateDisposition());

  Table existingTable = datasetService.getTable(destination);
  if (!validateDispositions(existingTable, createDisposition, writeDisposition)) {
    return new JobStatus().setState("FAILED").setErrorResult(new ErrorProto());
  }
  if (existingTable == null) {
    TableReference strippedDestination =
        destination
            .clone()
            .setTableId(BigQueryHelpers.stripPartitionDecorator(destination.getTableId()));
    existingTable = new Table().setTableReference(strippedDestination).setSchema(schema);
    if (load.getTimePartitioning() != null) {
      existingTable = existingTable.setTimePartitioning(load.getTimePartitioning());
    }
    if (load.getClustering() != null) {
      existingTable = existingTable.setClustering(load.getClustering());
    }
    datasetService.createTable(existingTable);
  }

  List<TableRow> rows = Lists.newArrayList();
  for (ResourceId filename : sourceFiles) {
    if (load.getSourceFormat().equals("NEWLINE_DELIMITED_JSON")) {
      rows.addAll(readJsonTableRows(filename.toString()));
    } else if (load.getSourceFormat().equals("AVRO")) {
      rows.addAll(readAvroTableRows(filename.toString(), schema));
    }
  }

  datasetService.insertAll(destination, rows, null);
  FileSystems.delete(sourceFiles);
  return new JobStatus().setState("DONE");
}

Source File: OpinionAnalysisPipeline.java From dataflow-opinion-analysis with Apache License 2.0

4 votes

/**
 * This function creates the DAG graph of transforms. It can be called from main()
 * as well as from the ControlPipeline.
 * @param options
 * @return
 * @throws Exception
 */
public static Pipeline createNLPPipeline(IndexerPipelineOptions options) throws Exception {
	
    IndexerPipelineUtils.validateIndexerPipelineOptions(options);
	Pipeline pipeline = Pipeline.create(options);
	
	PCollection<InputContent> readContent;
	PCollection<String> rawInput;
	
	if (options.isStreaming()) {
		
		// Continuously read from a Pub/Sub topic
		rawInput = pipeline.apply("Read from PubSub", 
			PubsubIO.readStrings().fromTopic(
				options.getPubsubTopic())); 
		
	
	} else {
		// Read from GCS files

		rawInput = pipeline.apply("Read from GCS files", 
			Read.from(new RecordFileSource<String>(
				ValueProvider.StaticValueProvider.of(options.getInputFile()), 
				StringUtf8Coder.of(), 
				RecordFileSource.DEFAULT_RECORD_SEPARATOR)));
	}

	readContent = rawInput.apply(ParDo.of(new ParseRawInput()));
	
	// Extract opinions from online opinions
	PCollection<ContentIndexSummary> indexes = readContent
		.apply(ParDo.of(new IndexDocument())) 
		.setCoder(AvroCoder.of(ContentIndexSummary.class));
	

	// Write into BigQuery 
	PCollectionTuple bqrows= indexes
		.apply(ParDo.of(new CreateTableRowsFromIndexSummaryFn())
			.withOutputTags(webresourceTag, // main output collection
				TupleTagList.of(documentTag).and(sentimentTag)) // 2 side output collections
			); 
	
	PCollection<TableRow> webresourceRows = bqrows.get(webresourceTag);
	PCollection<TableRow> documentRows = bqrows.get(documentTag);
	PCollection<TableRow> sentimentRows = bqrows.get(sentimentTag);

	// Append or Overwrite
	WriteDisposition dispo = options.getWriteTruncate() ? 
			WriteDisposition.WRITE_TRUNCATE: WriteDisposition.WRITE_APPEND; 
	
		
	webresourceRows
		.apply("Write to webresource", 
			BigQueryIO.writeTableRows()
				.to(getWebResourceTableReference(options)) 
				.withSchema(getWebResourceSchema())
				.withCreateDisposition(CreateDisposition.CREATE_NEVER)
				.withWriteDisposition(dispo)); 
	
	documentRows
		.apply("Write to document", 
			BigQueryIO.writeTableRows()
				.to(getDocumentTableReference(options))
				.withSchema(getDocumentTableSchema())
				.withCreateDisposition(CreateDisposition.CREATE_NEVER)
				.withWriteDisposition(dispo)); 
	
	sentimentRows
		.apply("Write to sentiment", 
			BigQueryIO.writeTableRows()
				.to(getSentimentTableReference(options)) 
				.withSchema(getSentimentSchema())
				.withCreateDisposition(CreateDisposition.CREATE_NEVER)
				.withWriteDisposition(dispo));

	
	return pipeline;
}

Source File: PubsubAvroToBigQuery.java From DataflowTemplates with Apache License 2.0

4 votes

/**
 * Runs the pipeline with the supplied options.
 *
 * @param options execution parameters to the pipeline
 * @return result of the pipeline execution as a {@link PipelineResult}
 */
private static PipelineResult run(PubsubAvroToBigQueryOptions options) {

  // Create the pipeline.
  Pipeline pipeline = Pipeline.create(options);

  Schema schema = SchemaUtils.getAvroSchema(options.getSchemaPath());

  WriteResult writeResults =
      pipeline
          .apply(
              "Read Avro records",
              PubsubIO
                  .readAvroGenericRecords(schema)
                  .fromSubscription(options.getInputSubscription()))

          .apply(
              "Write to BigQuery",
              BigQueryIO.<GenericRecord>write()
                  .to(options.getOutputTableSpec())
                  .useBeamSchema()
                  .withMethod(Method.STREAMING_INSERTS)
                  .withWriteDisposition(WriteDisposition.valueOf(options.getWriteDisposition()))
                  .withCreateDisposition(
                      CreateDisposition.valueOf(options.getCreateDisposition()))
                  .withFailedInsertRetryPolicy(InsertRetryPolicy.retryTransientErrors())
                  .withExtendedErrorInfo());

  writeResults
      .getFailedInsertsWithErr()
      .apply(
          "Create error payload",
          ErrorConverters.BigQueryInsertErrorToPubsubMessage.<GenericRecord>newBuilder()
              .setPayloadCoder(AvroCoder.of(schema))
              .setTranslateFunction(
                  BigQueryConverters.TableRowToGenericRecordFn.of(schema))
              .build())
      .apply(
          "Write failed records",
          PubsubIO.writeMessages().to(options.getOutputTopic()));

  // Execute the pipeline and return the result.
  return pipeline.run();
}

Source File: CreateTables.java From beam with Apache License 2.0

4 votes

private void tryCreateTable(
    ProcessContext context,
    DestinationT destination,
    TableDestination tableDestination,
    String tableSpec,
    String kmsKey) {
  DatasetService datasetService =
      bqServices.getDatasetService(context.getPipelineOptions().as(BigQueryOptions.class));
  TableReference tableReference = tableDestination.getTableReference().clone();
  tableReference.setTableId(
      BigQueryHelpers.stripPartitionDecorator(tableReference.getTableId()));
  try {
    if (datasetService.getTable(tableReference) == null) {
      TableSchema tableSchema = dynamicDestinations.getSchema(destination);
      checkArgument(
          tableSchema != null,
          "Unless create disposition is %s, a schema must be specified, i.e. "
              + "DynamicDestinations.getSchema() may not return null. "
              + "However, create disposition is %s, and "
              + " %s returned null for destination %s",
          CreateDisposition.CREATE_NEVER,
          createDisposition,
          dynamicDestinations,
          destination);
      Table table =
          new Table()
              .setTableReference(tableReference)
              .setSchema(tableSchema)
              .setDescription(tableDestination.getTableDescription());
      if (tableDestination.getTimePartitioning() != null) {
        table.setTimePartitioning(tableDestination.getTimePartitioning());
        if (tableDestination.getClustering() != null) {
          table.setClustering(tableDestination.getClustering());
        }
      }
      if (kmsKey != null) {
        table.setEncryptionConfiguration(new EncryptionConfiguration().setKmsKeyName(kmsKey));
      }
      datasetService.createTable(table);
    }
  } catch (Exception e) {
    throw new RuntimeException(e);
  }
  createdTables.add(tableSpec);
}

Source File: Snippets.java From beam with Apache License 2.0

4 votes

public static void main(String[] args) {

      // [START BigQueryIODeadLetter]

      PipelineOptions options =
          PipelineOptionsFactory.fromArgs(args).withValidation().as(BigQueryOptions.class);

      Pipeline p = Pipeline.create(options);

      // Create a bug by writing the 2nd value as null. The API will correctly
      // throw an error when trying to insert a null value into a REQUIRED field.
      WriteResult result =
          p.apply(Create.of(1, 2))
              .apply(
                  BigQueryIO.<Integer>write()
                      .withSchema(
                          new TableSchema()
                              .setFields(
                                  ImmutableList.of(
                                      new TableFieldSchema()
                                          .setName("num")
                                          .setType("INTEGER")
                                          .setMode("REQUIRED"))))
                      .to("Test.dummyTable")
                      .withFormatFunction(x -> new TableRow().set("num", (x == 2) ? null : x))
                      .withFailedInsertRetryPolicy(InsertRetryPolicy.retryTransientErrors())
                      // Forcing the bounded pipeline to use streaming inserts
                      .withMethod(BigQueryIO.Write.Method.STREAMING_INSERTS)
                      // set the withExtendedErrorInfo property.
                      .withExtendedErrorInfo()
                      .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED)
                      .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND));

      result
          .getFailedInsertsWithErr()
          .apply(
              MapElements.into(TypeDescriptors.strings())
                  .via(
                      x -> {
                        System.out.println(" The table was " + x.getTable());
                        System.out.println(" The row was " + x.getRow());
                        System.out.println(" The error was " + x.getError());
                        return "";
                      }));
      p.run();

      /*  Sample Output From the pipeline:
       <p>The table was GenericData{classInfo=[datasetId, projectId, tableId], {datasetId=Test,projectId=<>, tableId=dummyTable}}
       <p>The row was GenericData{classInfo=[f], {num=null}}
       <p>The error was GenericData{classInfo=[errors, index],{errors=[GenericData{classInfo=[debugInfo, location, message, reason], {debugInfo=,location=, message=Missing required field: Msg_0_CLOUD_QUERY_TABLE.num., reason=invalid}}],index=0}}
      */
    }

org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition Java Examples