org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition Java Examples
The following examples show how to use
org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: BigQueryDeadletterSink.java From feast with Apache License 2.0 | 6 votes |
@Override public PDone expand(PCollection<FailedElement> input) { TimePartitioning partition = new TimePartitioning().setType("DAY"); partition.setField(TIMESTAMP_COLUMN); input .apply("FailedElementToTableRow", ParDo.of(new FailedElementToTableRowFn())) .apply( "WriteFailedElementsToBigQuery", BigQueryIO.writeTableRows() .to(getTableSpec()) .withJsonSchema(getJsonSchema()) .withTimePartitioning(partition) .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) .withWriteDisposition(WriteDisposition.WRITE_APPEND)); return PDone.in(input.getPipeline()); }
Example #2
Source File: FakeJobService.java From beam with Apache License 2.0 | 6 votes |
private boolean validateDispositions( Table table, CreateDisposition createDisposition, WriteDisposition writeDisposition) throws InterruptedException, IOException { if (table == null) { if (createDisposition == CreateDisposition.CREATE_NEVER) { return false; } } else if (writeDisposition == WriteDisposition.WRITE_TRUNCATE) { datasetService.deleteTable(table.getTableReference()); } else if (writeDisposition == WriteDisposition.WRITE_EMPTY) { List<TableRow> allRows = datasetService.getAllRows( table.getTableReference().getProjectId(), table.getTableReference().getDatasetId(), table.getTableReference().getTableId()); if (!allRows.isEmpty()) { return false; } } return true; }
Example #3
Source File: StreamingInserts.java From beam with Apache License 2.0 | 6 votes |
/** Constructor. */ private StreamingInserts( CreateDisposition createDisposition, DynamicDestinations<?, DestinationT> dynamicDestinations, BigQueryServices bigQueryServices, InsertRetryPolicy retryPolicy, boolean extendedErrorInfo, boolean skipInvalidRows, boolean ignoreUnknownValues, boolean ignoreInsertIds, Coder<ElementT> elementCoder, SerializableFunction<ElementT, TableRow> toTableRow, String kmsKey) { this.createDisposition = createDisposition; this.dynamicDestinations = dynamicDestinations; this.bigQueryServices = bigQueryServices; this.retryPolicy = retryPolicy; this.extendedErrorInfo = extendedErrorInfo; this.skipInvalidRows = skipInvalidRows; this.ignoreUnknownValues = ignoreUnknownValues; this.ignoreInsertIds = ignoreInsertIds; this.elementCoder = elementCoder; this.toTableRow = toTableRow; this.kmsKey = kmsKey; }
Example #4
Source File: StreamingInserts.java From beam with Apache License 2.0 | 6 votes |
/** Constructor. */ public StreamingInserts( CreateDisposition createDisposition, DynamicDestinations<?, DestinationT> dynamicDestinations, Coder<ElementT> elementCoder, SerializableFunction<ElementT, TableRow> toTableRow) { this( createDisposition, dynamicDestinations, new BigQueryServicesImpl(), InsertRetryPolicy.alwaysRetry(), false, false, false, false, elementCoder, toTableRow, null); }
Example #5
Source File: WriteWindowedToBigQuery.java From beam with Apache License 2.0 | 5 votes |
@Override public PDone expand(PCollection<T> teamAndScore) { teamAndScore .apply("ConvertToRow", ParDo.of(new BuildRowFn())) .apply( BigQueryIO.writeTableRows() .to(getTable(projectId, datasetId, tableName)) .withSchema(getSchema()) .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) .withWriteDisposition(WriteDisposition.WRITE_APPEND)); return PDone.in(teamAndScore.getPipeline()); }
Example #6
Source File: CreateTables.java From beam with Apache License 2.0 | 5 votes |
private CreateTables( CreateDisposition createDisposition, BigQueryServices bqServices, DynamicDestinations<?, DestinationT> dynamicDestinations, String kmsKey) { this.createDisposition = createDisposition; this.bqServices = bqServices; this.dynamicDestinations = dynamicDestinations; this.kmsKey = kmsKey; }
Example #7
Source File: WriteTables.java From beam with Apache License 2.0 | 5 votes |
public WriteTables( boolean tempTable, BigQueryServices bqServices, PCollectionView<String> loadJobIdPrefixView, WriteDisposition writeDisposition, CreateDisposition createDisposition, List<PCollectionView<?>> sideInputs, DynamicDestinations<?, DestinationT> dynamicDestinations, @Nullable ValueProvider<String> loadJobProjectId, int maxRetryJobs, boolean ignoreUnknownValues, String kmsKey, String sourceFormat, boolean useAvroLogicalTypes, Set<SchemaUpdateOption> schemaUpdateOptions) { this.tempTable = tempTable; this.bqServices = bqServices; this.loadJobIdPrefixView = loadJobIdPrefixView; this.firstPaneWriteDisposition = writeDisposition; this.firstPaneCreateDisposition = createDisposition; this.sideInputs = sideInputs; this.dynamicDestinations = dynamicDestinations; this.mainOutputTag = new TupleTag<>("WriteTablesMainOutput"); this.temporaryFilesTag = new TupleTag<>("TemporaryFiles"); this.loadJobProjectId = loadJobProjectId; this.maxRetryJobs = maxRetryJobs; this.ignoreUnknownValues = ignoreUnknownValues; this.kmsKey = kmsKey; this.sourceFormat = sourceFormat; this.useAvroLogicalTypes = useAvroLogicalTypes; this.schemaUpdateOptions = schemaUpdateOptions; }
Example #8
Source File: BatchLoads.java From beam with Apache License 2.0 | 5 votes |
BatchLoads( WriteDisposition writeDisposition, CreateDisposition createDisposition, boolean singletonTable, DynamicDestinations<?, DestinationT> dynamicDestinations, Coder<DestinationT> destinationCoder, ValueProvider<String> customGcsTempLocation, @Nullable ValueProvider<String> loadJobProjectId, boolean ignoreUnknownValues, Coder<ElementT> elementCoder, RowWriterFactory<ElementT, DestinationT> rowWriterFactory, @Nullable String kmsKey, boolean clusteringEnabled, boolean useAvroLogicalTypes) { bigQueryServices = new BigQueryServicesImpl(); this.writeDisposition = writeDisposition; this.createDisposition = createDisposition; this.singletonTable = singletonTable; this.dynamicDestinations = dynamicDestinations; this.destinationCoder = destinationCoder; this.maxNumWritersPerBundle = DEFAULT_MAX_NUM_WRITERS_PER_BUNDLE; this.maxFileSize = DEFAULT_MAX_FILE_SIZE; this.numFileShards = DEFAULT_NUM_FILE_SHARDS; this.maxFilesPerPartition = DEFAULT_MAX_FILES_PER_PARTITION; this.maxBytesPerPartition = DEFAULT_MAX_BYTES_PER_PARTITION; this.triggeringFrequency = null; this.customGcsTempLocation = customGcsTempLocation; this.loadJobProjectId = loadJobProjectId; this.ignoreUnknownValues = ignoreUnknownValues; this.useAvroLogicalTypes = useAvroLogicalTypes; this.elementCoder = elementCoder; this.kmsKey = kmsKey; this.rowWriterFactory = rowWriterFactory; this.clusteringEnabled = clusteringEnabled; schemaUpdateOptions = Collections.emptySet(); }
Example #9
Source File: WriteRename.java From beam with Apache License 2.0 | 5 votes |
private PendingJobData startWriteRename( TableDestination finalTableDestination, Iterable<String> tempTableNames, ProcessContext c) throws Exception { WriteDisposition writeDisposition = (c.pane().getIndex() == 0) ? firstPaneWriteDisposition : WriteDisposition.WRITE_APPEND; CreateDisposition createDisposition = (c.pane().getIndex() == 0) ? firstPaneCreateDisposition : CreateDisposition.CREATE_NEVER; List<TableReference> tempTables = StreamSupport.stream(tempTableNames.spliterator(), false) .map(table -> BigQueryHelpers.fromJsonString(table, TableReference.class)) .collect(Collectors.toList()); ; // Make sure each destination table gets a unique job id. String jobIdPrefix = BigQueryHelpers.createJobId( c.sideInput(jobIdToken), finalTableDestination, -1, c.pane().getIndex()); BigQueryHelpers.PendingJob retryJob = startCopy( bqServices.getJobService(c.getPipelineOptions().as(BigQueryOptions.class)), bqServices.getDatasetService(c.getPipelineOptions().as(BigQueryOptions.class)), jobIdPrefix, finalTableDestination.getTableReference(), tempTables, writeDisposition, createDisposition, kmsKey); return new PendingJobData(retryJob, finalTableDestination, tempTables); }
Example #10
Source File: WriteRename.java From beam with Apache License 2.0 | 5 votes |
public WriteRename( BigQueryServices bqServices, PCollectionView<String> jobIdToken, WriteDisposition writeDisposition, CreateDisposition createDisposition, int maxRetryJobs, String kmsKey) { this.bqServices = bqServices; this.jobIdToken = jobIdToken; this.firstPaneWriteDisposition = writeDisposition; this.firstPaneCreateDisposition = createDisposition; this.maxRetryJobs = maxRetryJobs; this.kmsKey = kmsKey; }
Example #11
Source File: WriteFailedElementToBigQuery.java From feast with Apache License 2.0 | 5 votes |
@Override public WriteResult expand(PCollection<FailedElement> failedElements) { return failedElements .apply("FailedElementToTableRow", ParDo.of(new FailedElementToTableRowFn())) .apply( "WriteFailedElementsToBigQuery", BigQueryIO.writeTableRows() .to(getTableSpec()) .withJsonSchema(getJsonSchema()) .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) .withWriteDisposition(WriteDisposition.WRITE_APPEND)); }
Example #12
Source File: WriteToBigQuery.java From beam with Apache License 2.0 | 5 votes |
@Override public PDone expand(PCollection<InputT> teamAndScore) { teamAndScore .apply("ConvertToRow", ParDo.of(new BuildRowFn())) .apply( BigQueryIO.writeTableRows() .to(getTable(projectId, datasetId, tableName)) .withSchema(getSchema()) .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) .withWriteDisposition(WriteDisposition.WRITE_APPEND)); return PDone.in(teamAndScore.getPipeline()); }
Example #13
Source File: DatastoreToBigQuery.java From DataflowTemplates with Apache License 2.0 | 5 votes |
/** * Runs a pipeline which reads in Entities from Datastore, passes in the JSON encoded Entities * to a Javascript UDF that returns JSON that conforms to the BigQuery TableRow spec and writes * the TableRows to BigQuery. * * @param args arguments to the pipeline */ public static void main(String[] args) { DatastoreToBigQueryOptions options = PipelineOptionsFactory.fromArgs(args) .withValidation() .as(DatastoreToBigQueryOptions.class); Pipeline pipeline = Pipeline.create(options); pipeline .apply( ReadJsonEntities.newBuilder() .setGqlQuery(options.getDatastoreReadGqlQuery()) .setProjectId(options.getDatastoreReadProjectId()) .setNamespace(options.getDatastoreReadNamespace()) .build()) .apply( TransformTextViaJavascript.newBuilder() .setFileSystemPath(options.getJavascriptTextTransformGcsPath()) .setFunctionName(options.getJavascriptTextTransformFunctionName()) .build()) .apply(BigQueryConverters.jsonToTableRow()) .apply( "WriteBigQuery", BigQueryIO.writeTableRows() .withoutValidation() .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_NEVER) .to(options.getOutputTableSpec()) .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) .withWriteDisposition(WriteDisposition.WRITE_TRUNCATE) .withCustomGcsTempLocation(options.getBigQueryLoadingTemporaryDirectory())); pipeline.run(); }
Example #14
Source File: ErrorConverters.java From DataflowTemplates with Apache License 2.0 | 5 votes |
@Override public WriteResult expand(PCollection<FailsafeElement<PubsubMessage, String>> failedRecords) { return failedRecords .apply("FailedRecordToTableRow", ParDo.of(new FailedPubsubMessageToTableRowFn())) .apply( "WriteFailedRecordsToBigQuery", BigQueryIO.writeTableRows() .to(getErrorRecordsTable()) .withJsonSchema(getErrorRecordsTableSchema()) .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) .withWriteDisposition(WriteDisposition.WRITE_APPEND)); }
Example #15
Source File: WriteToBigQuery.java From deployment-examples with MIT License | 5 votes |
@Override public PDone expand(PCollection<InputT> teamAndScore) { teamAndScore .apply("ConvertToRow", ParDo.of(new BuildRowFn())) .apply( BigQueryIO.writeTableRows() .to(getTable(projectId, datasetId, tableName)) .withSchema(getSchema()) .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) .withWriteDisposition(WriteDisposition.WRITE_APPEND)); return PDone.in(teamAndScore.getPipeline()); }
Example #16
Source File: WriteWindowedToBigQuery.java From deployment-examples with MIT License | 5 votes |
@Override public PDone expand(PCollection<T> teamAndScore) { teamAndScore .apply("ConvertToRow", ParDo.of(new BuildRowFn())) .apply( BigQueryIO.writeTableRows() .to(getTable(projectId, datasetId, tableName)) .withSchema(getSchema()) .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) .withWriteDisposition(WriteDisposition.WRITE_APPEND)); return PDone.in(teamAndScore.getPipeline()); }
Example #17
Source File: ErrorConverters.java From DataflowTemplates with Apache License 2.0 | 5 votes |
@Override public WriteResult expand(PCollection<FailsafeElement<String, String>> failedRecords) { return failedRecords .apply("FailedRecordToTableRow", ParDo.of(new FailedStringToTableRowFn())) .apply( "WriteFailedRecordsToBigQuery", BigQueryIO.writeTableRows() .to(getErrorRecordsTable()) .withJsonSchema(getErrorRecordsTableSchema()) .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) .withWriteDisposition(WriteDisposition.WRITE_APPEND)); }
Example #18
Source File: ErrorConverters.java From DataflowTemplates with Apache License 2.0 | 5 votes |
@Override public WriteResult expand( PCollection<FailsafeElement<KV<String, String>, String>> failedRecords) { return failedRecords .apply("FailedRecordToTableRow", ParDo.of(new FailedMessageToTableRowFn())) .apply( "WriteFailedRecordsToBigQuery", BigQueryIO.writeTableRows() .to(getErrorRecordsTable()) .withJsonSchema(getErrorRecordsTableSchema()) .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) .withWriteDisposition(WriteDisposition.WRITE_APPEND)); }
Example #19
Source File: ErrorConverters.java From DataflowTemplates with Apache License 2.0 | 5 votes |
@Override public WriteResult expand( PCollection<FailsafeElement<PubsubMessage, String>> failedRecords) { return failedRecords .apply("FailedRecordToTableRow", ParDo.of(new FailedPubsubMessageToTableRowFn())) .apply( "WriteFailedRecordsToBigQuery", BigQueryIO.writeTableRows() .to(getErrorRecordsTable()) .withJsonSchema(getErrorRecordsTableSchema()) .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) .withWriteDisposition(WriteDisposition.WRITE_APPEND)); }
Example #20
Source File: KafkaToBigQuery.java From DataflowTemplates with Apache License 2.0 | 5 votes |
@Override public WriteResult expand( PCollection<FailsafeElement<KV<String, String>, String>> failedRecords) { return failedRecords .apply("FailedRecordToTableRow", ParDo.of(new FailedMessageToTableRowFn())) .apply( "WriteFailedRecordsToBigQuery", BigQueryIO.writeTableRows() .to(getErrorRecordsTable()) .withJsonSchema(getErrorRecordsTableSchema()) .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) .withWriteDisposition(WriteDisposition.WRITE_APPEND)); }
Example #21
Source File: ErrorConverters.java From DataflowTemplates with Apache License 2.0 | 5 votes |
@Override public WriteResult expand(PCollection<FailsafeElement<String, String>> failedRecords) { return failedRecords .apply("FailedRecordToTableRow", ParDo.of(new FailedStringToTableRowFn())) .apply( "WriteFailedRecordsToBigQuery", BigQueryIO.writeTableRows() .to(getErrorRecordsTable()) .withJsonSchema(getErrorRecordsTableSchema()) .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) .withWriteDisposition(WriteDisposition.WRITE_APPEND)); }
Example #22
Source File: CreateTables.java From beam with Apache License 2.0 | 4 votes |
private TableDestination getTableDestination(ProcessContext context, DestinationT destination) { TableDestination tableDestination = dynamicDestinations.getTable(destination); checkArgument( tableDestination != null, "DynamicDestinations.getTable() may not return null, " + "but %s returned null for destination %s", dynamicDestinations, destination); checkArgument( tableDestination.getTableSpec() != null, "DynamicDestinations.getTable() must return a TableDestination " + "with a non-null table spec, but %s returned %s for destination %s," + "which has a null table spec", dynamicDestinations, tableDestination, destination); boolean destinationCoderSupportsClustering = !(dynamicDestinations.getDestinationCoder() instanceof TableDestinationCoderV2); checkArgument( tableDestination.getClustering() == null || destinationCoderSupportsClustering, "DynamicDestinations.getTable() may only return destinations with clustering configured" + " if a destination coder is supplied that supports clustering, but %s is configured" + " to use TableDestinationCoderV2. Set withClustering() on BigQueryIO.write() and, " + " if you provided a custom DynamicDestinations instance, override" + " getDestinationCoder() to return TableDestinationCoderV3.", dynamicDestinations); TableReference tableReference = tableDestination.getTableReference().clone(); if (Strings.isNullOrEmpty(tableReference.getProjectId())) { tableReference.setProjectId( context.getPipelineOptions().as(BigQueryOptions.class).getProject()); tableDestination = tableDestination.withTableReference(tableReference); } if (createDisposition == CreateDisposition.CREATE_NEVER) { return tableDestination; } String tableSpec = BigQueryHelpers.stripPartitionDecorator(tableDestination.getTableSpec()); if (!createdTables.contains(tableSpec)) { // Another thread may have succeeded in creating the table in the meanwhile, so // check again. This check isn't needed for correctness, but we add it to prevent // every thread from attempting a create and overwhelming our BigQuery quota. synchronized (createdTables) { if (!createdTables.contains(tableSpec)) { tryCreateTable(context, destination, tableDestination, tableSpec, kmsKey); } } } return tableDestination; }
Example #23
Source File: KafkaToBigQuery.java From java-docs-samples with Apache License 2.0 | 4 votes |
public static void main(final String[] args) { Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); options.setStreaming(true); var pipeline = Pipeline.create(options); pipeline .apply("Read messages from Kafka", KafkaIO.<String, String>read() .withBootstrapServers(options.getBootstrapServer()) .withTopic(options.getInputTopic()) .withKeyDeserializer(StringDeserializer.class) .withValueDeserializer(StringDeserializer.class) .withoutMetadata()) .apply("Get message contents", Values.<String>create()) .apply("Log messages", MapElements.into(TypeDescriptor.of(String.class)) .via(message -> { LOG.info("Received: {}", message); return message; })) .apply("Parse JSON", MapElements.into(TypeDescriptor.of(PageRating.class)) .via(message -> GSON.fromJson(message, PageRating.class))) .apply("Add processing time", WithTimestamps.of((pageRating) -> new Instant(pageRating.processingTime))) .apply("Fixed-size windows", Window.into(FixedWindows.of(Duration.standardMinutes(1)))) .apply("Convert to BigQuery TableRow", MapElements.into(TypeDescriptor.of(TableRow.class)) .via(pageRating -> new TableRow() .set("processing_time", pageRating.processingTime.toString()) .set("url", pageRating.url) .set("rating", pageRating.rating))) .apply("Write to BigQuery", BigQueryIO.writeTableRows() .to(options.getOutputTable()) .withSchema(new TableSchema().setFields(Arrays.asList( new TableFieldSchema().setName("processing_time").setType("TIMESTAMP"), new TableFieldSchema().setName("url").setType("STRING"), new TableFieldSchema().setName("rating").setType("STRING")))) .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) .withWriteDisposition(WriteDisposition.WRITE_APPEND)); // For a Dataflow Flex Template, do NOT waitUntilFinish(). pipeline.run(); }
Example #24
Source File: StreamingBeamSQL.java From java-docs-samples with Apache License 2.0 | 4 votes |
public static void main(final String[] args) { Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); options.setStreaming(true); var project = options.as(GcpOptions.class).getProject(); var subscription = ProjectSubscriptionName.of(project, options.getInputSubscription()).toString(); var schema = Schema.builder() .addStringField("url") .addDoubleField("page_score") .addDateTimeField("processing_time") .build(); var pipeline = Pipeline.create(options); pipeline // Read, parse, and validate messages from Pub/Sub. .apply("Read messages from Pub/Sub", PubsubIO.readStrings().fromSubscription(subscription)) .apply("Parse JSON into SQL rows", MapElements.into(TypeDescriptor.of(Row.class)).via(message -> { // This is a good place to add error handling. // The first transform should act as a validation layer to make sure // that any data coming to the processing pipeline must be valid. // See `MapElements.MapWithFailures` for more details. LOG.info("message: {}", message); var msg = GSON.fromJson(message, PageReviewMessage.class); return Row.withSchema(schema).addValues( msg.url, // row url msg.review.equals("positive") ? 1.0 : 0.0, // row page_score new Instant() // row processing_time ).build(); })).setRowSchema(schema) // make sure to set the row schema for the PCollection // Add timestamps and bundle elements into windows. .apply("Add processing time", WithTimestamps.of((row) -> row.getDateTime("processing_time").toInstant())) .apply("Fixed-size windows", Window.into(FixedWindows.of(Duration.standardMinutes(1)))) // Apply a SQL query for every window of elements. .apply("Run Beam SQL query", SqlTransform.query( "SELECT " + " url, " + " COUNT(page_score) AS num_reviews, " + " AVG(page_score) AS score, " + " MIN(processing_time) AS first_date, " + " MAX(processing_time) AS last_date " + "FROM PCOLLECTION " + "GROUP BY url" )) // Convert the SQL Rows into BigQuery TableRows and write them to BigQuery. .apply("Convert to BigQuery TableRow", MapElements.into(TypeDescriptor.of(TableRow.class)).via(row -> { LOG.info("rating summary: {} {} ({} reviews)", row.getDouble("score"), row.getString("url"), row.getInt64("num_reviews")); return new TableRow() .set("url", row.getString("url")) .set("num_reviews", row.getInt64("num_reviews")) .set("score", row.getDouble("score")) .set("first_date", row.getDateTime("first_date").toInstant().toString()) .set("last_date", row.getDateTime("last_date").toInstant().toString()); })) .apply("Write to BigQuery", BigQueryIO.writeTableRows() .to(options.getOutputTable()) .withSchema(new TableSchema().setFields(Arrays.asList( // To learn more about the valid BigQuery types: // https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types new TableFieldSchema().setName("url").setType("STRING"), new TableFieldSchema().setName("num_reviews").setType("INTEGER"), new TableFieldSchema().setName("score").setType("FLOAT64"), new TableFieldSchema().setName("first_date").setType("TIMESTAMP"), new TableFieldSchema().setName("last_date").setType("TIMESTAMP")))) .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED) .withWriteDisposition(WriteDisposition.WRITE_APPEND)); // For a Dataflow Flex Template, do NOT waitUntilFinish(). pipeline.run(); }
Example #25
Source File: FakeJobService.java From beam with Apache License 2.0 | 4 votes |
private JobStatus runCopyJob(JobConfigurationTableCopy copy) throws InterruptedException, IOException { List<TableReference> sources = copy.getSourceTables(); TableReference destination = copy.getDestinationTable(); WriteDisposition writeDisposition = WriteDisposition.valueOf(copy.getWriteDisposition()); CreateDisposition createDisposition = CreateDisposition.valueOf(copy.getCreateDisposition()); Table existingTable = datasetService.getTable(destination); if (!validateDispositions(existingTable, createDisposition, writeDisposition)) { return new JobStatus().setState("FAILED").setErrorResult(new ErrorProto()); } TimePartitioning partitioning = null; Clustering clustering = null; TableSchema schema = null; boolean first = true; List<TableRow> allRows = Lists.newArrayList(); for (TableReference source : sources) { Table table = checkNotNull(datasetService.getTable(source)); if (!first) { if (!Objects.equals(partitioning, table.getTimePartitioning())) { return new JobStatus().setState("FAILED").setErrorResult(new ErrorProto()); } if (!Objects.equals(clustering, table.getClustering())) { return new JobStatus().setState("FAILED").setErrorResult(new ErrorProto()); } if (!Objects.equals(schema, table.getSchema())) { return new JobStatus().setState("FAILED").setErrorResult(new ErrorProto()); } } partitioning = table.getTimePartitioning(); clustering = table.getClustering(); schema = table.getSchema(); first = false; allRows.addAll( datasetService.getAllRows( source.getProjectId(), source.getDatasetId(), source.getTableId())); } datasetService.createTable( new Table() .setTableReference(destination) .setSchema(schema) .setTimePartitioning(partitioning) .setClustering(clustering) .setEncryptionConfiguration(copy.getDestinationEncryptionConfiguration())); datasetService.insertAll(destination, allRows, null); return new JobStatus().setState("DONE"); }
Example #26
Source File: FakeJobService.java From beam with Apache License 2.0 | 4 votes |
private JobStatus runLoadJob(JobReference jobRef, JobConfigurationLoad load) throws InterruptedException, IOException { TableReference destination = load.getDestinationTable(); TableSchema schema = load.getSchema(); checkArgument(schema != null, "No schema specified"); List<ResourceId> sourceFiles = filesForLoadJobs.get(jobRef.getProjectId(), jobRef.getJobId()); WriteDisposition writeDisposition = WriteDisposition.valueOf(load.getWriteDisposition()); CreateDisposition createDisposition = CreateDisposition.valueOf(load.getCreateDisposition()); Table existingTable = datasetService.getTable(destination); if (!validateDispositions(existingTable, createDisposition, writeDisposition)) { return new JobStatus().setState("FAILED").setErrorResult(new ErrorProto()); } if (existingTable == null) { TableReference strippedDestination = destination .clone() .setTableId(BigQueryHelpers.stripPartitionDecorator(destination.getTableId())); existingTable = new Table().setTableReference(strippedDestination).setSchema(schema); if (load.getTimePartitioning() != null) { existingTable = existingTable.setTimePartitioning(load.getTimePartitioning()); } if (load.getClustering() != null) { existingTable = existingTable.setClustering(load.getClustering()); } datasetService.createTable(existingTable); } List<TableRow> rows = Lists.newArrayList(); for (ResourceId filename : sourceFiles) { if (load.getSourceFormat().equals("NEWLINE_DELIMITED_JSON")) { rows.addAll(readJsonTableRows(filename.toString())); } else if (load.getSourceFormat().equals("AVRO")) { rows.addAll(readAvroTableRows(filename.toString(), schema)); } } datasetService.insertAll(destination, rows, null); FileSystems.delete(sourceFiles); return new JobStatus().setState("DONE"); }
Example #27
Source File: OpinionAnalysisPipeline.java From dataflow-opinion-analysis with Apache License 2.0 | 4 votes |
/** * This function creates the DAG graph of transforms. It can be called from main() * as well as from the ControlPipeline. * @param options * @return * @throws Exception */ public static Pipeline createNLPPipeline(IndexerPipelineOptions options) throws Exception { IndexerPipelineUtils.validateIndexerPipelineOptions(options); Pipeline pipeline = Pipeline.create(options); PCollection<InputContent> readContent; PCollection<String> rawInput; if (options.isStreaming()) { // Continuously read from a Pub/Sub topic rawInput = pipeline.apply("Read from PubSub", PubsubIO.readStrings().fromTopic( options.getPubsubTopic())); } else { // Read from GCS files rawInput = pipeline.apply("Read from GCS files", Read.from(new RecordFileSource<String>( ValueProvider.StaticValueProvider.of(options.getInputFile()), StringUtf8Coder.of(), RecordFileSource.DEFAULT_RECORD_SEPARATOR))); } readContent = rawInput.apply(ParDo.of(new ParseRawInput())); // Extract opinions from online opinions PCollection<ContentIndexSummary> indexes = readContent .apply(ParDo.of(new IndexDocument())) .setCoder(AvroCoder.of(ContentIndexSummary.class)); // Write into BigQuery PCollectionTuple bqrows= indexes .apply(ParDo.of(new CreateTableRowsFromIndexSummaryFn()) .withOutputTags(webresourceTag, // main output collection TupleTagList.of(documentTag).and(sentimentTag)) // 2 side output collections ); PCollection<TableRow> webresourceRows = bqrows.get(webresourceTag); PCollection<TableRow> documentRows = bqrows.get(documentTag); PCollection<TableRow> sentimentRows = bqrows.get(sentimentTag); // Append or Overwrite WriteDisposition dispo = options.getWriteTruncate() ? WriteDisposition.WRITE_TRUNCATE: WriteDisposition.WRITE_APPEND; webresourceRows .apply("Write to webresource", BigQueryIO.writeTableRows() .to(getWebResourceTableReference(options)) .withSchema(getWebResourceSchema()) .withCreateDisposition(CreateDisposition.CREATE_NEVER) .withWriteDisposition(dispo)); documentRows .apply("Write to document", BigQueryIO.writeTableRows() .to(getDocumentTableReference(options)) .withSchema(getDocumentTableSchema()) .withCreateDisposition(CreateDisposition.CREATE_NEVER) .withWriteDisposition(dispo)); sentimentRows .apply("Write to sentiment", BigQueryIO.writeTableRows() .to(getSentimentTableReference(options)) .withSchema(getSentimentSchema()) .withCreateDisposition(CreateDisposition.CREATE_NEVER) .withWriteDisposition(dispo)); return pipeline; }
Example #28
Source File: PubsubAvroToBigQuery.java From DataflowTemplates with Apache License 2.0 | 4 votes |
/** * Runs the pipeline with the supplied options. * * @param options execution parameters to the pipeline * @return result of the pipeline execution as a {@link PipelineResult} */ private static PipelineResult run(PubsubAvroToBigQueryOptions options) { // Create the pipeline. Pipeline pipeline = Pipeline.create(options); Schema schema = SchemaUtils.getAvroSchema(options.getSchemaPath()); WriteResult writeResults = pipeline .apply( "Read Avro records", PubsubIO .readAvroGenericRecords(schema) .fromSubscription(options.getInputSubscription())) .apply( "Write to BigQuery", BigQueryIO.<GenericRecord>write() .to(options.getOutputTableSpec()) .useBeamSchema() .withMethod(Method.STREAMING_INSERTS) .withWriteDisposition(WriteDisposition.valueOf(options.getWriteDisposition())) .withCreateDisposition( CreateDisposition.valueOf(options.getCreateDisposition())) .withFailedInsertRetryPolicy(InsertRetryPolicy.retryTransientErrors()) .withExtendedErrorInfo()); writeResults .getFailedInsertsWithErr() .apply( "Create error payload", ErrorConverters.BigQueryInsertErrorToPubsubMessage.<GenericRecord>newBuilder() .setPayloadCoder(AvroCoder.of(schema)) .setTranslateFunction( BigQueryConverters.TableRowToGenericRecordFn.of(schema)) .build()) .apply( "Write failed records", PubsubIO.writeMessages().to(options.getOutputTopic())); // Execute the pipeline and return the result. return pipeline.run(); }
Example #29
Source File: CreateTables.java From beam with Apache License 2.0 | 4 votes |
private void tryCreateTable( ProcessContext context, DestinationT destination, TableDestination tableDestination, String tableSpec, String kmsKey) { DatasetService datasetService = bqServices.getDatasetService(context.getPipelineOptions().as(BigQueryOptions.class)); TableReference tableReference = tableDestination.getTableReference().clone(); tableReference.setTableId( BigQueryHelpers.stripPartitionDecorator(tableReference.getTableId())); try { if (datasetService.getTable(tableReference) == null) { TableSchema tableSchema = dynamicDestinations.getSchema(destination); checkArgument( tableSchema != null, "Unless create disposition is %s, a schema must be specified, i.e. " + "DynamicDestinations.getSchema() may not return null. " + "However, create disposition is %s, and " + " %s returned null for destination %s", CreateDisposition.CREATE_NEVER, createDisposition, dynamicDestinations, destination); Table table = new Table() .setTableReference(tableReference) .setSchema(tableSchema) .setDescription(tableDestination.getTableDescription()); if (tableDestination.getTimePartitioning() != null) { table.setTimePartitioning(tableDestination.getTimePartitioning()); if (tableDestination.getClustering() != null) { table.setClustering(tableDestination.getClustering()); } } if (kmsKey != null) { table.setEncryptionConfiguration(new EncryptionConfiguration().setKmsKeyName(kmsKey)); } datasetService.createTable(table); } } catch (Exception e) { throw new RuntimeException(e); } createdTables.add(tableSpec); }
Example #30
Source File: Snippets.java From beam with Apache License 2.0 | 4 votes |
public static void main(String[] args) { // [START BigQueryIODeadLetter] PipelineOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as(BigQueryOptions.class); Pipeline p = Pipeline.create(options); // Create a bug by writing the 2nd value as null. The API will correctly // throw an error when trying to insert a null value into a REQUIRED field. WriteResult result = p.apply(Create.of(1, 2)) .apply( BigQueryIO.<Integer>write() .withSchema( new TableSchema() .setFields( ImmutableList.of( new TableFieldSchema() .setName("num") .setType("INTEGER") .setMode("REQUIRED")))) .to("Test.dummyTable") .withFormatFunction(x -> new TableRow().set("num", (x == 2) ? null : x)) .withFailedInsertRetryPolicy(InsertRetryPolicy.retryTransientErrors()) // Forcing the bounded pipeline to use streaming inserts .withMethod(BigQueryIO.Write.Method.STREAMING_INSERTS) // set the withExtendedErrorInfo property. .withExtendedErrorInfo() .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED) .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND)); result .getFailedInsertsWithErr() .apply( MapElements.into(TypeDescriptors.strings()) .via( x -> { System.out.println(" The table was " + x.getTable()); System.out.println(" The row was " + x.getRow()); System.out.println(" The error was " + x.getError()); return ""; })); p.run(); /* Sample Output From the pipeline: <p>The table was GenericData{classInfo=[datasetId, projectId, tableId], {datasetId=Test,projectId=<>, tableId=dummyTable}} <p>The row was GenericData{classInfo=[f], {num=null}} <p>The error was GenericData{classInfo=[errors, index],{errors=[GenericData{classInfo=[debugInfo, location, message, reason], {debugInfo=,location=, message=Missing required field: Msg_0_CLOUD_QUERY_TABLE.num., reason=invalid}}],index=0}} */ }