org.apache.beam.sdk.io.gcp.bigquery.TableDestination Java Examples

The following examples show how to use org.apache.beam.sdk.io.gcp.bigquery.TableDestination. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: KeyByBigQueryTableDestination.java    From gcp-ingestion with Mozilla Public License 2.0 6 votes vote down vote up
@Override
public Result<PCollection<KV<TableDestination, PubsubMessage>>, PubsubMessage> expand(
    PCollection<PubsubMessage> messages) {
  return messages
      .apply(MapElements.into(TypeDescriptors.kvs(TypeDescriptor.of(TableDestination.class),
          TypeDescriptor.of(PubsubMessage.class))).via((PubsubMessage msg) -> {
            msg = PubsubConstraints.ensureNonNull(msg);
            return KV.of(getTableDestination(msg.getAttributeMap()), msg);
          }).exceptionsInto(TypeDescriptor.of(PubsubMessage.class))
          .exceptionsVia((WithFailures.ExceptionElement<PubsubMessage> ee) -> {
            try {
              throw ee.exception();
            } catch (IllegalArgumentException e) {
              return FailureMessage.of(KeyByBigQueryTableDestination.class.getSimpleName(), //
                  ee.element(), //
                  ee.exception());
            }
          }));
}
 
Example #2
Source File: PartitionedTableRef.java    From dataflow-opinion-analysis with Apache License 2.0 6 votes vote down vote up
/**
 * input - a tupel that contains the data element (TableRow), the window, the timestamp, and the pane
 */

@Override
public TableDestination apply(ValueInSingleWindow<TableRow> input) {
    
	String partition;
	
	if (this.isTimeField) {
     String sTime = (String) input.getValue().get(this.fieldName);
     Instant time = Instant.parse(sTime);
     partition = time.toString(partitionFormatter);
	} else {
		partition = ((Integer) input.getValue().get(this.fieldName)).toString();
	}
	
    TableReference reference = new TableReference();
    reference.setProjectId(this.projectId);
    reference.setDatasetId(this.datasetId);
    reference.setTableId(this.partitionPrefix + partition);
    return new TableDestination(reference, null);
}
 
Example #3
Source File: S3Import.java    From dlp-dataflow-deidentification with Apache License 2.0 5 votes vote down vote up
@Override
public TableDestination getTable(KV<String, TableRow> destination) {
  TableDestination dest =
      new TableDestination(destination.getKey(), "pii-tokenized output data from dataflow");
  LOG.debug("Table Destination {}", dest.getTableSpec());
  return dest;
}
 
Example #4
Source File: DLPTextToBigQueryStreaming.java    From dlp-dataflow-deidentification with Apache License 2.0 5 votes vote down vote up
@Override
public TableDestination getTable(KV<String, TableRow> destination) {
  TableDestination dest =
      new TableDestination(destination.getKey(), "pii-tokenized output data from dataflow");
  LOG.debug("Table Destination {}", dest.getTableSpec());
  return dest;
}
 
Example #5
Source File: BQDestination.java    From dlp-dataflow-deidentification with Apache License 2.0 5 votes vote down vote up
@Override
public TableDestination getTable(KV<String, List<String>> destination) {

  TableDestination dest =
      new TableDestination(destination.getKey(), "pii-tokenized output data from dataflow");
  LOG.debug("Table Destination {}", dest.toString());
  return dest;
}
 
Example #6
Source File: PubsubMessageToTableRow.java    From gcp-ingestion with Mozilla Public License 2.0 5 votes vote down vote up
/**
 * Given a KV containing a destination and a message, return the message content as a {@link
 * TableRow} ready to pass to {@link org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO}.
 */
public TableRow kvToTableRow(KV<TableDestination, PubsubMessage> kv) {
  if (format == null) {
    format = createFormat();
  }
  final TableReference ref = kv.getKey().getTableReference();
  final TableId tableId = TableId.of(ref.getProjectId(), ref.getDatasetId(), ref.getTableId());
  final PubsubMessage message = kv.getValue();
  return Json.asTableRow(format.apply(tableId, message.getAttributeMap(), message.getPayload()));
}
 
Example #7
Source File: BigQuerySinkHelpers.java    From feast with Apache License 2.0 5 votes vote down vote up
/**
 * Generating BQ table destination from dataset reference and featuresSet's project and name. If
 * project is undefined "default" would be selected
 *
 * @param dataset {@link DatasetId} reference to bq project and dataset
 * @param featureSetKey Feature Set reference with format &lt;project&gt;/&lt;feature-set-name&gt;
 * @return {@link TableDestination}
 */
public static TableDestination getTableDestination(DatasetId dataset, String featureSetKey) {
  String[] splitName = featureSetKey.split("/");
  String projectName, setName;

  if (splitName.length == 2) {
    projectName = splitName[0];
    setName = splitName[1];
  } else {
    projectName = DEFAULT_PROJECT_NAME;
    setName = splitName[0];
  }

  TimePartitioning timePartitioning =
      new TimePartitioning()
          .setType("DAY")
          .setField(FeatureRowToTableRow.getEventTimestampColumn());

  return new TableDestination(
      String.format(
          "%s:%s.%s_%s",
          dataset.getProject(),
          dataset.getDataset(),
          projectName.replaceAll("-", "_"),
          setName.replaceAll("-", "_")),
      String.format("Feast table for %s", featureSetKey),
      timePartitioning);
}
 
Example #8
Source File: ChangelogTableDynamicDestinations.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
@Override
public TableDestination getTable(String targetTable) {
  String changelogTableName = getBigQueryTableName(targetTable, true);

  TableReference tableRef = new TableReference()
      .setTableId(changelogTableName)
      .setDatasetId(changeLogDataset)
      .setProjectId(gcpProjectId);
  String description = String.format("Changelog Table for {}", targetTable);

  return new TableDestination(tableRef, description);
}
 
Example #9
Source File: BigQueryDynamicConverters.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
@Override
public TableDestination getTable(KV<TableId, TableRow> destination) {
  TableId tableId = destination.getKey();
  String tableName = String.format("%s.%s", tableId.getDataset(), tableId.getTable());
  TableDestination dest =
      new TableDestination(tableName, "Name of table pulled from datafields");

  return dest;
}
 
Example #10
Source File: DLPTextToBigQueryStreaming.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
@Override
public TableDestination getTable(KV<String, TableRow> destination) {
  TableDestination dest =
      new TableDestination(destination.getKey(), "pii-tokenized output data from dataflow");
  LOG.debug("Table Destination {}", dest.getTableSpec());
  return dest;
}
 
Example #11
Source File: BigQueryDynamicConverters.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
@Override
public TableDestination getTable(KV<TableId, TableRow> destination) {
  TableId tableId = destination.getKey();
  // TODO String.format("%s:%s.%s", projectId.get(), datasetName.get(), key) if project id is req
  String tableName = String.format("%s.%s", tableId.getDataset(), tableId.getTable());
  TableDestination dest =
      new TableDestination(tableName, "Name of table pulled from datafields");

  return dest;
}
 
Example #12
Source File: KeyByBigQueryTableDestination.java    From gcp-ingestion with Mozilla Public License 2.0 4 votes vote down vote up
/**
 * Return the appropriate table destination instance for the given document type and other
 * attributes.
 */
public TableDestination getTableDestination(Map<String, String> attributes) {
  attributes = new HashMap<>(attributes);

  // We coerce all docType and namespace names to be snake_case and to remove invalid
  // characters; these transformations MUST match with the transformations applied by the
  // jsonschema-transpiler and mozilla-schema-generator when creating table schemas in BigQuery.
  final String namespace = attributes.get(Attribute.DOCUMENT_NAMESPACE);
  final String docType = attributes.get(Attribute.DOCUMENT_TYPE);
  if (namespace != null) {
    attributes.put(Attribute.DOCUMENT_NAMESPACE, getAndCacheNormalizedName(namespace));
  }
  if (docType != null) {
    attributes.put(Attribute.DOCUMENT_TYPE, getAndCacheNormalizedName(docType));
  }

  // Only letters, numbers, and underscores are allowed in BigQuery dataset and table names,
  // but some doc types and namespaces contain '-', so we convert to '_'; we don't pass all
  // values through getAndCacheBqName to avoid expensive regex operations and polluting the
  // cache of transformed field names.
  attributes = Maps.transformValues(attributes, v -> v.replaceAll("-", "_"));

  final String tableSpec = StringSubstitutor.replace(tableSpecTemplate.get(), attributes);

  // Send to error collection if incomplete tableSpec; $ is not a valid char in tableSpecs.
  if (tableSpec.contains("$")) {
    throw new IllegalArgumentException("Element did not contain all the attributes needed to"
        + " fill out variables in the configured BigQuery output template: "
        + tableSpecTemplate.get());
  }

  final TableDestination tableDestination = new TableDestination(tableSpec, null,
      new TimePartitioning().setField(partitioningField.get()),
      new Clustering().setFields(clusteringFields.get()));
  final TableReference ref = BigQueryHelpers.parseTableSpec(tableSpec);
  final DatasetReference datasetRef = new DatasetReference().setProjectId(ref.getProjectId())
      .setDatasetId(ref.getDatasetId());

  if (bqService == null) {
    bqService = BigQueryOptions.newBuilder().setProjectId(ref.getProjectId())
        .setRetrySettings(RETRY_SETTINGS).build().getService();
  }

  // Get and cache a listing of table names for this dataset.
  Set<String> tablesInDataset;
  if (tableListingCache == null) {
    // We need to be very careful about settings for the cache here. We have had significant
    // issues in the past due to exceeding limits on BigQuery API requests; see
    // https://bugzilla.mozilla.org/show_bug.cgi?id=1623000
    tableListingCache = CacheBuilder.newBuilder().expireAfterWrite(Duration.ofMinutes(10))
        .build();
  }
  try {
    tablesInDataset = tableListingCache.get(datasetRef, () -> {
      Set<String> tableSet = new HashSet<>();
      Dataset dataset = bqService.getDataset(ref.getDatasetId());
      if (dataset != null) {
        dataset.list().iterateAll().forEach(t -> {
          tableSet.add(t.getTableId().getTable());
        });
      }
      return tableSet;
    });
  } catch (ExecutionException e) {
    throw new UncheckedExecutionException(e.getCause());
  }

  // Send to error collection if dataset or table doesn't exist so BigQueryIO doesn't throw a
  // pipeline execution exception.
  if (tablesInDataset.isEmpty()) {
    throw new IllegalArgumentException("Resolved destination dataset does not exist or has no "
        + " tables for tableSpec " + tableSpec);
  } else if (!tablesInDataset.contains(ref.getTableId())) {
    throw new IllegalArgumentException("Resolved destination table does not exist: " + tableSpec);
  }

  return tableDestination;
}
 
Example #13
Source File: FeatureSetSpecToTableSchema.java    From feast with Apache License 2.0 4 votes vote down vote up
private TableId generateTableId(String specKey) {
  TableDestination tableDestination = BigQuerySinkHelpers.getTableDestination(dataset, specKey);
  TableReference tableReference = BigQueryHelpers.parseTableSpec(tableDestination.getTableSpec());
  return TableId.of(
      tableReference.getProjectId(), tableReference.getDatasetId(), tableReference.getTableId());
}