org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers Java Examples

The following examples show how to use org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: BigQueryToParquet.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
/**
 * Creates ReadSession for schema extraction.
 *
 * @param client BigQueryStorage client used to create ReadSession.
 * @param tableString String that represents table to export from.
 * @param tableReadOptions TableReadOptions that specify any fields in the table to filter on.
 * @return session ReadSession object that contains the schema for the export.
 */
static ReadSession create(
    BigQueryStorageClient client, String tableString, TableReadOptions tableReadOptions) {
  TableReference tableReference = BigQueryHelpers.parseTableSpec(tableString);
  String parentProjectId = "projects/" + tableReference.getProjectId();

  TableReferenceProto.TableReference storageTableRef =
      TableReferenceProto.TableReference.newBuilder()
          .setProjectId(tableReference.getProjectId())
          .setDatasetId(tableReference.getDatasetId())
          .setTableId(tableReference.getTableId())
          .build();

  CreateReadSessionRequest.Builder builder =
      CreateReadSessionRequest.newBuilder()
          .setParent(parentProjectId)
          .setReadOptions(tableReadOptions)
          .setTableReference(storageTableRef);
  try {
    return client.createReadSession(builder.build());
  } catch (InvalidArgumentException iae) {
    LOG.error("Error creating ReadSession: " + iae.getMessage());
    throw new RuntimeException(iae);
  }
}
 
Example #2
Source File: BigQueryTable.java    From beam with Apache License 2.0 6 votes vote down vote up
private static BeamTableStatistics getRowCountFromBQ(PipelineOptions o, String bqLocation) {
  try {
    BigInteger rowCount =
        BigQueryHelpers.getNumRows(
            o.as(BigQueryOptions.class), BigQueryHelpers.parseTableSpec(bqLocation));

    if (rowCount == null) {
      return BeamTableStatistics.BOUNDED_UNKNOWN;
    }

    return BeamTableStatistics.createBoundedTableStatistics(rowCount.doubleValue());

  } catch (IOException | InterruptedException e) {
    LOG.warn("Could not get the row count for the table " + bqLocation, e);
  }

  return BeamTableStatistics.BOUNDED_UNKNOWN;
}
 
Example #3
Source File: FakeDatasetService.java    From beam with Apache License 2.0 6 votes vote down vote up
Map<TableRow, List<TableDataInsertAllResponse.InsertErrors>> getInsertErrors() {
  Map<TableRow, List<TableDataInsertAllResponse.InsertErrors>> parsedInsertErrors =
      Maps.newHashMap();
  synchronized (tables) {
    for (Map.Entry<String, List<String>> entry : this.insertErrors.entrySet()) {
      TableRow tableRow = BigQueryHelpers.fromJsonString(entry.getKey(), TableRow.class);
      List<TableDataInsertAllResponse.InsertErrors> allErrors = Lists.newArrayList();
      for (String errorsString : entry.getValue()) {
        allErrors.add(
            BigQueryHelpers.fromJsonString(
                errorsString, TableDataInsertAllResponse.InsertErrors.class));
      }
      parsedInsertErrors.put(tableRow, allErrors);
    }
  }
  return parsedInsertErrors;
}
 
Example #4
Source File: FakeDatasetService.java    From beam with Apache License 2.0 5 votes vote down vote up
/**
 * Cause a given {@link TableRow} object to fail when it's inserted. The errors link the list will
 * be returned on subsequent retries, and the insert will succeed when the errors run out.
 */
public void failOnInsert(
    Map<TableRow, List<TableDataInsertAllResponse.InsertErrors>> insertErrors) {
  synchronized (tables) {
    for (Map.Entry<TableRow, List<TableDataInsertAllResponse.InsertErrors>> entry :
        insertErrors.entrySet()) {
      List<String> errorStrings = Lists.newArrayList();
      for (TableDataInsertAllResponse.InsertErrors errors : entry.getValue()) {
        errorStrings.add(BigQueryHelpers.toJsonString(errors));
      }
      this.insertErrors.put(BigQueryHelpers.toJsonString(entry.getKey()), errorStrings);
    }
  }
}
 
Example #5
Source File: FakeBigQueryServices.java    From beam with Apache License 2.0 5 votes vote down vote up
public static String encodeQueryResult(Table table, List<TableRow> rows) throws IOException {
  KvCoder<String, List<TableRow>> coder =
      KvCoder.of(StringUtf8Coder.of(), ListCoder.of(TableRowJsonCoder.of()));
  KV<String, List<TableRow>> kv = KV.of(BigQueryHelpers.toJsonString(table), rows);
  ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
  coder.encode(kv, outputStream);
  return Base64.encodeBase64String(outputStream.toByteArray());
}
 
Example #6
Source File: FakeBigQueryServices.java    From beam with Apache License 2.0 5 votes vote down vote up
public static KV<Table, List<TableRow>> decodeQueryResult(String queryResult) throws IOException {
  KvCoder<String, List<TableRow>> coder =
      KvCoder.of(StringUtf8Coder.of(), ListCoder.of(TableRowJsonCoder.of()));
  ByteArrayInputStream inputStream = new ByteArrayInputStream(Base64.decodeBase64(queryResult));
  KV<String, List<TableRow>> kv = coder.decode(inputStream);
  Table table = BigQueryHelpers.fromJsonString(kv.getKey(), Table.class);
  List<TableRow> rows = kv.getValue();
  rows.forEach(FakeBigQueryServices::convertNumbers);
  return KV.of(table, rows);
}
 
Example #7
Source File: KeyByBigQueryTableDestination.java    From gcp-ingestion with Mozilla Public License 2.0 4 votes vote down vote up
/**
 * Return the appropriate table destination instance for the given document type and other
 * attributes.
 */
public TableDestination getTableDestination(Map<String, String> attributes) {
  attributes = new HashMap<>(attributes);

  // We coerce all docType and namespace names to be snake_case and to remove invalid
  // characters; these transformations MUST match with the transformations applied by the
  // jsonschema-transpiler and mozilla-schema-generator when creating table schemas in BigQuery.
  final String namespace = attributes.get(Attribute.DOCUMENT_NAMESPACE);
  final String docType = attributes.get(Attribute.DOCUMENT_TYPE);
  if (namespace != null) {
    attributes.put(Attribute.DOCUMENT_NAMESPACE, getAndCacheNormalizedName(namespace));
  }
  if (docType != null) {
    attributes.put(Attribute.DOCUMENT_TYPE, getAndCacheNormalizedName(docType));
  }

  // Only letters, numbers, and underscores are allowed in BigQuery dataset and table names,
  // but some doc types and namespaces contain '-', so we convert to '_'; we don't pass all
  // values through getAndCacheBqName to avoid expensive regex operations and polluting the
  // cache of transformed field names.
  attributes = Maps.transformValues(attributes, v -> v.replaceAll("-", "_"));

  final String tableSpec = StringSubstitutor.replace(tableSpecTemplate.get(), attributes);

  // Send to error collection if incomplete tableSpec; $ is not a valid char in tableSpecs.
  if (tableSpec.contains("$")) {
    throw new IllegalArgumentException("Element did not contain all the attributes needed to"
        + " fill out variables in the configured BigQuery output template: "
        + tableSpecTemplate.get());
  }

  final TableDestination tableDestination = new TableDestination(tableSpec, null,
      new TimePartitioning().setField(partitioningField.get()),
      new Clustering().setFields(clusteringFields.get()));
  final TableReference ref = BigQueryHelpers.parseTableSpec(tableSpec);
  final DatasetReference datasetRef = new DatasetReference().setProjectId(ref.getProjectId())
      .setDatasetId(ref.getDatasetId());

  if (bqService == null) {
    bqService = BigQueryOptions.newBuilder().setProjectId(ref.getProjectId())
        .setRetrySettings(RETRY_SETTINGS).build().getService();
  }

  // Get and cache a listing of table names for this dataset.
  Set<String> tablesInDataset;
  if (tableListingCache == null) {
    // We need to be very careful about settings for the cache here. We have had significant
    // issues in the past due to exceeding limits on BigQuery API requests; see
    // https://bugzilla.mozilla.org/show_bug.cgi?id=1623000
    tableListingCache = CacheBuilder.newBuilder().expireAfterWrite(Duration.ofMinutes(10))
        .build();
  }
  try {
    tablesInDataset = tableListingCache.get(datasetRef, () -> {
      Set<String> tableSet = new HashSet<>();
      Dataset dataset = bqService.getDataset(ref.getDatasetId());
      if (dataset != null) {
        dataset.list().iterateAll().forEach(t -> {
          tableSet.add(t.getTableId().getTable());
        });
      }
      return tableSet;
    });
  } catch (ExecutionException e) {
    throw new UncheckedExecutionException(e.getCause());
  }

  // Send to error collection if dataset or table doesn't exist so BigQueryIO doesn't throw a
  // pipeline execution exception.
  if (tablesInDataset.isEmpty()) {
    throw new IllegalArgumentException("Resolved destination dataset does not exist or has no "
        + " tables for tableSpec " + tableSpec);
  } else if (!tablesInDataset.contains(ref.getTableId())) {
    throw new IllegalArgumentException("Resolved destination table does not exist: " + tableSpec);
  }

  return tableDestination;
}
 
Example #8
Source File: FeatureSetSpecToTableSchema.java    From feast with Apache License 2.0 4 votes vote down vote up
private TableId generateTableId(String specKey) {
  TableDestination tableDestination = BigQuerySinkHelpers.getTableDestination(dataset, specKey);
  TableReference tableReference = BigQueryHelpers.parseTableSpec(tableDestination.getTableSpec());
  return TableId.of(
      tableReference.getProjectId(), tableReference.getDatasetId(), tableReference.getTableId());
}
 
Example #9
Source File: FeatureSetSpecToTableSchema.java    From feast with Apache License 2.0 4 votes vote down vote up
@Override
public void encode(TableSchema value, OutputStream outStream)
    throws CoderException, IOException {
  stringCoder.encode(BigQueryHelpers.toJsonString(value), outStream);
}
 
Example #10
Source File: FeatureSetSpecToTableSchema.java    From feast with Apache License 2.0 4 votes vote down vote up
@Override
public TableSchema decode(InputStream inStream) throws CoderException, IOException {
  return BigQueryHelpers.fromJsonString(stringCoder.decode(inStream), TableSchema.class);
}
 
Example #11
Source File: FakeJobService.java    From beam with Apache License 2.0 4 votes vote down vote up
private JobStatus runLoadJob(JobReference jobRef, JobConfigurationLoad load)
    throws InterruptedException, IOException {
  TableReference destination = load.getDestinationTable();
  TableSchema schema = load.getSchema();
  checkArgument(schema != null, "No schema specified");
  List<ResourceId> sourceFiles = filesForLoadJobs.get(jobRef.getProjectId(), jobRef.getJobId());
  WriteDisposition writeDisposition = WriteDisposition.valueOf(load.getWriteDisposition());
  CreateDisposition createDisposition = CreateDisposition.valueOf(load.getCreateDisposition());

  Table existingTable = datasetService.getTable(destination);
  if (!validateDispositions(existingTable, createDisposition, writeDisposition)) {
    return new JobStatus().setState("FAILED").setErrorResult(new ErrorProto());
  }
  if (existingTable == null) {
    TableReference strippedDestination =
        destination
            .clone()
            .setTableId(BigQueryHelpers.stripPartitionDecorator(destination.getTableId()));
    existingTable = new Table().setTableReference(strippedDestination).setSchema(schema);
    if (load.getTimePartitioning() != null) {
      existingTable = existingTable.setTimePartitioning(load.getTimePartitioning());
    }
    if (load.getClustering() != null) {
      existingTable = existingTable.setClustering(load.getClustering());
    }
    datasetService.createTable(existingTable);
  }

  List<TableRow> rows = Lists.newArrayList();
  for (ResourceId filename : sourceFiles) {
    if (load.getSourceFormat().equals("NEWLINE_DELIMITED_JSON")) {
      rows.addAll(readJsonTableRows(filename.toString()));
    } else if (load.getSourceFormat().equals("AVRO")) {
      rows.addAll(readAvroTableRows(filename.toString(), schema));
    }
  }

  datasetService.insertAll(destination, rows, null);
  FileSystems.delete(sourceFiles);
  return new JobStatus().setState("DONE");
}
 
Example #12
Source File: FakeDatasetService.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public <T> long insertAll(
    TableReference ref,
    List<ValueInSingleWindow<TableRow>> rowList,
    @Nullable List<String> insertIdList,
    InsertRetryPolicy retryPolicy,
    List<ValueInSingleWindow<T>> failedInserts,
    ErrorContainer<T> errorContainer,
    boolean skipInvalidRows,
    boolean ignoreUnknownValues,
    boolean ignoreInsertIds)
    throws IOException, InterruptedException {
  Map<TableRow, List<TableDataInsertAllResponse.InsertErrors>> insertErrors = getInsertErrors();
  synchronized (tables) {
    if (ignoreInsertIds) {
      insertIdList = null;
    }

    if (insertIdList != null) {
      assertEquals(rowList.size(), insertIdList.size());
    }

    long dataSize = 0;
    TableContainer tableContainer =
        getTableContainer(
            ref.getProjectId(),
            ref.getDatasetId(),
            BigQueryHelpers.stripPartitionDecorator(ref.getTableId()));
    for (int i = 0; i < rowList.size(); ++i) {
      TableRow row = rowList.get(i).getValue();
      List<TableDataInsertAllResponse.InsertErrors> allErrors = insertErrors.get(row);
      boolean shouldInsert = true;
      if (allErrors != null) {
        for (TableDataInsertAllResponse.InsertErrors errors : allErrors) {
          if (!retryPolicy.shouldRetry(new Context(errors))) {
            shouldInsert = false;
          }
        }
      }
      if (shouldInsert) {
        if (insertIdList == null) {
          dataSize += tableContainer.addRow(row, null);
        } else {
          dataSize += tableContainer.addRow(row, insertIdList.get(i));
        }
      } else {
        errorContainer.add(
            failedInserts, allErrors.get(allErrors.size() - 1), ref, rowList.get(i));
      }
    }
    return dataSize;
  }
}