org.apache.beam.sdk.io.gcp.bigquery.SchemaAndRecord Java Examples

The following examples show how to use org.apache.beam.sdk.io.gcp.bigquery.SchemaAndRecord. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: BeamUtils.java    From nomulus with Apache License 2.0 6 votes vote down vote up
/**
 * Checks that no expected fields in the record are missing.
 *
 * <p>Note that this simply makes sure the field is not null; it may still generate a parse error
 * when interpreting the string representation of an object.
 *
 * @throws IllegalStateException if the record returns null for any field in {@code fieldNames}
 */
public static void checkFieldsNotNull(
    ImmutableList<String> fieldNames, SchemaAndRecord schemaAndRecord) {
  GenericRecord record = schemaAndRecord.getRecord();
  ImmutableList<String> nullFields =
      fieldNames
          .stream()
          .filter(fieldName -> record.get(fieldName) == null)
          .collect(ImmutableList.toImmutableList());
  String missingFieldList = Joiner.on(", ").join(nullFields);
  if (!nullFields.isEmpty()) {
    throw new IllegalStateException(
        String.format(
            "Read unexpected null value for field(s) %s for record %s",
            missingFieldList, record));
  }
}
 
Example #2
Source File: BigQueryConvertersTest.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
/**
 * Tests that {@link BigQueryConverters.AvroToEntity} creates an Entity without a valid key when a
 * field is of type Record.
 */
@Test
public void testAvroToEntityRecordField() throws Exception {
  // Create test data
  TableFieldSchema column = generateNestedTableFieldSchema();
  List<TableFieldSchema> fields = new ArrayList<>();
  fields.add(column);
  TableSchema bqSchema = new TableSchema().setFields(fields);
  Record record = generateNestedAvroRecord();
  SchemaAndRecord inputBqData = new SchemaAndRecord(record, bqSchema);
  // Run the test
  Entity outputEntity = converter.apply(inputBqData);
  // Assess results
  String expectedCauseMessage = String.format("Column [address] of type [RECORD] not supported.");
  assertTrue(!outputEntity.hasKey());
  assertEquals(
      expectedCauseMessage, outputEntity.getPropertiesMap().get("cause").getStringValue());
  assertEquals(record.toString(), outputEntity.getPropertiesMap().get("row").getStringValue());
}
 
Example #3
Source File: BigQueryConvertersTest.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
/**
 * Tests that {@link BigQueryConverters.AvroToEntity} creates an Entity with a default namespace
 * when the namespace is not specified.
 */
@Test
public void testAvroToEntityDefaultNamespace() throws Exception {
  // Create test data
  List<TableFieldSchema> fields = new ArrayList<>();
  fields.add(new TableFieldSchema().setName(idField).setType("STRING"));
  fields.add(new TableFieldSchema().setName(shortStringField).setType("STRING"));
  TableSchema bqSchema = new TableSchema().setFields(fields);
  Schema avroSchema =
      new Schema.Parser()
          .parse(
              String.format(
                  avroSchemaTemplate,
                  new StringBuilder()
                      .append(String.format(avroFieldTemplate, idField, "int", idFieldDesc))
                      .append(",")
                      .append(generateShortStringField())
                      .toString()));
  GenericRecordBuilder builder = new GenericRecordBuilder(avroSchema);
  builder.set(idField, 1);
  builder.set(shortStringField, shortStringFieldValue);
  Record record = builder.build();
  SchemaAndRecord inputBqData = new SchemaAndRecord(record, bqSchema);
  // Run the test
  AvroToEntity noNamespaceConverter =
      AvroToEntity.newBuilder()
          .setEntityKind(entityKind)
          .setUniqueNameColumn(uniqueNameColumn)
          .build();
  Entity outputEntity = noNamespaceConverter.apply(inputBqData);
  // Assess results
  assertTrue(outputEntity.hasKey());
  assertEquals("", outputEntity.getKey().getPartitionId().getNamespaceId());
}
 
Example #4
Source File: BeamUtilsTest.java    From nomulus with Apache License 2.0 5 votes vote down vote up
@Before
public void initializeRecord() {
  // Create a record with a given JSON schema.
  GenericRecord record = new GenericData.Record(new Schema.Parser().parse(GENERIC_SCHEMA));
  record.put("aString", "hello world");
  record.put("aFloat", 2.54);
  schemaAndRecord = new SchemaAndRecord(record, null);
}
 
Example #5
Source File: BillingEventTest.java    From nomulus with Apache License 2.0 5 votes vote down vote up
@Test
public void test_nonNullPoNumber() {
  GenericRecord record = createRecord();
  record.put("poNumber", "905610");
  BillingEvent event = BillingEvent.parseFromRecord(new SchemaAndRecord(record, null));
  assertThat(event.poNumber()).isEqualTo("905610");
  InvoiceGroupingKey invoiceKey = event.getInvoiceGroupingKey();
  assertThat(invoiceKey.poNumber()).isEqualTo("905610");
}
 
Example #6
Source File: Subdomain.java    From nomulus with Apache License 2.0 5 votes vote down vote up
/**
 * Constructs a {@link Subdomain} from an Apache Avro {@code SchemaAndRecord}.
 *
 * @see <a
 *     href=http://avro.apache.org/docs/1.7.7/api/java/org/apache/avro/generic/GenericData.Record.html>
 *     Apache AVRO GenericRecord</a>
 */
static Subdomain parseFromRecord(SchemaAndRecord schemaAndRecord) {
  checkFieldsNotNull(FIELD_NAMES, schemaAndRecord);
  GenericRecord record = schemaAndRecord.getRecord();
  return create(
      extractField(record, "fullyQualifiedDomainName"),
      extractField(record, "registrarClientId"),
      extractField(record, "registrarEmailAddress"));
}
 
Example #7
Source File: BillingEvent.java    From nomulus with Apache License 2.0 5 votes vote down vote up
/**
 * Constructs a {@code BillingEvent} from a {@code SchemaAndRecord}.
 *
 * @see <a
 *     href=http://avro.apache.org/docs/1.7.7/api/java/org/apache/avro/generic/GenericData.Record.html>
 *     Apache AVRO GenericRecord</a>
 */
static BillingEvent parseFromRecord(SchemaAndRecord schemaAndRecord) {
  checkFieldsNotNull(FIELD_NAMES, schemaAndRecord);
  GenericRecord record = schemaAndRecord.getRecord();
  String flags = extractField(record, "flags");
  double amount = getDiscountedAmount(Double.parseDouble(extractField(record, "amount")), flags);
  return create(
      // We need to chain parsers off extractField because GenericRecord only returns
      // Objects, which contain a string representation of their underlying types.
      Long.parseLong(extractField(record, "id")),
      // Bigquery provides UNIX timestamps with microsecond precision.
      Instant.ofEpochMilli(Long.parseLong(extractField(record, "billingTime")) / 1000)
          .atZone(ZoneId.of("UTC")),
      Instant.ofEpochMilli(Long.parseLong(extractField(record, "eventTime")) / 1000)
          .atZone(ZoneId.of("UTC")),
      extractField(record, "registrarId"),
      extractField(record, "billingId"),
      extractField(record, "poNumber"),
      extractField(record, "tld"),
      extractField(record, "action"),
      extractField(record, "domain"),
      extractField(record, "repositoryId"),
      Integer.parseInt(extractField(record, "years")),
      extractField(record, "currency"),
      amount,
      flags);
}
 
Example #8
Source File: BigQueryHllSketchCompatibilityIT.java    From beam with Apache License 2.0 5 votes vote down vote up
private void readSketchFromBigQuery(String tableId, Long expectedCount) {
  String tableSpec = String.format("%s.%s", DATASET_ID, tableId);
  String query =
      String.format(
          "SELECT HLL_COUNT.INIT(%s) AS %s FROM %s",
          DATA_FIELD_NAME, QUERY_RESULT_FIELD_NAME, tableSpec);

  SerializableFunction<SchemaAndRecord, byte[]> parseQueryResultToByteArray =
      input ->
          // BigQuery BYTES type corresponds to Java java.nio.ByteBuffer type
          HllCount.getSketchFromByteBuffer(
              (ByteBuffer) input.getRecord().get(QUERY_RESULT_FIELD_NAME));

  TestPipelineOptions options =
      TestPipeline.testingPipelineOptions().as(TestPipelineOptions.class);

  Pipeline p = Pipeline.create(options);
  PCollection<Long> result =
      p.apply(
              BigQueryIO.read(parseQueryResultToByteArray)
                  .fromQuery(query)
                  .usingStandardSql()
                  .withMethod(Method.DIRECT_READ)
                  .withCoder(ByteArrayCoder.of()))
          .apply(HllCount.MergePartial.globally()) // no-op, only for testing MergePartial
          .apply(HllCount.Extract.globally());
  PAssert.thatSingleton(result).isEqualTo(expectedCount);
  p.run().waitUntilFinish();
}
 
Example #9
Source File: BigQueryConvertersTest.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
/**
 * Tests that {@link BigQueryConverters.AvroToEntity} creates an Entity without a valid key when a
 * Timestamp field is invalid.
 */
@Test
public void testAvroToEntityInvalidTimestampField() throws Exception {
  // Create test data
  List<TableFieldSchema> fields = new ArrayList<>();
  fields.add(new TableFieldSchema().setName(idField).setType("STRING"));
  fields.add(new TableFieldSchema().setName(invalidTimestampField).setType("TIMESTAMP"));
  TableSchema bqSchema = new TableSchema().setFields(fields);
  Schema avroSchema =
      new Schema.Parser()
          .parse(
              String.format(
                  avroSchemaTemplate,
                  new StringBuilder()
                      .append(String.format(avroFieldTemplate, idField, "string", idFieldDesc))
                      .append(",")
                      .append(
                          String.format(
                              avroFieldTemplate,
                              invalidTimestampField,
                              "long",
                              invalidTimestampFieldDesc))
                      .toString()));
  GenericRecordBuilder builder = new GenericRecordBuilder(avroSchema);
  builder.set(idField, idFieldValueStr);
  builder.set(invalidTimestampField, invalidTimestampFieldValueNanos);
  Record record = builder.build();
  SchemaAndRecord inputBqData = new SchemaAndRecord(record, bqSchema);
  // Run the test
  Entity outputEntity = converter.apply(inputBqData);
  // Assess results
  assertTrue(!outputEntity.hasKey());
  assertTrue(
      outputEntity
          .getPropertiesMap()
          .get("cause")
          .getStringValue()
          .startsWith("Timestamp is not valid"));
  assertEquals(record.toString(), outputEntity.getPropertiesMap().get("row").getStringValue());
}
 
Example #10
Source File: BigQueryConvertersTest.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
/**
 * Tests that {@link BigQueryConverters.AvroToEntity} creates an Entity with a valid key when the
 * unique name column is string.
 */
@Test
public void testAvroToEntityStringIdColumn() throws Exception {
  // Create test data
  List<TableFieldSchema> fields = new ArrayList<>();
  fields.add(new TableFieldSchema().setName(idField).setType("STRING"));
  fields.add(new TableFieldSchema().setName(shortStringField).setType("STRING"));
  TableSchema bqSchema = new TableSchema().setFields(fields);
  Schema avroSchema =
      new Schema.Parser()
          .parse(
              String.format(
                  avroSchemaTemplate,
                  new StringBuilder()
                      .append(String.format(avroFieldTemplate, idField, "string", idFieldDesc))
                      .append(",")
                      .append(generateShortStringField())
                      .toString()));
  GenericRecordBuilder builder = new GenericRecordBuilder(avroSchema);
  builder.set(idField, idFieldValueStr);
  builder.set(shortStringField, shortStringFieldValue);
  Record record = builder.build();
  SchemaAndRecord inputBqData = new SchemaAndRecord(record, bqSchema);
  // Run the test
  Entity outputEntity = converter.apply(inputBqData);
  assertTrue(outputEntity.hasKey());
  assertEquals(idFieldValueStr, outputEntity.getKey().getPath(0).getName());
  validateMetadata(outputEntity);
}
 
Example #11
Source File: BigQueryConvertersTest.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
/**
 * Tests that {@link BigQueryConverters.AvroToEntity} creates an Entity with a valid key when the
 * unique name column is integer.
 */
@Test
public void testAvroToEntityIntegerIdColumn() throws Exception {
  // Create test data
  List<TableFieldSchema> fields = new ArrayList<>();
  fields.add(new TableFieldSchema().setName(idField).setType("INTEGER"));
  fields.add(new TableFieldSchema().setName(shortStringField).setType("STRING"));
  TableSchema bqSchema = new TableSchema().setFields(fields);
  Schema avroSchema =
      new Schema.Parser()
          .parse(
              String.format(
                  avroSchemaTemplate,
                  new StringBuilder()
                      .append(String.format(avroFieldTemplate, idField, "int", idFieldDesc))
                      .append(",")
                      .append(generateShortStringField())
                      .toString()));
  GenericRecordBuilder builder = new GenericRecordBuilder(avroSchema);
  builder.set(idField, idFieldValueInt);
  builder.set(shortStringField, shortStringFieldValue);
  Record record = builder.build();
  SchemaAndRecord inputBqData = new SchemaAndRecord(record, bqSchema);
  // Run the test
  Entity outputEntity = converter.apply(inputBqData);
  assertTrue(outputEntity.hasKey());
  assertEquals(idFieldValueStr, outputEntity.getKey().getPath(0).getName());
  validateMetadata(outputEntity);
}
 
Example #12
Source File: BigQueryConvertersTest.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
/**
 * Tests that {@link BigQueryConverters.AvroToEntity} creates an Entity without a key when the
 * unique name column exceeds the maximum size allowed of 1500 bytes.
 */
@Test
public void testAvroToEntityTooLongIdColumn() throws Exception {
  // Create test data
  List<TableFieldSchema> fields = new ArrayList<>();
  fields.add(new TableFieldSchema().setName(idField).setType("STRING"));
  fields.add(new TableFieldSchema().setName(shortStringField).setType("STRING"));
  TableSchema bqSchema = new TableSchema().setFields(fields);
  Schema avroSchema =
      new Schema.Parser()
          .parse(
              String.format(
                  avroSchemaTemplate,
                  new StringBuilder()
                      .append(String.format(avroFieldTemplate, idField, "string", idFieldDesc))
                      .append(",")
                      .append(generateShortStringField())
                      .toString()));
  GenericRecordBuilder builder = new GenericRecordBuilder(avroSchema);
  builder.set(idField, longStringFieldValue);
  builder.set(shortStringField, shortStringFieldValue);
  Record record = builder.build();
  SchemaAndRecord inputBqData = new SchemaAndRecord(record, bqSchema);
  // Run the test
  Entity outputEntity = converter.apply(inputBqData);
  assertTrue(!outputEntity.hasKey());
}
 
Example #13
Source File: BigQueryConvertersTest.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
/**
 * Tests that {@link BigQueryConverters.AvroToEntity} creates an Entity without a key when the
 * unique name column is null.
 */
@Test
public void testAvroToEntityNullIdColumn() throws Exception {
  // Create test data
  List<TableFieldSchema> fields = new ArrayList<>();
  fields.add(new TableFieldSchema().setName(idField).setType("STRING"));
  fields.add(new TableFieldSchema().setName(shortStringField).setType("STRING"));
  TableSchema bqSchema = new TableSchema().setFields(fields);
  Schema avroSchema =
      new Schema.Parser()
          .parse(
              String.format(
                  avroSchemaTemplate,
                  new StringBuilder()
                      .append(String.format(avroFieldTemplate, idField, "null", idFieldDesc))
                      .append(",")
                      .append(generateShortStringField())
                      .toString()));
  GenericRecordBuilder builder = new GenericRecordBuilder(avroSchema);
  builder.set(idField, null);
  builder.set(shortStringField, shortStringFieldValue);
  Record record = builder.build();
  SchemaAndRecord inputBqData = new SchemaAndRecord(record, bqSchema);
  // Run the test
  Entity outputEntity = converter.apply(inputBqData);
  assertTrue(!outputEntity.hasKey());
}
 
Example #14
Source File: BigQueryConvertersTest.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
/**
 * Tests that {@link BigQueryConverters.AvroToEntity} creates an Entity without a key when the
 * unique name column is missing.
 */
@Test
public void testAvroToEntityNoIdColumn() throws Exception {
  // Create test data
  List<TableFieldSchema> fields = new ArrayList<>();
  fields.add(new TableFieldSchema().setName(shortStringField).setType("STRING"));
  TableSchema bqSchema = new TableSchema().setFields(fields);
  Record record =
      generateSingleFieldAvroRecord(
          shortStringField, "string", shortStringFieldDesc, shortStringFieldValue);
  SchemaAndRecord inputBqData = new SchemaAndRecord(record, bqSchema);
  // Run the test
  Entity outputEntity = converter.apply(inputBqData);
  assertTrue(!outputEntity.hasKey());
}
 
Example #15
Source File: BigQueryToTFRecordTest.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
@Test
public void testBigQueryToTFRecordWithExeception() throws Exception {
  expectedEx.expect(RuntimeException.class);
  expectedEx.expectMessage("Unsupported type: BOLEAN");

  Long i1 = new Long(0);
  double f1 = 0.0d;
  String s1 = "";
  byte[] b1 = new byte[8];

  record.put("int1", i1);
  record.put("float1", f1);
  record.put("string1", s1);
  record.put("bytes1", b1);
  record.put("bool1", true);

  List<TableFieldSchema> fields = new ArrayList<TableFieldSchema>();
  fields.add(new TableFieldSchema().setName("int1").setType("INTEGER"));
  fields.add(new TableFieldSchema().setName("float1").setType("FLOAT"));
  fields.add(new TableFieldSchema().setName("string1").setType("STRING"));
  fields.add(new TableFieldSchema().setName("bytes1").setType("BYTES"));
  fields.add(new TableFieldSchema().setName("bool1").setType("BOLEAN"));
  final TableSchema tableSchema = new TableSchema();
  tableSchema.setFields(fields);
  final SchemaAndRecord schemaAndRecord = new SchemaAndRecord(record, tableSchema);

  byte[] gotBytes = record2Example(schemaAndRecord);
}
 
Example #16
Source File: BigQueryToTFRecord.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
/**
 * The {@link BigQueryToTFRecord#record2Example(SchemaAndRecord)} method uses takes in a
 * SchemaAndRecord Object returned from a BigQueryIO.read() step and builds a TensorFlow Example
 * from the record.
 */
@VisibleForTesting
protected static byte[] record2Example(SchemaAndRecord schemaAndRecord) {
  Example.Builder example = Example.newBuilder();
  Features.Builder features = example.getFeaturesBuilder();
  GenericRecord record = schemaAndRecord.getRecord();
  for (TableFieldSchema field : schemaAndRecord.getTableSchema().getFields()) {
    Feature feature = buildFeature(record.get(field.getName()), field.getType());
    features.putFeature(field.getName(), feature);
  }
  return example.build().toByteArray();
}
 
Example #17
Source File: Read.java    From gcp-ingestion with Mozilla Public License 2.0 4 votes vote down vote up
@Override
public PCollection<PubsubMessage> expand(PBegin input) {
  BigQueryIO.TypedRead<PubsubMessage> read = BigQueryIO
      .read((SchemaAndRecord schemaAndRecord) -> {
        TableSchema tableSchema = schemaAndRecord.getTableSchema();
        GenericRecord record = schemaAndRecord.getRecord();

        // We have to take care not to read additional bytes; see
        // https://github.com/mozilla/gcp-ingestion/issues/1266
        ByteBuffer byteBuffer = (ByteBuffer) record.get(FieldName.PAYLOAD);
        byte[] payload = new byte[byteBuffer.limit()];
        byteBuffer.get(payload);

        // We populate attributes for all simple string and timestamp fields, which is complete
        // for raw and error tables.
        // Decoded payload tables also have a top-level nested "metadata" struct; we can mostly
        // just drop this since the same metadata object is encoded in the payload, but we do
        // parse out the document namespace, type, and version since those are necessary in the
        // case of a Sink job that doesn't look at the payload but still may need those
        // attributes in order to route to the correct destination.
        Map<String, String> attributes = new HashMap<>();
        tableSchema.getFields().stream() //
            .filter(f -> !"REPEATED".equals(f.getMode())) //
            .forEach(f -> {
              Object value = record.get(f.getName());
              if (value != null) {
                switch (f.getType()) {
                  case "TIMESTAMP":
                    attributes.put(f.getName(), Time.epochMicrosToTimestamp((Long) value));
                    break;
                  case "STRING":
                  case "INTEGER":
                  case "INT64":
                    attributes.put(f.getName(), value.toString());
                    break;
                  case "RECORD":
                  case "STRUCT":
                    // The only struct we support is the top-level nested "metadata" and we
                    // extract only the attributes needed for destination routing.
                    GenericRecord metadata = (GenericRecord) value;
                    Arrays
                        .asList(Attribute.DOCUMENT_NAMESPACE, Attribute.DOCUMENT_TYPE,
                            Attribute.DOCUMENT_VERSION)
                        .forEach(v -> attributes.put(v, metadata.get(v).toString()));
                    break;
                  // Ignore any other types (only the payload BYTES field should hit this).
                  default:
                    break;
                }
              }
            });
        return new PubsubMessage(payload, attributes);
      }) //
      .withCoder(PubsubMessageWithAttributesCoder.of()) //
      .withTemplateCompatibility() //
      .withoutValidation() //
      .withMethod(method.method);
  switch (source) {
    case TABLE:
      read = read.from(tableSpec);
      break;
    default:
    case QUERY:
      read = read.fromQuery(tableSpec).usingStandardSql();
  }
  if (source == Source.TABLE && method == BigQueryReadMethod.storageapi) {
    if (rowRestriction != null) {
      read = read.withRowRestriction(rowRestriction);
    }
    if (selectedFields != null) {
      read = read.withSelectedFields(selectedFields);
    }
  }
  return input.apply(read);
}
 
Example #18
Source File: BigQueryToTFRecordTest.java    From DataflowTemplates with Apache License 2.0 4 votes vote down vote up
/** Test {@link BigQueryToTFRecord} correctly outputs TFRecord. */
@Test
public void record2ExampleTest() throws InvalidProtocolBufferException {

  Long i1 = new Long(0);
  double f1 = 0.0d;
  String s1 = "";
  byte[] b1 = new byte[8];

  record.put("int1", i1);
  record.put("float1", f1);
  record.put("string1", s1);
  record.put("bytes1", b1);
  record.put("bool1", true);

  List<TableFieldSchema> fields = new ArrayList<TableFieldSchema>();
  fields.add(new TableFieldSchema().setName("int1").setType("INTEGER"));
  fields.add(new TableFieldSchema().setName("float1").setType("FLOAT"));
  fields.add(new TableFieldSchema().setName("string1").setType("STRING"));
  fields.add(new TableFieldSchema().setName("bytes1").setType("BYTES"));
  fields.add(new TableFieldSchema().setName("bool1").setType("BOOLEAN"));
  final TableSchema tableSchema = new TableSchema();
  tableSchema.setFields(fields);
  final SchemaAndRecord schemaAndRecord = new SchemaAndRecord(record, tableSchema);

  Example.Builder example = Example.newBuilder();
  Features.Builder features = example.getFeaturesBuilder();
  Feature.Builder int1 = Feature.newBuilder();
  Feature.Builder float1 = Feature.newBuilder();
  Feature.Builder string1 = Feature.newBuilder();
  Feature.Builder bytes1 = Feature.newBuilder();
  Feature.Builder bool1 = Feature.newBuilder();

  int1.getInt64ListBuilder().addValue(i1);
  float1.getFloatListBuilder().addValue((float) f1);
  string1.getBytesListBuilder().addValue(ByteString.copyFromUtf8(s1));
  bytes1.getBytesListBuilder().addValue(ByteString.copyFrom(b1));
  bool1.getInt64ListBuilder().addValue(1);

  features.putFeature("int1", int1.build());
  features.putFeature("float1", float1.build());
  features.putFeature("string1", string1.build());
  features.putFeature("bytes1", bytes1.build());
  features.putFeature("bool1", bool1.build());

  byte[] gotBytes = record2Example(schemaAndRecord);
  Example gotExample = Example.parseFrom(gotBytes);

  Map<String, Feature> gotFeatures = gotExample.getFeatures().getFeatureMap();
  Feature[] got = new Feature[5];
  got[0] = gotFeatures.get("int1");
  got[1] = gotFeatures.get("float1");
  got[2] = gotFeatures.get("string1");
  got[3] = gotFeatures.get("bytes1");
  got[4] = gotFeatures.get("bool1");

  final Example wantExample = example.build();
  Map<String, Feature> wantFeatures = wantExample.getFeatures().getFeatureMap();
  Feature[] want = new Feature[5];
  want[0] = wantFeatures.get("int1");
  want[1] = wantFeatures.get("float1");
  want[2] = wantFeatures.get("string1");
  want[3] = wantFeatures.get("bytes1");
  want[4] = wantFeatures.get("bool1");

  for (int i = 0; i < 5; i++) {
    Assert.assertThat(got[i], equalTo(want[i]));
  }
}
 
Example #19
Source File: BillingEventTest.java    From nomulus with Apache License 2.0 4 votes vote down vote up
@Before
public void initializeRecord() {
  // Create a record with a given JSON schema.
  schemaAndRecord = new SchemaAndRecord(createRecord(), null);
}
 
Example #20
Source File: BigQueryToParquet.java    From DataflowTemplates with Apache License 2.0 4 votes vote down vote up
/**
 * Runs the pipeline with the supplied options.
 *
 * @param options The execution parameters to the pipeline.
 * @return The result of the pipeline execution.
 */
private static PipelineResult run(BigQueryToParquetOptions options) {

  // Create the pipeline.
  Pipeline pipeline = Pipeline.create(options);

  TableReadOptions.Builder builder = TableReadOptions.newBuilder();

  /* Add fields to filter export on, if any. */
  if (options.getFields() != null) {
    builder.addAllSelectedFields(Arrays.asList(options.getFields().split(",\\s*")));
  }

  TableReadOptions tableReadOptions = builder.build();
  BigQueryStorageClient client = BigQueryStorageClientFactory.create();
  ReadSession session =
      ReadSessionFactory.create(client, options.getTableRef(), tableReadOptions);

  // Extract schema from ReadSession
  Schema schema = getTableSchema(session);
  client.close();

  /*
   * Steps: 1) Read records from BigQuery via BigQueryIO.
   *        2) Write records to Google Cloud Storage in Parquet format.
   */
  pipeline
      /*
       * Step 1: Read records via BigQueryIO using supplied schema as a PCollection of
       *         {@link GenericRecord}.
       */
      .apply(
          "ReadFromBigQuery",
          BigQueryIO.read(SchemaAndRecord::getRecord)
              .from(options.getTableRef())
              .withTemplateCompatibility()
              .withMethod(Method.DIRECT_READ)
              .withCoder(AvroCoder.of(schema))
              .withReadOptions(tableReadOptions))
      /*
       * Step 2: Write records to Google Cloud Storage as one or more Parquet files
       *         via {@link ParquetIO}.
       */
      .apply(
          "WriteToParquet",
          FileIO.<GenericRecord>write()
              .via(ParquetIO.sink(schema))
              .to(options.getBucket())
              .withNumShards(options.getNumShards())
              .withSuffix(FILE_SUFFIX));

  // Execute the pipeline and return the result.
  return pipeline.run();
}