org.apache.beam.sdk.coders.AvroCoder Java Exaples

Source File: AvroSourceTest.java From beam with Apache License 2.0

6 votes

@Test
public void testSchemaStringIsInterned() throws Exception {
  List<Bird> birds = createRandomRecords(100);
  String filename =
      generateTestFile(
          "tmp.avro",
          birds,
          SyncBehavior.SYNC_DEFAULT,
          0,
          AvroCoder.of(Bird.class),
          DataFileConstants.NULL_CODEC);
  Metadata fileMetadata = FileSystems.matchSingleFileSpec(filename);
  String schema = AvroSource.readMetadataFromFile(fileMetadata.resourceId()).getSchemaString();
  // Add "" to the schema to make sure it is not interned.
  AvroSource<GenericRecord> sourceA = AvroSource.from(filename).withSchema("" + schema);
  AvroSource<GenericRecord> sourceB = AvroSource.from(filename).withSchema("" + schema);
  assertSame(sourceA.getReaderSchemaString(), sourceB.getReaderSchemaString());

  // Ensure that deserialization still goes through interning
  AvroSource<GenericRecord> sourceC = SerializableUtils.clone(sourceB);
  assertSame(sourceA.getReaderSchemaString(), sourceC.getReaderSchemaString());
}

Source File: HadoopFormatIOReadTest.java From beam with Apache License 2.0

6 votes

/**
 * This test validates behavior of HadoopInputFormatSource if {@link
 * InputFormat#createRecordReader(InputSplit, TaskAttemptContext)} createRecordReader(InputSplit,
 * TaskAttemptContext)} of InputFormat returns null.
 */
@Test
public void testReadWithNullCreateRecordReader() throws Exception {
  InputFormat<Text, Employee> mockInputFormat = Mockito.mock(EmployeeInputFormat.class);
  thrown.expect(IOException.class);
  thrown.expectMessage(
      String.format("Null RecordReader object returned by %s", mockInputFormat.getClass()));
  Mockito.when(
          mockInputFormat.createRecordReader(
              Mockito.any(InputSplit.class), Mockito.any(TaskAttemptContext.class)))
      .thenReturn(null);
  HadoopInputFormatBoundedSource<Text, Employee> boundedSource =
      new HadoopInputFormatBoundedSource<>(
          serConf,
          WritableCoder.of(Text.class),
          AvroCoder.of(Employee.class),
          null, // No key translation required.
          null, // No value translation required.
          new SerializableSplit());
  boundedSource.setInputFormatObj(mockInputFormat);
  SourceTestUtils.readFromSource(boundedSource, p.getOptions());
}

Source File: WriteToGCSParquet.java From DataflowTemplates with Apache License 2.0

6 votes

@Override
public WriteFilesResult<Void> expand(PCollection<KV<String, String>> kafkaRecords) {
  return kafkaRecords
      /*
       * Converting KV<String, String> records to GenericRecord using DoFn and {@link
       * KeyValueToGenericRecordFn} class.
       */
      .apply("Create GenericRecord(s)", ParDo.of(new KeyValueToGenericRecordFn()))
      .setCoder(AvroCoder.of(GenericRecord.class, KeyValueToGenericRecordFn.SCHEMA))
      /*
       * Writing as parquet file using {@link FileIO} and {@link ParquetIO}.
       *
       * The {@link WindowedFilenamePolicy} class specifies the file path for writing the file.
       * The {@link withNumShards} option specifies the number of shards passed by the user.
       */
      .apply(
          "Writing as Parquet",
          FileIO.<GenericRecord>write()
              .via(ParquetIO.sink(KeyValueToGenericRecordFn.SCHEMA))
              .to(outputDirectory())
              .withPrefix(outputFilenamePrefix())
              .withSuffix(
                  WriteToGCSUtility.FILE_SUFFIX_MAP.get(WriteToGCSUtility.FileFormat.PARQUET))
              .withNumShards(numShards()));
}

Source File: ConfluentSchemaRegistryDeserializerProviderTest.java From beam with Apache License 2.0

6 votes

@Test
public void testGetCoder() {
  String schemaRegistryUrl = "mock://my-scope-name";
  String subject = "mytopic";
  SchemaRegistryClient mockRegistryClient = mockSchemaRegistryClient(schemaRegistryUrl, subject);
  CoderRegistry coderRegistry = CoderRegistry.createDefault();

  AvroCoder coderV0 =
      (AvroCoder)
          mockDeserializerProvider(schemaRegistryUrl, subject, null).getCoder(coderRegistry);
  assertEquals(AVRO_SCHEMA, coderV0.getSchema());

  try {
    Integer version = mockRegistryClient.register(subject, AVRO_SCHEMA_V1);
    AvroCoder coderV1 =
        (AvroCoder)
            mockDeserializerProvider(schemaRegistryUrl, subject, version).getCoder(coderRegistry);
    assertEquals(AVRO_SCHEMA_V1, coderV1.getSchema());
  } catch (IOException | RestClientException e) {
    throw new RuntimeException("Unable to register schema for subject: " + subject, e);
  }
}

Source File: HadoopFormatIOReadTest.java From beam with Apache License 2.0

6 votes

/**
 * This test verifies that the method {@link
 * HadoopInputFormatBoundedSource.HadoopInputFormatReader#getCurrentSource() getCurrentSource()}
 * returns correct source object.
 */
@Test
public void testGetCurrentSourceFunction() throws Exception {
  SerializableSplit split = new SerializableSplit();
  BoundedSource<KV<Text, Employee>> source =
      new HadoopInputFormatBoundedSource<>(
          serConf,
          WritableCoder.of(Text.class),
          AvroCoder.of(Employee.class),
          null, // No key translation required.
          null, // No value translation required.
          split);
  BoundedReader<KV<Text, Employee>> hifReader = source.createReader(p.getOptions());
  BoundedSource<KV<Text, Employee>> hifSource = hifReader.getCurrentSource();
  assertEquals(hifSource, source);
}

Source File: PubsubIOTest.java From beam with Apache License 2.0

6 votes

@Test
public void testAvroGenericRecords() {
  AvroCoder<GenericRecord> coder = AvroCoder.of(GenericRecord.class, SCHEMA);
  List<GenericRecord> inputs =
      ImmutableList.of(
          new AvroGeneratedUser("Bob", 256, null),
          new AvroGeneratedUser("Alice", 128, null),
          new AvroGeneratedUser("Ted", null, "white"));
  setupTestClient(inputs, coder);
  PCollection<GenericRecord> read =
      readPipeline.apply(
          PubsubIO.readAvroGenericRecords(SCHEMA)
              .fromSubscription(SUBSCRIPTION.getPath())
              .withClock(CLOCK)
              .withClientFactory(clientFactory));
  PAssert.that(read).containsInAnyOrder(inputs);
  readPipeline.run();
}

Source File: WriteToGCSAvro.java From DataflowTemplates with Apache License 2.0

6 votes

@Override
public PDone expand(PCollection<KV<String, String>> kafkaRecords) {
  return kafkaRecords
      /*
       * Converting KV<String, String> records to GenericRecord using DoFn and {@link
       * KeyValueToGenericRecordFn} class.
       */
      .apply("Create GenericRecord(s)", ParDo.of(new KeyValueToGenericRecordFn()))
      .setCoder(AvroCoder.of(GenericRecord.class, KeyValueToGenericRecordFn.SCHEMA))
      /*
       * Writing as avro file using {@link AvroIO}.
       *
       * The {@link WindowedFilenamePolicy} class specifies the file path for writing the file.
       * The {@link withNumShards} option specifies the number of shards passed by the user.
       * The {@link withTempDirectory} option sets the base directory used to generate temporary files.
       */
      .apply(
          "Writing as Avro",
          AvroIO.writeGenericRecords(KeyValueToGenericRecordFn.SCHEMA)
              .to(
                  new WindowedFilenamePolicy(
                      outputDirectory(),
                      outputFilenamePrefix(),
                      WriteToGCSUtility.SHARD_TEMPLATE,
                      WriteToGCSUtility.FILE_SUFFIX_MAP.get(WriteToGCSUtility.FileFormat.AVRO)))
              .withTempDirectory(
                  FileBasedSink.convertToFileResourceIfPossible(tempLocation())
                      .getCurrentDirectory())
              .withWindowedWrites()
              .withNumShards(numShards()));
}

Source File: HadoopFormatIOReadTest.java From beam with Apache License 2.0

6 votes

/**
 * This test validates records emitted in PCollection are immutable if InputFormat's {@link
 * org.apache.hadoop.mapreduce.RecordReader RecordReader} returns different objects (i.e.
 * different locations in memory).
 */
@Test
public void testImmutablityOfOutputOfReadIfRecordReaderObjectsAreImmutable() throws Exception {
  List<BoundedSource<KV<Text, Employee>>> boundedSourceList =
      getBoundedSourceList(
          EmployeeInputFormat.class,
          Text.class,
          Employee.class,
          WritableCoder.of(Text.class),
          AvroCoder.of(Employee.class));
  List<KV<Text, Employee>> bundleRecords = new ArrayList<>();
  for (BoundedSource<KV<Text, Employee>> source : boundedSourceList) {
    List<KV<Text, Employee>> elems = SourceTestUtils.readFromSource(source, p.getOptions());
    bundleRecords.addAll(elems);
  }
  List<KV<Text, Employee>> referenceRecords = TestEmployeeDataSet.getEmployeeData();
  assertThat(bundleRecords, containsInAnyOrder(referenceRecords.toArray()));
}

Source File: CsvToAvro.java From java-docs-samples with Apache License 2.0

6 votes

public static void runCsvToAvro(SampleOptions options)
    throws IOException, IllegalArgumentException {
  FileSystems.setDefaultPipelineOptions(options);

  // Get Avro Schema
  String schemaJson = getSchema(options.getAvroSchema());
  Schema schema = new Schema.Parser().parse(schemaJson);

  // Check schema field types before starting the Dataflow job
  checkFieldTypes(schema);

  // Create the Pipeline object with the options we defined above.
  Pipeline pipeline = Pipeline.create(options);

  // Convert CSV to Avro
  pipeline.apply("Read CSV files", TextIO.read().from(options.getInputFile()))
      .apply("Convert CSV to Avro formatted data",
          ParDo.of(new ConvertCsvToAvro(schemaJson, options.getCsvDelimiter())))
      .setCoder(AvroCoder.of(GenericRecord.class, schema))
      .apply("Write Avro formatted data", AvroIO.writeGenericRecords(schemaJson)
          .to(options.getOutput()).withCodec(CodecFactory.snappyCodec()).withSuffix(".avro"));

  // Run the pipeline.
  pipeline.run().waitUntilFinish();
}

Source File: CsvConvertersTest.java From DataflowTemplates with Apache License 2.0

6 votes

/**
 * Tests {@link CsvConverters.StringToGenericRecordFn} throws an exception if incorrect header
 * information is provided. (for e.g. if a Csv file containing headers is passed and hasHeaders is
 * set to false.)
 */
@Test(expected = RuntimeException.class)
public void testIncorrectHeaderInformation() {
  Schema schema = SchemaUtils.getAvroSchema(TEST_AVRO_SCHEMA_PATH);

  pipeline
      .apply(
          "TestIncorrectHeaderInformation",
          CsvConverters.ReadCsv.newBuilder()
              .setInputFileSpec(HEADER_CSV_FILE_PATH)
              .setHasHeaders(false)
              .setHeaderTag(CSV_HEADERS)
              .setLineTag(CSV_LINES)
              .setCsvFormat("Default")
              .setDelimiter(",")
              .build())
      .get(CSV_LINES)
      .apply(
          "ConvertStringToGenericRecord",
          ParDo.of(new CsvConverters.StringToGenericRecordFn(TEST_AVRO_SCHEMA_PATH, ",")))
      .setCoder(AvroCoder.of(GenericRecord.class, schema));

  pipeline.run();
}

Source File: CsvConvertersTest.java From DataflowTemplates with Apache License 2.0

6 votes

/**
 * Test if {@link CsvConverters.StringToGenericRecordFn} throws an exception if the number of Csv
 * headers is less than the number of fields in Avro schema.
 */
@Test(expected = RuntimeException.class)
public void testIncorrectFieldSize() {
  Schema schema = SchemaUtils.getAvroSchema(TEST_AVRO_SCHEMA_TWO_PATH);

  pipeline
      .apply(
          "TestIncorrectFieldSize",
          CsvConverters.ReadCsv.newBuilder()
              .setInputFileSpec(HEADER_CSV_FILE_PATH)
              .setHasHeaders(true)
              .setHeaderTag(CSV_HEADERS)
              .setLineTag(CSV_LINES)
              .setCsvFormat("Default")
              .setDelimiter(",")
              .build())
      .get(CSV_LINES)
      .apply(
          "ConvertStringToGenericRecord",
          ParDo.of(new CsvConverters.StringToGenericRecordFn(TEST_AVRO_SCHEMA_TWO_PATH, ",")))
      .setCoder(AvroCoder.of(GenericRecord.class, schema));

  pipeline.run();
}

Source File: AvroSourceTest.java From beam with Apache License 2.0

6 votes

@Test
public void testCreateFromMetadata() throws Exception {
  List<Bird> expected = createRandomRecords(DEFAULT_RECORD_COUNT);
  String codec = DataFileConstants.NULL_CODEC;
  String filename =
      generateTestFile(
          codec, expected, SyncBehavior.SYNC_DEFAULT, 0, AvroCoder.of(Bird.class), codec);
  Metadata fileMeta = FileSystems.matchSingleFileSpec(filename);

  AvroSource<GenericRecord> source = AvroSource.from(fileMeta);
  AvroSource<Bird> sourceWithSchema = source.withSchema(Bird.class);
  AvroSource<Bird> sourceWithSchemaWithMinBundleSize = sourceWithSchema.withMinBundleSize(1234);

  assertEquals(FileBasedSource.Mode.SINGLE_FILE_OR_SUBRANGE, source.getMode());
  assertEquals(FileBasedSource.Mode.SINGLE_FILE_OR_SUBRANGE, sourceWithSchema.getMode());
  assertEquals(
      FileBasedSource.Mode.SINGLE_FILE_OR_SUBRANGE, sourceWithSchemaWithMinBundleSize.getMode());
}

Source File: LazyAvroCoder.java From components with Apache License 2.0

6 votes

@Override
public void encode(Object value, OutputStream outputStream) throws IOException {
    if (converter == null) {
        converter = ConvertToIndexedRecord.getConverter((T) value);
    }
    IndexedRecord ir = converter.convertToAvro((T) value);
    if (internalAvroCoder == null) {
        Schema s = converter.getSchema();
        avroSchemaHolder.put(s);
        @SuppressWarnings("unchecked")
        AvroCoder<IndexedRecord> tCoder = (AvroCoder<IndexedRecord>) (AvroCoder<? extends IndexedRecord>) AvroCoder
                .of(ir.getSchema());
        internalAvroCoder = tCoder;
    }
    LOG.debug("Internal AvroCoder's schema is {}", internalAvroCoder.getSchema());
    LOG.debug("Encode value is {}", value);
    internalAvroCoder.encode(convertToAvro(value), outputStream);
}

Source File: ParquetIOTest.java From beam with Apache License 2.0

6 votes

@Test
public void testWriteAndReadUsingReflectDataSchemaWithDataModel() {
  Schema testRecordSchema = ReflectData.get().getSchema(TestRecord.class);

  List<GenericRecord> records = generateGenericRecords(1000);
  mainPipeline
      .apply(Create.of(records).withCoder(AvroCoder.of(testRecordSchema)))
      .apply(
          FileIO.<GenericRecord>write()
              .via(ParquetIO.sink(testRecordSchema))
              .to(temporaryFolder.getRoot().getAbsolutePath()));
  mainPipeline.run().waitUntilFinish();

  PCollection<GenericRecord> readBack =
      readPipeline.apply(
          ParquetIO.read(testRecordSchema)
              .withAvroDataModel(GenericData.get())
              .from(temporaryFolder.getRoot().getAbsolutePath() + "/*"));

  PAssert.that(readBack).containsInAnyOrder(records);
  readPipeline.run().waitUntilFinish();
}

Source File: AvroSourceTest.java From beam with Apache License 2.0

6 votes

@Test
public void testCreationWithSchema() throws Exception {
  List<Bird> expected = createRandomRecords(100);
  String filename =
      generateTestFile(
          "tmp.avro",
          expected,
          SyncBehavior.SYNC_DEFAULT,
          0,
          AvroCoder.of(Bird.class),
          DataFileConstants.NULL_CODEC);

  // Create a source with a schema object
  Schema schema = ReflectData.get().getSchema(Bird.class);
  AvroSource<GenericRecord> source = AvroSource.from(filename).withSchema(schema);
  List<GenericRecord> records = SourceTestUtils.readFromSource(source, null);
  assertEqualsWithGeneric(expected, records);

  // Create a source with a JSON schema
  String schemaString = ReflectData.get().getSchema(Bird.class).toString();
  source = AvroSource.from(filename).withSchema(schemaString);
  records = SourceTestUtils.readFromSource(source, null);
  assertEqualsWithGeneric(expected, records);
}

Source File: AvroSourceTest.java From beam with Apache License 2.0

6 votes

@Test
public void testMultipleFiles() throws Exception {
  String baseName = "tmp-";
  List<Bird> expected = new ArrayList<>();
  for (int i = 0; i < 10; i++) {
    List<Bird> contents = createRandomRecords(DEFAULT_RECORD_COUNT / 10);
    expected.addAll(contents);
    generateTestFile(
        baseName + i,
        contents,
        SyncBehavior.SYNC_DEFAULT,
        0,
        AvroCoder.of(Bird.class),
        DataFileConstants.NULL_CODEC);
  }

  AvroSource<Bird> source =
      AvroSource.from(new File(tmpFolder.getRoot().toString(), baseName + "*").toString())
          .withSchema(Bird.class);
  List<Bird> actual = SourceTestUtils.readFromSource(source, null);
  assertThat(actual, containsInAnyOrder(expected.toArray()));
}

Source File: PubsubIOTest.java From beam with Apache License 2.0

6 votes

@Test
public void testAvroSpecificRecord() {
  AvroCoder<AvroGeneratedUser> coder = AvroCoder.of(AvroGeneratedUser.class);
  List<AvroGeneratedUser> inputs =
      ImmutableList.of(
          new AvroGeneratedUser("Bob", 256, null),
          new AvroGeneratedUser("Alice", 128, null),
          new AvroGeneratedUser("Ted", null, "white"));
  setupTestClient(inputs, coder);
  PCollection<AvroGeneratedUser> read =
      readPipeline.apply(
          PubsubIO.readAvrosWithBeamSchema(AvroGeneratedUser.class)
              .fromSubscription(SUBSCRIPTION.getPath())
              .withClock(CLOCK)
              .withClientFactory(clientFactory));
  PAssert.that(read).containsInAnyOrder(inputs);
  readPipeline.run();
}

Source File: AvroSourceTest.java From beam with Apache License 2.0

6 votes

@Test
public void testGetCurrentFromUnstartedReader() throws Exception {
  List<FixedRecord> records = createFixedRecords(DEFAULT_RECORD_COUNT);
  String filename =
      generateTestFile(
          "tmp.avro",
          records,
          SyncBehavior.SYNC_DEFAULT,
          1000,
          AvroCoder.of(FixedRecord.class),
          DataFileConstants.NULL_CODEC);

  AvroSource<FixedRecord> source = AvroSource.from(filename).withSchema(FixedRecord.class);
  try (BlockBasedSource.BlockBasedReader<FixedRecord> reader =
      (BlockBasedSource.BlockBasedReader<FixedRecord>) source.createReader(null)) {
    assertEquals(null, reader.getCurrentBlock());

    expectedException.expect(NoSuchElementException.class);
    expectedException.expectMessage("No block has been successfully read from");
    reader.getCurrent();
  }
}

Source File: AvroSourceTest.java From beam with Apache License 2.0

6 votes

@Test
public void testGetProgressFromUnstartedReader() throws Exception {
  List<FixedRecord> records = createFixedRecords(DEFAULT_RECORD_COUNT);
  String filename =
      generateTestFile(
          "tmp.avro",
          records,
          SyncBehavior.SYNC_DEFAULT,
          1000,
          AvroCoder.of(FixedRecord.class),
          DataFileConstants.NULL_CODEC);
  File file = new File(filename);

  AvroSource<FixedRecord> source = AvroSource.from(filename).withSchema(FixedRecord.class);
  try (BoundedSource.BoundedReader<FixedRecord> reader = source.createReader(null)) {
    assertEquals(Double.valueOf(0.0), reader.getFractionConsumed());
  }

  List<? extends BoundedSource<FixedRecord>> splits = source.split(file.length() / 3, null);
  for (BoundedSource<FixedRecord> subSource : splits) {
    try (BoundedSource.BoundedReader<FixedRecord> reader = subSource.createReader(null)) {
      assertEquals(Double.valueOf(0.0), reader.getFractionConsumed());
    }
  }
}

Source File: SnowflakeIOReadTest.java From beam with Apache License 2.0

6 votes

@Test
public void testConfigContainsFromQueryAndFromTable() {
  thrown.expect(IllegalArgumentException.class);
  thrown.expectMessage("fromTable() and fromQuery() are not allowed together");

  pipeline.apply(
      SnowflakeIO.<GenericRecord>read(snowflakeService)
          .withDataSourceConfiguration(dataSourceConfiguration)
          .fromQuery("")
          .fromTable(FAKE_TABLE)
          .withStagingBucketName(options.getStagingBucketName())
          .withStorageIntegrationName(options.getStorageIntegrationName())
          .withCsvMapper(getCsvMapper())
          .withCoder(AvroCoder.of(AvroGeneratedUser.getClassSchema())));

  pipeline.run();
}

Source File: HadoopFormatIOReadTest.java From beam with Apache License 2.0

6 votes

/**
 * This test validates behavior of {@link
 * HadoopInputFormatBoundedSource#computeSplitsIfNecessary() computeSplits()} when Hadoop
 * InputFormat's {@link InputFormat#getSplits(JobContext)} returns empty list.
 */
@Test
public void testComputeSplitsIfGetSplitsReturnsEmptyList() throws Exception {
  InputFormat<?, ?> mockInputFormat = Mockito.mock(EmployeeInputFormat.class);
  SerializableSplit mockInputSplit = Mockito.mock(SerializableSplit.class);
  Mockito.when(mockInputFormat.getSplits(Mockito.any(JobContext.class)))
      .thenReturn(new ArrayList<>());
  HadoopInputFormatBoundedSource<Text, Employee> hifSource =
      new HadoopInputFormatBoundedSource<>(
          serConf,
          WritableCoder.of(Text.class),
          AvroCoder.of(Employee.class),
          null, // No key translation required.
          null, // No value translation required.
          mockInputSplit);
  thrown.expect(IOException.class);
  thrown.expectMessage("Error in computing splits, getSplits() returns a empty list");
  hifSource.setInputFormatObj(mockInputFormat);
  hifSource.computeSplitsIfNecessary();
}

Source File: PubsubIO.java From beam with Apache License 2.0

6 votes

/**
 * Returns a {@link PTransform} that continuously reads binary encoded Avro messages of the
 * specific type.
 *
 * <p>Beam will infer a schema for the Avro schema. This allows the output to be used by SQL and
 * by the schema-transform library.
 */
@Experimental(Kind.SCHEMAS)
public static <T> Read<T> readAvrosWithBeamSchema(Class<T> clazz) {
  if (clazz.equals(GenericRecord.class)) {
    throw new IllegalArgumentException("For GenericRecord, please call readAvroGenericRecords");
  }
  org.apache.avro.Schema avroSchema = ReflectData.get().getSchema(clazz);
  AvroCoder<T> coder = AvroCoder.of(clazz);
  Schema schema = AvroUtils.getSchema(clazz, null);
  return Read.newBuilder(parsePayloadUsingCoder(coder))
      .setCoder(
          SchemaCoder.of(
              schema,
              TypeDescriptor.of(clazz),
              AvroUtils.getToRowFunction(clazz, avroSchema),
              AvroUtils.getFromRowFunction(clazz)))
      .build();
}

Source File: HadoopFormatIOReadTest.java From beam with Apache License 2.0

6 votes

/**
 * This test validates behavior of {@link HadoopInputFormatBoundedSource} if RecordReader object
 * creation fails.
 */
@Test
public void testReadIfCreateRecordReaderFails() throws Exception {
  thrown.expect(Exception.class);
  thrown.expectMessage("Exception in creating RecordReader");
  InputFormat<Text, Employee> mockInputFormat = Mockito.mock(EmployeeInputFormat.class);
  Mockito.when(
          mockInputFormat.createRecordReader(
              Mockito.any(InputSplit.class), Mockito.any(TaskAttemptContext.class)))
      .thenThrow(new IOException("Exception in creating RecordReader"));
  HadoopInputFormatBoundedSource<Text, Employee> boundedSource =
      new HadoopInputFormatBoundedSource<>(
          serConf,
          WritableCoder.of(Text.class),
          AvroCoder.of(Employee.class),
          null, // No key translation required.
          null, // No value translation required.
          new SerializableSplit());
  boundedSource.setInputFormatObj(mockInputFormat);
  SourceTestUtils.readFromSource(boundedSource, p.getOptions());
}

Source File: HadoopFormatIOReadTest.java From beam with Apache License 2.0

6 votes

/**
 * Test reading if InputFormat implements {@link org.apache.hadoop.conf.Configurable
 * Configurable}.
 */
@Test
public void testReadingWithConfigurableInputFormat() throws Exception {
  List<BoundedSource<KV<Text, Employee>>> boundedSourceList =
      getBoundedSourceList(
          ConfigurableEmployeeInputFormat.class,
          Text.class,
          Employee.class,
          WritableCoder.of(Text.class),
          AvroCoder.of(Employee.class));
  for (BoundedSource<KV<Text, Employee>> source : boundedSourceList) {
    // Cast to HadoopInputFormatBoundedSource to access getInputFormat().
    HadoopInputFormatBoundedSource<Text, Employee> hifSource =
        (HadoopInputFormatBoundedSource<Text, Employee>) source;
    hifSource.createInputFormatInstance();
    ConfigurableEmployeeInputFormat inputFormatObj =
        (ConfigurableEmployeeInputFormat) hifSource.getInputFormat();
    assertTrue(inputFormatObj.isConfSet);
  }
}

Source File: ApproximateDistinctTest.java From beam with Apache License 2.0

5 votes

@Test
public void customObject() {
  final int cardinality = 500;
  final int p = 15;
  final double expectedErr = 1.04 / Math.sqrt(p);

  Schema schema =
      SchemaBuilder.record("User")
          .fields()
          .requiredString("Pseudo")
          .requiredInt("Age")
          .endRecord();
  List<GenericRecord> users = new ArrayList<>();
  for (int i = 1; i <= cardinality; i++) {
    GenericData.Record newRecord = new GenericData.Record(schema);
    newRecord.put("Pseudo", "User" + i);
    newRecord.put("Age", i);
    users.add(newRecord);
  }
  PCollection<Long> results =
      tp.apply("Create stream", Create.of(users).withCoder(AvroCoder.of(schema)))
          .apply(
              "Test custom object",
              ApproximateDistinct.<GenericRecord>globally().withPrecision(p));

  PAssert.that("Verify Accuracy for custom object", results)
      .satisfies(new VerifyAccuracy(cardinality, expectedErr));

  tp.run();
}

Source File: FixedInputRuntime.java From components with Apache License 2.0

5 votes

@Override
public PCollection<IndexedRecord> expand(PBegin begin) {
    FixedDatasetRuntime runtime = new FixedDatasetRuntime();
    runtime.initialize(null, properties.getDatasetProperties());

    // The values to include in the PCollection
    List<IndexedRecord> values = new LinkedList<>();

    if (properties.overrideValuesAction.getValue() == FixedInputProperties.OverrideValuesAction.NONE
            || properties.overrideValuesAction.getValue() == FixedInputProperties.OverrideValuesAction.APPEND) {
        if (!properties.getDatasetProperties().values.getValue().trim().isEmpty()) {
            values.addAll(runtime.getValues(Integer.MAX_VALUE));
        }
    }

    if (properties.overrideValuesAction.getValue() == FixedInputProperties.OverrideValuesAction.APPEND
            || properties.overrideValuesAction.getValue() == FixedInputProperties.OverrideValuesAction.REPLACE) {
        properties.getDatasetProperties().values.setValue(properties.overrideValues.getValue());
        if (!properties.getDatasetProperties().values.getValue().trim().isEmpty()) {
            values.addAll(runtime.getValues(Integer.MAX_VALUE));
        }
    }

    if (values.size() != 0) {
        PCollection<IndexedRecord> out = (PCollection<IndexedRecord>) begin
                .apply(Create.of(values).withCoder((AvroCoder) AvroCoder.of(runtime.getSchema())));
        if (properties.repeat.getValue() > 1) {
            PCollectionList<IndexedRecord> merged = PCollectionList.of(out);
            for (int i = 2; i < properties.repeat.getValue(); i++)
                merged = merged.and(out);
            out = merged.apply(Flatten.<IndexedRecord> pCollections());
        }
        return out;
    } else {
        return begin.apply(RowGeneratorIO.read().withSchema(runtime.getSchema()) //
                .withSeed(0L) //
                .withPartitions(1) //
                .withRows(properties.repeat.getValue()));
    }
}

Source File: GenericRecordToRowTest.java From beam with Apache License 2.0

5 votes

@Test
public void testConvertsGenericRecordToRow() {
  String schemaString =
      "{\"namespace\": \"example.avro\",\n"
          + " \"type\": \"record\",\n"
          + " \"name\": \"User\",\n"
          + " \"fields\": [\n"
          + "     {\"name\": \"name\", \"type\": \"string\"},\n"
          + "     {\"name\": \"favorite_number\", \"type\": \"int\"},\n"
          + "     {\"name\": \"favorite_color\", \"type\": \"string\"},\n"
          + "     {\"name\": \"price\", \"type\": \"double\"}\n"
          + " ]\n"
          + "}";
  Schema schema = (new Schema.Parser()).parse(schemaString);

  GenericRecord before = new GenericData.Record(schema);
  before.put("name", "Bob");
  before.put("favorite_number", 256);
  before.put("favorite_color", "red");
  before.put("price", 2.4);

  AvroCoder<GenericRecord> coder = AvroCoder.of(schema);

  PCollection<Row> rows =
      pipeline
          .apply("create PCollection<GenericRecord>", Create.of(before).withCoder(coder))
          .apply(
              "convert", GenericRecordReadConverter.builder().beamSchema(payloadSchema).build());

  PAssert.that(rows)
      .containsInAnyOrder(
          Row.withSchema(payloadSchema).addValues("Bob", 256, "red", 2.4).build());
  pipeline.run();
}

Source File: AvroHdfsFileSource.java From components with Apache License 2.0

5 votes

@Override
protected KV<AvroKey, NullWritable> nextPair() throws IOException, InterruptedException {
    // Not only is the AvroKey reused by the file format, but the underlying GenericRecord is as well.
    KV<AvroKey, NullWritable> kv = super.nextPair();
    GenericRecord gr = (GenericRecord) kv.getKey().datum();
    gr = CoderUtils.clone(AvroCoder.of(gr.getSchema()), gr);
    return KV.of(new AvroKey(gr), kv.getValue());
}

Source File: NexmarkUtils.java From beam with Apache License 2.0

5 votes

/** Setup pipeline with codes and some other options. */
public static void setupPipeline(CoderStrategy coderStrategy, Pipeline p) {
  CoderRegistry registry = p.getCoderRegistry();
  switch (coderStrategy) {
    case HAND:
      registry.registerCoderForClass(Auction.class, Auction.CODER);
      registry.registerCoderForClass(AuctionBid.class, AuctionBid.CODER);
      registry.registerCoderForClass(AuctionCount.class, AuctionCount.CODER);
      registry.registerCoderForClass(AuctionPrice.class, AuctionPrice.CODER);
      registry.registerCoderForClass(Bid.class, Bid.CODER);
      registry.registerCoderForClass(CategoryPrice.class, CategoryPrice.CODER);
      registry.registerCoderForClass(Event.class, Event.CODER);
      registry.registerCoderForClass(IdNameReserve.class, IdNameReserve.CODER);
      registry.registerCoderForClass(NameCityStateId.class, NameCityStateId.CODER);
      registry.registerCoderForClass(Person.class, Person.CODER);
      registry.registerCoderForClass(SellerPrice.class, SellerPrice.CODER);
      registry.registerCoderForClass(Done.class, Done.CODER);
      registry.registerCoderForClass(BidsPerSession.class, BidsPerSession.CODER);
      break;
    case AVRO:
      registry.registerCoderProvider(AvroCoder.getCoderProvider());
      break;
    case JAVA:
      registry.registerCoderProvider(SerializableCoder.getCoderProvider());
      break;
  }
}

Source File: AvroIO.java From beam with Apache License 2.0

5 votes

@Override
public PCollection<T> expand(PCollection<FileIO.ReadableFile> input) {
  checkNotNull(getSchema(), "schema");
  PCollection<T> read =
      input.apply(
          "Read all via FileBasedSource",
          new ReadAllViaFileBasedSource<>(
              getDesiredBundleSizeBytes(),
              new CreateSourceFn<>(
                  getRecordClass(), getSchema().toString(), getDatumReaderFactory()),
              AvroCoder.of(getRecordClass(), getSchema())));
  return getInferBeamSchema() ? setBeamSchema(read, getRecordClass(), getSchema()) : read;
}

org.apache.beam.sdk.coders.AvroCoder Java Examples