org.apache.beam.sdk.transforms.Values Java Exaples

Source File: CreateStreamTest.java From beam with Apache License 2.0

6 votes

@Test
public void testElementsAtAlmostPositiveInfinity() throws IOException {
  Instant endOfGlobalWindow = GlobalWindow.INSTANCE.maxTimestamp();
  CreateStream<String> source =
      CreateStream.of(StringUtf8Coder.of(), batchDuration())
          .nextBatch(
              TimestampedValue.of("foo", endOfGlobalWindow),
              TimestampedValue.of("bar", endOfGlobalWindow))
          .advanceNextBatchWatermarkToInfinity();

  FixedWindows windows = FixedWindows.of(Duration.standardHours(6));
  PCollection<String> windowedValues =
      p.apply(source)
          .apply(Window.into(windows))
          .apply(WithKeys.of(1))
          .apply(GroupByKey.create())
          .apply(Values.create())
          .apply(Flatten.iterables());

  PAssert.that(windowedValues)
      .inWindow(windows.assignWindow(GlobalWindow.INSTANCE.maxTimestamp()))
      .containsInAnyOrder("foo", "bar");
  p.run();
}

Source File: BigQueryMerger.java From DataflowTemplates with Apache License 2.0

6 votes

@Override
public PCollection<Void> expand(PCollection<MergeInfo> input) {
  final MergeStatementBuilder mergeBuilder = new MergeStatementBuilder(mergeConfiguration);
  return input
      .apply(
          MapElements.into(
              TypeDescriptors.kvs(
                  TypeDescriptors.strings(), TypeDescriptor.of(MergeInfo.class)))
              .via(mergeInfo -> KV.of(mergeInfo.getReplicaTable(), mergeInfo)))
      .apply(new TriggerPerKeyOnFixedIntervals<String, MergeInfo>(windowDuration))
      .apply(Values.create())
      .apply(MapElements.into(TypeDescriptors.strings()).via(mergeInfo -> {
        return mergeBuilder.buildMergeStatement(
            mergeInfo.getReplicaTable(),
            mergeInfo.getStagingTable(),
            mergeInfo.getAllPkFields(),
            mergeInfo.getAllFields());
      }))
      .apply(ParDo.of(new BigQueryStatementIssuingFn(this.testBigQueryClient)))
      .apply(
          MapElements.into(TypeDescriptors.voids())
              .via(
                  whatever ->
                      (Void) null)); // TODO(pabloem) Remove this line and find a return type
}

Source File: ApproximateDistinctTest.java From beam with Apache License 2.0

6 votes

@Test
public void perKey() {
  final int cardinality = 1000;
  final int p = 15;
  final double expectedErr = 1.04 / Math.sqrt(p);

  List<Integer> stream = new ArrayList<>();
  for (int i = 1; i <= cardinality; i++) {
    stream.addAll(Collections.nCopies(2, i));
  }
  Collections.shuffle(stream);

  PCollection<Long> results =
      tp.apply("per key stream", Create.of(stream))
          .apply("create keys", WithKeys.of(1))
          .apply(
              "per key cardinality",
              ApproximateDistinct.<Integer, Integer>perKey().withPrecision(p))
          .apply("extract values", Values.create());

  PAssert.that("Verify Accuracy for cardinality per key", results)
      .satisfies(new VerifyAccuracy(cardinality, expectedErr));

  tp.run();
}

Source File: TestExpansionService.java From beam with Apache License 2.0

6 votes

@Override
public PTransform<PCollection<GenericRecord>, PCollection<String>> buildExternal(
    StringConfiguration configuration) {
  return new PTransform<PCollection<GenericRecord>, PCollection<String>>() {
    @Override
    public PCollection<String> expand(PCollection<GenericRecord> input) {
      return input
          .apply(
              FileIO.<GenericRecord>write()
                  .via(ParquetIO.sink(schema))
                  .to(configuration.data))
          .getPerDestinationOutputFilenames()
          .apply(Values.create());
    }
  };
}

Source File: FileIO.java From beam with Apache License 2.0

6 votes

@Override
public PCollection<MatchResult.Metadata> expand(PCollection<String> input) {
  PCollection<MatchResult.Metadata> res;
  if (getConfiguration().getWatchInterval() == null) {
    res =
        input.apply(
            "Match filepatterns",
            ParDo.of(new MatchFn(getConfiguration().getEmptyMatchTreatment())));
  } else {
    res =
        input
            .apply(
                "Continuously match filepatterns",
                Watch.growthOf(
                        Contextful.of(new MatchPollFn(), Requirements.empty()),
                        new ExtractFilenameFn())
                    .withPollInterval(getConfiguration().getWatchInterval())
                    .withTerminationPerInput(getConfiguration().getWatchTerminationCondition()))
            .apply(Values.create());
  }
  return res.apply(Reshuffle.viaRandomKey());
}

Source File: GatherAllPanes.java From beam with Apache License 2.0

6 votes

@Override
public PCollection<Iterable<ValueInSingleWindow<T>>> expand(PCollection<T> input) {
  WindowFn<?, ?> originalWindowFn = input.getWindowingStrategy().getWindowFn();

  return input
      .apply(Reify.windows())
      .apply(
          WithKeys.<Integer, ValueInSingleWindow<T>>of(0)
              .withKeyType(new TypeDescriptor<Integer>() {}))
      .apply(
          Window.into(
                  new IdentityWindowFn<KV<Integer, ValueInSingleWindow<T>>>(
                      originalWindowFn.windowCoder()))
              .triggering(Never.ever())
              .withAllowedLateness(input.getWindowingStrategy().getAllowedLateness())
              .discardingFiredPanes())
      // all values have the same key so they all appear as a single output element
      .apply(GroupByKey.create())
      .apply(Values.create())
      .setWindowingStrategyInternal(input.getWindowingStrategy());
}

Source File: WriteFiles.java From beam with Apache License 2.0

6 votes

@Override
public PCollection<List<ResultT>> expand(PCollection<ResultT> input) {
  if (getWindowedWrites()) {
    // Reshuffle the results to make them stable against retries.
    // Use a single void key to maximize size of bundles for finalization.
    return input
        .apply("Add void key", WithKeys.of((Void) null))
        .apply("Reshuffle", Reshuffle.of())
        .apply("Drop key", Values.create())
        .apply("Gather bundles", ParDo.of(new GatherBundlesPerWindowFn<>()))
        .setCoder(ListCoder.of(resultCoder))
        // Reshuffle one more time to stabilize the contents of the bundle lists to finalize.
        .apply(Reshuffle.viaRandomKey());
  } else {
    // Pass results via a side input rather than reshuffle, because we need to get an empty
    // iterable to finalize if there are no results.
    return input
        .getPipeline()
        .apply(Reify.viewInGlobalWindow(input.apply(View.asList()), ListCoder.of(resultCoder)));
  }
}

Source File: TestStreamTest.java From beam with Apache License 2.0

6 votes

@Test
@Category({NeedsRunner.class, UsesTestStream.class})
public void testElementsAtAlmostPositiveInfinity() {
  Instant endOfGlobalWindow = GlobalWindow.INSTANCE.maxTimestamp();
  TestStream<String> stream =
      TestStream.create(StringUtf8Coder.of())
          .addElements(
              TimestampedValue.of("foo", endOfGlobalWindow),
              TimestampedValue.of("bar", endOfGlobalWindow))
          .advanceWatermarkToInfinity();

  FixedWindows windows = FixedWindows.of(Duration.standardHours(6));
  PCollection<String> windowedValues =
      p.apply(stream)
          .apply(into(windows))
          .apply(WithKeys.of(1))
          .apply(GroupByKey.create())
          .apply(Values.create())
          .apply(Flatten.iterables());

  PAssert.that(windowedValues)
      .inWindow(windows.assignWindow(endOfGlobalWindow))
      .containsInAnyOrder("foo", "bar");
  p.run();
}

Source File: BigQueryToTableIT.java From beam with Apache License 2.0

6 votes

private void runBigQueryToTablePipeline(BigQueryToTableOptions options) {
  Pipeline p = Pipeline.create(options);
  BigQueryIO.Read bigQueryRead = BigQueryIO.read().fromQuery(options.getQuery());
  if (options.getUsingStandardSql()) {
    bigQueryRead = bigQueryRead.usingStandardSql();
  }
  PCollection<TableRow> input = p.apply(bigQueryRead);
  if (options.getReshuffle()) {
    input =
        input
            .apply(WithKeys.<Void, TableRow>of((Void) null))
            .setCoder(KvCoder.of(VoidCoder.of(), TableRowJsonCoder.of()))
            .apply(Reshuffle.<Void, TableRow>of())
            .apply(Values.<TableRow>create());
  }
  input.apply(
      BigQueryIO.writeTableRows()
          .to(options.getOutput())
          .withSchema(options.getOutputSchema())
          .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED));

  p.run().waitUntilFinish();
}

Source File: HadoopFormatIOElasticTest.java From beam with Apache License 2.0

6 votes

/**
 * Test to read data from embedded Elasticsearch instance and verify whether data is read
 * successfully.
 */
@Test
public void testHifIOWithElastic() {
  // Expected hashcode is evaluated during insertion time one time and hardcoded here.
  String expectedHashCode = "a62a85f5f081e3840baf1028d4d6c6bc";
  Configuration conf = getConfiguration();
  PCollection<KV<Text, LinkedMapWritable>> esData =
      pipeline.apply(HadoopFormatIO.<Text, LinkedMapWritable>read().withConfiguration(conf));
  PCollection<Long> count = esData.apply(Count.globally());
  // Verify that the count of objects fetched using HIFInputFormat IO is correct.
  PAssert.thatSingleton(count).isEqualTo((long) TEST_DATA_ROW_COUNT);
  PCollection<LinkedMapWritable> values = esData.apply(Values.create());
  PCollection<String> textValues = values.apply(transformFunc);
  // Verify the output values using checksum comparison.
  PCollection<String> consolidatedHashcode =
      textValues.apply(Combine.globally(new HashingFn()).withoutDefaults());
  PAssert.that(consolidatedHashcode).containsInAnyOrder(expectedHashCode);
  pipeline.run().waitUntilFinish();
}

Source File: HadoopFormatIOCassandraIT.java From beam with Apache License 2.0

6 votes

/** This test reads data from the Cassandra instance and verifies if data is read successfully. */
@Test
public void testHIFReadForCassandra() {
  // Expected hashcode is evaluated during insertion time one time and hardcoded here.
  String expectedHashCode = "1a30ad400afe4ebf5fde75f5d2d95408";
  Long expectedRecordsCount = 1000L;
  Configuration conf = getConfiguration(options);
  PCollection<KV<Long, String>> cassandraData =
      pipeline.apply(
          HadoopFormatIO.<Long, String>read()
              .withConfiguration(conf)
              .withValueTranslation(myValueTranslate));
  PAssert.thatSingleton(cassandraData.apply("Count", Count.globally()))
      .isEqualTo(expectedRecordsCount);
  PCollection<String> textValues = cassandraData.apply(Values.create());
  // Verify the output values using checksum comparison.
  PCollection<String> consolidatedHashcode =
      textValues.apply(Combine.globally(new HashingFn()).withoutDefaults());
  PAssert.that(consolidatedHashcode).containsInAnyOrder(expectedHashCode);
  pipeline.run().waitUntilFinish();
}

Source File: HadoopFormatIOElasticIT.java From beam with Apache License 2.0

6 votes

/**
 * This test reads data from the Elasticsearch instance and verifies whether data is read
 * successfully.
 */
@Test
public void testHifIOWithElastic() throws SecurityException {
  // Expected hashcode is evaluated during insertion time one time and hardcoded here.
  final long expectedRowCount = 1000L;
  String expectedHashCode = "42e254c8689050ed0a617ff5e80ea392";
  Configuration conf = getConfiguration(options);
  PCollection<KV<Text, LinkedMapWritable>> esData =
      pipeline.apply(HadoopFormatIO.<Text, LinkedMapWritable>read().withConfiguration(conf));
  // Verify that the count of objects fetched using HIFInputFormat IO is correct.
  PCollection<Long> count = esData.apply(Count.globally());
  PAssert.thatSingleton(count).isEqualTo(expectedRowCount);
  PCollection<LinkedMapWritable> values = esData.apply(Values.create());
  PCollection<String> textValues = values.apply(transformFunc);
  // Verify the output values using checksum comparison.
  PCollection<String> consolidatedHashcode =
      textValues.apply(Combine.globally(new HashingFn()).withoutDefaults());
  PAssert.that(consolidatedHashcode).containsInAnyOrder(expectedHashCode);
  pipeline.run().waitUntilFinish();
}

Source File: ParquetIOTest.java From beam with Apache License 2.0

6 votes

@Test
public void testWriteAndReadFiles() {
  List<GenericRecord> records = generateGenericRecords(1000);

  PCollection<GenericRecord> writeThenRead =
      mainPipeline
          .apply(Create.of(records).withCoder(AvroCoder.of(SCHEMA)))
          .apply(
              FileIO.<GenericRecord>write()
                  .via(ParquetIO.sink(SCHEMA))
                  .to(temporaryFolder.getRoot().getAbsolutePath()))
          .getPerDestinationOutputFilenames()
          .apply(Values.create())
          .apply(FileIO.matchAll())
          .apply(FileIO.readMatches())
          .apply(ParquetIO.readFiles(SCHEMA));

  PAssert.that(writeThenRead).containsInAnyOrder(records);

  mainPipeline.run().waitUntilFinish();
}

Source File: KafkaIOTest.java From beam with Apache License 2.0

6 votes

@Test
public void testUnboundedSourceWithSingleTopic() {
  // same as testUnboundedSource, but with single topic

  int numElements = 1000;
  String topic = "my_topic";

  KafkaIO.Read<Integer, Long> reader =
      KafkaIO.<Integer, Long>read()
          .withBootstrapServers("none")
          .withTopic("my_topic")
          .withConsumerFactoryFn(
              new ConsumerFactoryFn(
                  ImmutableList.of(topic), 10, numElements, OffsetResetStrategy.EARLIEST))
          .withMaxNumRecords(numElements)
          .withKeyDeserializer(IntegerDeserializer.class)
          .withValueDeserializer(LongDeserializer.class);

  PCollection<Long> input = p.apply(reader.withoutMetadata()).apply(Values.create());

  addCountingAsserts(input, numElements);
  p.run();
}

Source File: KafkaIOTest.java From beam with Apache License 2.0

6 votes

@Test
public void testUnboundedSourceWithExplicitPartitions() {
  int numElements = 1000;

  List<String> topics = ImmutableList.of("test");

  KafkaIO.Read<byte[], Long> reader =
      KafkaIO.<byte[], Long>read()
          .withBootstrapServers("none")
          .withTopicPartitions(ImmutableList.of(new TopicPartition("test", 5)))
          .withConsumerFactoryFn(
              new ConsumerFactoryFn(
                  topics, 10, numElements, OffsetResetStrategy.EARLIEST)) // 10 partitions
          .withKeyDeserializer(ByteArrayDeserializer.class)
          .withValueDeserializer(LongDeserializer.class)
          .withMaxNumRecords(numElements / 10);

  PCollection<Long> input = p.apply(reader.withoutMetadata()).apply(Values.create());

  // assert that every element is a multiple of 5.
  PAssert.that(input).satisfies(new AssertMultipleOf(5));

  PAssert.thatSingleton(input.apply(Count.globally())).isEqualTo(numElements / 10L);

  p.run();
}

Source File: KafkaIOTest.java From beam with Apache License 2.0

6 votes

@Test
public void testUnboundedSourceTimestamps() {

  int numElements = 1000;

  PCollection<Long> input =
      p.apply(mkKafkaReadTransform(numElements, new ValueAsTimestampFn()).withoutMetadata())
          .apply(Values.create());

  addCountingAsserts(input, numElements);

  PCollection<Long> diffs =
      input
          .apply("TimestampDiff", ParDo.of(new ElementValueDiff()))
          .apply("DistinctTimestamps", Distinct.create());
  // This assert also confirms that diffs only has one unique value.
  PAssert.thatSingleton(diffs).isEqualTo(0L);

  p.run();
}

Source File: KafkaIOTest.java From beam with Apache License 2.0

6 votes

@Test
public void testUnboundedSourceLogAppendTimestamps() {
  // LogAppendTime (server side timestamp) for records is set based on record index
  // in MockConsumer above. Ensure that those exact timestamps are set by the source.
  int numElements = 1000;

  PCollection<Long> input =
      p.apply(mkKafkaReadTransform(numElements, null).withLogAppendTime().withoutMetadata())
          .apply(Values.create());

  addCountingAsserts(input, numElements);

  PCollection<Long> diffs =
      input
          .apply(
              MapElements.into(TypeDescriptors.longs())
                  .via(t -> LOG_APPEND_START_TIME.plus(Duration.standardSeconds(t)).getMillis()))
          .apply("TimestampDiff", ParDo.of(new ElementValueDiff()))
          .apply("DistinctTimestamps", Distinct.create());

  // This assert also confirms that diff only has one unique value.
  PAssert.thatSingleton(diffs).isEqualTo(0L);

  p.run();
}

Source File: KafkaIOTest.java From beam with Apache License 2.0

6 votes

@Test
public void testUnboundedSourceStartReadTime() {

  assumeTrue(new ConsumerSpEL().hasOffsetsForTimes());

  int numElements = 1000;
  // In this MockConsumer, we let the elements of the time and offset equal and there are 20
  // partitions. So set this startTime can read half elements.
  int startTime = numElements / 20 / 2;
  int maxNumRecords = numElements / 2;

  PCollection<Long> input =
      p.apply(
              mkKafkaReadTransform(numElements, maxNumRecords, new ValueAsTimestampFn())
                  .withStartReadTime(new Instant(startTime))
                  .withoutMetadata())
          .apply(Values.create());

  addCountingAsserts(input, maxNumRecords, maxNumRecords, maxNumRecords, numElements - 1);
  p.run();
}

Source File: KafkaIOTest.java From beam with Apache License 2.0

6 votes

@Test
public void testUnboundedSourceStartReadTimeException() {

  assumeTrue(new ConsumerSpEL().hasOffsetsForTimes());

  noMessagesException.expect(RuntimeException.class);

  int numElements = 1000;
  // In this MockConsumer, we let the elements of the time and offset equal and there are 20
  // partitions. So set this startTime can not read any element.
  int startTime = numElements / 20;

  p.apply(
          mkKafkaReadTransform(numElements, numElements, new ValueAsTimestampFn())
              .withStartReadTime(new Instant(startTime))
              .withoutMetadata())
      .apply(Values.create());

  p.run();
}

Source File: HadoopFormatIOCassandraTest.java From beam with Apache License 2.0

6 votes

/**
 * Test to read data from embedded Cassandra instance and verify whether data is read
 * successfully.
 */
@Test
public void testHIFReadForCassandra() {
  // Expected hashcode is evaluated during insertion time one time and hardcoded here.
  String expectedHashCode = "1b9780833cce000138b9afa25ba63486";
  Configuration conf = getConfiguration();
  PCollection<KV<Long, String>> cassandraData =
      p.apply(
          HadoopFormatIO.<Long, String>read()
              .withConfiguration(conf)
              .withValueTranslation(myValueTranslate));
  // Verify the count of data retrieved from Cassandra matches expected count.
  PAssert.thatSingleton(cassandraData.apply("Count", Count.globally()))
      .isEqualTo(TEST_DATA_ROW_COUNT);
  PCollection<String> textValues = cassandraData.apply(Values.create());
  // Verify the output values using checksum comparison.
  PCollection<String> consolidatedHashcode =
      textValues.apply(Combine.globally(new HashingFn()).withoutDefaults());
  PAssert.that(consolidatedHashcode).containsInAnyOrder(expectedHashCode);
  p.run().waitUntilFinish();
}

Source File: BigQueryMerger.java From DataflowTemplates with Apache License 2.0

6 votes

@Override
public PCollection<Void> expand(PCollection<MergeInfo> input) {
  final MergeStatementBuilder mergeBuilder = new MergeStatementBuilder(mergeConfiguration);
  return input
      .apply(
          MapElements.into(
              TypeDescriptors.kvs(
                  TypeDescriptors.strings(), TypeDescriptor.of(MergeInfo.class)))
              .via(mergeInfo -> KV.of(mergeInfo.getReplicaTable(), mergeInfo)))
      .apply(new TriggerPerKeyOnFixedIntervals<String, MergeInfo>(windowDuration))
      .apply(Values.create())
      .apply(MapElements.into(TypeDescriptors.strings()).via(mergeInfo -> {
        return mergeBuilder.buildMergeStatement(
            mergeInfo.getReplicaTable(),
            mergeInfo.getStagingTable(),
            mergeInfo.getAllPkFields(),
            mergeInfo.getAllFields());
      }))
      .apply(ParDo.of(new BigQueryStatementIssuingFn(this.testBigQueryClient)))
      .apply(
          MapElements.into(TypeDescriptors.voids())
              .via(
                  whatever ->
                      (Void) null)); // TODO(pabloem) Remove this line and find a return type
}

Source File: HadoopFormatIOCassandraIT.java From beam with Apache License 2.0

5 votes

/**
 * This test reads data from the Cassandra instance based on query and verifies if data is read
 * successfully.
 */
@Test
public void testHIFReadForCassandraQuery() {
  String expectedHashCode = "7bead6d6385c5f4dd0524720cd320b49";
  Long expectedNumRows = 1L;
  Configuration conf = getConfiguration(options);
  conf.set(
      "cassandra.input.cql",
      "select * from "
          + CASSANDRA_KEYSPACE
          + "."
          + CASSANDRA_TABLE
          + " where token(y_id) > ? and token(y_id) <= ? "
          + "and field0 = 'user48:field0:431531'");
  PCollection<KV<Long, String>> cassandraData =
      pipeline.apply(
          HadoopFormatIO.<Long, String>read()
              .withConfiguration(conf)
              .withValueTranslation(myValueTranslate));
  PAssert.thatSingleton(cassandraData.apply("Count", Count.globally()))
      .isEqualTo(expectedNumRows);
  PCollection<String> textValues = cassandraData.apply(Values.create());
  // Verify the output values using checksum comparison.
  PCollection<String> consolidatedHashcode =
      textValues.apply(Combine.globally(new HashingFn()).withoutDefaults());
  PAssert.that(consolidatedHashcode).containsInAnyOrder(expectedHashCode);
  pipeline.run().waitUntilFinish();
}

Source File: KafkaIOTest.java From beam with Apache License 2.0

5 votes

@Test
public void testUnboundedSourceSplits() throws Exception {

  int numElements = 1000;
  int numSplits = 10;

  // Coders must be specified explicitly here due to the way the transform
  // is used in the test.
  UnboundedSource<KafkaRecord<Integer, Long>, ?> initial =
      mkKafkaReadTransform(numElements, null)
          .withKeyDeserializerAndCoder(IntegerDeserializer.class, BigEndianIntegerCoder.of())
          .withValueDeserializerAndCoder(LongDeserializer.class, BigEndianLongCoder.of())
          .makeSource();

  List<? extends UnboundedSource<KafkaRecord<Integer, Long>, ?>> splits =
      initial.split(numSplits, p.getOptions());
  assertEquals("Expected exact splitting", numSplits, splits.size());

  long elementsPerSplit = numElements / numSplits;
  assertEquals("Expected even splits", numElements, elementsPerSplit * numSplits);
  PCollectionList<Long> pcollections = PCollectionList.empty(p);
  for (int i = 0; i < splits.size(); ++i) {
    pcollections =
        pcollections.and(
            p.apply("split" + i, Read.from(splits.get(i)).withMaxNumRecords(elementsPerSplit))
                .apply("Remove Metadata " + i, ParDo.of(new RemoveKafkaMetadata<>()))
                .apply("collection " + i, Values.create()));
  }
  PCollection<Long> input = pcollections.apply(Flatten.pCollections());

  addCountingAsserts(input, numElements);
  p.run();
}

Source File: KafkaIOTest.java From beam with Apache License 2.0

5 votes

@Test
public void testUnboundedSourceCustomTimestamps() {
  // The custom timestamps is set to customTimestampStartMillis + value.
  // Tests basic functionality of custom timestamps.

  final int numElements = 1000;
  final long customTimestampStartMillis = 80000L;

  PCollection<Long> input =
      p.apply(
              mkKafkaReadTransform(numElements, null)
                  .withTimestampPolicyFactory(
                      (tp, prevWatermark) ->
                          new CustomTimestampPolicyWithLimitedDelay<Integer, Long>(
                              (record ->
                                  new Instant(
                                      TimeUnit.SECONDS.toMillis(record.getKV().getValue())
                                          + customTimestampStartMillis)),
                              Duration.ZERO,
                              prevWatermark))
                  .withoutMetadata())
          .apply(Values.create());

  addCountingAsserts(input, numElements);

  PCollection<Long> diffs =
      input
          .apply(
              MapElements.into(TypeDescriptors.longs())
                  .via(t -> TimeUnit.SECONDS.toMillis(t) + customTimestampStartMillis))
          .apply("TimestampDiff", ParDo.of(new ElementValueDiff()))
          .apply("DistinctTimestamps", Distinct.create());

  // This assert also confirms that diff only has one unique value.
  PAssert.thatSingleton(diffs).isEqualTo(0L);

  p.run();
}

Source File: KafkaIOTest.java From beam with Apache License 2.0

5 votes

@Test
public void testUnboundedSource() {
  int numElements = 1000;

  PCollection<Long> input =
      p.apply(mkKafkaReadTransform(numElements, new ValueAsTimestampFn()).withoutMetadata())
          .apply(Values.create());

  addCountingAsserts(input, numElements);
  p.run();
}

Source File: KafkaIOTest.java From beam with Apache License 2.0

5 votes

@Test
public void testValuesSink() throws Exception {
  // similar to testSink(), but use values()' interface.

  int numElements = 1000;

  try (MockProducerWrapper producerWrapper = new MockProducerWrapper()) {

    ProducerSendCompletionThread completionThread =
        new ProducerSendCompletionThread(producerWrapper.mockProducer).start();

    String topic = "test";

    p.apply(mkKafkaReadTransform(numElements, new ValueAsTimestampFn()).withoutMetadata())
        .apply(Values.create()) // there are no keys
        .apply(
            KafkaIO.<Integer, Long>write()
                .withBootstrapServers("none")
                .withTopic(topic)
                .withValueSerializer(LongSerializer.class)
                .withProducerFactoryFn(new ProducerFactoryFn(producerWrapper.producerKey))
                .values());

    p.run();

    completionThread.shutdown();

    verifyProducerRecords(producerWrapper.mockProducer, topic, numElements, true, false);
  }
}

Source File: HadoopFormatIOElasticIT.java From beam with Apache License 2.0

5 votes

/**
 * This test reads data from the Elasticsearch instance based on a query and verifies if data is
 * read successfully.
 */
@Test
public void testHifIOWithElasticQuery() {
  String expectedHashCode = "d7a7e4e42c2ca7b83ef7c1ad1ebce000";
  Long expectedRecordsCount = 1L;
  Configuration conf = getConfiguration(options);
  String query =
      "{"
          + "  \"query\": {"
          + "  \"match\" : {"
          + "    \"Title\" : {"
          + "      \"query\" : \"Title9\","
          + "      \"type\" : \"boolean\""
          + "    }"
          + "  }"
          + "  }"
          + "}";
  conf.set(ConfigurationOptions.ES_QUERY, query);
  PCollection<KV<Text, LinkedMapWritable>> esData =
      pipeline.apply(HadoopFormatIO.<Text, LinkedMapWritable>read().withConfiguration(conf));
  PCollection<Long> count = esData.apply(Count.globally());
  // Verify that the count of objects fetched using HIFInputFormat IO is correct.
  PAssert.thatSingleton(count).isEqualTo(expectedRecordsCount);
  PCollection<LinkedMapWritable> values = esData.apply(Values.create());
  PCollection<String> textValues = values.apply(transformFunc);
  // Verify the output values using checksum comparison.
  PCollection<String> consolidatedHashcode =
      textValues.apply(Combine.globally(new HashingFn()).withoutDefaults());
  PAssert.that(consolidatedHashcode).containsInAnyOrder(expectedHashCode);
  pipeline.run().waitUntilFinish();
}

Source File: HadoopFormatIOCassandraTest.java From beam with Apache License 2.0

5 votes

/**
 * Test to read data from embedded Cassandra instance based on query and verify whether data is
 * read successfully.
 */
@Test
public void testHIFReadForCassandraQuery() {
  Long expectedCount = 1L;
  String expectedChecksum = "f11caabc7a9fc170e22b41218749166c";
  Configuration conf = getConfiguration();
  conf.set(
      "cassandra.input.cql",
      "select * from "
          + CASSANDRA_KEYSPACE
          + "."
          + CASSANDRA_TABLE
          + " where token(id) > ? and token(id) <= ? and scientist='Faraday1' allow filtering");
  PCollection<KV<Long, String>> cassandraData =
      p.apply(
          HadoopFormatIO.<Long, String>read()
              .withConfiguration(conf)
              .withValueTranslation(myValueTranslate));
  // Verify the count of data retrieved from Cassandra matches expected count.
  PAssert.thatSingleton(cassandraData.apply("Count", Count.globally())).isEqualTo(expectedCount);
  PCollection<String> textValues = cassandraData.apply(Values.create());
  // Verify the output values using checksum comparison.
  PCollection<String> consolidatedHashcode =
      textValues.apply(Combine.globally(new HashingFn()).withoutDefaults());
  PAssert.that(consolidatedHashcode).containsInAnyOrder(expectedChecksum);
  p.run().waitUntilFinish();
}

Source File: SplittableParDoNaiveBounded.java From beam with Apache License 2.0

5 votes

@Override
public PCollectionTuple expand(PCollection<KV<byte[], KV<InputT, RestrictionT>>> input) {
  return input
      .apply("Drop key", Values.create())
      .apply("Reshuffle", Reshuffle.of())
      .apply(
          "NaiveProcess",
          ParDo.of(
                  new NaiveProcessFn<
                      InputT, OutputT, RestrictionT, PositionT, WatermarkEstimatorStateT>(
                      original.getFn()))
              .withSideInputs(original.getSideInputs())
              .withOutputTags(original.getMainOutputTag(), original.getAdditionalOutputTags()));
}

Source File: KafkaIOTest.java From beam with Apache License 2.0

5 votes

@Test
public void testUnreachableKafkaBrokers() {
  // Expect an exception when the Kafka brokers are not reachable on the workers.
  // We specify partitions explicitly so that splitting does not involve server interaction.
  // Set request timeout to 10ms so that test does not take long.

  thrown.expect(Exception.class);
  thrown.expectMessage("Reader-0: Timeout while initializing partition 'test-0'");

  int numElements = 1000;
  PCollection<Long> input =
      p.apply(
              KafkaIO.<Integer, Long>read()
                  .withBootstrapServers("8.8.8.8:9092") // Google public DNS ip.
                  .withTopicPartitions(ImmutableList.of(new TopicPartition("test", 0)))
                  .withKeyDeserializer(IntegerDeserializer.class)
                  .withValueDeserializer(LongDeserializer.class)
                  .withConsumerConfigUpdates(
                      ImmutableMap.of(
                          ConsumerConfig.HEARTBEAT_INTERVAL_MS_CONFIG,
                          5,
                          ConsumerConfig.SESSION_TIMEOUT_MS_CONFIG,
                          8,
                          ConsumerConfig.FETCH_MAX_WAIT_MS_CONFIG,
                          8,
                          "default.api.timeout.ms",
                          10))
                  .withMaxNumRecords(10)
                  .withoutMetadata())
          .apply(Values.create());

  addCountingAsserts(input, numElements);
  p.run();
}

org.apache.beam.sdk.transforms.Values Java Examples