org.apache.beam.sdk.transforms.Combine Java Exaples

Source File: AggregatorCombiner.java From beam with Apache License 2.0

7 votes

public AggregatorCombiner(
    Combine.CombineFn<InputT, AccumT, OutputT> combineFn,
    WindowingStrategy<?, ?> windowingStrategy,
    Coder<AccumT> accumulatorCoder,
    Coder<OutputT> outputCoder) {
  this.combineFn = combineFn;
  this.windowingStrategy = (WindowingStrategy<InputT, W>) windowingStrategy;
  this.timestampCombiner = windowingStrategy.getTimestampCombiner();
  this.accumulatorCoder =
      IterableCoder.of(
          WindowedValue.FullWindowedValueCoder.of(
              accumulatorCoder, windowingStrategy.getWindowFn().windowCoder()));
  this.outputCoder =
      IterableCoder.of(
          WindowedValue.FullWindowedValueCoder.of(
              outputCoder, windowingStrategy.getWindowFn().windowCoder()));
}

Source File: HBaseIOIT.java From beam with Apache License 2.0

6 votes

/** Read the test dataset from hbase and validate its contents. */
private void runRead() {
  PCollection<Result> tableRows =
      pipelineRead.apply(HBaseIO.read().withConfiguration(conf).withTableId(TABLE_NAME));

  PAssert.thatSingleton(tableRows.apply("Count All", Count.<Result>globally()))
      .isEqualTo((long) numberOfRows);

  PCollection<String> consolidatedHashcode =
      tableRows
          .apply(ParDo.of(new SelectNameFn()))
          .apply("Hash row contents", Combine.globally(new HashingFn()).withoutDefaults());

  PAssert.that(consolidatedHashcode)
      .containsInAnyOrder(TestRow.getExpectedHashForRowCount(numberOfRows));

  pipelineRead.run().waitUntilFinish();
}

Source File: CassandraIOIT.java From beam with Apache License 2.0

6 votes

private void runRead() {
  PCollection<Scientist> output =
      pipelineRead.apply(
          CassandraIO.<Scientist>read()
              .withHosts(options.getCassandraHost())
              .withPort(options.getCassandraPort())
              .withMinNumberOfSplits(20)
              .withKeyspace(KEYSPACE)
              .withTable(TABLE)
              .withEntity(Scientist.class)
              .withCoder(SerializableCoder.of(Scientist.class)));

  PCollection<String> consolidatedHashcode =
      output
          .apply(ParDo.of(new SelectNameFn()))
          .apply("Hash row contents", Combine.globally(new HashingFn()).withoutDefaults());

  PAssert.thatSingleton(consolidatedHashcode)
      .isEqualTo(TestRow.getExpectedHashForRowCount(options.getNumberOfRecords()));

  pipelineRead.run().waitUntilFinish();
}

Source File: HadoopFormatIOElasticTest.java From beam with Apache License 2.0

6 votes

/**
 * Test to read data from embedded Elasticsearch instance and verify whether data is read
 * successfully.
 */
@Test
public void testHifIOWithElastic() {
  // Expected hashcode is evaluated during insertion time one time and hardcoded here.
  String expectedHashCode = "a62a85f5f081e3840baf1028d4d6c6bc";
  Configuration conf = getConfiguration();
  PCollection<KV<Text, LinkedMapWritable>> esData =
      pipeline.apply(HadoopFormatIO.<Text, LinkedMapWritable>read().withConfiguration(conf));
  PCollection<Long> count = esData.apply(Count.globally());
  // Verify that the count of objects fetched using HIFInputFormat IO is correct.
  PAssert.thatSingleton(count).isEqualTo((long) TEST_DATA_ROW_COUNT);
  PCollection<LinkedMapWritable> values = esData.apply(Values.create());
  PCollection<String> textValues = values.apply(transformFunc);
  // Verify the output values using checksum comparison.
  PCollection<String> consolidatedHashcode =
      textValues.apply(Combine.globally(new HashingFn()).withoutDefaults());
  PAssert.that(consolidatedHashcode).containsInAnyOrder(expectedHashCode);
  pipeline.run().waitUntilFinish();
}

Source File: HadoopFormatIOCassandraIT.java From beam with Apache License 2.0

6 votes

/** This test reads data from the Cassandra instance and verifies if data is read successfully. */
@Test
public void testHIFReadForCassandra() {
  // Expected hashcode is evaluated during insertion time one time and hardcoded here.
  String expectedHashCode = "1a30ad400afe4ebf5fde75f5d2d95408";
  Long expectedRecordsCount = 1000L;
  Configuration conf = getConfiguration(options);
  PCollection<KV<Long, String>> cassandraData =
      pipeline.apply(
          HadoopFormatIO.<Long, String>read()
              .withConfiguration(conf)
              .withValueTranslation(myValueTranslate));
  PAssert.thatSingleton(cassandraData.apply("Count", Count.globally()))
      .isEqualTo(expectedRecordsCount);
  PCollection<String> textValues = cassandraData.apply(Values.create());
  // Verify the output values using checksum comparison.
  PCollection<String> consolidatedHashcode =
      textValues.apply(Combine.globally(new HashingFn()).withoutDefaults());
  PAssert.that(consolidatedHashcode).containsInAnyOrder(expectedHashCode);
  pipeline.run().waitUntilFinish();
}

Source File: FlinkStreamingTransformTranslators.java From beam with Apache License 2.0

6 votes

@Override
boolean canTranslate(
    PTransform<PCollection<KV<K, InputT>>, PCollection<KV<K, OutputT>>> transform,
    FlinkStreamingTranslationContext context) {
  // if we have a merging window strategy and side inputs we cannot
  // translate as a proper combine. We have to group and then run the combine
  // over the final grouped values.
  PCollection<KV<K, InputT>> input = context.getInput(transform);

  @SuppressWarnings("unchecked")
  WindowingStrategy<?, BoundedWindow> windowingStrategy =
      (WindowingStrategy<?, BoundedWindow>) input.getWindowingStrategy();

  return windowingStrategy.getWindowFn().isNonMerging()
      || ((Combine.PerKey) transform).getSideInputs().isEmpty();
}

Source File: HadoopFormatIOCassandraTest.java From beam with Apache License 2.0

6 votes

/**
 * Test to read data from embedded Cassandra instance and verify whether data is read
 * successfully.
 */
@Test
public void testHIFReadForCassandra() {
  // Expected hashcode is evaluated during insertion time one time and hardcoded here.
  String expectedHashCode = "1b9780833cce000138b9afa25ba63486";
  Configuration conf = getConfiguration();
  PCollection<KV<Long, String>> cassandraData =
      p.apply(
          HadoopFormatIO.<Long, String>read()
              .withConfiguration(conf)
              .withValueTranslation(myValueTranslate));
  // Verify the count of data retrieved from Cassandra matches expected count.
  PAssert.thatSingleton(cassandraData.apply("Count", Count.globally()))
      .isEqualTo(TEST_DATA_ROW_COUNT);
  PCollection<String> textValues = cassandraData.apply(Values.create());
  // Verify the output values using checksum comparison.
  PCollection<String> consolidatedHashcode =
      textValues.apply(Combine.globally(new HashingFn()).withoutDefaults());
  PAssert.that(consolidatedHashcode).containsInAnyOrder(expectedHashCode);
  p.run().waitUntilFinish();
}

Source File: SparkCombineFnTest.java From beam with Apache License 2.0

6 votes

private static Combine.CombineFn<Integer, Long, Long> getSumFn() {
  return new Combine.CombineFn<Integer, Long, Long>() {

    @Override
    public Long createAccumulator() {
      return 0L;
    }

    @Override
    public Long addInput(Long mutableAccumulator, Integer input) {
      return mutableAccumulator + input;
    }

    @Override
    public Long mergeAccumulators(Iterable<Long> accumulators) {
      return StreamSupport.stream(accumulators.spliterator(), false).mapToLong(e -> e).sum();
    }

    @Override
    public Long extractOutput(Long accumulator) {
      return accumulator;
    }
  };
}

Source File: CombineTest.java From beam with Apache License 2.0

6 votes

@Test
public void testBinaryCombineWithSlidingWindows() {
  PCollection<Integer> input =
      pipeline
          .apply(
              Create.timestamped(
                  TimestampedValue.of(1, new Instant(1)),
                  TimestampedValue.of(3, new Instant(2)),
                  TimestampedValue.of(5, new Instant(3))))
          .apply(Window.into(SlidingWindows.of(Duration.millis(3)).every(Duration.millis(1))))
          .apply(
              Combine.globally(
                      Combine.BinaryCombineFn.of(
                          (SerializableBiFunction<Integer, Integer, Integer>)
                              (integer1, integer2) -> integer1 > integer2 ? integer1 : integer2))
                  .withoutDefaults());
  PAssert.that(input).containsInAnyOrder(1, 3, 5, 5, 5);
  pipeline.run();
}

Source File: GroupByKeyTranslator.java From beam with Apache License 2.0

6 votes

@SuppressWarnings("unchecked")
private static <K, InputT, OutputT>
    SystemReduceFn<K, InputT, ?, OutputT, BoundedWindow> getSystemReduceFn(
        PTransform<PCollection<KV<K, InputT>>, PCollection<KV<K, OutputT>>> transform,
        Pipeline pipeline,
        KvCoder<K, InputT> kvInputCoder) {
  if (transform instanceof GroupByKey) {
    return (SystemReduceFn<K, InputT, ?, OutputT, BoundedWindow>)
        SystemReduceFn.buffering(kvInputCoder.getValueCoder());
  } else if (transform instanceof Combine.PerKey) {
    final CombineFnBase.GlobalCombineFn<? super InputT, ?, OutputT> combineFn =
        ((Combine.PerKey) transform).getFn();
    return SystemReduceFn.combining(
        kvInputCoder.getKeyCoder(),
        AppliedCombineFn.withInputCoder(combineFn, pipeline.getCoderRegistry(), kvInputCoder));
  } else {
    throw new RuntimeException("Transform " + transform + " cannot be translated as GroupByKey.");
  }
}

Source File: CombineTranslation.java From beam with Apache License 2.0

6 votes

@Override
public FunctionSpec translate(
    AppliedPTransform<?, ?, Combine.PerKey<?, ?, ?>> transform, SdkComponents components)
    throws IOException {
  if (transform.getTransform().getSideInputs().isEmpty()) {
    GlobalCombineFn<?, ?, ?> combineFn = transform.getTransform().getFn();
    Coder<?> accumulatorCoder =
        extractAccumulatorCoder(combineFn, (AppliedPTransform) transform);
    return FunctionSpec.newBuilder()
        .setUrn(getUrn(transform.getTransform()))
        .setPayload(combinePayload(combineFn, accumulatorCoder, components).toByteString())
        .build();
  } else {
    // Combines with side inputs are translated as generic composites, which have a blank
    // FunctionSpec.
    return null;
  }
}

Source File: CombineTranslation.java From beam with Apache License 2.0

6 votes

private static <K, InputT, AccumT> Coder<AccumT> extractAccumulatorCoder(
    GlobalCombineFn<InputT, AccumT, ?> combineFn,
    AppliedPTransform<
            PCollection<KV<K, Iterable<InputT>>>, ?, Combine.GroupedValues<K, InputT, ?>>
        transform)
    throws IOException {
  try {
    @SuppressWarnings("unchecked")
    PCollection<KV<K, Iterable<InputT>>> mainInput =
        (PCollection<KV<K, Iterable<InputT>>>)
            Iterables.getOnlyElement(TransformInputs.nonAdditionalInputs(transform));
    KvCoder<K, Iterable<InputT>> kvCoder = (KvCoder<K, Iterable<InputT>>) mainInput.getCoder();
    IterableCoder<InputT> iterCoder = (IterableCoder<InputT>) kvCoder.getValueCoder();
    return combineFn.getAccumulatorCoder(
        transform.getPipeline().getCoderRegistry(), iterCoder.getElemCoder());
  } catch (CannotProvideCoderException e) {
    throw new IOException("Could not obtain a Coder for the accumulator", e);
  }
}

Source File: CombineTest.java From beam with Apache License 2.0

6 votes

@Test
public void testCombineGloballyPreservesWindowing() {
  PCollection<Integer> input =
      pipeline
          .apply(
              Create.timestamped(
                  TimestampedValue.of(1, new Instant(1)),
                  TimestampedValue.of(2, new Instant(2)),
                  TimestampedValue.of(3, new Instant(11)),
                  TimestampedValue.of(4, new Instant(3)),
                  TimestampedValue.of(5, new Instant(11)),
                  TimestampedValue.of(6, new Instant(12))))
          .apply(Window.into(FixedWindows.of(Duration.millis(10))))
          .apply(Combine.globally(Sum.ofIntegers()).withoutDefaults());
  PAssert.that(input).containsInAnyOrder(7, 14);
}

Source File: WindowTest.java From beam with Apache License 2.0

6 votes

@Test
@Category({ValidatesRunner.class, UsesCustomWindowMerging.class})
public void testMergingCustomWindows() {
  Instant startInstant = new Instant(0L);
  PCollection<String> inputCollection =
      pipeline.apply(
          Create.timestamped(
              TimestampedValue.of("big", startInstant.plus(Duration.standardSeconds(10))),
              TimestampedValue.of("small1", startInstant.plus(Duration.standardSeconds(20))),
              // This one will be outside of bigWindow thus not merged
              TimestampedValue.of("small2", startInstant.plus(Duration.standardSeconds(39)))));
  PCollection<String> windowedCollection =
      inputCollection.apply(Window.into(new CustomWindowFn<>()));
  PCollection<Long> count =
      windowedCollection.apply(Combine.globally(Count.<String>combineFn()).withoutDefaults());
  // "small1" and "big" elements merged into bigWindow "small2" not merged
  // because timestamp is not in bigWindow
  PAssert.that("Wrong number of elements in output collection", count).containsInAnyOrder(2L, 1L);
  pipeline.run();
}

Source File: WindowTest.java From beam with Apache License 2.0

6 votes

@Test
@Category({ValidatesRunner.class, UsesCustomWindowMerging.class})
public void testMergingCustomWindowsKeyedCollection() {
  Instant startInstant = new Instant(0L);
  PCollection<KV<Integer, String>> inputCollection =
      pipeline.apply(
          Create.timestamped(
              TimestampedValue.of(
                  KV.of(0, "big"), startInstant.plus(Duration.standardSeconds(10))),
              TimestampedValue.of(
                  KV.of(1, "small1"), startInstant.plus(Duration.standardSeconds(20))),
              // This element is not contained within the bigWindow and not merged
              TimestampedValue.of(
                  KV.of(2, "small2"), startInstant.plus(Duration.standardSeconds(39)))));
  PCollection<KV<Integer, String>> windowedCollection =
      inputCollection.apply(Window.into(new CustomWindowFn<>()));
  PCollection<Long> count =
      windowedCollection.apply(
          Combine.globally(Count.<KV<Integer, String>>combineFn()).withoutDefaults());
  // "small1" and "big" elements merged into bigWindow "small2" not merged
  // because it is not contained in bigWindow
  PAssert.that("Wrong number of elements in output collection", count).containsInAnyOrder(2L, 1L);
  pipeline.run();
}

Source File: CombineTranslation.java From beam with Apache License 2.0

6 votes

@Override
public FunctionSpec translate(
    AppliedPTransform<?, ?, Combine.Globally<?, ?>> transform, SdkComponents components)
    throws IOException {
  if (transform.getTransform().getSideInputs().isEmpty()) {
    return FunctionSpec.newBuilder()
        .setUrn(getUrn(transform.getTransform()))
        .setPayload(
            payloadForCombineGlobally((AppliedPTransform) transform, components).toByteString())
        .build();
  } else {
    // Combines with side inputs are translated as generic composites, which have a blank
    // FunctionSpec.
    return null;
  }
}

Source File: CombineRunnersTest.java From beam with Apache License 2.0

6 votes

@Before
public void createPipeline() throws Exception {
  // Create pipeline with an input pCollection, combine, and output pCollection.
  TestCombineFn combineFn = new TestCombineFn();
  Combine.PerKey<String, String, Integer> combine = Combine.perKey(combineFn);

  Pipeline p = Pipeline.create();
  PCollection<KV<String, String>> inputPCollection = p.apply(Create.of(KV.of("unused", "0")));
  inputPCollection.setCoder(KvCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of()));
  PCollection<KV<String, Integer>> outputPCollection =
      inputPCollection.apply(TEST_COMBINE_ID, combine);
  outputPCollection.setCoder(KvCoder.of(StringUtf8Coder.of(), BigEndianIntegerCoder.of()));

  // Create FnApi protos needed for the runner.
  SdkComponents sdkComponents = SdkComponents.create(p.getOptions());
  pProto = PipelineTranslation.toProto(p, sdkComponents);
  inputPCollectionId = sdkComponents.registerPCollection(inputPCollection);
  outputPCollectionId = sdkComponents.registerPCollection(outputPCollection);
  pTransform = pProto.getComponents().getTransformsOrThrow(TEST_COMBINE_ID);
}

Source File: HCatalogIOIT.java From beam with Apache License 2.0

5 votes

@Test
public void writeAndReadAll() {
  pipelineWrite
      .apply("Generate sequence", Create.of(buildHCatRecords(options.getNumberOfRecords())))
      .apply(
          HCatalogIO.write()
              .withConfigProperties(configProperties)
              .withDatabase(options.getHCatalogHiveDatabaseName())
              .withTable(tableName));
  pipelineWrite.run().waitUntilFinish();

  PCollection<String> testRecords =
      pipelineRead
          .apply(
              HCatalogIO.read()
                  .withConfigProperties(configProperties)
                  .withDatabase(options.getHCatalogHiveDatabaseName())
                  .withTable(tableName))
          .apply(ParDo.of(new CreateHCatFn()));

  PCollection<String> consolidatedHashcode =
      testRecords.apply("Calculate hashcode", Combine.globally(new HashingFn()));

  String expectedHash = getHashForRecordCount(options.getNumberOfRecords(), EXPECTED_HASHES);
  PAssert.thatSingleton(consolidatedHashcode).isEqualTo(expectedHash);

  pipelineRead.run().waitUntilFinish();
}

Source File: BucketingFunction.java From beam with Apache License 2.0

5 votes

public BucketingFunction(
    long bucketWidthMs,
    int numSignificantBuckets,
    int numSignificantSamples,
    Combine.BinaryCombineLongFn function) {
  this.bucketWidthMs = bucketWidthMs;
  this.numSignificantBuckets = numSignificantBuckets;
  this.numSignificantSamples = numSignificantSamples;
  this.function = function;
  this.buckets = new HashMap<>();
}

Source File: AggregateRuntime.java From components with Apache License 2.0

5 votes

@Override
public PCollection<IndexedRecord> expand(PCollection<IndexedRecord> indexedRecordPCollection) {
    // Return an empty result if there are no operations in the list. This is normally not a permitted operation.
    if (operationFieldPathList.size() == 0)
        return (PCollection<IndexedRecord>) (PCollection) indexedRecordPCollection.getPipeline().apply(
                Create.empty(AvroCoder.of(AvroUtils.createEmptySchema())));

    PCollection<KV<IndexedRecord, IndexedRecord>> kv = indexedRecordPCollection
            .apply(ParDo.of(new ExtractKVFn(new ArrayList<>(groupByFieldPathList),
                    new ArrayList<>(operationFieldPathList))))
            .setCoder(KvCoder.of(LazyAvroCoder.of(), LazyAvroCoder.of()));

    PCollection<KV<IndexedRecord, IndexedRecord>> aggregateResult = kv
            .apply(Combine.<IndexedRecord, IndexedRecord, IndexedRecord> perKey(new AggregateCombineFn(properties)))
            .setCoder(KvCoder.of(LazyAvroCoder.of(), NullableCoder.of(LazyAvroCoder.of())));

    PCollection<IndexedRecord> result = aggregateResult
            .apply(ParDo.of(new DoFn<KV<IndexedRecord, IndexedRecord>, KV<IndexedRecord, IndexedRecord>>() {

                @ProcessElement
                public void processElement(ProcessContext c) {
                    /**
                     * Filter null value when AggregateCombineFn for nothing, see {@link
                     * org.talend.components.processing.runtime.aggregate.AggregateCombineFn#extractOutput(AggregateCombineFn.AggregateAccumulator)}
                     */
                    if (c.element().getValue() != null) {
                        c.output(c.element());
                    }
                }
            }))
            .apply(ParDo.of(new MergeKVFn()))
            .setCoder(LazyAvroCoder.of());

    return result;
}

Source File: KinesisIOIT.java From beam with Apache License 2.0

5 votes

/** Read test dataset from Kinesis stream. */
private void runRead() {
  PCollection<KinesisRecord> output =
      pipelineRead.apply(
          KinesisIO.read()
              .withStreamName(options.getAwsKinesisStream())
              .withAWSClientsProvider(
                  options.getAwsAccessKey(),
                  options.getAwsSecretKey(),
                  Regions.fromName(options.getAwsKinesisRegion()))
              .withMaxNumRecords(numberOfRows)
              // to prevent endless running in case of error
              .withMaxReadTime(Duration.standardMinutes(10))
              .withInitialPositionInStream(InitialPositionInStream.AT_TIMESTAMP)
              .withInitialTimestampInStream(now)
              .withRequestRecordsLimit(1000));

  PAssert.thatSingleton(output.apply("Count All", Count.globally()))
      .isEqualTo((long) numberOfRows);

  PCollection<String> consolidatedHashcode =
      output
          .apply(ParDo.of(new ExtractDataValues()))
          .apply("Hash row contents", Combine.globally(new HashingFn()).withoutDefaults());

  PAssert.that(consolidatedHashcode)
      .containsInAnyOrder(TestRow.getExpectedHashForRowCount(numberOfRows));

  pipelineRead.run().waitUntilFinish();
}

Source File: FlinkStateInternals.java From beam with Apache License 2.0

5 votes

FlinkCombiningState(
    KeyedStateBackend<ByteBuffer> flinkStateBackend,
    String stateId,
    Combine.CombineFn<InputT, AccumT, OutputT> combineFn,
    StateNamespace namespace,
    Coder<AccumT> accumCoder) {

  this.namespace = namespace;
  this.stateId = stateId;
  this.combineFn = combineFn;
  this.flinkStateBackend = flinkStateBackend;

  flinkStateDescriptor =
      new ValueStateDescriptor<>(stateId, new CoderTypeSerializer<>(accumCoder));
}

Source File: SamzaPublishViewTransformOverride.java From beam with Apache License 2.0

5 votes

@Override
public PCollection<ElemT> expand(PCollection<ElemT> input) {
  // This actually creates a branch in the graph that publishes the view but then returns
  // the original input. This is copied from the Flink runner.
  input
      .apply(Combine.globally(new Concatenate<ElemT>()).withoutDefaults())
      .apply(new SamzaPublishView<>(view));
  return input;
}

Source File: CombineTranslationTest.java From beam with Apache License 2.0

5 votes

@Test
public void testToProto() throws Exception {
  PCollection<Integer> input = pipeline.apply(Create.of(1, 2, 3));
  input.apply(Combine.globally(combineFn));
  final AtomicReference<AppliedPTransform<?, ?, Combine.Globally<?, ?>>> combine =
      new AtomicReference<>();
  pipeline.traverseTopologically(
      new PipelineVisitor.Defaults() {
        @Override
        public void leaveCompositeTransform(Node node) {
          if (node.getTransform() instanceof Combine.Globally) {
            checkState(combine.get() == null);
            combine.set((AppliedPTransform) node.toAppliedPTransform(getPipeline()));
          }
        }
      });
  checkState(combine.get() != null);
  assertEquals(combineFn, combine.get().getTransform().getFn());

  SdkComponents sdkComponents = SdkComponents.create();
  sdkComponents.registerEnvironment(Environments.createDockerEnvironment("java"));
  CombinePayload combineProto =
      CombineTranslation.CombineGloballyPayloadTranslator.payloadForCombineGlobally(
          (AppliedPTransform) combine.get(), sdkComponents);
  RunnerApi.Components componentsProto = sdkComponents.toComponents();

  assertEquals(
      combineFn.getAccumulatorCoder(pipeline.getCoderRegistry(), input.getCoder()),
      getAccumulatorCoder(combineProto, RehydratedComponents.forComponents(componentsProto)));
  assertEquals(
      combineFn,
      SerializableUtils.deserializeFromByteArray(
          combineProto.getCombineFn().getPayload().toByteArray(), "CombineFn"));
}

Source File: Group.java From beam with Apache License 2.0

5 votes

@Override
public PCollection<Row> expand(PCollection<InputT> input) {
  SchemaAggregateFn.Inner fn = schemaAggregateFn.withSchema(input.getSchema());
  return input
      .apply("toRows", Convert.toRows())
      .apply("Global Combine", Combine.globally(fn))
      .setRowSchema(fn.getOutputSchema());
}

Source File: CombineTranslation.java From beam with Apache License 2.0

5 votes

/** Produces a {@link RunnerApi.CombinePayload} from a {@link Combine.Globally}. */
@VisibleForTesting
static <InputT, OutputT> CombinePayload payloadForCombineGlobally(
    final AppliedPTransform<
            PCollection<InputT>, PCollection<OutputT>, Combine.Globally<InputT, OutputT>>
        transform,
    final SdkComponents components)
    throws IOException {
  GlobalCombineFn<?, ?, ?> combineFn = transform.getTransform().getFn();
  Coder<?> accumulatorCoder = extractAccumulatorCoder(combineFn, (AppliedPTransform) transform);
  return combinePayload(combineFn, accumulatorCoder, components);
}

Source File: KafkaIOIT.java From beam with Apache License 2.0

5 votes

@Test
public void testKafkaIOReadsAndWritesCorrectly() throws IOException {
  writePipeline
      .apply("Generate records", Read.from(new SyntheticBoundedSource(sourceOptions)))
      .apply("Measure write time", ParDo.of(new TimeMonitor<>(NAMESPACE, WRITE_TIME_METRIC_NAME)))
      .apply("Write to Kafka", writeToKafka());

  PCollection<String> hashcode =
      readPipeline
          .apply("Read from Kafka", readFromKafka())
          .apply(
              "Measure read time", ParDo.of(new TimeMonitor<>(NAMESPACE, READ_TIME_METRIC_NAME)))
          .apply("Map records to strings", MapElements.via(new MapKafkaRecordsToStrings()))
          .apply("Calculate hashcode", Combine.globally(new HashingFn()).withoutDefaults());

  PAssert.thatSingleton(hashcode).isEqualTo(expectedHashcode);

  PipelineResult writeResult = writePipeline.run();
  writeResult.waitUntilFinish();

  PipelineResult readResult = readPipeline.run();
  PipelineResult.State readState =
      readResult.waitUntilFinish(Duration.standardSeconds(options.getReadTimeout()));

  cancelIfTimeouted(readResult, readState);

  Set<NamedTestResult> metrics = readMetrics(writeResult, readResult);
  IOITMetrics.publish(options.getBigQueryDataset(), options.getBigQueryTable(), metrics);
  IOITMetrics.publishToInflux(TEST_ID, TIMESTAMP, metrics, settings);
}

Source File: CombineValuesFnFactoryTest.java From beam with Apache License 2.0

5 votes

@Test
public void testCombineValuesFnAll() throws Exception {
  TestReceiver receiver = new TestReceiver();

  Combine.CombineFn<Integer, CountSum, String> combiner = (new MeanInts());

  ParDoFn combineParDoFn =
      createCombineValuesFn(
          CombinePhase.ALL,
          combiner,
          StringUtf8Coder.of(),
          BigEndianIntegerCoder.of(),
          new CountSumCoder(),
          WindowingStrategy.globalDefault());

  combineParDoFn.startBundle(receiver);
  combineParDoFn.processElement(
      WindowedValue.valueInGlobalWindow(KV.of("a", Arrays.asList(5, 6, 7))));
  combineParDoFn.processElement(
      WindowedValue.valueInGlobalWindow(KV.of("b", Arrays.asList(1, 3, 7))));
  combineParDoFn.processElement(
      WindowedValue.valueInGlobalWindow(KV.of("c", Arrays.asList(3, 6, 8, 9))));
  combineParDoFn.finishBundle();

  Object[] expectedReceivedElems = {
    WindowedValue.valueInGlobalWindow(KV.of("a", String.format("%.1f", 6.0))),
    WindowedValue.valueInGlobalWindow(KV.of("b", String.format("%.1f", 3.7))),
    WindowedValue.valueInGlobalWindow(KV.of("c", String.format("%.1f", 6.5))),
  };
  assertArrayEquals(expectedReceivedElems, receiver.receivedElems.toArray());
}

Source File: DataflowPipelineTranslator.java From beam with Apache License 2.0

5 votes

private <K, InputT, OutputT> void translateHelper(
    final CombineGroupedValues<K, InputT, OutputT> primitiveTransform,
    TranslationContext context) {
  Combine.GroupedValues<K, InputT, OutputT> originalTransform =
      primitiveTransform.getOriginalCombine();
  StepTranslationContext stepContext =
      context.addStep(primitiveTransform, "CombineValues");
  translateInputs(
      stepContext,
      context.getInput(primitiveTransform),
      originalTransform.getSideInputs(),
      context);

  AppliedCombineFn<? super K, ? super InputT, ?, OutputT> fn =
      originalTransform.getAppliedFn(
          context.getInput(primitiveTransform).getPipeline().getCoderRegistry(),
          context.getInput(primitiveTransform).getCoder(),
          context.getInput(primitiveTransform).getWindowingStrategy());

  stepContext.addEncodingInput(fn.getAccumulatorCoder());

  List<String> experiments = context.getPipelineOptions().getExperiments();
  boolean isFnApi = experiments != null && experiments.contains("beam_fn_api");

  if (isFnApi) {
    String ptransformId =
        context.getSdkComponents().getPTransformIdOrThrow(context.getCurrentParent());
    stepContext.addInput(PropertyNames.SERIALIZED_FN, ptransformId);
  } else {
    stepContext.addInput(
        PropertyNames.SERIALIZED_FN, byteArrayToJsonString(serializeToByteArray(fn)));
  }

  stepContext.addOutput(PropertyNames.OUTPUT, context.getOutput(primitiveTransform));
}

Source File: MultiStepCombineTest.java From beam with Apache License 2.0

5 votes

@Test
public void testMultiStepCombineWindowed() {
  SlidingWindows windowFn = SlidingWindows.of(Duration.millis(6L)).every(Duration.millis(3L));
  PCollection<KV<String, Long>> combined =
      pipeline
          .apply(
              Create.timestamped(
                  TimestampedValue.of(KV.of("foo", 1L), new Instant(1L)),
                  TimestampedValue.of(KV.of("bar", 2L), new Instant(2L)),
                  TimestampedValue.of(KV.of("bizzle", 3L), new Instant(3L)),
                  TimestampedValue.of(KV.of("bar", 4L), new Instant(4L)),
                  TimestampedValue.of(KV.of("bizzle", 11L), new Instant(11L))))
          .apply(Window.into(windowFn))
          .apply(Combine.perKey(new MultiStepCombineFn()));

  PAssert.that("Windows should combine only elements in their windows", combined)
      .inWindow(new IntervalWindow(new Instant(0L), Duration.millis(6L)))
      .containsInAnyOrder(KV.of("foo", 1L), KV.of("bar", 6L), KV.of("bizzle", 3L));
  PAssert.that("Elements should appear in all the windows they are assigned to", combined)
      .inWindow(new IntervalWindow(new Instant(-3L), Duration.millis(6L)))
      .containsInAnyOrder(KV.of("foo", 1L), KV.of("bar", 2L));
  PAssert.that(combined)
      .inWindow(new IntervalWindow(new Instant(6L), Duration.millis(6L)))
      .containsInAnyOrder(KV.of("bizzle", 11L));
  PAssert.that(combined)
      .containsInAnyOrder(
          KV.of("foo", 1L),
          KV.of("foo", 1L),
          KV.of("bar", 6L),
          KV.of("bar", 2L),
          KV.of("bar", 4L),
          KV.of("bizzle", 11L),
          KV.of("bizzle", 11L),
          KV.of("bizzle", 3L),
          KV.of("bizzle", 3L));
  pipeline.run();
}

org.apache.beam.sdk.transforms.Combine Java Examples