com.google.cloud.dataflow.sdk.transforms.Combine Java Exaples

Source File: BreakFusion.java From dockerflow with Apache License 2.0

5 votes

@Override
public PCollection<T> apply(PCollection<T> input) {
  return input
      .apply(ParDo.named("BreakFusion").of(new DummyMapFn<T>()))
      .apply(Combine.<String, T>perKey(new First<T>()))
      .apply(Values.<T>create());
}

Source File: DockerDo.java From dockerflow with Apache License 2.0

5 votes

@Override
public PCollection<KV<String, WorkflowArgs>> apply(
    PCollection<KV<String, WorkflowArgs>> input) {
  return input
      .apply(ParDo.named("Prepare").of(new Gather(task)))
      .apply(Combine.perKey(new SortArgs()))
      .apply(ParDo.named("CombineOutputs").of(new CombineArgs()));
}

Source File: MergeBranches.java From dockerflow with Apache License 2.0

5 votes

@Override
public PCollection<KV<String, WorkflowArgs>> apply(
    PCollectionList<KV<String, WorkflowArgs>> input) {
  return input
      .apply(Flatten.<KV<String, WorkflowArgs>>pCollections())
      .apply(Combine.globally(new Merge()));
}

Source File: LatestRides.java From cloud-dataflow-nyc-taxi-tycoon with Apache License 2.0

5 votes

public static void main(String[] args) {
  CustomPipelineOptions options =
      PipelineOptionsFactory.fromArgs(args).withValidation().as(CustomPipelineOptions.class);
  Pipeline p = Pipeline.create(options);

  p.apply(PubsubIO.Read.named("read from PubSub")
      .topic(String.format("projects/%s/topics/%s", options.getSourceProject(), options.getSourceTopic()))
      .timestampLabel("ts")
      .withCoder(TableRowJsonCoder.of()))

   .apply("key rides by rideid",
      MapElements.via((TableRow ride) -> KV.of(ride.get("ride_id").toString(), ride))
        .withOutputType(new TypeDescriptor<KV<String, TableRow>>() {}))

   .apply("session windows on rides with early firings",
      Window.<KV<String, TableRow>>into(
        Sessions.withGapDuration(Duration.standardMinutes(60)))
          .triggering(
            AfterWatermark.pastEndOfWindow()
              .withEarlyFirings(AfterProcessingTime.pastFirstElementInPane().plusDelayOf(Duration.millis(2000))))
          .accumulatingFiredPanes()
          .withAllowedLateness(Duration.ZERO))

   .apply("group ride points on same ride", Combine.perKey(new LatestPointCombine()))

   .apply("discard key",
      MapElements.via((KV<String, TableRow> a) -> a.getValue())
        .withOutputType(TypeDescriptor.of(TableRow.class)))

   .apply(PubsubIO.Write.named("WriteToPubsub")
      .topic(String.format("projects/%s/topics/%s", options.getSinkProject(), options.getSinkTopic()))
      .withCoder(TableRowJsonCoder.of()));
  p.run();
}

Source File: FXTimeSeriesPipelineSRGTests.java From data-timeseries-java with Apache License 2.0

5 votes

public PCollection<KV<String, TSProto>> generateCompleteWindowData(Pipeline pipeline,
    List<KV<String, TSProto>> data, WorkPacketConfig packetConfig) {

  LOG.info("Check to see that time streams with missing 'ticks' have been corrected");

  PCollection<KV<String, TSProto>> tsData = setupDataInput(pipeline, data);


  PCollection<KV<String, TSProto>> windowedData =
      tsData.apply("CandleResolutionWindow", Window.<KV<String, TSProto>>into(FixedWindows
          .of(Duration.standardSeconds(((FXTimeSeriesPipelineOptions) pipeline.getOptions())
              .getCandleResolution()))));

  // Determine streams that are missing in this Window and generate values for them

  PCollection<KV<String, TSProto>> generatedValues =
      windowedData
          .apply(
              "DetectMissingTimeSeriesValues",
              Combine.globally(new DetectMissingTimeSeriesValuesCombiner(packetConfig))
                  .withoutDefaults()).apply(ParDo.of(new CreateMissingTimeSeriesValuesDoFn()))
          .setName("CreateMissingTimeSeriesValues");

  // Flatten the live streams and the generated streams together

  PCollection<KV<String, TSProto>> completeWindowData =
      PCollectionList.of(windowedData).and(generatedValues)
          .apply("MergeGeneratedLiveValues", Flatten.<KV<String, TSProto>>pCollections());


  return completeWindowData;
}

Source File: FlinkAbstractParDoWrapper.java From flink-dataflow with Apache License 2.0

5 votes

@Override
protected <AggInputT, AggOutputT> Aggregator<AggInputT, AggOutputT> createAggregatorInternal(String name, Combine.CombineFn<AggInputT, ?, AggOutputT> combiner) {
	Accumulator acc = getRuntimeContext().getAccumulator(name);
	if (acc != null) {
		AccumulatorHelper.compareAccumulatorTypes(name,
				SerializableFnAggregatorWrapper.class, acc.getClass());
		return (Aggregator<AggInputT, AggOutputT>) acc;
	}

	SerializableFnAggregatorWrapper<AggInputT, AggOutputT> accumulator =
			new SerializableFnAggregatorWrapper<>(combiner);
	getRuntimeContext().addAccumulator(name, accumulator);
	return accumulator;
}

Source File: FlinkGroupAlsoByWindowWrapper.java From flink-dataflow with Apache License 2.0

5 votes

@Override
protected <AggInputT, AggOutputT> Aggregator<AggInputT, AggOutputT> createAggregatorInternal(String name, Combine.CombineFn<AggInputT, ?, AggOutputT> combiner) {
	Accumulator acc = getRuntimeContext().getAccumulator(name);
	if (acc != null) {
		AccumulatorHelper.compareAccumulatorTypes(name,
				SerializableFnAggregatorWrapper.class, acc.getClass());
		return (Aggregator<AggInputT, AggOutputT>) acc;
	}

	SerializableFnAggregatorWrapper<AggInputT, AggOutputT> accumulator =
			new SerializableFnAggregatorWrapper<>(combiner);
	getRuntimeContext().addAccumulator(name, accumulator);
	return accumulator;
}

Source File: FlinkGroupAlsoByWindowWrapper.java From flink-dataflow with Apache License 2.0

5 votes

private FlinkGroupAlsoByWindowWrapper(PipelineOptions options,
                                      CoderRegistry registry,
                                      WindowingStrategy<KV<K, VIN>, BoundedWindow> windowingStrategy,
                                      KvCoder<K, VIN> inputCoder,
                                      Combine.KeyedCombineFn<K, VIN, VACC, VOUT> combiner) {
	Preconditions.checkNotNull(options);

	this.options = Preconditions.checkNotNull(options);
	this.coderRegistry = Preconditions.checkNotNull(registry);
	this.inputKvCoder = Preconditions.checkNotNull(inputCoder);//(KvCoder<K, VIN>) input.getCoder();
	this.windowingStrategy = Preconditions.checkNotNull(windowingStrategy);//input.getWindowingStrategy();
	this.combineFn = combiner;
	this.operator = createGroupAlsoByWindowOperator();
	this.chainingStrategy = ChainingStrategy.ALWAYS;
}

Source File: FlinkGroupAlsoByWindowWrapper.java From flink-dataflow with Apache License 2.0

5 votes

/**
 * Creates an DataStream where elements are grouped in windows based on the specified windowing strategy.
 * This method assumes that <b>elements are already grouped by key</b>.
 * <p/>
 * The difference with {@link #createForIterable(PipelineOptions, PCollection, KeyedStream)}
 * is that this method assumes that a combiner function is provided
 * (see {@link com.google.cloud.dataflow.sdk.transforms.Combine.KeyedCombineFn}).
 * A combiner helps at increasing the speed and, in most of the cases, reduce the per-window state.
 *
 * @param options            the general job configuration options.
 * @param input              the input Dataflow {@link com.google.cloud.dataflow.sdk.values.PCollection}.
 * @param groupedStreamByKey the input stream, it is assumed to already be grouped by key.
 * @param combiner           the combiner to be used.
 * @param outputKvCoder      the type of the output values.
 */
public static <K, VIN, VACC, VOUT> DataStream<WindowedValue<KV<K, VOUT>>> create(
		PipelineOptions options,
		PCollection input,
		KeyedStream<WindowedValue<KV<K, VIN>>, K> groupedStreamByKey,
		Combine.KeyedCombineFn<K, VIN, VACC, VOUT> combiner,
		KvCoder<K, VOUT> outputKvCoder) {
	Preconditions.checkNotNull(options);

	KvCoder<K, VIN> inputKvCoder = (KvCoder<K, VIN>) input.getCoder();
	FlinkGroupAlsoByWindowWrapper windower = new FlinkGroupAlsoByWindowWrapper<>(options,
			input.getPipeline().getCoderRegistry(), input.getWindowingStrategy(), inputKvCoder, combiner);

	Coder<WindowedValue<KV<K, VOUT>>> windowedOutputElemCoder = WindowedValue.FullWindowedValueCoder.of(
			outputKvCoder,
			input.getWindowingStrategy().getWindowFn().windowCoder());

	CoderTypeInformation<WindowedValue<KV<K, VOUT>>> outputTypeInfo =
			new CoderTypeInformation<>(windowedOutputElemCoder);

	DataStream<WindowedValue<KV<K, VOUT>>> groupedByKeyAndWindow = groupedStreamByKey
			.transform("GroupByWindowWithCombiner",
					new CoderTypeInformation<>(outputKvCoder),
					windower)
			.returns(outputTypeInfo);

	return groupedByKeyAndWindow;
}

Source File: FlinkGroupAlsoByWindowWrapper.java From flink-dataflow with Apache License 2.0

5 votes

public static <K, VIN, VACC, VOUT> FlinkGroupAlsoByWindowWrapper
createForTesting(PipelineOptions options,
                 CoderRegistry registry,
                 WindowingStrategy<KV<K, VIN>, BoundedWindow> windowingStrategy,
                 KvCoder<K, VIN> inputCoder,
                 Combine.KeyedCombineFn<K, VIN, VACC, VOUT> combiner) {
	Preconditions.checkNotNull(options);

	return new FlinkGroupAlsoByWindowWrapper(options, registry, windowingStrategy, inputCoder, combiner);
}

Source File: FlinkDoFnFunction.java From flink-dataflow with Apache License 2.0

4 votes

@Override
protected <AggInputT, AggOutputT> Aggregator<AggInputT, AggOutputT> createAggregatorInternal(String name, Combine.CombineFn<AggInputT, ?, AggOutputT> combiner) {
	SerializableFnAggregatorWrapper<AggInputT, AggOutputT> wrapper = new SerializableFnAggregatorWrapper<>(combiner);
	getRuntimeContext().addAccumulator(name, wrapper);
	return wrapper;
}

Source File: FlinkReduceFunction.java From flink-dataflow with Apache License 2.0

4 votes

public FlinkReduceFunction(Combine.KeyedCombineFn<K, ?, VA, VO> keyedCombineFn) {
	this.keyedCombineFn = keyedCombineFn;
}

Source File: FlinkPartialReduceFunction.java From flink-dataflow with Apache License 2.0

4 votes

public FlinkPartialReduceFunction(Combine.KeyedCombineFn<K, VI, VA, ?>
		                                  keyedCombineFn) {
	this.keyedCombineFn = keyedCombineFn;
}

Source File: FlinkMultiOutputDoFnFunction.java From flink-dataflow with Apache License 2.0

4 votes

@Override
protected <AggInputT, AggOutputT> Aggregator<AggInputT, AggOutputT> createAggregatorInternal(String name, Combine.CombineFn<AggInputT, ?, AggOutputT> combiner) {
	SerializableFnAggregatorWrapper<AggInputT, AggOutputT> wrapper = new SerializableFnAggregatorWrapper<>(combiner);
	getRuntimeContext().addAccumulator(name, wrapper);
	return null;
}

Source File: FlinkBatchTransformTranslators.java From flink-dataflow with Apache License 2.0

4 votes

@Override
public void translateNode(Combine.PerKey<K, VI, VO> transform, FlinkBatchTranslationContext context) {
	DataSet<KV<K, VI>> inputDataSet = context.getInputDataSet(context.getInput(transform));

	@SuppressWarnings("unchecked")
	Combine.KeyedCombineFn<K, VI, VA, VO> keyedCombineFn = (Combine.KeyedCombineFn<K, VI, VA, VO>) transform.getFn();

	KvCoder<K, VI> inputCoder = (KvCoder<K, VI>) context.getInput(transform).getCoder();

	Coder<VA> accumulatorCoder =
			null;
	try {
		accumulatorCoder = keyedCombineFn.getAccumulatorCoder(context.getInput(transform).getPipeline().getCoderRegistry(), inputCoder.getKeyCoder(), inputCoder.getValueCoder());
	} catch (CannotProvideCoderException e) {
		e.printStackTrace();
		// TODO
	}

	TypeInformation<KV<K, VI>> kvCoderTypeInformation = new KvCoderTypeInformation<>(inputCoder);
	TypeInformation<KV<K, VA>> partialReduceTypeInfo = new KvCoderTypeInformation<>(KvCoder.of(inputCoder.getKeyCoder(), accumulatorCoder));

	Grouping<KV<K, VI>> inputGrouping = new UnsortedGrouping<>(inputDataSet, new Keys.ExpressionKeys<>(new String[]{"key"}, kvCoderTypeInformation));

	FlinkPartialReduceFunction<K, VI, VA> partialReduceFunction = new FlinkPartialReduceFunction<>(keyedCombineFn);

	// Partially GroupReduce the values into the intermediate format VA (combine)
	GroupCombineOperator<KV<K, VI>, KV<K, VA>> groupCombine =
			new GroupCombineOperator<>(inputGrouping, partialReduceTypeInfo, partialReduceFunction,
					"GroupCombine: " + transform.getName());

	// Reduce fully to VO
	GroupReduceFunction<KV<K, VA>, KV<K, VO>> reduceFunction = new FlinkReduceFunction<>(keyedCombineFn);

	TypeInformation<KV<K, VO>> reduceTypeInfo = context.getTypeInfo(context.getOutput(transform));

	Grouping<KV<K, VA>> intermediateGrouping = new UnsortedGrouping<>(groupCombine, new Keys.ExpressionKeys<>(new String[]{"key"}, groupCombine.getType()));

	// Fully reduce the values and create output format VO
	GroupReduceOperator<KV<K, VA>, KV<K, VO>> outputDataSet =
			new GroupReduceOperator<>(intermediateGrouping, reduceTypeInfo, reduceFunction, transform.getName());

	context.setOutputDataSet(context.getOutput(transform), outputDataSet);
}

Source File: FlinkStateInternals.java From flink-dataflow with Apache License 2.0

4 votes

private FlinkInMemoryKeyedCombiningValue(ByteString stateKey,
                                         Combine.KeyedCombineFn<? super K, InputT, AccumT, OutputT> combineFn,
                                         Coder<AccumT> accumCoder,
                                         final StateContext<?> stateContext) {
	this(stateKey, withContext(combineFn), accumCoder, stateContext);
}

Source File: FlinkStateInternals.java From flink-dataflow with Apache License 2.0

4 votes

private FlinkInMemoryKeyedCombiningValue(ByteString stateKey,
                                         Combine.CombineFn<InputT, AccumT, OutputT> combineFn,
                                         Coder<AccumT> accumCoder,
                                         final StateContext<?> stateContext) {
	this(stateKey, withKeyAndContext(combineFn), accumCoder, stateContext);
}

Source File: SerializableFnAggregatorWrapper.java From flink-dataflow with Apache License 2.0

4 votes

@Override
public Combine.CombineFn<AI, ?, AO> getCombineFn() {
	return combiner;
}

Source File: SerializableFnAggregatorWrapper.java From flink-dataflow with Apache License 2.0

4 votes

public SerializableFnAggregatorWrapper(Combine.CombineFn<AI, ?, AO> combiner) {
	this.combiner = combiner;
	resetLocal();
}

Source File: CombineFnAggregatorWrapper.java From flink-dataflow with Apache License 2.0

4 votes

@Override
public Combine.CombineFn getCombineFn() {
	return combiner;
}

Source File: CombineFnAggregatorWrapper.java From flink-dataflow with Apache License 2.0

4 votes

public CombineFnAggregatorWrapper(Combine.CombineFn<? super AI, AA, AR> combiner) {
	this.combiner = combiner;
	this.aa = combiner.createAccumulator();
}

Source File: FXTimeSeriesPipelineSRGTests.java From data-timeseries-java with Apache License 2.0

4 votes

public PCollection<KV<String, TSAggValueProto>> createCompleteAggregates(Pipeline pipeline,
    List<KV<String, TSProto>> data, WorkPacketConfig packetConfig) {

  PCollection<KV<String, TSProto>> completeWindowData =
      generateCompleteWindowData(pipeline, data, packetConfig);

  PCollection<KV<String, TSAggValueProto>> parital =
      completeWindowData.apply("CreatePartialAggregates",
          Combine.perKey(new PartialTimeSeriesAggCombiner()));

  PCollection<KV<String, TSAggValueProto>> paritalWithWindowBoundary =
      parital.apply(ParDo.of(new EmbedWindowTimeIntoAggregateDoFn()));

  PCollection<KV<String, TSAggValueProto>> completeAggregationStage1 =
      paritalWithWindowBoundary.apply(
          "completeAggregationStage1",
          Window.<KV<String, TSAggValueProto>>into(new GlobalWindows())
              .triggering(Repeatedly.forever(AfterPane.elementCountAtLeast(1)))
              .withOutputTimeFn(OutputTimeFns.outputAtEarliestInputTimestamp())
              .accumulatingFiredPanes());

  PCollection<KV<String, TSAggValueProto>> completeAggregationStage2 =
      completeAggregationStage1.apply("CreateCompleteCandles",
          Combine.perKey(new CompleteTimeSeriesAggCombiner())).apply("FlattenIterables",
          ParDo.of(new FlattenKVIterableDoFn()));

  PCollection<KV<String, TSAggValueProto>> completeAggregationStage3 =
      completeAggregationStage2.apply("ResetTimestampsAfterGlobalWindow",
          ParDo.of(new DoFn<KV<String, TSAggValueProto>, KV<String, TSAggValueProto>>() {

            @Override
            public void processElement(
                DoFn<KV<String, TSAggValueProto>, KV<String, TSAggValueProto>>.ProcessContext c)
                throws Exception {
              if (c.timestamp().isBefore(new Instant(32530703764000L))) {

                if (c.timestamp().isAfter(
                    new Instant(c.element().getValue().getCloseState().getTime()))) {

                  LOG.error("BUG There was a timestamp before current :: "
                      + TextFormat.shortDebugString(c.element().getValue()));

                } else {
                  c.outputWithTimestamp(c.element(), new Instant(c.element().getValue()
                      .getCloseTime()));

                }
              }

            }

          }));

  return completeAggregationStage3;

}

Source File: CreateAggregatesTransform.java From data-timeseries-java with Apache License 2.0

4 votes

@Override
public PCollection<KV<String, TSAggValueProto>> apply(PCollection<KV<String, TSProto>> input) {



  PCollection<KV<String, TSProto>> windowedData =
      input.apply("CandleResolutionWindow", Window.<KV<String, TSProto>>into(
          FixedWindows.of(Duration.standardSeconds(options.getCandleResolution()))));

  // Determine streams that are missing in this Window and generate values for them

  PCollection<KV<String, TSProto>> generatedValues = windowedData
      .apply("DetectMissingTimeSeriesValues",
          Combine.globally(new DetectMissingTimeSeriesValuesCombiner(packetConfig))
              .withoutDefaults())
      .apply(ParDo.of(new CreateMissingTimeSeriesValuesDoFn()))
      .setName("CreateMissingTimeSeriesValues");

  // Flatten the live streams and the generated streams together

  PCollection<KV<String, TSProto>> completeWindowData =
      PCollectionList.of(windowedData).and(generatedValues).apply("MergeGeneratedLiveValues",
          Flatten.<KV<String, TSProto>>pCollections());

  // Create partial aggregates, at this stage we will not bring forward the previous windows close
  // value
  PCollection<KV<String, TSAggValueProto>> parital = completeWindowData
      .apply("CreatePartialAggregates", Combine.perKey(new PartialTimeSeriesAggCombiner()));

  // When these aggregates go through the Global Window they will lose their time value
  // We will embed the window close into the data so we can access it later on

  PCollection<KV<String, TSAggValueProto>> paritalWithWindowBoundary =
      parital.apply(ParDo.of(new EmbedWindowTimeIntoAggregateDoFn()));

  // Create a Global window which can retain the last value held in memory We must use
  // outputAtEarliestInputTimestamp as later on we re-attach the timestamp from within the data
  // point, for us not to hit 'skew' issues we need to ensure the output timestamp value is always
  // the smallest value
  PCollection<KV<String, TSAggValueProto>> completeAggregationStage1 =
      paritalWithWindowBoundary.apply("completeAggregationStage1",
          Window.<KV<String, TSAggValueProto>>into(new GlobalWindows())
              .triggering(Repeatedly.forever(AfterPane.elementCountAtLeast(1)))
              .withOutputTimeFn(OutputTimeFns.outputAtEarliestInputTimestamp())
              .accumulatingFiredPanes());

  PCollection<KV<String, TSAggValueProto>> completeAggregationStage2 = completeAggregationStage1
      .apply("CreateCompleteCandles", Combine.perKey(new CompleteTimeSeriesAggCombiner()))
      .apply("FlattenIterables", ParDo.of(new FlattenKVIterableDoFn()));



  // Reset timestamps after global window
  PCollection<KV<String, TSAggValueProto>> completeAggregationStage3 =
      completeAggregationStage2.apply("ResetTimestampsAfterGlobalWindow",
          ParDo.of(new DoFn<KV<String, TSAggValueProto>, KV<String, TSAggValueProto>>() {

            @Override
            public void processElement(
                DoFn<KV<String, TSAggValueProto>, KV<String, TSAggValueProto>>.ProcessContext c)
                throws Exception {
              //
              // TODO When the local Dataflow runners shuts down there will be some values
              // produced for the end of the the GlobalWindow. We can remove these values by
              // filtering out anything from year 3000+ for now. Better solution will be to check
              // the WINDOW PANE
              //
          	  Instant time = c.timestamp();
          	  
              if (time.isBefore(new Instant(32530703764000L))) {

                // The timestamp produced from the Combiner after the GlobalWindow loses fidelity,
                // we can add this back by looking at the value in the data

                if (time
                    .isAfter(new Instant(c.element().getValue().getCloseState().getTime()))) {

                  LOG.error(
                      "There was a timestamp before earlier than the window and skew must be 0 :: "
                          + TextFormat.shortDebugString(c.element().getValue()));

                } else {
                  c.outputWithTimestamp(c.element(),
                      new Instant(c.element().getValue().getCloseTime()));

                }
              }

            }

          }));

  return completeAggregationStage3;
}

com.google.cloud.dataflow.sdk.transforms.Combine Java Examples