com.google.cloud.dataflow.sdk.transforms.Flatten Java Exaples

Source File: MergeBranches.java From dockerflow with Apache License 2.0

5 votes

@Override
public PCollection<KV<String, WorkflowArgs>> apply(
    PCollectionList<KV<String, WorkflowArgs>> input) {
  return input
      .apply(Flatten.<KV<String, WorkflowArgs>>pCollections())
      .apply(Combine.globally(new Merge()));
}

Source File: FXTimeSeriesPipelineSRGTests.java From data-timeseries-java with Apache License 2.0

5 votes

public PCollection<KV<String, TSProto>> generateCompleteWindowData(Pipeline pipeline,
    List<KV<String, TSProto>> data, WorkPacketConfig packetConfig) {

  LOG.info("Check to see that time streams with missing 'ticks' have been corrected");

  PCollection<KV<String, TSProto>> tsData = setupDataInput(pipeline, data);


  PCollection<KV<String, TSProto>> windowedData =
      tsData.apply("CandleResolutionWindow", Window.<KV<String, TSProto>>into(FixedWindows
          .of(Duration.standardSeconds(((FXTimeSeriesPipelineOptions) pipeline.getOptions())
              .getCandleResolution()))));

  // Determine streams that are missing in this Window and generate values for them

  PCollection<KV<String, TSProto>> generatedValues =
      windowedData
          .apply(
              "DetectMissingTimeSeriesValues",
              Combine.globally(new DetectMissingTimeSeriesValuesCombiner(packetConfig))
                  .withoutDefaults()).apply(ParDo.of(new CreateMissingTimeSeriesValuesDoFn()))
          .setName("CreateMissingTimeSeriesValues");

  // Flatten the live streams and the generated streams together

  PCollection<KV<String, TSProto>> completeWindowData =
      PCollectionList.of(windowedData).and(generatedValues)
          .apply("MergeGeneratedLiveValues", Flatten.<KV<String, TSProto>>pCollections());


  return completeWindowData;
}

Source File: TFIDF.java From flink-dataflow with Apache License 2.0

5 votes

@Override
public PCollection<KV<URI, String>> apply(PInput input) {
	Pipeline pipeline = input.getPipeline();

	// Create one TextIO.Read transform for each document
	// and add its output to a PCollectionList
	PCollectionList<KV<URI, String>> urisToLines =
			PCollectionList.empty(pipeline);

	// TextIO.Read supports:
	//  - file: URIs and paths locally
	//  - gs: URIs on the service
	for (final URI uri : uris) {
		String uriString;
		if (uri.getScheme().equals("file")) {
			uriString = new File(uri).getPath();
		} else {
			uriString = uri.toString();
		}

		PCollection<KV<URI, String>> oneUriToLines = pipeline
				.apply(TextIO.Read.from(uriString)
						.named("TextIO.Read(" + uriString + ")"))
				.apply("WithKeys(" + uriString + ")", WithKeys.<URI, String>of(uri));

		urisToLines = urisToLines.and(oneUriToLines);
	}

	return urisToLines.apply(Flatten.<KV<URI, String>>pCollections());
}

Source File: FlinkBatchTransformTranslators.java From flink-dataflow with Apache License 2.0

5 votes

@Override
public void translateNode(Flatten.FlattenPCollectionList<T> transform, FlinkBatchTranslationContext context) {
	List<PCollection<T>> allInputs = context.getInput(transform).getAll();
	DataSet<T> result = null;
	for(PCollection<T> collection : allInputs) {
		DataSet<T> current = context.getInputDataSet(collection);
		if (result == null) {
			result = current;
		} else {
			result = result.union(current);
		}
	}
	context.setOutputDataSet(context.getOutput(transform), result);
}

Source File: CreateAggregatesTransform.java From data-timeseries-java with Apache License 2.0

4 votes

@Override
public PCollection<KV<String, TSAggValueProto>> apply(PCollection<KV<String, TSProto>> input) {



  PCollection<KV<String, TSProto>> windowedData =
      input.apply("CandleResolutionWindow", Window.<KV<String, TSProto>>into(
          FixedWindows.of(Duration.standardSeconds(options.getCandleResolution()))));

  // Determine streams that are missing in this Window and generate values for them

  PCollection<KV<String, TSProto>> generatedValues = windowedData
      .apply("DetectMissingTimeSeriesValues",
          Combine.globally(new DetectMissingTimeSeriesValuesCombiner(packetConfig))
              .withoutDefaults())
      .apply(ParDo.of(new CreateMissingTimeSeriesValuesDoFn()))
      .setName("CreateMissingTimeSeriesValues");

  // Flatten the live streams and the generated streams together

  PCollection<KV<String, TSProto>> completeWindowData =
      PCollectionList.of(windowedData).and(generatedValues).apply("MergeGeneratedLiveValues",
          Flatten.<KV<String, TSProto>>pCollections());

  // Create partial aggregates, at this stage we will not bring forward the previous windows close
  // value
  PCollection<KV<String, TSAggValueProto>> parital = completeWindowData
      .apply("CreatePartialAggregates", Combine.perKey(new PartialTimeSeriesAggCombiner()));

  // When these aggregates go through the Global Window they will lose their time value
  // We will embed the window close into the data so we can access it later on

  PCollection<KV<String, TSAggValueProto>> paritalWithWindowBoundary =
      parital.apply(ParDo.of(new EmbedWindowTimeIntoAggregateDoFn()));

  // Create a Global window which can retain the last value held in memory We must use
  // outputAtEarliestInputTimestamp as later on we re-attach the timestamp from within the data
  // point, for us not to hit 'skew' issues we need to ensure the output timestamp value is always
  // the smallest value
  PCollection<KV<String, TSAggValueProto>> completeAggregationStage1 =
      paritalWithWindowBoundary.apply("completeAggregationStage1",
          Window.<KV<String, TSAggValueProto>>into(new GlobalWindows())
              .triggering(Repeatedly.forever(AfterPane.elementCountAtLeast(1)))
              .withOutputTimeFn(OutputTimeFns.outputAtEarliestInputTimestamp())
              .accumulatingFiredPanes());

  PCollection<KV<String, TSAggValueProto>> completeAggregationStage2 = completeAggregationStage1
      .apply("CreateCompleteCandles", Combine.perKey(new CompleteTimeSeriesAggCombiner()))
      .apply("FlattenIterables", ParDo.of(new FlattenKVIterableDoFn()));



  // Reset timestamps after global window
  PCollection<KV<String, TSAggValueProto>> completeAggregationStage3 =
      completeAggregationStage2.apply("ResetTimestampsAfterGlobalWindow",
          ParDo.of(new DoFn<KV<String, TSAggValueProto>, KV<String, TSAggValueProto>>() {

            @Override
            public void processElement(
                DoFn<KV<String, TSAggValueProto>, KV<String, TSAggValueProto>>.ProcessContext c)
                throws Exception {
              //
              // TODO When the local Dataflow runners shuts down there will be some values
              // produced for the end of the the GlobalWindow. We can remove these values by
              // filtering out anything from year 3000+ for now. Better solution will be to check
              // the WINDOW PANE
              //
          	  Instant time = c.timestamp();
          	  
              if (time.isBefore(new Instant(32530703764000L))) {

                // The timestamp produced from the Combiner after the GlobalWindow loses fidelity,
                // we can add this back by looking at the value in the data

                if (time
                    .isAfter(new Instant(c.element().getValue().getCloseState().getTime()))) {

                  LOG.error(
                      "There was a timestamp before earlier than the window and skew must be 0 :: "
                          + TextFormat.shortDebugString(c.element().getValue()));

                } else {
                  c.outputWithTimestamp(c.element(),
                      new Instant(c.element().getValue().getCloseTime()));

                }
              }

            }

          }));

  return completeAggregationStage3;
}

Source File: FlattenizeITCase.java From flink-dataflow with Apache License 2.0

4 votes

@Override
protected void testProgram() throws Exception {
	Pipeline p = FlinkTestPipeline.createForBatch();

	PCollection<String> p1 = p.apply(Create.of(words));
	PCollection<String> p2 = p.apply(Create.of(words2));

	PCollectionList<String> list = PCollectionList.of(p1).and(p2);

	list.apply(Flatten.<String>pCollections()).apply(TextIO.Write.to(resultPath));

	PCollection<String> p3 = p.apply(Create.of(words3));

	PCollectionList<String> list2 = list.and(p3);

	list2.apply(Flatten.<String>pCollections()).apply(TextIO.Write.to(resultPath2));

	p.run();
}

com.google.cloud.dataflow.sdk.transforms.Flatten Java Examples