com.google.cloud.dataflow.sdk.values.PCollectionList Java Exaples

Source File: MergeBranches.java From dockerflow with Apache License 2.0

5 votes

@Override
public PCollection<KV<String, WorkflowArgs>> apply(
    PCollectionList<KV<String, WorkflowArgs>> input) {
  return input
      .apply(Flatten.<KV<String, WorkflowArgs>>pCollections())
      .apply(Combine.globally(new Merge()));
}

Source File: FXTimeSeriesPipelineSRGTests.java From data-timeseries-java with Apache License 2.0

5 votes

public PCollection<KV<String, TSProto>> generateCompleteWindowData(Pipeline pipeline,
    List<KV<String, TSProto>> data, WorkPacketConfig packetConfig) {

  LOG.info("Check to see that time streams with missing 'ticks' have been corrected");

  PCollection<KV<String, TSProto>> tsData = setupDataInput(pipeline, data);


  PCollection<KV<String, TSProto>> windowedData =
      tsData.apply("CandleResolutionWindow", Window.<KV<String, TSProto>>into(FixedWindows
          .of(Duration.standardSeconds(((FXTimeSeriesPipelineOptions) pipeline.getOptions())
              .getCandleResolution()))));

  // Determine streams that are missing in this Window and generate values for them

  PCollection<KV<String, TSProto>> generatedValues =
      windowedData
          .apply(
              "DetectMissingTimeSeriesValues",
              Combine.globally(new DetectMissingTimeSeriesValuesCombiner(packetConfig))
                  .withoutDefaults()).apply(ParDo.of(new CreateMissingTimeSeriesValuesDoFn()))
          .setName("CreateMissingTimeSeriesValues");

  // Flatten the live streams and the generated streams together

  PCollection<KV<String, TSProto>> completeWindowData =
      PCollectionList.of(windowedData).and(generatedValues)
          .apply("MergeGeneratedLiveValues", Flatten.<KV<String, TSProto>>pCollections());


  return completeWindowData;
}

Source File: TFIDF.java From flink-dataflow with Apache License 2.0

5 votes

@Override
public PCollection<KV<URI, String>> apply(PInput input) {
	Pipeline pipeline = input.getPipeline();

	// Create one TextIO.Read transform for each document
	// and add its output to a PCollectionList
	PCollectionList<KV<URI, String>> urisToLines =
			PCollectionList.empty(pipeline);

	// TextIO.Read supports:
	//  - file: URIs and paths locally
	//  - gs: URIs on the service
	for (final URI uri : uris) {
		String uriString;
		if (uri.getScheme().equals("file")) {
			uriString = new File(uri).getPath();
		} else {
			uriString = uri.toString();
		}

		PCollection<KV<URI, String>> oneUriToLines = pipeline
				.apply(TextIO.Read.from(uriString)
						.named("TextIO.Read(" + uriString + ")"))
				.apply("WithKeys(" + uriString + ")", WithKeys.<URI, String>of(uri));

		urisToLines = urisToLines.and(oneUriToLines);
	}

	return urisToLines.apply(Flatten.<KV<URI, String>>pCollections());
}

Source File: AutoComplete.java From flink-dataflow with Apache License 2.0

5 votes

@Override
public PCollectionList<KV<String, List<CompletionCandidate>>> apply(
      PCollection<CompletionCandidate> input) {
    if (minPrefix > 10) {
      // Base case, partitioning to return the output in the expected format.
      return input
        .apply(new ComputeTopFlat(candidatesPerPrefix, minPrefix))
        .apply(Partition.of(2, new KeySizePartitionFn()));
    } else {
      // If a candidate is in the top N for prefix a...b, it must also be in the top
      // N for a...bX for every X, which is typlically a much smaller set to consider.
      // First, compute the top candidate for prefixes of size at least minPrefix + 1.
      PCollectionList<KV<String, List<CompletionCandidate>>> larger = input
        .apply(new ComputeTopRecursive(candidatesPerPrefix, minPrefix + 1));
      // Consider the top candidates for each prefix of length minPrefix + 1...
      PCollection<KV<String, List<CompletionCandidate>>> small =
        PCollectionList
        .of(larger.get(1).apply(ParDo.of(new FlattenTops())))
        // ...together with those (previously excluded) candidates of length
        // exactly minPrefix...
        .and(input.apply(Filter.by(new SerializableFunction<CompletionCandidate, Boolean>() {
                private static final long serialVersionUID = 0;

                @Override
                public Boolean apply(CompletionCandidate c) {
                  return c.getValue().length() == minPrefix;
                }
              })))
        .apply("FlattenSmall", Flatten.<CompletionCandidate>pCollections())
        // ...set the key to be the minPrefix-length prefix...
        .apply(ParDo.of(new AllPrefixes(minPrefix, minPrefix)))
        // ...and (re)apply the Top operator to all of them together.
        .apply(Top.<String, CompletionCandidate>largestPerKey(candidatesPerPrefix));

      PCollection<KV<String, List<CompletionCandidate>>> flattenLarger = larger
          .apply("FlattenLarge", Flatten.<KV<String, List<CompletionCandidate>>>pCollections());

      return PCollectionList.of(flattenLarger).and(small);
    }
}

Source File: GatkPairedSingleSampleAlt.java From dockerflow with Apache License 2.0

4 votes

/**
 * Only this one method is different from GatkPairedSingleSample.java.
 */
@Override
public Pipeline createDataflow(
    Map<String, WorkflowArgs> argsTable,
    DataflowPipelineOptions pipelineOptions,
    String[] args) throws IOException {

  DataflowPipelineOptions o = DataflowFactory.pipelineOptions(args);
  o.setAppName(GatkPairedSingleSampleAlt.class.getSimpleName());
  Pipeline p = Pipeline.create(o);

  // Merge sample-specific args with default workflow args
  for (String key : argsTable.keySet()) {
    WorkflowArgs instanceArgs = argsTable.get(key);
    instanceArgs.mergeDefaultArgs(workflowArgs);
  }

  // Declarations
  PCollection<KV<String, WorkflowArgs>> mainBranch, branchOne, branchTwo;
  PCollectionList<KV<String, WorkflowArgs>> mergeList;

  // Construct the workflow graph
  mainBranch  = p.apply(Create.of(argsTable));
  branchOne = mainBranch.apply(DockerDo.of(CreateSequenceGroupingTSV));
  branchTwo = mainBranch.apply(DockerDo.of(BwaVersion))
      .apply(DockerDo.of(SamToFastqAndBwaMem))
      .apply(DockerDo.of(MergeBamAlignment))
      .apply(DockerDo.of(SortAndFixReadGroupBam))
      .apply(DockerDo.of(MarkDuplicates))
      .apply(DockerDo.of(SortAndFixSampleBam));
  mergeList = PCollectionList.of(branchOne).and(branchTwo);
  mainBranch = mergeList.apply(new MergeBranches())
      .apply(DockerDo.of(BaseRecalibrator))
      .apply(DockerDo.of(ApplyBQSR))
      .apply(DockerDo.of(GatherBqsrReports))
      .apply(DockerDo.of(ApplyBQSRToUnmappedReads))
      .apply(DockerDo.of(GatherBamFiles));
  branchOne = mainBranch.apply(DockerDo.of(ConvertToCram));
  branchTwo = mainBranch.apply(DockerDo.of(HaplotypeCaller))
      .apply(DockerDo.of(GatherVCFs));
  mergeList = PCollectionList.of(branchOne).and(branchTwo);
  mainBranch = mergeList.apply(new MergeBranches());

  return p;
}

Source File: CreateAggregatesTransform.java From data-timeseries-java with Apache License 2.0

4 votes

@Override
public PCollection<KV<String, TSAggValueProto>> apply(PCollection<KV<String, TSProto>> input) {



  PCollection<KV<String, TSProto>> windowedData =
      input.apply("CandleResolutionWindow", Window.<KV<String, TSProto>>into(
          FixedWindows.of(Duration.standardSeconds(options.getCandleResolution()))));

  // Determine streams that are missing in this Window and generate values for them

  PCollection<KV<String, TSProto>> generatedValues = windowedData
      .apply("DetectMissingTimeSeriesValues",
          Combine.globally(new DetectMissingTimeSeriesValuesCombiner(packetConfig))
              .withoutDefaults())
      .apply(ParDo.of(new CreateMissingTimeSeriesValuesDoFn()))
      .setName("CreateMissingTimeSeriesValues");

  // Flatten the live streams and the generated streams together

  PCollection<KV<String, TSProto>> completeWindowData =
      PCollectionList.of(windowedData).and(generatedValues).apply("MergeGeneratedLiveValues",
          Flatten.<KV<String, TSProto>>pCollections());

  // Create partial aggregates, at this stage we will not bring forward the previous windows close
  // value
  PCollection<KV<String, TSAggValueProto>> parital = completeWindowData
      .apply("CreatePartialAggregates", Combine.perKey(new PartialTimeSeriesAggCombiner()));

  // When these aggregates go through the Global Window they will lose their time value
  // We will embed the window close into the data so we can access it later on

  PCollection<KV<String, TSAggValueProto>> paritalWithWindowBoundary =
      parital.apply(ParDo.of(new EmbedWindowTimeIntoAggregateDoFn()));

  // Create a Global window which can retain the last value held in memory We must use
  // outputAtEarliestInputTimestamp as later on we re-attach the timestamp from within the data
  // point, for us not to hit 'skew' issues we need to ensure the output timestamp value is always
  // the smallest value
  PCollection<KV<String, TSAggValueProto>> completeAggregationStage1 =
      paritalWithWindowBoundary.apply("completeAggregationStage1",
          Window.<KV<String, TSAggValueProto>>into(new GlobalWindows())
              .triggering(Repeatedly.forever(AfterPane.elementCountAtLeast(1)))
              .withOutputTimeFn(OutputTimeFns.outputAtEarliestInputTimestamp())
              .accumulatingFiredPanes());

  PCollection<KV<String, TSAggValueProto>> completeAggregationStage2 = completeAggregationStage1
      .apply("CreateCompleteCandles", Combine.perKey(new CompleteTimeSeriesAggCombiner()))
      .apply("FlattenIterables", ParDo.of(new FlattenKVIterableDoFn()));



  // Reset timestamps after global window
  PCollection<KV<String, TSAggValueProto>> completeAggregationStage3 =
      completeAggregationStage2.apply("ResetTimestampsAfterGlobalWindow",
          ParDo.of(new DoFn<KV<String, TSAggValueProto>, KV<String, TSAggValueProto>>() {

            @Override
            public void processElement(
                DoFn<KV<String, TSAggValueProto>, KV<String, TSAggValueProto>>.ProcessContext c)
                throws Exception {
              //
              // TODO When the local Dataflow runners shuts down there will be some values
              // produced for the end of the the GlobalWindow. We can remove these values by
              // filtering out anything from year 3000+ for now. Better solution will be to check
              // the WINDOW PANE
              //
          	  Instant time = c.timestamp();
          	  
              if (time.isBefore(new Instant(32530703764000L))) {

                // The timestamp produced from the Combiner after the GlobalWindow loses fidelity,
                // we can add this back by looking at the value in the data

                if (time
                    .isAfter(new Instant(c.element().getValue().getCloseState().getTime()))) {

                  LOG.error(
                      "There was a timestamp before earlier than the window and skew must be 0 :: "
                          + TextFormat.shortDebugString(c.element().getValue()));

                } else {
                  c.outputWithTimestamp(c.element(),
                      new Instant(c.element().getValue().getCloseTime()));

                }
              }

            }

          }));

  return completeAggregationStage3;
}

Source File: FlattenizeITCase.java From flink-dataflow with Apache License 2.0

4 votes

@Override
protected void testProgram() throws Exception {
	Pipeline p = FlinkTestPipeline.createForBatch();

	PCollection<String> p1 = p.apply(Create.of(words));
	PCollection<String> p2 = p.apply(Create.of(words2));

	PCollectionList<String> list = PCollectionList.of(p1).and(p2);

	list.apply(Flatten.<String>pCollections()).apply(TextIO.Write.to(resultPath));

	PCollection<String> p3 = p.apply(Create.of(words3));

	PCollectionList<String> list2 = list.and(p3);

	list2.apply(Flatten.<String>pCollections()).apply(TextIO.Write.to(resultPath2));

	p.run();
}

com.google.cloud.dataflow.sdk.values.PCollectionList Java Examples