com.google.cloud.dataflow.sdk.values.PCollectionList Java Examples
The following examples show how to use
com.google.cloud.dataflow.sdk.values.PCollectionList.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: MergeBranches.java From dockerflow with Apache License 2.0 | 5 votes |
@Override public PCollection<KV<String, WorkflowArgs>> apply( PCollectionList<KV<String, WorkflowArgs>> input) { return input .apply(Flatten.<KV<String, WorkflowArgs>>pCollections()) .apply(Combine.globally(new Merge())); }
Example #2
Source File: FXTimeSeriesPipelineSRGTests.java From data-timeseries-java with Apache License 2.0 | 5 votes |
public PCollection<KV<String, TSProto>> generateCompleteWindowData(Pipeline pipeline, List<KV<String, TSProto>> data, WorkPacketConfig packetConfig) { LOG.info("Check to see that time streams with missing 'ticks' have been corrected"); PCollection<KV<String, TSProto>> tsData = setupDataInput(pipeline, data); PCollection<KV<String, TSProto>> windowedData = tsData.apply("CandleResolutionWindow", Window.<KV<String, TSProto>>into(FixedWindows .of(Duration.standardSeconds(((FXTimeSeriesPipelineOptions) pipeline.getOptions()) .getCandleResolution())))); // Determine streams that are missing in this Window and generate values for them PCollection<KV<String, TSProto>> generatedValues = windowedData .apply( "DetectMissingTimeSeriesValues", Combine.globally(new DetectMissingTimeSeriesValuesCombiner(packetConfig)) .withoutDefaults()).apply(ParDo.of(new CreateMissingTimeSeriesValuesDoFn())) .setName("CreateMissingTimeSeriesValues"); // Flatten the live streams and the generated streams together PCollection<KV<String, TSProto>> completeWindowData = PCollectionList.of(windowedData).and(generatedValues) .apply("MergeGeneratedLiveValues", Flatten.<KV<String, TSProto>>pCollections()); return completeWindowData; }
Example #3
Source File: TFIDF.java From flink-dataflow with Apache License 2.0 | 5 votes |
@Override public PCollection<KV<URI, String>> apply(PInput input) { Pipeline pipeline = input.getPipeline(); // Create one TextIO.Read transform for each document // and add its output to a PCollectionList PCollectionList<KV<URI, String>> urisToLines = PCollectionList.empty(pipeline); // TextIO.Read supports: // - file: URIs and paths locally // - gs: URIs on the service for (final URI uri : uris) { String uriString; if (uri.getScheme().equals("file")) { uriString = new File(uri).getPath(); } else { uriString = uri.toString(); } PCollection<KV<URI, String>> oneUriToLines = pipeline .apply(TextIO.Read.from(uriString) .named("TextIO.Read(" + uriString + ")")) .apply("WithKeys(" + uriString + ")", WithKeys.<URI, String>of(uri)); urisToLines = urisToLines.and(oneUriToLines); } return urisToLines.apply(Flatten.<KV<URI, String>>pCollections()); }
Example #4
Source File: AutoComplete.java From flink-dataflow with Apache License 2.0 | 5 votes |
@Override public PCollectionList<KV<String, List<CompletionCandidate>>> apply( PCollection<CompletionCandidate> input) { if (minPrefix > 10) { // Base case, partitioning to return the output in the expected format. return input .apply(new ComputeTopFlat(candidatesPerPrefix, minPrefix)) .apply(Partition.of(2, new KeySizePartitionFn())); } else { // If a candidate is in the top N for prefix a...b, it must also be in the top // N for a...bX for every X, which is typlically a much smaller set to consider. // First, compute the top candidate for prefixes of size at least minPrefix + 1. PCollectionList<KV<String, List<CompletionCandidate>>> larger = input .apply(new ComputeTopRecursive(candidatesPerPrefix, minPrefix + 1)); // Consider the top candidates for each prefix of length minPrefix + 1... PCollection<KV<String, List<CompletionCandidate>>> small = PCollectionList .of(larger.get(1).apply(ParDo.of(new FlattenTops()))) // ...together with those (previously excluded) candidates of length // exactly minPrefix... .and(input.apply(Filter.by(new SerializableFunction<CompletionCandidate, Boolean>() { private static final long serialVersionUID = 0; @Override public Boolean apply(CompletionCandidate c) { return c.getValue().length() == minPrefix; } }))) .apply("FlattenSmall", Flatten.<CompletionCandidate>pCollections()) // ...set the key to be the minPrefix-length prefix... .apply(ParDo.of(new AllPrefixes(minPrefix, minPrefix))) // ...and (re)apply the Top operator to all of them together. .apply(Top.<String, CompletionCandidate>largestPerKey(candidatesPerPrefix)); PCollection<KV<String, List<CompletionCandidate>>> flattenLarger = larger .apply("FlattenLarge", Flatten.<KV<String, List<CompletionCandidate>>>pCollections()); return PCollectionList.of(flattenLarger).and(small); } }
Example #5
Source File: GatkPairedSingleSampleAlt.java From dockerflow with Apache License 2.0 | 4 votes |
/** * Only this one method is different from GatkPairedSingleSample.java. */ @Override public Pipeline createDataflow( Map<String, WorkflowArgs> argsTable, DataflowPipelineOptions pipelineOptions, String[] args) throws IOException { DataflowPipelineOptions o = DataflowFactory.pipelineOptions(args); o.setAppName(GatkPairedSingleSampleAlt.class.getSimpleName()); Pipeline p = Pipeline.create(o); // Merge sample-specific args with default workflow args for (String key : argsTable.keySet()) { WorkflowArgs instanceArgs = argsTable.get(key); instanceArgs.mergeDefaultArgs(workflowArgs); } // Declarations PCollection<KV<String, WorkflowArgs>> mainBranch, branchOne, branchTwo; PCollectionList<KV<String, WorkflowArgs>> mergeList; // Construct the workflow graph mainBranch = p.apply(Create.of(argsTable)); branchOne = mainBranch.apply(DockerDo.of(CreateSequenceGroupingTSV)); branchTwo = mainBranch.apply(DockerDo.of(BwaVersion)) .apply(DockerDo.of(SamToFastqAndBwaMem)) .apply(DockerDo.of(MergeBamAlignment)) .apply(DockerDo.of(SortAndFixReadGroupBam)) .apply(DockerDo.of(MarkDuplicates)) .apply(DockerDo.of(SortAndFixSampleBam)); mergeList = PCollectionList.of(branchOne).and(branchTwo); mainBranch = mergeList.apply(new MergeBranches()) .apply(DockerDo.of(BaseRecalibrator)) .apply(DockerDo.of(ApplyBQSR)) .apply(DockerDo.of(GatherBqsrReports)) .apply(DockerDo.of(ApplyBQSRToUnmappedReads)) .apply(DockerDo.of(GatherBamFiles)); branchOne = mainBranch.apply(DockerDo.of(ConvertToCram)); branchTwo = mainBranch.apply(DockerDo.of(HaplotypeCaller)) .apply(DockerDo.of(GatherVCFs)); mergeList = PCollectionList.of(branchOne).and(branchTwo); mainBranch = mergeList.apply(new MergeBranches()); return p; }
Example #6
Source File: CreateAggregatesTransform.java From data-timeseries-java with Apache License 2.0 | 4 votes |
@Override public PCollection<KV<String, TSAggValueProto>> apply(PCollection<KV<String, TSProto>> input) { PCollection<KV<String, TSProto>> windowedData = input.apply("CandleResolutionWindow", Window.<KV<String, TSProto>>into( FixedWindows.of(Duration.standardSeconds(options.getCandleResolution())))); // Determine streams that are missing in this Window and generate values for them PCollection<KV<String, TSProto>> generatedValues = windowedData .apply("DetectMissingTimeSeriesValues", Combine.globally(new DetectMissingTimeSeriesValuesCombiner(packetConfig)) .withoutDefaults()) .apply(ParDo.of(new CreateMissingTimeSeriesValuesDoFn())) .setName("CreateMissingTimeSeriesValues"); // Flatten the live streams and the generated streams together PCollection<KV<String, TSProto>> completeWindowData = PCollectionList.of(windowedData).and(generatedValues).apply("MergeGeneratedLiveValues", Flatten.<KV<String, TSProto>>pCollections()); // Create partial aggregates, at this stage we will not bring forward the previous windows close // value PCollection<KV<String, TSAggValueProto>> parital = completeWindowData .apply("CreatePartialAggregates", Combine.perKey(new PartialTimeSeriesAggCombiner())); // When these aggregates go through the Global Window they will lose their time value // We will embed the window close into the data so we can access it later on PCollection<KV<String, TSAggValueProto>> paritalWithWindowBoundary = parital.apply(ParDo.of(new EmbedWindowTimeIntoAggregateDoFn())); // Create a Global window which can retain the last value held in memory We must use // outputAtEarliestInputTimestamp as later on we re-attach the timestamp from within the data // point, for us not to hit 'skew' issues we need to ensure the output timestamp value is always // the smallest value PCollection<KV<String, TSAggValueProto>> completeAggregationStage1 = paritalWithWindowBoundary.apply("completeAggregationStage1", Window.<KV<String, TSAggValueProto>>into(new GlobalWindows()) .triggering(Repeatedly.forever(AfterPane.elementCountAtLeast(1))) .withOutputTimeFn(OutputTimeFns.outputAtEarliestInputTimestamp()) .accumulatingFiredPanes()); PCollection<KV<String, TSAggValueProto>> completeAggregationStage2 = completeAggregationStage1 .apply("CreateCompleteCandles", Combine.perKey(new CompleteTimeSeriesAggCombiner())) .apply("FlattenIterables", ParDo.of(new FlattenKVIterableDoFn())); // Reset timestamps after global window PCollection<KV<String, TSAggValueProto>> completeAggregationStage3 = completeAggregationStage2.apply("ResetTimestampsAfterGlobalWindow", ParDo.of(new DoFn<KV<String, TSAggValueProto>, KV<String, TSAggValueProto>>() { @Override public void processElement( DoFn<KV<String, TSAggValueProto>, KV<String, TSAggValueProto>>.ProcessContext c) throws Exception { // // TODO When the local Dataflow runners shuts down there will be some values // produced for the end of the the GlobalWindow. We can remove these values by // filtering out anything from year 3000+ for now. Better solution will be to check // the WINDOW PANE // Instant time = c.timestamp(); if (time.isBefore(new Instant(32530703764000L))) { // The timestamp produced from the Combiner after the GlobalWindow loses fidelity, // we can add this back by looking at the value in the data if (time .isAfter(new Instant(c.element().getValue().getCloseState().getTime()))) { LOG.error( "There was a timestamp before earlier than the window and skew must be 0 :: " + TextFormat.shortDebugString(c.element().getValue())); } else { c.outputWithTimestamp(c.element(), new Instant(c.element().getValue().getCloseTime())); } } } })); return completeAggregationStage3; }
Example #7
Source File: FlattenizeITCase.java From flink-dataflow with Apache License 2.0 | 4 votes |
@Override protected void testProgram() throws Exception { Pipeline p = FlinkTestPipeline.createForBatch(); PCollection<String> p1 = p.apply(Create.of(words)); PCollection<String> p2 = p.apply(Create.of(words2)); PCollectionList<String> list = PCollectionList.of(p1).and(p2); list.apply(Flatten.<String>pCollections()).apply(TextIO.Write.to(resultPath)); PCollection<String> p3 = p.apply(Create.of(words3)); PCollectionList<String> list2 = list.and(p3); list2.apply(Flatten.<String>pCollections()).apply(TextIO.Write.to(resultPath2)); p.run(); }