Java Code Examples for com.google.cloud.dataflow.sdk.values.PCollection#apply()
The following examples show how to use
com.google.cloud.dataflow.sdk.values.PCollection#apply() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ExportedServiceAccountKeyRemover.java From policyscanner with Apache License 2.0 | 6 votes |
private PCollection<String> constructPipeline(Pipeline pipeline, String org) { // Read projects from the CRM API. PCollection<GCPProject> projects = pipeline.apply(Read.from(new LiveProjectSource(org))); // List the service accounts of the projects. PCollection<GCPServiceAccount> serviceAccounts = projects.apply(ParDo.named("List Service Accounts").of(new ListServiceAccounts())); // List the keys of the service accounts. PCollection<GCPServiceAccountKey> serviceAccountKeys = serviceAccounts.apply(ParDo.named("List Service Account Keys") .of(new ListServiceAccountKeys())); // Construct an alert message for all the discrepancies found. return serviceAccountKeys.apply(ParDo .named("Remove user-managed keys") .of(new ExportedServiceAccountKeyMessenger())); }
Example 2
Source File: ReadSourceITCase.java From flink-dataflow with Apache License 2.0 | 6 votes |
private static void runProgram(String resultPath) { Pipeline p = FlinkTestPipeline.createForBatch(); PCollection<String> result = p .apply(Read.from(new ReadSource(1, 10))) .apply(ParDo.of(new DoFn<Integer, String>() { @Override public void processElement(ProcessContext c) throws Exception { c.output(c.element().toString()); } })); result.apply(TextIO.Write.to(resultPath)); p.run(); }
Example 3
Source File: UnboundedSourceITCase.java From flink-dataflow with Apache License 2.0 | 6 votes |
private static void runProgram(String resultPath) { Pipeline p = FlinkTestPipeline.createForStreaming(); PCollection<String> result = p .apply(Read.from(new RangeReadSource(1, 10))) .apply(Window.<Integer>into(new GlobalWindows()) .triggering(AfterPane.elementCountAtLeast(10)) .discardingFiredPanes()) .apply(ParDo.of(new DoFn<Integer, String>() { @Override public void processElement(ProcessContext c) throws Exception { c.output(c.element().toString()); } })); result.apply(TextIO.Write.to(resultPath)); try { p.run(); fail(); } catch(Exception e) { assertEquals("The source terminates as expected.", e.getCause().getCause().getMessage()); } }
Example 4
Source File: LoadBooks.java From cloud-bigtable-examples with Apache License 2.0 | 6 votes |
public static void main(String[] args) { // CloudBigtableOptions is one way to retrieve the options. It's not required. // https://github.com/GoogleCloudPlatform/cloud-bigtable-examples/blob/master/java/dataflow-connector-examples/src/main/java/com/google/cloud/bigtable/dataflow/example/HelloWorldWrite.java BigtableCsvOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as(BigtableCsvOptions.class); CloudBigtableTableConfiguration config = CloudBigtableTableConfiguration.fromCBTOptions(options); Pipeline p = Pipeline.create(options); CloudBigtableIO.initializeForWrite(p); PCollection<KV<String, Integer>> ngrams = applyPipelineToParseBooks(p.apply(TextIO.Read.from(options.getInputFile()))); PCollection<Mutation> mutations = ngrams.apply(ParDo.of(ENCODE_NGRAM)); mutations.apply(CloudBigtableIO.writeToTable(config)); // Run the pipeline. p.run(); }
Example 5
Source File: RemoveDuplicatesITCase.java From flink-dataflow with Apache License 2.0 | 6 votes |
@Override protected void testProgram() throws Exception { List<String> strings = Arrays.asList("k1", "k5", "k5", "k2", "k1", "k2", "k3"); Pipeline p = FlinkTestPipeline.createForBatch(); PCollection<String> input = p.apply(Create.of(strings)) .setCoder(StringUtf8Coder.of()); PCollection<String> output = input.apply(RemoveDuplicates.<String>create()); output.apply(TextIO.Write.to(resultPath)); p.run(); }
Example 6
Source File: TfIdfITCase.java From flink-dataflow with Apache License 2.0 | 6 votes |
@Override protected void testProgram() throws Exception { Pipeline pipeline = FlinkTestPipeline.createForBatch(); pipeline.getCoderRegistry().registerCoder(URI.class, StringDelegateCoder.of(URI.class)); PCollection<KV<String, KV<URI, Double>>> wordToUriAndTfIdf = pipeline .apply(Create.of( KV.of(new URI("x"), "a b c d"), KV.of(new URI("y"), "a b c"), KV.of(new URI("z"), "a m n"))) .apply(new TfIdf.ComputeTfIdf()); PCollection<String> words = wordToUriAndTfIdf .apply(Keys.<String>create()) .apply(RemoveDuplicates.<String>create()); words.apply(TextIO.Write.to(resultPath)); pipeline.run(); }
Example 7
Source File: DockerDo.java From dockerflow with Apache License 2.0 | 5 votes |
@Override public PCollection<KV<String, WorkflowArgs>> apply( PCollection<KV<String, WorkflowArgs>> input) { PCollection<KV<String, WorkflowArgs>> pc = input; // Add retries for (int i = 1; i < ((WorkflowArgs) task.getArgs()).getMaxTries(); ++i) { pc = pc.apply(new RunTask(task, i)); } return pc; }
Example 8
Source File: DockerDo.java From dockerflow with Apache License 2.0 | 5 votes |
@Override public PCollection<KV<String, WorkflowArgs>> apply( PCollection<KV<String, WorkflowArgs>> input) { PCollection<KV<String, WorkflowArgs>> pc = input; if (attempt == 0) { pc = pc.apply(ParDo.named("Prepare").of(new ClearOperationStatus())); } return pc.apply(ParDo.named("Start").of(new StartTask(task, attempt))) .apply(new BreakFusion<KV<String, WorkflowArgs>>("AfterStarted")) .apply(ParDo.named("Wait").of(new WaitForOperation())); }
Example 9
Source File: DataflowFactory.java From dockerflow with Apache License 2.0 | 5 votes |
/** * Recursively construct the dataflow pipeline. * * @param graphItem a node, edge or branch point * @param input the inputs to the graph element * @throws IOException */ private static PCollection<KV<String, WorkflowArgs>> dataflow( GraphItem graphItem, PCollection<KV<String, WorkflowArgs>> input) throws IOException { PCollection<KV<String, WorkflowArgs>> output = input; // It's a node if (graphItem instanceof Workflow) { Workflow w = (Workflow) graphItem; LOG.info("Adding task: " + w.getDefn().getName()); output = input.apply(DockerDo.of(w)); // It's a branch } else if (graphItem instanceof Branch) { LOG.info("Pipeline splits into branches. Adding branches"); output = branches(((Branch) graphItem), input); // It's an edge } else if (graphItem instanceof Steps) { LOG.info("Adding steps"); Steps steps = (Steps) graphItem; // For each sequential element, the output of one is the input to // the next if (steps.getSteps() != null) { for (GraphItem item : steps.getSteps()) { output = dataflow(item, output); } } } else { throw new IllegalStateException("Invalid graph element type: " + graphItem); } return output; }
Example 10
Source File: JoinExamples.java From flink-dataflow with Apache License 2.0 | 5 votes |
public static void main(String[] args) throws Exception { Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); Pipeline p = Pipeline.create(options); // the following two 'applys' create multiple inputs to our pipeline, one for each // of our two input sources. PCollection<TableRow> eventsTable = p.apply(BigQueryIO.Read.from(GDELT_EVENTS_TABLE)); PCollection<TableRow> countryCodes = p.apply(BigQueryIO.Read.from(COUNTRY_CODES)); PCollection<String> formattedResults = joinEvents(eventsTable, countryCodes); formattedResults.apply(TextIO.Write.to(options.getOutput())); p.run(); }
Example 11
Source File: FXTimeSeriesPipelineSRGTests.java From data-timeseries-java with Apache License 2.0 | 5 votes |
public PCollection<KV<String, TSProto>> generateCompleteWindowData(Pipeline pipeline, List<KV<String, TSProto>> data, WorkPacketConfig packetConfig) { LOG.info("Check to see that time streams with missing 'ticks' have been corrected"); PCollection<KV<String, TSProto>> tsData = setupDataInput(pipeline, data); PCollection<KV<String, TSProto>> windowedData = tsData.apply("CandleResolutionWindow", Window.<KV<String, TSProto>>into(FixedWindows .of(Duration.standardSeconds(((FXTimeSeriesPipelineOptions) pipeline.getOptions()) .getCandleResolution())))); // Determine streams that are missing in this Window and generate values for them PCollection<KV<String, TSProto>> generatedValues = windowedData .apply( "DetectMissingTimeSeriesValues", Combine.globally(new DetectMissingTimeSeriesValuesCombiner(packetConfig)) .withoutDefaults()).apply(ParDo.of(new CreateMissingTimeSeriesValuesDoFn())) .setName("CreateMissingTimeSeriesValues"); // Flatten the live streams and the generated streams together PCollection<KV<String, TSProto>> completeWindowData = PCollectionList.of(windowedData).and(generatedValues) .apply("MergeGeneratedLiveValues", Flatten.<KV<String, TSProto>>pCollections()); return completeWindowData; }
Example 12
Source File: WordCount.java From flink-dataflow with Apache License 2.0 | 5 votes |
@Override public PCollection<KV<String, Long>> apply(PCollection<String> lines) { // Convert lines of text into individual words. PCollection<String> words = lines.apply( ParDo.of(new ExtractWordsFn())); // Count the number of times each word occurs. PCollection<KV<String, Long>> wordCounts = words.apply(Count.<String>perElement()); return wordCounts; }
Example 13
Source File: KafkaWindowedWordCountExample.java From flink-dataflow with Apache License 2.0 | 5 votes |
public static void main(String[] args) { PipelineOptionsFactory.register(KafkaStreamingWordCountOptions.class); KafkaStreamingWordCountOptions options = PipelineOptionsFactory.fromArgs(args).as(KafkaStreamingWordCountOptions.class); options.setJobName("KafkaExample - WindowSize: " + options.getWindowSize() + " seconds"); options.setStreaming(true); options.setCheckpointingInterval(1000L); options.setNumberOfExecutionRetries(5); options.setExecutionRetryDelay(3000L); options.setRunner(FlinkPipelineRunner.class); System.out.println(options.getKafkaTopic() +" "+ options.getZookeeper() +" "+ options.getBroker() +" "+ options.getGroup() ); Pipeline pipeline = Pipeline.create(options); Properties p = new Properties(); p.setProperty("zookeeper.connect", options.getZookeeper()); p.setProperty("bootstrap.servers", options.getBroker()); p.setProperty("group.id", options.getGroup()); // this is the Flink consumer that reads the input to // the program from a kafka topic. FlinkKafkaConsumer08<String> kafkaConsumer = new FlinkKafkaConsumer08<>( options.getKafkaTopic(), new SimpleStringSchema(), p); PCollection<String> words = pipeline .apply(Read.from(new UnboundedFlinkSource<>(kafkaConsumer)).named("StreamingWordCount")) .apply(ParDo.of(new ExtractWordsFn())) .apply(Window.<String>into(FixedWindows.of(Duration.standardSeconds(options.getWindowSize()))) .triggering(AfterWatermark.pastEndOfWindow()).withAllowedLateness(Duration.ZERO) .discardingFiredPanes()); PCollection<KV<String, Long>> wordCounts = words.apply(Count.<String>perElement()); wordCounts.apply(ParDo.of(new FormatAsStringFn())) .apply(TextIO.Write.to("./outputKafka.txt")); pipeline.run(); }
Example 14
Source File: JoinExamples.java From flink-dataflow with Apache License 2.0 | 5 votes |
public static void main(String[] args) throws Exception { Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); options.setStreaming(true); options.setCheckpointingInterval(1000L); options.setNumberOfExecutionRetries(5); options.setExecutionRetryDelay(3000L); options.setRunner(FlinkPipelineRunner.class); PTransform<? super PBegin, PCollection<String>> readSourceA = Read.from(new UnboundedSocketSource<>("localhost", 9999, '\n', 3)).named("FirstStream"); PTransform<? super PBegin, PCollection<String>> readSourceB = Read.from(new UnboundedSocketSource<>("localhost", 9998, '\n', 3)).named("SecondStream"); WindowFn<Object, ?> windowFn = FixedWindows.of(Duration.standardSeconds(options.getWindowSize())); Pipeline p = Pipeline.create(options); // the following two 'applys' create multiple inputs to our pipeline, one for each // of our two input sources. PCollection<String> streamA = p.apply(readSourceA) .apply(Window.<String>into(windowFn) .triggering(AfterWatermark.pastEndOfWindow()).withAllowedLateness(Duration.ZERO) .discardingFiredPanes()); PCollection<String> streamB = p.apply(readSourceB) .apply(Window.<String>into(windowFn) .triggering(AfterWatermark.pastEndOfWindow()).withAllowedLateness(Duration.ZERO) .discardingFiredPanes()); PCollection<String> formattedResults = joinEvents(streamA, streamB); formattedResults.apply(TextIO.Write.to("./outputJoin.txt")); p.run(); }
Example 15
Source File: WindowedWordCount.java From flink-dataflow with Apache License 2.0 | 5 votes |
public static void main(String[] args) throws IOException { StreamingWordCountOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as(StreamingWordCountOptions.class); options.setStreaming(true); options.setWindowSize(10L); options.setSlide(5L); options.setCheckpointingInterval(1000L); options.setNumberOfExecutionRetries(5); options.setExecutionRetryDelay(3000L); options.setRunner(FlinkPipelineRunner.class); LOG.info("Windpwed WordCount with Sliding Windows of " + options.getWindowSize() + " sec. and a slide of " + options.getSlide()); Pipeline pipeline = Pipeline.create(options); PCollection<String> words = pipeline .apply(Read.from(new UnboundedSocketSource<>("localhost", 9999, '\n', 3)).named("StreamingWordCount")) .apply(ParDo.of(new ExtractWordsFn())) .apply(Window.<String>into(SlidingWindows.of(Duration.standardSeconds(options.getWindowSize())) .every(Duration.standardSeconds(options.getSlide()))) .triggering(AfterWatermark.pastEndOfWindow()).withAllowedLateness(Duration.ZERO) .discardingFiredPanes()); PCollection<KV<String, Long>> wordCounts = words.apply(Count.<String>perElement()); wordCounts.apply(ParDo.of(new FormatAsStringFn())) .apply(TextIO.Write.to("./outputWordCount.txt")); pipeline.run(); }
Example 16
Source File: AutoComplete.java From flink-dataflow with Apache License 2.0 | 5 votes |
@Override public PCollectionList<KV<String, List<CompletionCandidate>>> apply( PCollection<CompletionCandidate> input) { if (minPrefix > 10) { // Base case, partitioning to return the output in the expected format. return input .apply(new ComputeTopFlat(candidatesPerPrefix, minPrefix)) .apply(Partition.of(2, new KeySizePartitionFn())); } else { // If a candidate is in the top N for prefix a...b, it must also be in the top // N for a...bX for every X, which is typlically a much smaller set to consider. // First, compute the top candidate for prefixes of size at least minPrefix + 1. PCollectionList<KV<String, List<CompletionCandidate>>> larger = input .apply(new ComputeTopRecursive(candidatesPerPrefix, minPrefix + 1)); // Consider the top candidates for each prefix of length minPrefix + 1... PCollection<KV<String, List<CompletionCandidate>>> small = PCollectionList .of(larger.get(1).apply(ParDo.of(new FlattenTops()))) // ...together with those (previously excluded) candidates of length // exactly minPrefix... .and(input.apply(Filter.by(new SerializableFunction<CompletionCandidate, Boolean>() { private static final long serialVersionUID = 0; @Override public Boolean apply(CompletionCandidate c) { return c.getValue().length() == minPrefix; } }))) .apply("FlattenSmall", Flatten.<CompletionCandidate>pCollections()) // ...set the key to be the minPrefix-length prefix... .apply(ParDo.of(new AllPrefixes(minPrefix, minPrefix))) // ...and (re)apply the Top operator to all of them together. .apply(Top.<String, CompletionCandidate>largestPerKey(candidatesPerPrefix)); PCollection<KV<String, List<CompletionCandidate>>> flattenLarger = larger .apply("FlattenLarge", Flatten.<KV<String, List<CompletionCandidate>>>pCollections()); return PCollectionList.of(flattenLarger).and(small); } }
Example 17
Source File: DesiredStateEnforcer.java From policyscanner with Apache License 2.0 | 5 votes |
private PCollection<String> constructPipeline(Pipeline pipeline, String org, BoundedSource<KV<List<String>, String>> knownGoodSource) { // Read files from GCS. PCollection<KV<List<String>, String>> knownGoodFiles = pipeline.apply("Read known-good data", Read.from(knownGoodSource)); // Convert files to GCPResourceState objects. PCollection<KV<GCPResource, GCPResourceState>> knownGoodStates = knownGoodFiles.apply(ParDo.named("Convert file data to Java Objects") .of(new FileToState())); // Tag the state objects to indicate they're from a checked-in repo and not live. PCollection<KV<GCPResource, KV<StateSource, GCPResourceState>>> taggedKnownGoodStates = knownGoodStates.apply(ParDo.named("Mark states as being known-good") .of(new TagStateWithSource(StateSource.DESIRED))); // Read projects from the CRM API. PCollection<GCPProject> allProjects = pipeline.apply("Read live projects", Read.from(new LiveProjectSource(org))); // Extract project states. PCollection<KV<GCPResource, GCPResourceState>> liveStates = allProjects .apply(ParDo.named("Extract project policies").of(new ExtractState())); // Tag the states to indicate they're live and not from a checked-in source. PCollection<KV<GCPResource, KV<StateSource, GCPResourceState>>> taggedLiveStates = liveStates.apply(ParDo.named("Mark states as being live") .of(new TagStateWithSource(StateSource.LIVE))); // Join the two known-good and the live halves. PCollectionView<Map<GCPResource, KV<StateSource, GCPResourceState>>> knownGoodStatesView = taggedKnownGoodStates.apply(View.<GCPResource, KV<StateSource, GCPResourceState>>asMap()); PCollection<KV<GCPResource, Map<StateSource, GCPResourceState>>> mismatchedStates = taggedLiveStates.apply(ParDo.named("Find states that don't match") .withSideInputs(knownGoodStatesView) .of(new FilterOutMatchingState(knownGoodStatesView))); // Construct an alert message for all the discrepancies found and fix the discrepancies. return mismatchedStates .apply(ParDo.named("Fix discrepancies").of(discrepancyAutoFixMessenger)); }
Example 18
Source File: DataflowFactory.java From dockerflow with Apache License 2.0 | 4 votes |
/** * Dynamically construct a Dataflow from the workflow definition. The root PCollection has one * element, the root task's name. * * @param workflow * @param dataflowArgs * @return * @throws IOException */ public static Pipeline dataflow( Workflow workflow, Map<String, WorkflowArgs> workflowArgs, DataflowPipelineOptions o) throws IOException { assert (workflow != null); assert (o != null); assert (workflow.getDefn() != null); // Set defaults if (o.getAppName() == null) { o.setAppName(workflow.getDefn().getName()); } if (o.getProject() == null && workflow.getArgs() != null) { o.setProject(workflow.getArgs().getProjectId()); } if (o.getMaxNumWorkers() == 0) { o.setMaxNumWorkers(1); } if (o.getWorkerMachineType() == null) { o.setWorkerMachineType(DEFAULT_MACHINE_TYPE); } LOG.info("Initializing dataflow pipeline"); Pipeline p = Pipeline.create(o); LOG.info("Creating input collection of workflow args"); if (workflowArgs == null) { workflowArgs = new HashMap<String, WorkflowArgs>(); } if (workflowArgs.isEmpty()) { LOG.info("No workflow args were provided. Using default values."); workflowArgs.put(workflow.getDefn().getName(), new WorkflowArgs()); } else if (workflow.getArgs() != null) { LOG.info("Merging default workflow args with instance-specific args"); for (String key : workflowArgs.keySet()) { WorkflowArgs instanceArgs = workflowArgs.get(key); instanceArgs.mergeDefaultArgs(workflow.getArgs()); LOG.debug("Merged args: " + StringUtils.toJson(instanceArgs)); } } LOG.info("Creating dataflow pipeline for workflow " + workflow.getDefn().getName()); PCollection<KV<String, WorkflowArgs>> input = p.apply(Create.of(workflowArgs)); input = dataflow(Workflow.Steps.graph(workflow), input); if (workflowArgs.values().iterator().next().getDeleteFiles()) { LOG.info("Intermediate files will be deleted"); input = input.apply( ParDo.named("DeleteIntermediateFiles").of(new DeleteIntermediateFiles(workflow))); } return p; }
Example 19
Source File: ParDoMultiOutputITCase.java From flink-dataflow with Apache License 2.0 | 4 votes |
@Override protected void testProgram() throws Exception { Pipeline p = FlinkTestPipeline.createForBatch(); PCollection<String> words = p.apply(Create.of("Hello", "Whatupmyman", "hey", "SPECIALthere", "MAAA", "MAAFOOO")); // Select words whose length is below a cut off, // plus the lengths of words that are above the cut off. // Also select words starting with "MARKER". final int wordLengthCutOff = 3; // Create tags to use for the main and side outputs. final TupleTag<String> wordsBelowCutOffTag = new TupleTag<String>(){}; final TupleTag<Integer> wordLengthsAboveCutOffTag = new TupleTag<Integer>(){}; final TupleTag<String> markedWordsTag = new TupleTag<String>(){}; PCollectionTuple results = words.apply(ParDo .withOutputTags(wordsBelowCutOffTag, TupleTagList.of(wordLengthsAboveCutOffTag) .and(markedWordsTag)) .of(new DoFn<String, String>() { final TupleTag<String> specialWordsTag = new TupleTag<String>() { }; public void processElement(ProcessContext c) { String word = c.element(); if (word.length() <= wordLengthCutOff) { c.output(word); } else { c.sideOutput(wordLengthsAboveCutOffTag, word.length()); } if (word.startsWith("MAA")) { c.sideOutput(markedWordsTag, word); } if (word.startsWith("SPECIAL")) { c.sideOutput(specialWordsTag, word); } } })); // Extract the PCollection results, by tag. PCollection<String> wordsBelowCutOff = results.get(wordsBelowCutOffTag); PCollection<Integer> wordLengthsAboveCutOff = results.get (wordLengthsAboveCutOffTag); PCollection<String> markedWords = results.get(markedWordsTag); markedWords.apply(TextIO.Write.to(resultPath)); p.run(); }
Example 20
Source File: JoinExamplesITCase.java From flink-dataflow with Apache License 2.0 | 3 votes |
@Override protected void testProgram() throws Exception { Pipeline p = FlinkTestPipeline.createForBatch(); PCollection<TableRow> input1 = p.apply(Create.of(EVENT_ARRAY)); PCollection<TableRow> input2 = p.apply(Create.of(CC_ARRAY)); PCollection<String> output = JoinExamples.joinEvents(input1, input2); output.apply(TextIO.Write.to(resultPath)); p.run(); }