Java Code Examples for com.google.cloud.dataflow.sdk.Pipeline#create()
The following examples show how to use
com.google.cloud.dataflow.sdk.Pipeline#create() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: CountRides.java From cloud-dataflow-nyc-taxi-tycoon with Apache License 2.0 | 6 votes |
public static void main(String[] args) { CustomPipelineOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as(CustomPipelineOptions.class); Pipeline p = Pipeline.create(options); p.apply(PubsubIO.Read.named("read from PubSub") .topic(String.format("projects/%s/topics/%s", options.getSourceProject(), options.getSourceTopic())) .timestampLabel("ts") .withCoder(TableRowJsonCoder.of())) .apply("window 1s", Window.into(FixedWindows.of(Duration.standardSeconds(1)))) .apply("mark rides", MapElements.via(new MarkRides())) .apply("count similar", Count.perKey()) .apply("format rides", MapElements.via(new TransformRides())) .apply(PubsubIO.Write.named("WriteToPubsub") .topic(String.format("projects/%s/topics/%s", options.getSinkProject(), options.getSinkTopic())) .withCoder(TableRowJsonCoder.of())); p.run(); }
Example 2
Source File: FilterRides.java From cloud-dataflow-nyc-taxi-tycoon with Apache License 2.0 | 6 votes |
public static void main(String[] args) { CustomPipelineOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as(CustomPipelineOptions.class); Pipeline p = Pipeline.create(options); p.apply(PubsubIO.Read.named("read from PubSub") .topic(String.format("projects/%s/topics/%s", options.getSourceProject(), options.getSourceTopic())) .timestampLabel("ts") .withCoder(TableRowJsonCoder.of())) .apply("filter lower Manhattan", ParDo.of(new FilterLowerManhattan())) .apply(PubsubIO.Write.named("WriteToPubsub") .topic(String.format("projects/%s/topics/%s", options.getSinkProject(), options.getSinkTopic())) .withCoder(TableRowJsonCoder.of())); p.run(); }
Example 3
Source File: LoadBooks.java From cloud-bigtable-examples with Apache License 2.0 | 6 votes |
public static void main(String[] args) { // CloudBigtableOptions is one way to retrieve the options. It's not required. // https://github.com/GoogleCloudPlatform/cloud-bigtable-examples/blob/master/java/dataflow-connector-examples/src/main/java/com/google/cloud/bigtable/dataflow/example/HelloWorldWrite.java BigtableCsvOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as(BigtableCsvOptions.class); CloudBigtableTableConfiguration config = CloudBigtableTableConfiguration.fromCBTOptions(options); Pipeline p = Pipeline.create(options); CloudBigtableIO.initializeForWrite(p); PCollection<KV<String, Integer>> ngrams = applyPipelineToParseBooks(p.apply(TextIO.Read.from(options.getInputFile()))); PCollection<Mutation> mutations = ngrams.apply(ParDo.of(ENCODE_NGRAM)); mutations.apply(CloudBigtableIO.writeToTable(config)); // Run the pipeline. p.run(); }
Example 4
Source File: ExactDollarRides.java From cloud-dataflow-nyc-taxi-tycoon with Apache License 2.0 | 5 votes |
public static void main(String[] args) { CustomPipelineOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as(CustomPipelineOptions.class); Pipeline p = Pipeline.create(options); p.apply(PubsubIO.Read.named("read from PubSub") .topic(String.format("projects/%s/topics/%s", options.getSourceProject(), options.getSourceTopic())) .timestampLabel("ts") .withCoder(TableRowJsonCoder.of())) .apply("extract dollars", MapElements.via((TableRow x) -> Double.parseDouble(x.get("meter_increment").toString())) .withOutputType(TypeDescriptor.of(Double.class))) .apply("fixed window", Window.into(FixedWindows.of(Duration.standardMinutes(1)))) .apply("trigger", Window.<Double>triggering( AfterWatermark.pastEndOfWindow() .withEarlyFirings(AfterProcessingTime.pastFirstElementInPane().plusDelayOf(Duration.standardSeconds(1))) .withLateFirings(AfterPane.elementCountAtLeast(1))) .accumulatingFiredPanes() .withAllowedLateness(Duration.standardMinutes(5))) .apply("sum whole window", Sum.doublesGlobally().withoutDefaults()) .apply("format rides", ParDo.of(new TransformRides())) .apply(PubsubIO.Write.named("WriteToPubsub") .topic(String.format("projects/%s/topics/%s", options.getSinkProject(), options.getSinkTopic())) .withCoder(TableRowJsonCoder.of())); p.run(); }
Example 5
Source File: AutoComplete.java From flink-dataflow with Apache License 2.0 | 5 votes |
public static void main(String[] args) throws IOException { Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); options.setStreaming(true); options.setCheckpointingInterval(1000L); options.setNumberOfExecutionRetries(5); options.setExecutionRetryDelay(3000L); options.setRunner(FlinkPipelineRunner.class); PTransform<? super PBegin, PCollection<String>> readSource = Read.from(new UnboundedSocketSource<>("localhost", 9999, '\n', 3)).named("WordStream"); WindowFn<Object, ?> windowFn = FixedWindows.of(Duration.standardSeconds(options.getWindowSize())); // Create the pipeline. Pipeline p = Pipeline.create(options); PCollection<KV<String, List<CompletionCandidate>>> toWrite = p .apply(readSource) .apply(ParDo.of(new ExtractWordsFn())) .apply(Window.<String>into(windowFn) .triggering(AfterWatermark.pastEndOfWindow()).withAllowedLateness(Duration.ZERO) .discardingFiredPanes()) .apply(ComputeTopCompletions.top(10, options.getRecursive())); toWrite .apply(ParDo.named("FormatForPerTaskFile").of(new FormatForPerTaskLocalFile())) .apply(TextIO.Write.to("./outputAutoComplete.txt")); p.run(); }
Example 6
Source File: WindowedWordCount.java From flink-dataflow with Apache License 2.0 | 5 votes |
public static void main(String[] args) throws IOException { StreamingWordCountOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as(StreamingWordCountOptions.class); options.setStreaming(true); options.setWindowSize(10L); options.setSlide(5L); options.setCheckpointingInterval(1000L); options.setNumberOfExecutionRetries(5); options.setExecutionRetryDelay(3000L); options.setRunner(FlinkPipelineRunner.class); LOG.info("Windpwed WordCount with Sliding Windows of " + options.getWindowSize() + " sec. and a slide of " + options.getSlide()); Pipeline pipeline = Pipeline.create(options); PCollection<String> words = pipeline .apply(Read.from(new UnboundedSocketSource<>("localhost", 9999, '\n', 3)).named("StreamingWordCount")) .apply(ParDo.of(new ExtractWordsFn())) .apply(Window.<String>into(SlidingWindows.of(Duration.standardSeconds(options.getWindowSize())) .every(Duration.standardSeconds(options.getSlide()))) .triggering(AfterWatermark.pastEndOfWindow()).withAllowedLateness(Duration.ZERO) .discardingFiredPanes()); PCollection<KV<String, Long>> wordCounts = words.apply(Count.<String>perElement()); wordCounts.apply(ParDo.of(new FormatAsStringFn())) .apply(TextIO.Write.to("./outputWordCount.txt")); pipeline.run(); }
Example 7
Source File: JoinExamples.java From flink-dataflow with Apache License 2.0 | 5 votes |
public static void main(String[] args) throws Exception { Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); options.setStreaming(true); options.setCheckpointingInterval(1000L); options.setNumberOfExecutionRetries(5); options.setExecutionRetryDelay(3000L); options.setRunner(FlinkPipelineRunner.class); PTransform<? super PBegin, PCollection<String>> readSourceA = Read.from(new UnboundedSocketSource<>("localhost", 9999, '\n', 3)).named("FirstStream"); PTransform<? super PBegin, PCollection<String>> readSourceB = Read.from(new UnboundedSocketSource<>("localhost", 9998, '\n', 3)).named("SecondStream"); WindowFn<Object, ?> windowFn = FixedWindows.of(Duration.standardSeconds(options.getWindowSize())); Pipeline p = Pipeline.create(options); // the following two 'applys' create multiple inputs to our pipeline, one for each // of our two input sources. PCollection<String> streamA = p.apply(readSourceA) .apply(Window.<String>into(windowFn) .triggering(AfterWatermark.pastEndOfWindow()).withAllowedLateness(Duration.ZERO) .discardingFiredPanes()); PCollection<String> streamB = p.apply(readSourceB) .apply(Window.<String>into(windowFn) .triggering(AfterWatermark.pastEndOfWindow()).withAllowedLateness(Duration.ZERO) .discardingFiredPanes()); PCollection<String> formattedResults = joinEvents(streamA, streamB); formattedResults.apply(TextIO.Write.to("./outputJoin.txt")); p.run(); }
Example 8
Source File: KafkaWindowedWordCountExample.java From flink-dataflow with Apache License 2.0 | 5 votes |
public static void main(String[] args) { PipelineOptionsFactory.register(KafkaStreamingWordCountOptions.class); KafkaStreamingWordCountOptions options = PipelineOptionsFactory.fromArgs(args).as(KafkaStreamingWordCountOptions.class); options.setJobName("KafkaExample - WindowSize: " + options.getWindowSize() + " seconds"); options.setStreaming(true); options.setCheckpointingInterval(1000L); options.setNumberOfExecutionRetries(5); options.setExecutionRetryDelay(3000L); options.setRunner(FlinkPipelineRunner.class); System.out.println(options.getKafkaTopic() +" "+ options.getZookeeper() +" "+ options.getBroker() +" "+ options.getGroup() ); Pipeline pipeline = Pipeline.create(options); Properties p = new Properties(); p.setProperty("zookeeper.connect", options.getZookeeper()); p.setProperty("bootstrap.servers", options.getBroker()); p.setProperty("group.id", options.getGroup()); // this is the Flink consumer that reads the input to // the program from a kafka topic. FlinkKafkaConsumer08<String> kafkaConsumer = new FlinkKafkaConsumer08<>( options.getKafkaTopic(), new SimpleStringSchema(), p); PCollection<String> words = pipeline .apply(Read.from(new UnboundedFlinkSource<>(kafkaConsumer)).named("StreamingWordCount")) .apply(ParDo.of(new ExtractWordsFn())) .apply(Window.<String>into(FixedWindows.of(Duration.standardSeconds(options.getWindowSize()))) .triggering(AfterWatermark.pastEndOfWindow()).withAllowedLateness(Duration.ZERO) .discardingFiredPanes()); PCollection<KV<String, Long>> wordCounts = words.apply(Count.<String>perElement()); wordCounts.apply(ParDo.of(new FormatAsStringFn())) .apply(TextIO.Write.to("./outputKafka.txt")); pipeline.run(); }
Example 9
Source File: WordCount.java From flink-dataflow with Apache License 2.0 | 5 votes |
public static void main(String[] args) { Options options = PipelineOptionsFactory.fromArgs(args).withValidation() .as(Options.class); options.setRunner(FlinkPipelineRunner.class); Pipeline p = Pipeline.create(options); p.apply(TextIO.Read.named("ReadLines").from(options.getInput())) .apply(new CountWords()) .apply(MapElements.via(new FormatAsTextFn())) .apply(TextIO.Write.named("WriteCounts").to(options.getOutput())); p.run(); }
Example 10
Source File: TFIDF.java From flink-dataflow with Apache License 2.0 | 5 votes |
public static void main(String[] args) throws Exception { Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); options.setRunner(FlinkPipelineRunner.class); Pipeline pipeline = Pipeline.create(options); pipeline.getCoderRegistry().registerCoder(URI.class, StringDelegateCoder.of(URI.class)); pipeline .apply(new ReadDocuments(listInputDocuments(options))) .apply(new ComputeTfIdf()) .apply(new WriteTfIdf(options.getOutput())); pipeline.run(); }
Example 11
Source File: MultiLinearGraph.java From dockerflow with Apache License 2.0 | 5 votes |
/** * For simple linear graphs, it's not too hard to generate the Dataflow pipeline yourself. Here's * the equivalent Dataflow code for this simple example. */ public static void manualDataflow(String[] args) throws IOException { LOG.info("Parsing Dataflow options"); DataflowPipelineOptions o = DataflowFactory.pipelineOptions(args); o.setAppName(MultiLinearGraph.class.getSimpleName()); Pipeline p = Pipeline.create(o); p.apply(Create.of(ArgsTableBuilder.fromArgs(args).build())) .apply(DockerDo.of(taskOne())) .apply(DockerDo.of(taskTwo())); p.run(); }
Example 12
Source File: LiveStateChecker.java From policyscanner with Apache License 2.0 | 5 votes |
/** * Construct a LiveStateChecker to compare the live states of GCP resources * with their checked-in known-good counterparts. * @param options The options used to construct the pipeline. * @param knownGoodSource The source used to read the known-good. * @param org The organization the projects are to be read from. */ public LiveStateChecker(PipelineOptions options, BoundedSource<KV<List<String>, String>> knownGoodSource, String org) { this.pipeline = Pipeline.create(options); this.knownGoodSource = knownGoodSource; this.org = org; }
Example 13
Source File: DesiredStateEnforcer.java From policyscanner with Apache License 2.0 | 5 votes |
/** * Construct a LiveStateChecker to compare the live states of GCP resources * with their checked-in known-good counterparts. * @param options The options used to construct the pipeline. * @param knownGoodSource The source used to read the known-good. * @param org The organization the projects are to be read from. */ public DesiredStateEnforcer(PipelineOptions options, BoundedSource<KV<List<String>, String>> knownGoodSource, String org) { this.pipeline = Pipeline.create(options); this.outputMessages = constructPipeline(this.pipeline, org, knownGoodSource); this.enforcedStates = 0L; }
Example 14
Source File: LatestRides.java From cloud-dataflow-nyc-taxi-tycoon with Apache License 2.0 | 5 votes |
public static void main(String[] args) { CustomPipelineOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as(CustomPipelineOptions.class); Pipeline p = Pipeline.create(options); p.apply(PubsubIO.Read.named("read from PubSub") .topic(String.format("projects/%s/topics/%s", options.getSourceProject(), options.getSourceTopic())) .timestampLabel("ts") .withCoder(TableRowJsonCoder.of())) .apply("key rides by rideid", MapElements.via((TableRow ride) -> KV.of(ride.get("ride_id").toString(), ride)) .withOutputType(new TypeDescriptor<KV<String, TableRow>>() {})) .apply("session windows on rides with early firings", Window.<KV<String, TableRow>>into( Sessions.withGapDuration(Duration.standardMinutes(60))) .triggering( AfterWatermark.pastEndOfWindow() .withEarlyFirings(AfterProcessingTime.pastFirstElementInPane().plusDelayOf(Duration.millis(2000)))) .accumulatingFiredPanes() .withAllowedLateness(Duration.ZERO)) .apply("group ride points on same ride", Combine.perKey(new LatestPointCombine())) .apply("discard key", MapElements.via((KV<String, TableRow> a) -> a.getValue()) .withOutputType(TypeDescriptor.of(TableRow.class))) .apply(PubsubIO.Write.named("WriteToPubsub") .topic(String.format("projects/%s/topics/%s", options.getSinkProject(), options.getSinkTopic())) .withCoder(TableRowJsonCoder.of())); p.run(); }
Example 15
Source File: DebugFewRides.java From cloud-dataflow-nyc-taxi-tycoon with Apache License 2.0 | 5 votes |
public static void main(String[] args) { CustomPipelineOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as(CustomPipelineOptions.class); Pipeline p = Pipeline.create(options); p.apply(PubsubIO.Read.named("read from PubSub") .topic(String.format("projects/%s/topics/%s", options.getSourceProject(), options.getSourceTopic())) .timestampLabel("ts") .withCoder(TableRowJsonCoder.of())) .apply("filter a few rides", Filter.byPredicate( (TableRow t) -> { String rideId = t.get("ride_id").toString(); // You can change the filter here to allow more or fewer rides through: // rideIds starting with "a" are quite common // rideIds starting with "ab" are rarer // rideIds starting with "abc" are rarer still if (rideId.startsWith("ab")) { LOG.info("Accepted point on ride {} with order number {}} timestamp {}", t.get("ride_id"), t.get("point_idx"), t.get("timestamp")); return true; } return false; })) .apply(PubsubIO.Write.named("WriteToPubsub") .topic(String.format("projects/%s/topics/%s", options.getSinkProject(), options.getSinkTopic())) .withCoder(TableRowJsonCoder.of())); p.run(); }
Example 16
Source File: TimestampRides.java From cloud-dataflow-nyc-taxi-tycoon with Apache License 2.0 | 5 votes |
public static void main(String[] args) { CustomPipelineOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as(CustomPipelineOptions.class); Pipeline p = Pipeline.create(options); p.apply(PubsubIO.Read.named("read from PubSub") .topic(String.format("projects/%s/topics/%s", options.getSourceProject(), options.getSourceTopic())) .timestampLabel("ts") .withCoder(TableRowJsonCoder.of())) .apply("window 1s", Window.into(FixedWindows.of(Duration.standardSeconds(1)))) .apply("parse timestamps", MapElements.via( (TableRow e) -> Instant.from(DateTimeFormatter.ISO_DATE_TIME.parse(e.get("timestamp").toString())).toEpochMilli()) .withOutputType(TypeDescriptor.of(Long.class))) .apply("max timestamp in window", Max.longsGlobally().withoutDefaults()) .apply("transform", MapElements.via( (Long t) -> { TableRow ride = new TableRow(); ride.set("timestamp", Instant.ofEpochMilli(t).toString()); return ride; }) .withOutputType(TypeDescriptor.of(TableRow.class))) .apply(PubsubIO.Write.named("write to PubSub") .topic(String.format("projects/%s/topics/%s", options.getSinkProject(), options.getSinkTopic())) .withCoder(TableRowJsonCoder.of())); p.run(); }
Example 17
Source File: DataflowFactory.java From dockerflow with Apache License 2.0 | 4 votes |
/** * Dynamically construct a Dataflow from the workflow definition. The root PCollection has one * element, the root task's name. * * @param workflow * @param dataflowArgs * @return * @throws IOException */ public static Pipeline dataflow( Workflow workflow, Map<String, WorkflowArgs> workflowArgs, DataflowPipelineOptions o) throws IOException { assert (workflow != null); assert (o != null); assert (workflow.getDefn() != null); // Set defaults if (o.getAppName() == null) { o.setAppName(workflow.getDefn().getName()); } if (o.getProject() == null && workflow.getArgs() != null) { o.setProject(workflow.getArgs().getProjectId()); } if (o.getMaxNumWorkers() == 0) { o.setMaxNumWorkers(1); } if (o.getWorkerMachineType() == null) { o.setWorkerMachineType(DEFAULT_MACHINE_TYPE); } LOG.info("Initializing dataflow pipeline"); Pipeline p = Pipeline.create(o); LOG.info("Creating input collection of workflow args"); if (workflowArgs == null) { workflowArgs = new HashMap<String, WorkflowArgs>(); } if (workflowArgs.isEmpty()) { LOG.info("No workflow args were provided. Using default values."); workflowArgs.put(workflow.getDefn().getName(), new WorkflowArgs()); } else if (workflow.getArgs() != null) { LOG.info("Merging default workflow args with instance-specific args"); for (String key : workflowArgs.keySet()) { WorkflowArgs instanceArgs = workflowArgs.get(key); instanceArgs.mergeDefaultArgs(workflow.getArgs()); LOG.debug("Merged args: " + StringUtils.toJson(instanceArgs)); } } LOG.info("Creating dataflow pipeline for workflow " + workflow.getDefn().getName()); PCollection<KV<String, WorkflowArgs>> input = p.apply(Create.of(workflowArgs)); input = dataflow(Workflow.Steps.graph(workflow), input); if (workflowArgs.values().iterator().next().getDeleteFiles()) { LOG.info("Intermediate files will be deleted"); input = input.apply( ParDo.named("DeleteIntermediateFiles").of(new DeleteIntermediateFiles(workflow))); } return p; }
Example 18
Source File: GatkPairedSingleSampleAlt.java From dockerflow with Apache License 2.0 | 4 votes |
/** * Only this one method is different from GatkPairedSingleSample.java. */ @Override public Pipeline createDataflow( Map<String, WorkflowArgs> argsTable, DataflowPipelineOptions pipelineOptions, String[] args) throws IOException { DataflowPipelineOptions o = DataflowFactory.pipelineOptions(args); o.setAppName(GatkPairedSingleSampleAlt.class.getSimpleName()); Pipeline p = Pipeline.create(o); // Merge sample-specific args with default workflow args for (String key : argsTable.keySet()) { WorkflowArgs instanceArgs = argsTable.get(key); instanceArgs.mergeDefaultArgs(workflowArgs); } // Declarations PCollection<KV<String, WorkflowArgs>> mainBranch, branchOne, branchTwo; PCollectionList<KV<String, WorkflowArgs>> mergeList; // Construct the workflow graph mainBranch = p.apply(Create.of(argsTable)); branchOne = mainBranch.apply(DockerDo.of(CreateSequenceGroupingTSV)); branchTwo = mainBranch.apply(DockerDo.of(BwaVersion)) .apply(DockerDo.of(SamToFastqAndBwaMem)) .apply(DockerDo.of(MergeBamAlignment)) .apply(DockerDo.of(SortAndFixReadGroupBam)) .apply(DockerDo.of(MarkDuplicates)) .apply(DockerDo.of(SortAndFixSampleBam)); mergeList = PCollectionList.of(branchOne).and(branchTwo); mainBranch = mergeList.apply(new MergeBranches()) .apply(DockerDo.of(BaseRecalibrator)) .apply(DockerDo.of(ApplyBQSR)) .apply(DockerDo.of(GatherBqsrReports)) .apply(DockerDo.of(ApplyBQSRToUnmappedReads)) .apply(DockerDo.of(GatherBamFiles)); branchOne = mainBranch.apply(DockerDo.of(ConvertToCram)); branchTwo = mainBranch.apply(DockerDo.of(HaplotypeCaller)) .apply(DockerDo.of(GatherVCFs)); mergeList = PCollectionList.of(branchOne).and(branchTwo); mainBranch = mergeList.apply(new MergeBranches()); return p; }
Example 19
Source File: OnDemandLiveStateChecker.java From policyscanner with Apache License 2.0 | 2 votes |
/** * Construct a OnDemandLiveStateChecker to compare the live states of GCP resources * with their checked-in known-good counterparts. * @param options The options used to construct the pipeline. * @param knownGoodSource The source used to read the known-good. */ public OnDemandLiveStateChecker(PipelineOptions options, BoundedSource<KV<List<String>, String>> knownGoodSource) { this.pipeline = Pipeline.create(options); this.outputMessages = constructPipeline(this.pipeline, knownGoodSource); }
Example 20
Source File: ExportedServiceAccountKeyRemover.java From policyscanner with Apache License 2.0 | 2 votes |
/** * Constructor far ExportedServiceAccountKeyRemover * @param options The options used to construct the pipeline. * @param org The organization the projects are to be read from. */ public ExportedServiceAccountKeyRemover(PipelineOptions options, String org) { this.pipeline = Pipeline.create(options); this.outputMessages = constructPipeline(this.pipeline, org); }