com.google.cloud.dataflow.sdk.transforms.ParDo Java Exaples

Source File: FilterRides.java From cloud-dataflow-nyc-taxi-tycoon with Apache License 2.0

6 votes

public static void main(String[] args) {
  CustomPipelineOptions options =
      PipelineOptionsFactory.fromArgs(args).withValidation().as(CustomPipelineOptions.class);
  Pipeline p = Pipeline.create(options);

  p.apply(PubsubIO.Read.named("read from PubSub")
      .topic(String.format("projects/%s/topics/%s", options.getSourceProject(), options.getSourceTopic()))
      .timestampLabel("ts")
      .withCoder(TableRowJsonCoder.of()))

   .apply("filter lower Manhattan", ParDo.of(new FilterLowerManhattan()))

   .apply(PubsubIO.Write.named("WriteToPubsub")
      .topic(String.format("projects/%s/topics/%s", options.getSinkProject(), options.getSinkTopic()))
      .withCoder(TableRowJsonCoder.of()));
  p.run();
}

Source File: CoinbaseSource.java From cloud-bigtable-examples with Apache License 2.0

6 votes

public static void main(String[] args) {
  CloudBigtableOptions options =
      PipelineOptionsFactory.fromArgs(args).withValidation().as(CloudBigtableOptions.class);

  CloudBigtableScanConfiguration config =
      new CloudBigtableScanConfiguration.Builder()
          .withProjectId(options.getBigtableProjectId())
          .withInstanceId(options.getBigtableInstanceId())
          .withTableId(options.getBigtableTableId())
          .build();

  options.setStreaming(true);
  options.setRunner(DataflowPipelineRunner.class);

  Pipeline p = Pipeline.create(options);
  CloudBigtableIO.initializeForWrite(p);

  p.apply(Read.from(new CoinbaseSource()))
      .apply(ParDo.named("DeserializeCoinbase").of(new DeserializeCoinbase()))
      .apply(ParDo.of(new HBaseBigtableWriter()))
      .apply(CloudBigtableIO.writeToTable(config));

  p.run();
}

Source File: LoadBooks.java From cloud-bigtable-examples with Apache License 2.0

6 votes

public static void main(String[] args) {
  // CloudBigtableOptions is one way to retrieve the options.  It's not required.
  // https://github.com/GoogleCloudPlatform/cloud-bigtable-examples/blob/master/java/dataflow-connector-examples/src/main/java/com/google/cloud/bigtable/dataflow/example/HelloWorldWrite.java
  BigtableCsvOptions options =
      PipelineOptionsFactory.fromArgs(args).withValidation().as(BigtableCsvOptions.class);
  CloudBigtableTableConfiguration config =
      CloudBigtableTableConfiguration.fromCBTOptions(options);

  Pipeline p = Pipeline.create(options);

  CloudBigtableIO.initializeForWrite(p);

  PCollection<KV<String, Integer>> ngrams =
      applyPipelineToParseBooks(p.apply(TextIO.Read.from(options.getInputFile())));
  PCollection<Mutation> mutations = ngrams.apply(ParDo.of(ENCODE_NGRAM));
  mutations.apply(CloudBigtableIO.writeToTable(config));

  // Run the pipeline.
  p.run();
}

Source File: UnboundedSourceITCase.java From flink-dataflow with Apache License 2.0

6 votes

private static void runProgram(String resultPath) {

		Pipeline p = FlinkTestPipeline.createForStreaming();

		PCollection<String> result = p
			.apply(Read.from(new RangeReadSource(1, 10)))
			.apply(Window.<Integer>into(new GlobalWindows())
				.triggering(AfterPane.elementCountAtLeast(10))
				.discardingFiredPanes())
			.apply(ParDo.of(new DoFn<Integer, String>() {
				@Override
				public void processElement(ProcessContext c) throws Exception {
				c.output(c.element().toString());
				}
			}));

		result.apply(TextIO.Write.to(resultPath));

		try {
			p.run();
			fail();
		} catch(Exception e) {
			assertEquals("The source terminates as expected.", e.getCause().getCause().getMessage());
		}
	}

Source File: WordCountJoin2ITCase.java From flink-dataflow with Apache License 2.0

6 votes

@Override
protected void testProgram() throws Exception {
	Pipeline p = FlinkTestPipeline.createForBatch();

	/* Create two PCollections and join them */
	PCollection<KV<String,Long>> occurences1 = p.apply(Create.of(WORDS_1))
			.apply(ParDo.of(new ExtractWordsFn()))
			.apply(Count.<String>perElement());

	PCollection<KV<String,Long>> occurences2 = p.apply(Create.of(WORDS_2))
			.apply(ParDo.of(new ExtractWordsFn()))
			.apply(Count.<String>perElement());

	/* CoGroup the two collections */
	PCollection<KV<String, CoGbkResult>> mergedOccurences = KeyedPCollectionTuple
			.of(tag1, occurences1)
			.and(tag2, occurences2)
			.apply(CoGroupByKey.<String>create());

	/* Format output */
	mergedOccurences.apply(ParDo.of(new FormatCountsFn()))
			.apply(TextIO.Write.named("test").to(resultPath));

	p.run();
}

Source File: SideInputITCase.java From flink-dataflow with Apache License 2.0

6 votes

@Override
protected void testProgram() throws Exception {


	Pipeline p = FlinkTestPipeline.createForBatch();


	final PCollectionView<String> sidesInput = p
			.apply(Create.of(expected))
			.apply(View.<String>asSingleton());

	p.apply(Create.of("bli"))
			.apply(ParDo.of(new DoFn<String, String>() {
				@Override
				public void processElement(ProcessContext c) throws Exception {
					String s = c.sideInput(sidesInput);
					c.output(s);
				}
			}).withSideInputs(sidesInput)).apply(TextIO.Write.to(resultPath));

	p.run();
}

Source File: ReadSourceITCase.java From flink-dataflow with Apache License 2.0

6 votes

private static void runProgram(String resultPath) {

		Pipeline p = FlinkTestPipeline.createForBatch();

		PCollection<String> result = p
				.apply(Read.from(new ReadSource(1, 10)))
				.apply(ParDo.of(new DoFn<Integer, String>() {
					@Override
					public void processElement(ProcessContext c) throws Exception {
						c.output(c.element().toString());
					}
				}));

		result.apply(TextIO.Write.to(resultPath));
		p.run();
	}

Source File: TFIDF.java From flink-dataflow with Apache License 2.0

6 votes

@Override
public PDone apply(PCollection<KV<String, KV<URI, Double>>> wordToUriAndTfIdf) {
	return wordToUriAndTfIdf
			.apply(ParDo.named("Format").of(new DoFn<KV<String, KV<URI, Double>>, String>() {
				private static final long serialVersionUID = 0;

				@Override
				public void processElement(ProcessContext c) {
					c.output(String.format("%s,\t%s,\t%f",
							c.element().getKey(),
							c.element().getValue().getKey(),
							c.element().getValue().getValue()));
				}
			}))
			.apply(TextIO.Write
					.to(output)
					.withSuffix(".csv"));
}

Source File: FXTimeSeriesPipelineSRGTests.java From data-timeseries-java with Apache License 2.0

6 votes

public PCollection<KV<String, TSProto>> setupDataInput(Pipeline pipeline,
    List<KV<String, TSProto>> data) {


  // Assert that we have 44 Elements in the PCollection
  PCollection<KV<String, TSProto>> tsData =
      pipeline.apply("ReadData", Create.of(data))
          .apply(ParDo.of(new DoFn<KV<String, TSProto>, KV<String, TSProto>>() {

            @Override
            public void processElement(ProcessContext c) throws Exception {
              c.outputWithTimestamp(c.element(),
                  new DateTime(c.element().getValue().getTime()).toInstant());

            }

          })).setName("Assign TimeStamps");
  return tsData;

}

Source File: ExportedServiceAccountKeyRemover.java From policyscanner with Apache License 2.0

6 votes

private PCollection<String> constructPipeline(Pipeline pipeline, String org) {
  // Read projects from the CRM API.
  PCollection<GCPProject> projects =
      pipeline.apply(Read.from(new LiveProjectSource(org)));
  // List the service accounts of the projects.
  PCollection<GCPServiceAccount> serviceAccounts =
      projects.apply(ParDo.named("List Service Accounts").of(new ListServiceAccounts()));
  // List the keys of the service accounts.
  PCollection<GCPServiceAccountKey> serviceAccountKeys =
      serviceAccounts.apply(ParDo.named("List Service Account Keys")
          .of(new ListServiceAccountKeys()));
  // Construct an alert message for all the discrepancies found.
  return serviceAccountKeys.apply(ParDo
      .named("Remove user-managed keys")
      .of(new ExportedServiceAccountKeyMessenger()));
}

Source File: ExactDollarRides.java From cloud-dataflow-nyc-taxi-tycoon with Apache License 2.0

5 votes

public static void main(String[] args) {
  CustomPipelineOptions options =
      PipelineOptionsFactory.fromArgs(args).withValidation().as(CustomPipelineOptions.class);
  Pipeline p = Pipeline.create(options);

  p.apply(PubsubIO.Read.named("read from PubSub")
      .topic(String.format("projects/%s/topics/%s", options.getSourceProject(), options.getSourceTopic()))
      .timestampLabel("ts")
      .withCoder(TableRowJsonCoder.of()))

   .apply("extract dollars",
      MapElements.via((TableRow x) -> Double.parseDouble(x.get("meter_increment").toString()))
        .withOutputType(TypeDescriptor.of(Double.class)))

   .apply("fixed window", Window.into(FixedWindows.of(Duration.standardMinutes(1))))
   .apply("trigger",
      Window.<Double>triggering(
        AfterWatermark.pastEndOfWindow()
          .withEarlyFirings(AfterProcessingTime.pastFirstElementInPane().plusDelayOf(Duration.standardSeconds(1)))
          .withLateFirings(AfterPane.elementCountAtLeast(1)))
        .accumulatingFiredPanes()
        .withAllowedLateness(Duration.standardMinutes(5)))

   .apply("sum whole window", Sum.doublesGlobally().withoutDefaults())
   .apply("format rides", ParDo.of(new TransformRides()))

   .apply(PubsubIO.Write.named("WriteToPubsub")
      .topic(String.format("projects/%s/topics/%s", options.getSinkProject(), options.getSinkTopic()))
      .withCoder(TableRowJsonCoder.of()));
  p.run();
}

Source File: DesiredStateEnforcer.java From policyscanner with Apache License 2.0

5 votes

private PCollection<String> constructPipeline(Pipeline pipeline, String org,
    BoundedSource<KV<List<String>, String>> knownGoodSource) {
  // Read files from GCS.
  PCollection<KV<List<String>, String>> knownGoodFiles =
      pipeline.apply("Read known-good data", Read.from(knownGoodSource));
  // Convert files to GCPResourceState objects.
  PCollection<KV<GCPResource, GCPResourceState>> knownGoodStates =
      knownGoodFiles.apply(ParDo.named("Convert file data to Java Objects")
          .of(new FileToState()));
  // Tag the state objects to indicate they're from a checked-in repo and not live.
  PCollection<KV<GCPResource, KV<StateSource, GCPResourceState>>> taggedKnownGoodStates =
      knownGoodStates.apply(ParDo.named("Mark states as being known-good")
          .of(new TagStateWithSource(StateSource.DESIRED)));

  // Read projects from the CRM API.
  PCollection<GCPProject> allProjects =
      pipeline.apply("Read live projects", Read.from(new LiveProjectSource(org)));
  // Extract project states.
  PCollection<KV<GCPResource, GCPResourceState>> liveStates =
      allProjects
          .apply(ParDo.named("Extract project policies").of(new ExtractState()));
  // Tag the states to indicate they're live and not from a checked-in source.
  PCollection<KV<GCPResource, KV<StateSource, GCPResourceState>>> taggedLiveStates =
      liveStates.apply(ParDo.named("Mark states as being live")
          .of(new TagStateWithSource(StateSource.LIVE)));

  // Join the two known-good and the live halves.
  PCollectionView<Map<GCPResource, KV<StateSource, GCPResourceState>>> knownGoodStatesView =
      taggedKnownGoodStates.apply(View.<GCPResource, KV<StateSource, GCPResourceState>>asMap());
  PCollection<KV<GCPResource, Map<StateSource, GCPResourceState>>> mismatchedStates =
      taggedLiveStates.apply(ParDo.named("Find states that don't match")
          .withSideInputs(knownGoodStatesView)
          .of(new FilterOutMatchingState(knownGoodStatesView)));

  // Construct an alert message for all the discrepancies found and fix the discrepancies.
  return mismatchedStates
      .apply(ParDo.named("Fix discrepancies").of(discrepancyAutoFixMessenger));
}

Source File: DockerDo.java From dockerflow with Apache License 2.0

5 votes

@Override
public PCollection<KV<String, WorkflowArgs>> apply(
    PCollection<KV<String, WorkflowArgs>> input) {
  return input
      .apply(ParDo.named("Prepare").of(new Gather(task)))
      .apply(Combine.perKey(new SortArgs()))
      .apply(ParDo.named("CombineOutputs").of(new CombineArgs()));
}

Source File: FXTimeSeriesPipelineSRGTests.java From data-timeseries-java with Apache License 2.0

5 votes

public PCollection<KV<String, TSProto>> generateCompleteWindowData(Pipeline pipeline,
    List<KV<String, TSProto>> data, WorkPacketConfig packetConfig) {

  LOG.info("Check to see that time streams with missing 'ticks' have been corrected");

  PCollection<KV<String, TSProto>> tsData = setupDataInput(pipeline, data);


  PCollection<KV<String, TSProto>> windowedData =
      tsData.apply("CandleResolutionWindow", Window.<KV<String, TSProto>>into(FixedWindows
          .of(Duration.standardSeconds(((FXTimeSeriesPipelineOptions) pipeline.getOptions())
              .getCandleResolution()))));

  // Determine streams that are missing in this Window and generate values for them

  PCollection<KV<String, TSProto>> generatedValues =
      windowedData
          .apply(
              "DetectMissingTimeSeriesValues",
              Combine.globally(new DetectMissingTimeSeriesValuesCombiner(packetConfig))
                  .withoutDefaults()).apply(ParDo.of(new CreateMissingTimeSeriesValuesDoFn()))
          .setName("CreateMissingTimeSeriesValues");

  // Flatten the live streams and the generated streams together

  PCollection<KV<String, TSProto>> completeWindowData =
      PCollectionList.of(windowedData).and(generatedValues)
          .apply("MergeGeneratedLiveValues", Flatten.<KV<String, TSProto>>pCollections());


  return completeWindowData;
}

Source File: WordCountJoin3ITCase.java From flink-dataflow with Apache License 2.0

5 votes

@Override
protected void testProgram() throws Exception {

	Pipeline p = FlinkTestPipeline.createForBatch();

	/* Create two PCollections and join them */
	PCollection<KV<String,Long>> occurences1 = p.apply(Create.of(WORDS_1))
			.apply(ParDo.of(new ExtractWordsFn()))
			.apply(Count.<String>perElement());

	PCollection<KV<String,Long>> occurences2 = p.apply(Create.of(WORDS_2))
			.apply(ParDo.of(new ExtractWordsFn()))
			.apply(Count.<String>perElement());

	PCollection<KV<String,Long>> occurences3 = p.apply(Create.of(WORDS_3))
			.apply(ParDo.of(new ExtractWordsFn()))
			.apply(Count.<String>perElement());

	/* CoGroup the two collections */
	PCollection<KV<String, CoGbkResult>> mergedOccurences = KeyedPCollectionTuple
			.of(tag1, occurences1)
			.and(tag2, occurences2)
			.and(tag3, occurences3)
			.apply(CoGroupByKey.<String>create());

	/* Format output */
	mergedOccurences.apply(ParDo.of(new FormatCountsFn()))
			.apply(TextIO.Write.named("test").to(resultPath));

	p.run();
}

Source File: DockerDo.java From dockerflow with Apache License 2.0

5 votes

@Override
public PCollection<KV<String, WorkflowArgs>> apply(
    PCollection<KV<String, WorkflowArgs>> input) {
  PCollection<KV<String, WorkflowArgs>> pc = input;

  if (attempt == 0) {
    pc = pc.apply(ParDo.named("Prepare").of(new ClearOperationStatus()));
  }

  return pc.apply(ParDo.named("Start").of(new StartTask(task, attempt)))
      .apply(new BreakFusion<KV<String, WorkflowArgs>>("AfterStarted"))
      .apply(ParDo.named("Wait").of(new WaitForOperation()));
}

Source File: MaybeEmptyTestITCase.java From flink-dataflow with Apache License 2.0

5 votes

@Override
protected void testProgram() throws Exception {

	Pipeline p = FlinkTestPipeline.createForBatch();

	p.apply(Create.of((Void) null)).setCoder(VoidCoder.of())
			.apply(ParDo.of(
					new DoFn<Void, String>() {
						@Override
						public void processElement(DoFn<Void, String>.ProcessContext c) {
							c.output(expected);
						}
					})).apply(TextIO.Write.to(resultPath));
	p.run();
}

Source File: OnDemandLiveStateChecker.java From policyscanner with Apache License 2.0

5 votes

private PCollection<String> constructPipeline(Pipeline pipeline,
    BoundedSource<KV<List<String>, String>> knownGoodSource) {
  // Read files from GCS.
  PCollection<KV<List<String>, String>> knownGoodFiles =
      pipeline.apply("Read known-good data", Read.from(knownGoodSource));
  // Convert files to GCPResourceState objects.
  PCollection<KV<GCPResource, GCPResourceState>> knownGoodStates =
      knownGoodFiles.apply(ParDo.named("Convert file data to Java objects")
          .of(new FileToState()));
  // Tag the state objects to indicate they're from a checked-in repo and not live.
  PCollection<KV<GCPResource, KV<StateSource, GCPResourceState>>> taggedKnownGoodStates =
      knownGoodStates.apply(ParDo.named("Mark states as being known-good")
          .of(new TagStateWithSource(StateSource.DESIRED)));

  // Extract a list of checked-in projects from GCS.
  PCollection<List<String>> allFilePaths = knownGoodFiles
      .apply("Extract just the file paths", ParDo.of(new FilePathFromPair()));
  // Read the live version of the states of the checked-in projects.
  PCollection<KV<GCPResource, GCPResourceState>> liveStates =
      allFilePaths.apply(ParDo.named("Get live resource and states from file path")
          .of(new FilePathToLiveState()));
  // Tag the states to indicate they're live and not from a checked-in source.
  PCollection<KV<GCPResource, KV<StateSource, GCPResourceState>>> taggedLiveStates =
      liveStates.apply(ParDo.named("Mark states as being live")
          .of(new TagStateWithSource(StateSource.LIVE)));

  // Join the two known-good and the live halves.
  PCollectionView<Map<GCPResource, KV<StateSource, GCPResourceState>>> liveStatesView =
      taggedLiveStates.apply(View.<GCPResource, KV<StateSource, GCPResourceState>>asMap());
  PCollection<KV<GCPResource, Map<StateSource, GCPResourceState>>> mismatchedStates =
      taggedKnownGoodStates.apply(ParDo.named("Find states that don't match")
          .withSideInputs(liveStatesView)
          .of(new FilterOutMatchingState(liveStatesView)));
  // Construct an alert message for all the discrepancies found.
  return mismatchedStates.apply(ParDo
      .named("Generate notification messages")
      .of(new StateDiscrepancyMessenger()));
}

Source File: AvroITCase.java From flink-dataflow with Apache License 2.0

5 votes

private static void runProgram(String tmpPath, String resultPath) {
	Pipeline p = FlinkTestPipeline.createForBatch();

	p
		.apply(Create.of(
				new User("Joe", 3, "red"),
				new User("Mary", 4, "blue"),
				new User("Mark", 1, "green"),
				new User("Julia", 5, "purple"))
			.withCoder(AvroCoder.of(User.class)))

		.apply(AvroIO.Write.to(tmpPath)
			.withSchema(User.class));

	p.run();

	p = FlinkTestPipeline.createForBatch();

	p
		.apply(AvroIO.Read.from(tmpPath).withSchema(User.class).withoutValidation())

			.apply(ParDo.of(new DoFn<User, String>() {
				@Override
				public void processElement(ProcessContext c) throws Exception {
					User u = c.element();
					String result = u.getName() + " " + u.getFavoriteColor() + " " + u.getFavoriteNumber();
					c.output(result);
				}
			}))

		.apply(TextIO.Write.to(resultPath));

	p.run();
}

Source File: BreakFusion.java From dockerflow with Apache License 2.0

5 votes

@Override
public PCollection<T> apply(PCollection<T> input) {
  return input
      .apply(ParDo.named("BreakFusion").of(new DummyMapFn<T>()))
      .apply(Combine.<String, T>perKey(new First<T>()))
      .apply(Values.<T>create());
}

Source File: FlinkBatchTransformTranslators.java From flink-dataflow with Apache License 2.0

4 votes

@Override
public void translateNode(ParDo.BoundMulti<IN, OUT> transform, FlinkBatchTranslationContext context) {
	DataSet<IN> inputDataSet = context.getInputDataSet(context.getInput(transform));

	final DoFn<IN, OUT> doFn = transform.getFn();

	Map<TupleTag<?>, PCollection<?>> outputs = context.getOutput(transform).getAll();

	Map<TupleTag<?>, Integer> outputMap = Maps.newHashMap();
	// put the main output at index 0, FlinkMultiOutputDoFnFunction also expects this
	outputMap.put(transform.getMainOutputTag(), 0);
	int count = 1;
	for (TupleTag<?> tag: outputs.keySet()) {
		if (!outputMap.containsKey(tag)) {
			outputMap.put(tag, count++);
		}
	}

	// collect all output Coders and create a UnionCoder for our tagged outputs
	List<Coder<?>> outputCoders = Lists.newArrayList();
	for (PCollection<?> coll: outputs.values()) {
		outputCoders.add(coll.getCoder());
	}

	UnionCoder unionCoder = UnionCoder.of(outputCoders);

	@SuppressWarnings("unchecked")
	TypeInformation<RawUnionValue> typeInformation = new CoderTypeInformation<>(unionCoder);

	@SuppressWarnings("unchecked")
	FlinkMultiOutputDoFnFunction<IN, OUT> doFnWrapper = new FlinkMultiOutputDoFnFunction(doFn, context.getPipelineOptions(), outputMap);
	MapPartitionOperator<IN, RawUnionValue> outputDataSet = new MapPartitionOperator<>(inputDataSet, typeInformation, doFnWrapper, transform.getName());

	transformSideInputs(transform.getSideInputs(), outputDataSet, context);

	for (Map.Entry<TupleTag<?>, PCollection<?>> output: outputs.entrySet()) {
		TypeInformation<Object> outputType = context.getTypeInfo(output.getValue());
		int outputTag = outputMap.get(output.getKey());
		FlinkMultiOutputPruningFunction<Object> pruningFunction = new FlinkMultiOutputPruningFunction<>(outputTag);
		FlatMapOperator<RawUnionValue, Object> pruningOperator = new
				FlatMapOperator<>(outputDataSet, outputType,
				pruningFunction, output.getValue().getName());
		context.setOutputDataSet(output.getValue(), pruningOperator);

	}
}

Source File: ParDoMultiOutputITCase.java From flink-dataflow with Apache License 2.0

4 votes

@Override
protected void testProgram() throws Exception {
	Pipeline p = FlinkTestPipeline.createForBatch();

	PCollection<String> words = p.apply(Create.of("Hello", "Whatupmyman", "hey", "SPECIALthere", "MAAA", "MAAFOOO"));

	// Select words whose length is below a cut off,
	// plus the lengths of words that are above the cut off.
	// Also select words starting with "MARKER".
	final int wordLengthCutOff = 3;
	// Create tags to use for the main and side outputs.
	final TupleTag<String> wordsBelowCutOffTag = new TupleTag<String>(){};
	final TupleTag<Integer> wordLengthsAboveCutOffTag = new TupleTag<Integer>(){};
	final TupleTag<String> markedWordsTag = new TupleTag<String>(){};

	PCollectionTuple results =
			words.apply(ParDo
					.withOutputTags(wordsBelowCutOffTag, TupleTagList.of(wordLengthsAboveCutOffTag)
							.and(markedWordsTag))
					.of(new DoFn<String, String>() {
						final TupleTag<String> specialWordsTag = new TupleTag<String>() {
						};

						public void processElement(ProcessContext c) {
							String word = c.element();
							if (word.length() <= wordLengthCutOff) {
								c.output(word);
							} else {
								c.sideOutput(wordLengthsAboveCutOffTag, word.length());
							}
							if (word.startsWith("MAA")) {
								c.sideOutput(markedWordsTag, word);
							}

							if (word.startsWith("SPECIAL")) {
								c.sideOutput(specialWordsTag, word);
							}
						}
					}));

	// Extract the PCollection results, by tag.
	PCollection<String> wordsBelowCutOff = results.get(wordsBelowCutOffTag);
	PCollection<Integer> wordLengthsAboveCutOff = results.get
			(wordLengthsAboveCutOffTag);
	PCollection<String> markedWords = results.get(markedWordsTag);

	markedWords.apply(TextIO.Write.to(resultPath));

	p.run();
}

Source File: DockerDo.java From dockerflow with Apache License 2.0

4 votes

/** Scatter on the inputs to run multiple Docker tasks in parallel. */
public static ParDo.Bound<KV<String, WorkflowArgs>, KV<String, WorkflowArgs>> scatter(Task t) {
  return ParDo.named("Scatter").of(new ScatterTasks(t));
}

Source File: DockerDo.java From dockerflow with Apache License 2.0

4 votes

@Override
public PCollection<KV<String, WorkflowArgs>> apply(
    PCollection<KV<String, WorkflowArgs>> input) {
  return input.apply(ParDo.of(new Outputs(task)));
}

Source File: FXTimeSeriesPipelineSRGTests.java From data-timeseries-java with Apache License 2.0

4 votes

@org.junit.Test
public void testCompleteCandleDataOneStream() {

  Pipeline pipeline = setup();

  List<KV<String, TSProto>> pipelineData = GenerateSampleData.getTestData();
  WorkPacketConfig packetConfig =
      GenerateSampleData.generateWorkPacketConfig(2, new String[] {GenerateSampleData.TS1});

  Map<String, TSProto> map = generateMapData(pipelineData);

  // Run test with TS-1 data only

  List<KV<String, TSProto>> ts1Only = new ArrayList<>();

  for (String ts : map.keySet()) {
    if (extractKey(ts).equals(GenerateSampleData.TS1)) {
      ts1Only.add(KV.of(extractKey(ts), map.get(ts)));
    }
  }

  List<KV<String, TSProto>> testData = new ArrayList<KV<String, TSProto>>(ts1Only);

  PCollection<KV<String, TSAggValueProto>> completeAggs =
      createCompleteAggregates(pipeline, ts1Only, packetConfig);

  PCollection<SimpleAggTester> simpleAgg =
      completeAggs.apply(ParDo.of(new DoFn<KV<String, TSAggValueProto>, SimpleAggTester>() {

        @Override
        public void processElement(
            DoFn<KV<String, TSAggValueProto>, SimpleAggTester>.ProcessContext c) throws Exception {

          c.output(SimpleAggTester.newBuilder().setKey(c.element().getKey())
              .setCloseTime(c.element().getValue().getCloseTime())
              .setOpenStateTime(c.element().getValue().getOpenState().getTime())
              .setCloseStateTime(c.element().getValue().getCloseState().getTime())
              .setMinAskPrice(c.element().getValue().getMinAskValue().getAskPrice())
              .setMaxAskPrice(c.element().getValue().getMaxAskValue().getAskPrice())
              .setMinBidPrice(c.element().getValue().getMinBidValue().getBidPrice())
              .setMaxBidPrice(c.element().getValue().getMaxBidValue().getBidPrice()).build());

        }

      }));

  List<SimpleAggTester> expectedList = new ArrayList<>();

  String key = GenerateSampleData.TS1;

  expectedList.add(SimpleAggTester.newBuilder().setKey(key).setCloseTime(1451577719999L)
          .setOpenStateTime(1451577660000L).setCloseStateTime(1451577660000L).setMinAskPrice(1)
          .setMaxAskPrice(2).setMinBidPrice(1).setMaxBidPrice(2).build());

  expectedList.add(SimpleAggTester.newBuilder().setKey(key).setCloseTime(1451577839999L)
          .setOpenStateTime(1451577660000L).setCloseStateTime(1451577780000L).setMinAskPrice(3)
          .setMaxAskPrice(4).setMinBidPrice(3).setMaxBidPrice(4).build());

  expectedList.add(SimpleAggTester.newBuilder().setKey(key).setCloseTime(1451577959999L)
          .setOpenStateTime(1451577780000L).setCloseStateTime(1451577900000L).setMinAskPrice(5)
          .setMaxAskPrice(5).setMinBidPrice(5).setMaxBidPrice(5).build());

  expectedList.add(SimpleAggTester.newBuilder().setKey(key).setCloseTime(1451578079999L)
          .setOpenStateTime(1451577900000L).setCloseStateTime(1451578020000L).setMinAskPrice(3)
          .setMaxAskPrice(4).setMinBidPrice(3).setMaxBidPrice(4).build());

  expectedList.add(SimpleAggTester.newBuilder().setKey(key).setCloseTime(1451578199999L)
          .setOpenStateTime(1451578020000L).setCloseStateTime(1451578140000L).setMinAskPrice(1)
          .setMaxAskPrice(2).setMinBidPrice(1).setMaxBidPrice(2).build());

  
    
  DataflowAssert.that(simpleAgg).containsInAnyOrder(expectedList);

  pipeline.run();
}

Source File: FXTimeSeriesPipelineSRGTests.java From data-timeseries-java with Apache License 2.0

4 votes

@org.junit.Test
public void testDataInput() {

  Pipeline pipeline = setup();

  PCollection<KV<String, TSProto>> tsData =
      setupDataInput(pipeline, GenerateSampleData.getTestData());

  LOG.info("Check that we have 42 elements in the Input PCollection");

  DataflowAssert.that(
      tsData.apply("TestInputElementCount", ParDo.of(new DoFn<KV<String, TSProto>, Integer>() {

        @Override
        public void processElement(DoFn<KV<String, TSProto>, Integer>.ProcessContext c)
            throws Exception {

          c.output(1);
        }

      })).apply(Sum.integersGlobally())).containsInAnyOrder(42);

  pipeline.run();

}

Source File: FXTimeSeriesPipelineSRGTests.java From data-timeseries-java with Apache License 2.0

4 votes

public PCollection<KV<String, TSAggValueProto>> createCompleteAggregates(Pipeline pipeline,
    List<KV<String, TSProto>> data, WorkPacketConfig packetConfig) {

  PCollection<KV<String, TSProto>> completeWindowData =
      generateCompleteWindowData(pipeline, data, packetConfig);

  PCollection<KV<String, TSAggValueProto>> parital =
      completeWindowData.apply("CreatePartialAggregates",
          Combine.perKey(new PartialTimeSeriesAggCombiner()));

  PCollection<KV<String, TSAggValueProto>> paritalWithWindowBoundary =
      parital.apply(ParDo.of(new EmbedWindowTimeIntoAggregateDoFn()));

  PCollection<KV<String, TSAggValueProto>> completeAggregationStage1 =
      paritalWithWindowBoundary.apply(
          "completeAggregationStage1",
          Window.<KV<String, TSAggValueProto>>into(new GlobalWindows())
              .triggering(Repeatedly.forever(AfterPane.elementCountAtLeast(1)))
              .withOutputTimeFn(OutputTimeFns.outputAtEarliestInputTimestamp())
              .accumulatingFiredPanes());

  PCollection<KV<String, TSAggValueProto>> completeAggregationStage2 =
      completeAggregationStage1.apply("CreateCompleteCandles",
          Combine.perKey(new CompleteTimeSeriesAggCombiner())).apply("FlattenIterables",
          ParDo.of(new FlattenKVIterableDoFn()));

  PCollection<KV<String, TSAggValueProto>> completeAggregationStage3 =
      completeAggregationStage2.apply("ResetTimestampsAfterGlobalWindow",
          ParDo.of(new DoFn<KV<String, TSAggValueProto>, KV<String, TSAggValueProto>>() {

            @Override
            public void processElement(
                DoFn<KV<String, TSAggValueProto>, KV<String, TSAggValueProto>>.ProcessContext c)
                throws Exception {
              if (c.timestamp().isBefore(new Instant(32530703764000L))) {

                if (c.timestamp().isAfter(
                    new Instant(c.element().getValue().getCloseState().getTime()))) {

                  LOG.error("BUG There was a timestamp before current :: "
                      + TextFormat.shortDebugString(c.element().getValue()));

                } else {
                  c.outputWithTimestamp(c.element(), new Instant(c.element().getValue()
                      .getCloseTime()));

                }
              }

            }

          }));

  return completeAggregationStage3;

}

Source File: CreateAggregatesTransform.java From data-timeseries-java with Apache License 2.0

4 votes

@Override
public PCollection<KV<String, TSAggValueProto>> apply(PCollection<KV<String, TSProto>> input) {



  PCollection<KV<String, TSProto>> windowedData =
      input.apply("CandleResolutionWindow", Window.<KV<String, TSProto>>into(
          FixedWindows.of(Duration.standardSeconds(options.getCandleResolution()))));

  // Determine streams that are missing in this Window and generate values for them

  PCollection<KV<String, TSProto>> generatedValues = windowedData
      .apply("DetectMissingTimeSeriesValues",
          Combine.globally(new DetectMissingTimeSeriesValuesCombiner(packetConfig))
              .withoutDefaults())
      .apply(ParDo.of(new CreateMissingTimeSeriesValuesDoFn()))
      .setName("CreateMissingTimeSeriesValues");

  // Flatten the live streams and the generated streams together

  PCollection<KV<String, TSProto>> completeWindowData =
      PCollectionList.of(windowedData).and(generatedValues).apply("MergeGeneratedLiveValues",
          Flatten.<KV<String, TSProto>>pCollections());

  // Create partial aggregates, at this stage we will not bring forward the previous windows close
  // value
  PCollection<KV<String, TSAggValueProto>> parital = completeWindowData
      .apply("CreatePartialAggregates", Combine.perKey(new PartialTimeSeriesAggCombiner()));

  // When these aggregates go through the Global Window they will lose their time value
  // We will embed the window close into the data so we can access it later on

  PCollection<KV<String, TSAggValueProto>> paritalWithWindowBoundary =
      parital.apply(ParDo.of(new EmbedWindowTimeIntoAggregateDoFn()));

  // Create a Global window which can retain the last value held in memory We must use
  // outputAtEarliestInputTimestamp as later on we re-attach the timestamp from within the data
  // point, for us not to hit 'skew' issues we need to ensure the output timestamp value is always
  // the smallest value
  PCollection<KV<String, TSAggValueProto>> completeAggregationStage1 =
      paritalWithWindowBoundary.apply("completeAggregationStage1",
          Window.<KV<String, TSAggValueProto>>into(new GlobalWindows())
              .triggering(Repeatedly.forever(AfterPane.elementCountAtLeast(1)))
              .withOutputTimeFn(OutputTimeFns.outputAtEarliestInputTimestamp())
              .accumulatingFiredPanes());

  PCollection<KV<String, TSAggValueProto>> completeAggregationStage2 = completeAggregationStage1
      .apply("CreateCompleteCandles", Combine.perKey(new CompleteTimeSeriesAggCombiner()))
      .apply("FlattenIterables", ParDo.of(new FlattenKVIterableDoFn()));



  // Reset timestamps after global window
  PCollection<KV<String, TSAggValueProto>> completeAggregationStage3 =
      completeAggregationStage2.apply("ResetTimestampsAfterGlobalWindow",
          ParDo.of(new DoFn<KV<String, TSAggValueProto>, KV<String, TSAggValueProto>>() {

            @Override
            public void processElement(
                DoFn<KV<String, TSAggValueProto>, KV<String, TSAggValueProto>>.ProcessContext c)
                throws Exception {
              //
              // TODO When the local Dataflow runners shuts down there will be some values
              // produced for the end of the the GlobalWindow. We can remove these values by
              // filtering out anything from year 3000+ for now. Better solution will be to check
              // the WINDOW PANE
              //
          	  Instant time = c.timestamp();
          	  
              if (time.isBefore(new Instant(32530703764000L))) {

                // The timestamp produced from the Combiner after the GlobalWindow loses fidelity,
                // we can add this back by looking at the value in the data

                if (time
                    .isAfter(new Instant(c.element().getValue().getCloseState().getTime()))) {

                  LOG.error(
                      "There was a timestamp before earlier than the window and skew must be 0 :: "
                          + TextFormat.shortDebugString(c.element().getValue()));

                } else {
                  c.outputWithTimestamp(c.element(),
                      new Instant(c.element().getValue().getCloseTime()));

                }
              }

            }

          }));

  return completeAggregationStage3;
}

Source File: DataflowFactory.java From dockerflow with Apache License 2.0

4 votes

/**
 * Dynamically construct a Dataflow from the workflow definition. The root PCollection has one
 * element, the root task's name.
 *
 * @param workflow
 * @param dataflowArgs
 * @return
 * @throws IOException
 */
public static Pipeline dataflow(
    Workflow workflow, Map<String, WorkflowArgs> workflowArgs, DataflowPipelineOptions o)
    throws IOException {

  assert (workflow != null);
  assert (o != null);
  assert (workflow.getDefn() != null);

  // Set defaults
  if (o.getAppName() == null) {
    o.setAppName(workflow.getDefn().getName());
  }
  if (o.getProject() == null && workflow.getArgs() != null) {
    o.setProject(workflow.getArgs().getProjectId());
  }
  if (o.getMaxNumWorkers() == 0) {
    o.setMaxNumWorkers(1);
  }
  if (o.getWorkerMachineType() == null) {
    o.setWorkerMachineType(DEFAULT_MACHINE_TYPE);
  }

  LOG.info("Initializing dataflow pipeline");
  Pipeline p = Pipeline.create(o);

  LOG.info("Creating input collection of workflow args");
  if (workflowArgs == null) {
    workflowArgs = new HashMap<String, WorkflowArgs>();
  }
  if (workflowArgs.isEmpty()) {
    LOG.info("No workflow args were provided. Using default values.");
    workflowArgs.put(workflow.getDefn().getName(), new WorkflowArgs());
  } else if (workflow.getArgs() != null) {
    LOG.info("Merging default workflow args with instance-specific args");

    for (String key : workflowArgs.keySet()) {
      WorkflowArgs instanceArgs = workflowArgs.get(key);
      instanceArgs.mergeDefaultArgs(workflow.getArgs());
      LOG.debug("Merged args: " + StringUtils.toJson(instanceArgs));
    }
  }

  LOG.info("Creating dataflow pipeline for workflow " + workflow.getDefn().getName());
  PCollection<KV<String, WorkflowArgs>> input = p.apply(Create.of(workflowArgs));
  input = dataflow(Workflow.Steps.graph(workflow), input);

  if (workflowArgs.values().iterator().next().getDeleteFiles()) {
    LOG.info("Intermediate files will be deleted");
    input =
        input.apply(
            ParDo.named("DeleteIntermediateFiles").of(new DeleteIntermediateFiles(workflow)));
  }

  return p;
}

Source File: FlinkBatchTransformTranslators.java From flink-dataflow with Apache License 2.0

3 votes

@Override
public void translateNode(ParDo.Bound<IN, OUT> transform, FlinkBatchTranslationContext context) {
	DataSet<IN> inputDataSet = context.getInputDataSet(context.getInput(transform));

	final DoFn<IN, OUT> doFn = transform.getFn();

	TypeInformation<OUT> typeInformation = context.getTypeInfo(context.getOutput(transform));

	FlinkDoFnFunction<IN, OUT> doFnWrapper = new FlinkDoFnFunction<>(doFn, context.getPipelineOptions());
	MapPartitionOperator<IN, OUT> outputDataSet = new MapPartitionOperator<>(inputDataSet, typeInformation, doFnWrapper, transform.getName());

	transformSideInputs(transform.getSideInputs(), outputDataSet, context);

	context.setOutputDataSet(context.getOutput(transform), outputDataSet);
}

com.google.cloud.dataflow.sdk.transforms.ParDo Java Examples