com.google.cloud.dataflow.sdk.values.PCollection Java Exaples

Source File: FXTimeSeriesPipelineSRGTests.java From data-timeseries-java with Apache License 2.0

6 votes

@org.junit.Test
public void testCompleteWindowData() {

  Pipeline pipeline = setup();

  List<KV<String, TSProto>> pipelineData = GenerateSampleData.getTestData();
  List<KV<String, TSProto>> testData = new ArrayList<KV<String, TSProto>>(pipelineData);
  WorkPacketConfig packetConfig = GenerateSampleData.generateWorkPacketConfig(2);

  PCollection<KV<String, TSProto>> completeWindowData =
      generateCompleteWindowData(pipeline, pipelineData, packetConfig);

  testData.add(KV.of(GenerateSampleData.TS3, TSProto.newBuilder().setKey(GenerateSampleData.TS3)
          .setIsLive(false).setTime(1451577839999L).build()));
  testData.add(KV.of(GenerateSampleData.TS4, TSProto.newBuilder().setKey(GenerateSampleData.TS4)
          .setIsLive(false).setTime(1451577839999L).build()));
  
  DataflowAssert.that(completeWindowData).containsInAnyOrder(testData);
  pipeline.run();
}

Source File: UnboundedSourceITCase.java From flink-dataflow with Apache License 2.0

6 votes

private static void runProgram(String resultPath) {

		Pipeline p = FlinkTestPipeline.createForStreaming();

		PCollection<String> result = p
			.apply(Read.from(new RangeReadSource(1, 10)))
			.apply(Window.<Integer>into(new GlobalWindows())
				.triggering(AfterPane.elementCountAtLeast(10))
				.discardingFiredPanes())
			.apply(ParDo.of(new DoFn<Integer, String>() {
				@Override
				public void processElement(ProcessContext c) throws Exception {
				c.output(c.element().toString());
				}
			}));

		result.apply(TextIO.Write.to(resultPath));

		try {
			p.run();
			fail();
		} catch(Exception e) {
			assertEquals("The source terminates as expected.", e.getCause().getCause().getMessage());
		}
	}

Source File: ExportedServiceAccountKeyRemover.java From policyscanner with Apache License 2.0

6 votes

private PCollection<String> constructPipeline(Pipeline pipeline, String org) {
  // Read projects from the CRM API.
  PCollection<GCPProject> projects =
      pipeline.apply(Read.from(new LiveProjectSource(org)));
  // List the service accounts of the projects.
  PCollection<GCPServiceAccount> serviceAccounts =
      projects.apply(ParDo.named("List Service Accounts").of(new ListServiceAccounts()));
  // List the keys of the service accounts.
  PCollection<GCPServiceAccountKey> serviceAccountKeys =
      serviceAccounts.apply(ParDo.named("List Service Account Keys")
          .of(new ListServiceAccountKeys()));
  // Construct an alert message for all the discrepancies found.
  return serviceAccountKeys.apply(ParDo
      .named("Remove user-managed keys")
      .of(new ExportedServiceAccountKeyMessenger()));
}

Source File: LoadBooks.java From cloud-bigtable-examples with Apache License 2.0

6 votes

public static void main(String[] args) {
  // CloudBigtableOptions is one way to retrieve the options.  It's not required.
  // https://github.com/GoogleCloudPlatform/cloud-bigtable-examples/blob/master/java/dataflow-connector-examples/src/main/java/com/google/cloud/bigtable/dataflow/example/HelloWorldWrite.java
  BigtableCsvOptions options =
      PipelineOptionsFactory.fromArgs(args).withValidation().as(BigtableCsvOptions.class);
  CloudBigtableTableConfiguration config =
      CloudBigtableTableConfiguration.fromCBTOptions(options);

  Pipeline p = Pipeline.create(options);

  CloudBigtableIO.initializeForWrite(p);

  PCollection<KV<String, Integer>> ngrams =
      applyPipelineToParseBooks(p.apply(TextIO.Read.from(options.getInputFile())));
  PCollection<Mutation> mutations = ngrams.apply(ParDo.of(ENCODE_NGRAM));
  mutations.apply(CloudBigtableIO.writeToTable(config));

  // Run the pipeline.
  p.run();
}

Source File: WordCountJoin2ITCase.java From flink-dataflow with Apache License 2.0

6 votes

@Override
protected void testProgram() throws Exception {
	Pipeline p = FlinkTestPipeline.createForBatch();

	/* Create two PCollections and join them */
	PCollection<KV<String,Long>> occurences1 = p.apply(Create.of(WORDS_1))
			.apply(ParDo.of(new ExtractWordsFn()))
			.apply(Count.<String>perElement());

	PCollection<KV<String,Long>> occurences2 = p.apply(Create.of(WORDS_2))
			.apply(ParDo.of(new ExtractWordsFn()))
			.apply(Count.<String>perElement());

	/* CoGroup the two collections */
	PCollection<KV<String, CoGbkResult>> mergedOccurences = KeyedPCollectionTuple
			.of(tag1, occurences1)
			.and(tag2, occurences2)
			.apply(CoGroupByKey.<String>create());

	/* Format output */
	mergedOccurences.apply(ParDo.of(new FormatCountsFn()))
			.apply(TextIO.Write.named("test").to(resultPath));

	p.run();
}

Source File: RemoveDuplicatesEmptyITCase.java From flink-dataflow with Apache License 2.0

6 votes

@Override
protected void testProgram() throws Exception {

	List<String> strings = Collections.emptyList();

	Pipeline p = FlinkTestPipeline.createForBatch();

	PCollection<String> input =
			p.apply(Create.of(strings))
					.setCoder(StringUtf8Coder.of());

	PCollection<String> output =
			input.apply(RemoveDuplicates.<String>create());

	output.apply(TextIO.Write.to(resultPath));
	p.run();
}

Source File: LoadBooksTest.java From cloud-bigtable-examples with Apache License 2.0

6 votes

@Test
public void parseBooks_returnsNgramsCounts() {
  // Arrange
  Pipeline p = TestPipeline.create();
  PCollection<String> input = p.apply(Create.of(testFile));

  // Act
  PCollection<KV<String, Integer>> output = LoadBooks.applyPipelineToParseBooks(input);

  // Assert
  DataflowAssert.that(output)
      .containsInAnyOrder(
          KV.of("despatch when art", 10),
          KV.of("despatch when came", 10),
          KV.of("despatch when published", 12),
          KV.of("despatch where was", 10),
          KV.of("despatch which made", 45),
          // There are two entries for "despatch which addressed".
          // Each entry has a different part of speech for "addressed".
          KV.of("despatch which addressed", 12 + 46),
          KV.of("despatch which admitted", 13),
          KV.of("despatch which allow", 14),
          KV.of("despatch which announced", 50),
          KV.of("despatch which answer", 32));
}

Source File: TfIdfITCase.java From flink-dataflow with Apache License 2.0

6 votes

@Override
protected void testProgram() throws Exception {

	Pipeline pipeline = FlinkTestPipeline.createForBatch();

	pipeline.getCoderRegistry().registerCoder(URI.class, StringDelegateCoder.of(URI.class));

	PCollection<KV<String, KV<URI, Double>>> wordToUriAndTfIdf = pipeline
			.apply(Create.of(
					KV.of(new URI("x"), "a b c d"),
					KV.of(new URI("y"), "a b c"),
					KV.of(new URI("z"), "a m n")))
			.apply(new TfIdf.ComputeTfIdf());

	PCollection<String> words = wordToUriAndTfIdf
			.apply(Keys.<String>create())
			.apply(RemoveDuplicates.<String>create());

	words.apply(TextIO.Write.to(resultPath));

	pipeline.run();
}

Source File: RemoveDuplicatesITCase.java From flink-dataflow with Apache License 2.0

6 votes

@Override
protected void testProgram() throws Exception {

	List<String> strings = Arrays.asList("k1", "k5", "k5", "k2", "k1", "k2", "k3");

	Pipeline p = FlinkTestPipeline.createForBatch();

	PCollection<String> input =
			p.apply(Create.of(strings))
					.setCoder(StringUtf8Coder.of());

	PCollection<String> output =
			input.apply(RemoveDuplicates.<String>create());

	output.apply(TextIO.Write.to(resultPath));
	p.run();
}

Source File: ReadSourceITCase.java From flink-dataflow with Apache License 2.0

6 votes

private static void runProgram(String resultPath) {

		Pipeline p = FlinkTestPipeline.createForBatch();

		PCollection<String> result = p
				.apply(Read.from(new ReadSource(1, 10)))
				.apply(ParDo.of(new DoFn<Integer, String>() {
					@Override
					public void processElement(ProcessContext c) throws Exception {
						c.output(c.element().toString());
					}
				}));

		result.apply(TextIO.Write.to(resultPath));
		p.run();
	}

Source File: FlinkStreamingTransformTranslators.java From flink-dataflow with Apache License 2.0

6 votes

@Override
public void translateNode(ParDo.Bound<IN, OUT> transform, FlinkStreamingTranslationContext context) {
	PCollection<OUT> output = context.getOutput(transform);

	final WindowingStrategy<OUT, ? extends BoundedWindow> windowingStrategy =
			(WindowingStrategy<OUT, ? extends BoundedWindow>)
					context.getOutput(transform).getWindowingStrategy();

	WindowedValue.WindowedValueCoder<OUT> outputStreamCoder = WindowedValue.getFullCoder(output.getCoder(),
			windowingStrategy.getWindowFn().windowCoder());
	CoderTypeInformation<WindowedValue<OUT>> outputWindowedValueCoder =
			new CoderTypeInformation<>(outputStreamCoder);

	FlinkParDoBoundWrapper<IN, OUT> doFnWrapper = new FlinkParDoBoundWrapper<>(
			context.getPipelineOptions(), windowingStrategy, transform.getFn());
	DataStream<WindowedValue<IN>> inputDataStream = context.getInputDataStream(context.getInput(transform));
	SingleOutputStreamOperator<WindowedValue<OUT>> outDataStream = inputDataStream.flatMap(doFnWrapper)
			.returns(outputWindowedValueCoder);

	context.setOutputDataStream(context.getOutput(transform), outDataStream);
}

Source File: FlinkStreamingTransformTranslators.java From flink-dataflow with Apache License 2.0

6 votes

@Override
public void translateNode(Read.Unbounded<T> transform, FlinkStreamingTranslationContext context) {
	PCollection<T> output = context.getOutput(transform);

	DataStream<WindowedValue<T>> source;
	if (transform.getSource().getClass().equals(UnboundedFlinkSource.class)) {
		UnboundedFlinkSource flinkSource = (UnboundedFlinkSource) transform.getSource();
		source = context.getExecutionEnvironment()
				.addSource(flinkSource.getFlinkSource())
				.flatMap(new FlatMapFunction<String, WindowedValue<String>>() {
					@Override
					public void flatMap(String s, Collector<WindowedValue<String>> collector) throws Exception {
						collector.collect(WindowedValue.<String>of(s, Instant.now(), GlobalWindow.INSTANCE, PaneInfo.NO_FIRING));
					}
				}).assignTimestampsAndWatermarks(new IngestionTimeExtractor());
	} else {
		source = context.getExecutionEnvironment()
				.addSource(new UnboundedSourceWrapper<>(context.getPipelineOptions(), transform));
	}
	context.setOutputDataStream(output, source);
}

Source File: FXTimeSeriesPipelineSRGTests.java From data-timeseries-java with Apache License 2.0

6 votes

public PCollection<KV<String, TSProto>> setupDataInput(Pipeline pipeline,
    List<KV<String, TSProto>> data) {


  // Assert that we have 44 Elements in the PCollection
  PCollection<KV<String, TSProto>> tsData =
      pipeline.apply("ReadData", Create.of(data))
          .apply(ParDo.of(new DoFn<KV<String, TSProto>, KV<String, TSProto>>() {

            @Override
            public void processElement(ProcessContext c) throws Exception {
              c.outputWithTimestamp(c.element(),
                  new DateTime(c.element().getValue().getTime()).toInstant());

            }

          })).setName("Assign TimeStamps");
  return tsData;

}

Source File: TFIDF.java From flink-dataflow with Apache License 2.0

6 votes

@Override
public PDone apply(PCollection<KV<String, KV<URI, Double>>> wordToUriAndTfIdf) {
	return wordToUriAndTfIdf
			.apply(ParDo.named("Format").of(new DoFn<KV<String, KV<URI, Double>>, String>() {
				private static final long serialVersionUID = 0;

				@Override
				public void processElement(ProcessContext c) {
					c.output(String.format("%s,\t%s,\t%f",
							c.element().getKey(),
							c.element().getValue().getKey(),
							c.element().getValue().getValue()));
				}
			}))
			.apply(TextIO.Write
					.to(output)
					.withSuffix(".csv"));
}

Source File: FlinkStreamingTransformTranslators.java From flink-dataflow with Apache License 2.0

5 votes

@Override
public void translateNode(Create.Values<OUT> transform, FlinkStreamingTranslationContext context) {
	PCollection<OUT> output = context.getOutput(transform);
	Iterable<OUT> elements = transform.getElements();

	// we need to serialize the elements to byte arrays, since they might contain
	// elements that are not serializable by Java serialization. We deserialize them
	// in the FlatMap function using the Coder.

	List<byte[]> serializedElements = Lists.newArrayList();
	Coder<OUT> elementCoder = context.getOutput(transform).getCoder();
	for (OUT element: elements) {
		ByteArrayOutputStream bao = new ByteArrayOutputStream();
		try {
			elementCoder.encode(element, bao, Coder.Context.OUTER);
			serializedElements.add(bao.toByteArray());
		} catch (IOException e) {
			throw new RuntimeException("Could not serialize Create elements using Coder: " + e);
		}
	}


	DataStream<Integer> initDataSet = context.getExecutionEnvironment().fromElements(1);

	FlinkStreamingCreateFunction<Integer, OUT> createFunction =
			new FlinkStreamingCreateFunction<>(serializedElements, elementCoder);

	WindowedValue.ValueOnlyWindowedValueCoder<OUT> windowCoder = WindowedValue.getValueOnlyCoder(elementCoder);
	TypeInformation<WindowedValue<OUT>> outputType = new CoderTypeInformation<>(windowCoder);

	DataStream<WindowedValue<OUT>> outputDataStream = initDataSet.flatMap(createFunction)
			.returns(outputType);

	context.setOutputDataStream(context.getOutput(transform), outputDataStream);
}

Source File: FlinkStreamingTransformTranslators.java From flink-dataflow with Apache License 2.0

5 votes

@Override
public void translateNode(Flatten.FlattenPCollectionList<T> transform, FlinkStreamingTranslationContext context) {
	List<PCollection<T>> allInputs = context.getInput(transform).getAll();
	DataStream<T> result = null;
	for (PCollection<T> collection : allInputs) {
		DataStream<T> current = context.getInputDataStream(collection);
		result = (result == null) ? current : result.union(current);
	}
	context.setOutputDataStream(context.getOutput(transform), result);
}

Source File: FlinkBatchTransformTranslators.java From flink-dataflow with Apache License 2.0

5 votes

@Override
public void translateNode(Flatten.FlattenPCollectionList<T> transform, FlinkBatchTranslationContext context) {
	List<PCollection<T>> allInputs = context.getInput(transform).getAll();
	DataSet<T> result = null;
	for(PCollection<T> collection : allInputs) {
		DataSet<T> current = context.getInputDataSet(collection);
		if (result == null) {
			result = current;
		} else {
			result = result.union(current);
		}
	}
	context.setOutputDataSet(context.getOutput(transform), result);
}

Source File: FlinkBatchTransformTranslators.java From flink-dataflow with Apache License 2.0

5 votes

@Override
public void translateNode(CoGroupByKey<K> transform, FlinkBatchTranslationContext context) {
	KeyedPCollectionTuple<K> input = context.getInput(transform);

	CoGbkResultSchema schema = input.getCoGbkResultSchema();
	List<KeyedPCollectionTuple.TaggedKeyedPCollection<K, ?>> keyedCollections = input.getKeyedCollections();

	KeyedPCollectionTuple.TaggedKeyedPCollection<K, ?> taggedCollection1 = keyedCollections.get(0);
	KeyedPCollectionTuple.TaggedKeyedPCollection<K, ?> taggedCollection2 = keyedCollections.get(1);

	TupleTag<?> tupleTag1 = taggedCollection1.getTupleTag();
	TupleTag<?> tupleTag2 = taggedCollection2.getTupleTag();

	PCollection<? extends KV<K, ?>> collection1 = taggedCollection1.getCollection();
	PCollection<? extends KV<K, ?>> collection2 = taggedCollection2.getCollection();

	DataSet<KV<K,V1>> inputDataSet1 = context.getInputDataSet(collection1);
	DataSet<KV<K,V2>> inputDataSet2 = context.getInputDataSet(collection2);

	TypeInformation<KV<K,CoGbkResult>> typeInfo = context.getOutputTypeInfo();

	FlinkCoGroupKeyedListAggregator<K,V1,V2> aggregator = new FlinkCoGroupKeyedListAggregator<>(schema, tupleTag1, tupleTag2);

	Keys.ExpressionKeys<KV<K,V1>> keySelector1 = new Keys.ExpressionKeys<>(new String[]{"key"}, inputDataSet1.getType());
	Keys.ExpressionKeys<KV<K,V2>> keySelector2 = new Keys.ExpressionKeys<>(new String[]{"key"}, inputDataSet2.getType());

	DataSet<KV<K, CoGbkResult>> out = new CoGroupOperator<>(inputDataSet1, inputDataSet2,
															keySelector1, keySelector2,
			                                                aggregator, typeInfo, null, transform.getName());
	context.setOutputDataSet(context.getOutput(transform), out);
}

Source File: JoinExamples.java From flink-dataflow with Apache License 2.0

5 votes

public static void main(String[] args) throws Exception {
	Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
	Pipeline p = Pipeline.create(options);
	// the following two 'applys' create multiple inputs to our pipeline, one for each
	// of our two input sources.
	PCollection<TableRow> eventsTable = p.apply(BigQueryIO.Read.from(GDELT_EVENTS_TABLE));
	PCollection<TableRow> countryCodes = p.apply(BigQueryIO.Read.from(COUNTRY_CODES));
	PCollection<String> formattedResults = joinEvents(eventsTable, countryCodes);
	formattedResults.apply(TextIO.Write.to(options.getOutput()));
	p.run();
}

Source File: WordCountJoin3ITCase.java From flink-dataflow with Apache License 2.0

5 votes

@Override
protected void testProgram() throws Exception {

	Pipeline p = FlinkTestPipeline.createForBatch();

	/* Create two PCollections and join them */
	PCollection<KV<String,Long>> occurences1 = p.apply(Create.of(WORDS_1))
			.apply(ParDo.of(new ExtractWordsFn()))
			.apply(Count.<String>perElement());

	PCollection<KV<String,Long>> occurences2 = p.apply(Create.of(WORDS_2))
			.apply(ParDo.of(new ExtractWordsFn()))
			.apply(Count.<String>perElement());

	PCollection<KV<String,Long>> occurences3 = p.apply(Create.of(WORDS_3))
			.apply(ParDo.of(new ExtractWordsFn()))
			.apply(Count.<String>perElement());

	/* CoGroup the two collections */
	PCollection<KV<String, CoGbkResult>> mergedOccurences = KeyedPCollectionTuple
			.of(tag1, occurences1)
			.and(tag2, occurences2)
			.and(tag3, occurences3)
			.apply(CoGroupByKey.<String>create());

	/* Format output */
	mergedOccurences.apply(ParDo.of(new FormatCountsFn()))
			.apply(TextIO.Write.named("test").to(resultPath));

	p.run();
}

Source File: WordCountITCase.java From flink-dataflow with Apache License 2.0

5 votes

@Override
protected void testProgram() throws Exception {

	Pipeline p = FlinkTestPipeline.createForBatch();

	PCollection<String> input = p.apply(Create.of(WORDS)).setCoder(StringUtf8Coder.of());

	input
			.apply(new WordCount.CountWords())
			.apply(MapElements.via(new WordCount.FormatAsTextFn()))
			.apply(TextIO.Write.to(resultPath));

	p.run();
}

Source File: FlinkBatchTransformTranslators.java From flink-dataflow with Apache License 2.0

5 votes

@Override
public void translateNode(Read.Bounded<T> transform, FlinkBatchTranslationContext context) {
	String name = transform.getName();
	BoundedSource<T> source = transform.getSource();
	PCollection<T> output = context.getOutput(transform);
	Coder<T> coder = output.getCoder();

	TypeInformation<T> typeInformation = context.getTypeInfo(output);

	DataSource<T> dataSource = new DataSource<>(context.getExecutionEnvironment(),
			new SourceInputFormat<>(source, context.getPipelineOptions()), typeInformation, name);

	context.setOutputDataSet(output, dataSource);
}

Source File: AutoComplete.java From flink-dataflow with Apache License 2.0

5 votes

@Override
public PCollectionList<KV<String, List<CompletionCandidate>>> apply(
      PCollection<CompletionCandidate> input) {
    if (minPrefix > 10) {
      // Base case, partitioning to return the output in the expected format.
      return input
        .apply(new ComputeTopFlat(candidatesPerPrefix, minPrefix))
        .apply(Partition.of(2, new KeySizePartitionFn()));
    } else {
      // If a candidate is in the top N for prefix a...b, it must also be in the top
      // N for a...bX for every X, which is typlically a much smaller set to consider.
      // First, compute the top candidate for prefixes of size at least minPrefix + 1.
      PCollectionList<KV<String, List<CompletionCandidate>>> larger = input
        .apply(new ComputeTopRecursive(candidatesPerPrefix, minPrefix + 1));
      // Consider the top candidates for each prefix of length minPrefix + 1...
      PCollection<KV<String, List<CompletionCandidate>>> small =
        PCollectionList
        .of(larger.get(1).apply(ParDo.of(new FlattenTops())))
        // ...together with those (previously excluded) candidates of length
        // exactly minPrefix...
        .and(input.apply(Filter.by(new SerializableFunction<CompletionCandidate, Boolean>() {
                private static final long serialVersionUID = 0;

                @Override
                public Boolean apply(CompletionCandidate c) {
                  return c.getValue().length() == minPrefix;
                }
              })))
        .apply("FlattenSmall", Flatten.<CompletionCandidate>pCollections())
        // ...set the key to be the minPrefix-length prefix...
        .apply(ParDo.of(new AllPrefixes(minPrefix, minPrefix)))
        // ...and (re)apply the Top operator to all of them together.
        .apply(Top.<String, CompletionCandidate>largestPerKey(candidatesPerPrefix));

      PCollection<KV<String, List<CompletionCandidate>>> flattenLarger = larger
          .apply("FlattenLarge", Flatten.<KV<String, List<CompletionCandidate>>>pCollections());

      return PCollectionList.of(flattenLarger).and(small);
    }
}

Source File: AutoComplete.java From flink-dataflow with Apache License 2.0

5 votes

@Override
public PCollection<KV<String, List<CompletionCandidate>>> apply(
    PCollection<CompletionCandidate> input) {
  return input
    // For each completion candidate, map it to all prefixes.
    .apply(ParDo.of(new AllPrefixes(minPrefix)))

    // Find and return the top candiates for each prefix.
    .apply(Top.<String, CompletionCandidate>largestPerKey(candidatesPerPrefix)
         .withHotKeyFanout(new HotKeyFanout()));
}

Source File: AutoComplete.java From flink-dataflow with Apache License 2.0

5 votes

@Override
public PCollection<KV<String, List<CompletionCandidate>>> apply(PCollection<String> input) {
  PCollection<CompletionCandidate> candidates = input
    // First count how often each token appears.
    .apply(new Count.PerElement<String>())

    // Map the KV outputs of Count into our own CompletionCandiate class.
    .apply(ParDo.named("CreateCompletionCandidates").of(
        new DoFn<KV<String, Long>, CompletionCandidate>() {
          private static final long serialVersionUID = 0;

          @Override
          public void processElement(ProcessContext c) {
            CompletionCandidate cand = new CompletionCandidate(c.element().getKey(), c.element().getValue());
            c.output(cand);
          }
        }));

  // Compute the top via either a flat or recursive algorithm.
  if (recursive) {
    return candidates
      .apply(new ComputeTopRecursive(candidatesPerPrefix, 1))
      .apply(Flatten.<KV<String, List<CompletionCandidate>>>pCollections());
  } else {
    return candidates
      .apply(new ComputeTopFlat(candidatesPerPrefix, 1));
  }
}

Source File: WindowedWordCount.java From flink-dataflow with Apache License 2.0

5 votes

public static void main(String[] args) throws IOException {
	StreamingWordCountOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as(StreamingWordCountOptions.class);
	options.setStreaming(true);
	options.setWindowSize(10L);
	options.setSlide(5L);
	options.setCheckpointingInterval(1000L);
	options.setNumberOfExecutionRetries(5);
	options.setExecutionRetryDelay(3000L);
	options.setRunner(FlinkPipelineRunner.class);

	LOG.info("Windpwed WordCount with Sliding Windows of " + options.getWindowSize() +
			" sec. and a slide of " + options.getSlide());

	Pipeline pipeline = Pipeline.create(options);

	PCollection<String> words = pipeline
			.apply(Read.from(new UnboundedSocketSource<>("localhost", 9999, '\n', 3)).named("StreamingWordCount"))
			.apply(ParDo.of(new ExtractWordsFn()))
			.apply(Window.<String>into(SlidingWindows.of(Duration.standardSeconds(options.getWindowSize()))
					.every(Duration.standardSeconds(options.getSlide())))
					.triggering(AfterWatermark.pastEndOfWindow()).withAllowedLateness(Duration.ZERO)
					.discardingFiredPanes());

	PCollection<KV<String, Long>> wordCounts =
			words.apply(Count.<String>perElement());

	wordCounts.apply(ParDo.of(new FormatAsStringFn()))
			.apply(TextIO.Write.to("./outputWordCount.txt"));

	pipeline.run();
}

Source File: JoinExamples.java From flink-dataflow with Apache License 2.0

5 votes

public static void main(String[] args) throws Exception {
	Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
	options.setStreaming(true);
	options.setCheckpointingInterval(1000L);
	options.setNumberOfExecutionRetries(5);
	options.setExecutionRetryDelay(3000L);
	options.setRunner(FlinkPipelineRunner.class);

	PTransform<? super PBegin, PCollection<String>> readSourceA =
			Read.from(new UnboundedSocketSource<>("localhost", 9999, '\n', 3)).named("FirstStream");
	PTransform<? super PBegin, PCollection<String>> readSourceB =
			Read.from(new UnboundedSocketSource<>("localhost", 9998, '\n', 3)).named("SecondStream");

	WindowFn<Object, ?> windowFn = FixedWindows.of(Duration.standardSeconds(options.getWindowSize()));

	Pipeline p = Pipeline.create(options);

	// the following two 'applys' create multiple inputs to our pipeline, one for each
	// of our two input sources.
	PCollection<String> streamA = p.apply(readSourceA)
			.apply(Window.<String>into(windowFn)
					.triggering(AfterWatermark.pastEndOfWindow()).withAllowedLateness(Duration.ZERO)
					.discardingFiredPanes());
	PCollection<String> streamB = p.apply(readSourceB)
			.apply(Window.<String>into(windowFn)
					.triggering(AfterWatermark.pastEndOfWindow()).withAllowedLateness(Duration.ZERO)
					.discardingFiredPanes());

	PCollection<String> formattedResults = joinEvents(streamA, streamB);
	formattedResults.apply(TextIO.Write.to("./outputJoin.txt"));
	p.run();
}

Source File: KafkaWindowedWordCountExample.java From flink-dataflow with Apache License 2.0

5 votes

public static void main(String[] args) {
	PipelineOptionsFactory.register(KafkaStreamingWordCountOptions.class);
	KafkaStreamingWordCountOptions options = PipelineOptionsFactory.fromArgs(args).as(KafkaStreamingWordCountOptions.class);
	options.setJobName("KafkaExample - WindowSize: " + options.getWindowSize() + " seconds");
	options.setStreaming(true);
	options.setCheckpointingInterval(1000L);
	options.setNumberOfExecutionRetries(5);
	options.setExecutionRetryDelay(3000L);
	options.setRunner(FlinkPipelineRunner.class);

	System.out.println(options.getKafkaTopic() +" "+ options.getZookeeper() +" "+ options.getBroker() +" "+ options.getGroup() );
	Pipeline pipeline = Pipeline.create(options);

	Properties p = new Properties();
	p.setProperty("zookeeper.connect", options.getZookeeper());
	p.setProperty("bootstrap.servers", options.getBroker());
	p.setProperty("group.id", options.getGroup());

	// this is the Flink consumer that reads the input to
	// the program from a kafka topic.
	FlinkKafkaConsumer08<String> kafkaConsumer = new FlinkKafkaConsumer08<>(
			options.getKafkaTopic(),
			new SimpleStringSchema(), p);

	PCollection<String> words = pipeline
			.apply(Read.from(new UnboundedFlinkSource<>(kafkaConsumer)).named("StreamingWordCount"))
			.apply(ParDo.of(new ExtractWordsFn()))
			.apply(Window.<String>into(FixedWindows.of(Duration.standardSeconds(options.getWindowSize())))
					.triggering(AfterWatermark.pastEndOfWindow()).withAllowedLateness(Duration.ZERO)
					.discardingFiredPanes());

	PCollection<KV<String, Long>> wordCounts =
			words.apply(Count.<String>perElement());

	wordCounts.apply(ParDo.of(new FormatAsStringFn()))
			.apply(TextIO.Write.to("./outputKafka.txt"));

	pipeline.run();
}

Source File: WordCount.java From flink-dataflow with Apache License 2.0

5 votes

@Override
public PCollection<KV<String, Long>> apply(PCollection<String> lines) {

	// Convert lines of text into individual words.
	PCollection<String> words = lines.apply(
			ParDo.of(new ExtractWordsFn()));

	// Count the number of times each word occurs.
	PCollection<KV<String, Long>> wordCounts =
			words.apply(Count.<String>perElement());

	return wordCounts;
}

Source File: TFIDF.java From flink-dataflow with Apache License 2.0

5 votes

@Override
public PCollection<KV<URI, String>> apply(PInput input) {
	Pipeline pipeline = input.getPipeline();

	// Create one TextIO.Read transform for each document
	// and add its output to a PCollectionList
	PCollectionList<KV<URI, String>> urisToLines =
			PCollectionList.empty(pipeline);

	// TextIO.Read supports:
	//  - file: URIs and paths locally
	//  - gs: URIs on the service
	for (final URI uri : uris) {
		String uriString;
		if (uri.getScheme().equals("file")) {
			uriString = new File(uri).getPath();
		} else {
			uriString = uri.toString();
		}

		PCollection<KV<URI, String>> oneUriToLines = pipeline
				.apply(TextIO.Read.from(uriString)
						.named("TextIO.Read(" + uriString + ")"))
				.apply("WithKeys(" + uriString + ")", WithKeys.<URI, String>of(uri));

		urisToLines = urisToLines.and(oneUriToLines);
	}

	return urisToLines.apply(Flatten.<KV<URI, String>>pCollections());
}

com.google.cloud.dataflow.sdk.values.PCollection Java Examples