com.google.cloud.dataflow.sdk.io.TextIO Java Exaples

Source File: SideInputITCase.java From flink-dataflow with Apache License 2.0

6 votes

@Override
protected void testProgram() throws Exception {


	Pipeline p = FlinkTestPipeline.createForBatch();


	final PCollectionView<String> sidesInput = p
			.apply(Create.of(expected))
			.apply(View.<String>asSingleton());

	p.apply(Create.of("bli"))
			.apply(ParDo.of(new DoFn<String, String>() {
				@Override
				public void processElement(ProcessContext c) throws Exception {
					String s = c.sideInput(sidesInput);
					c.output(s);
				}
			}).withSideInputs(sidesInput)).apply(TextIO.Write.to(resultPath));

	p.run();
}

Source File: LoadBooks.java From cloud-bigtable-examples with Apache License 2.0

6 votes

public static void main(String[] args) {
  // CloudBigtableOptions is one way to retrieve the options.  It's not required.
  // https://github.com/GoogleCloudPlatform/cloud-bigtable-examples/blob/master/java/dataflow-connector-examples/src/main/java/com/google/cloud/bigtable/dataflow/example/HelloWorldWrite.java
  BigtableCsvOptions options =
      PipelineOptionsFactory.fromArgs(args).withValidation().as(BigtableCsvOptions.class);
  CloudBigtableTableConfiguration config =
      CloudBigtableTableConfiguration.fromCBTOptions(options);

  Pipeline p = Pipeline.create(options);

  CloudBigtableIO.initializeForWrite(p);

  PCollection<KV<String, Integer>> ngrams =
      applyPipelineToParseBooks(p.apply(TextIO.Read.from(options.getInputFile())));
  PCollection<Mutation> mutations = ngrams.apply(ParDo.of(ENCODE_NGRAM));
  mutations.apply(CloudBigtableIO.writeToTable(config));

  // Run the pipeline.
  p.run();
}

Source File: UnboundedSourceITCase.java From flink-dataflow with Apache License 2.0

6 votes

private static void runProgram(String resultPath) {

		Pipeline p = FlinkTestPipeline.createForStreaming();

		PCollection<String> result = p
			.apply(Read.from(new RangeReadSource(1, 10)))
			.apply(Window.<Integer>into(new GlobalWindows())
				.triggering(AfterPane.elementCountAtLeast(10))
				.discardingFiredPanes())
			.apply(ParDo.of(new DoFn<Integer, String>() {
				@Override
				public void processElement(ProcessContext c) throws Exception {
				c.output(c.element().toString());
				}
			}));

		result.apply(TextIO.Write.to(resultPath));

		try {
			p.run();
			fail();
		} catch(Exception e) {
			assertEquals("The source terminates as expected.", e.getCause().getCause().getMessage());
		}
	}

Source File: RemoveDuplicatesEmptyITCase.java From flink-dataflow with Apache License 2.0

6 votes

@Override
protected void testProgram() throws Exception {

	List<String> strings = Collections.emptyList();

	Pipeline p = FlinkTestPipeline.createForBatch();

	PCollection<String> input =
			p.apply(Create.of(strings))
					.setCoder(StringUtf8Coder.of());

	PCollection<String> output =
			input.apply(RemoveDuplicates.<String>create());

	output.apply(TextIO.Write.to(resultPath));
	p.run();
}

Source File: WordCountJoin2ITCase.java From flink-dataflow with Apache License 2.0

6 votes

@Override
protected void testProgram() throws Exception {
	Pipeline p = FlinkTestPipeline.createForBatch();

	/* Create two PCollections and join them */
	PCollection<KV<String,Long>> occurences1 = p.apply(Create.of(WORDS_1))
			.apply(ParDo.of(new ExtractWordsFn()))
			.apply(Count.<String>perElement());

	PCollection<KV<String,Long>> occurences2 = p.apply(Create.of(WORDS_2))
			.apply(ParDo.of(new ExtractWordsFn()))
			.apply(Count.<String>perElement());

	/* CoGroup the two collections */
	PCollection<KV<String, CoGbkResult>> mergedOccurences = KeyedPCollectionTuple
			.of(tag1, occurences1)
			.and(tag2, occurences2)
			.apply(CoGroupByKey.<String>create());

	/* Format output */
	mergedOccurences.apply(ParDo.of(new FormatCountsFn()))
			.apply(TextIO.Write.named("test").to(resultPath));

	p.run();
}

Source File: TfIdfITCase.java From flink-dataflow with Apache License 2.0

6 votes

@Override
protected void testProgram() throws Exception {

	Pipeline pipeline = FlinkTestPipeline.createForBatch();

	pipeline.getCoderRegistry().registerCoder(URI.class, StringDelegateCoder.of(URI.class));

	PCollection<KV<String, KV<URI, Double>>> wordToUriAndTfIdf = pipeline
			.apply(Create.of(
					KV.of(new URI("x"), "a b c d"),
					KV.of(new URI("y"), "a b c"),
					KV.of(new URI("z"), "a m n")))
			.apply(new TfIdf.ComputeTfIdf());

	PCollection<String> words = wordToUriAndTfIdf
			.apply(Keys.<String>create())
			.apply(RemoveDuplicates.<String>create());

	words.apply(TextIO.Write.to(resultPath));

	pipeline.run();
}

Source File: UserManagedKeysApp.java From policyscanner with Apache License 2.0

6 votes

@Override
public void doGet(HttpServletRequest req, HttpServletResponse resp)
    throws IOException {
  PrintWriter out = resp.getWriter();

  Preconditions.checkNotNull(Constants.ORG_ID);
  Preconditions.checkNotNull(Constants.OUTPUT_PREFIX);
  Preconditions.checkNotNull(Constants.DATAFLOW_STAGING);

  PipelineOptions options;
  if (CloudUtil.willExecuteOnCloud()) {
    options = getCloudExecutionOptions(Constants.DATAFLOW_STAGING);
  } else {
    options = getLocalExecutionOptions();
  }

  new ExportedServiceAccountKeyRemover(options, Constants.ORG_ID)
      .attachSink(TextIO.Write.named("Write output messages").to(Constants.OUTPUT_PREFIX))
      .run();
  out.println("Test passed! The output was written to GCS");
}

Source File: RemoveDuplicatesITCase.java From flink-dataflow with Apache License 2.0

6 votes

@Override
protected void testProgram() throws Exception {

	List<String> strings = Arrays.asList("k1", "k5", "k5", "k2", "k1", "k2", "k3");

	Pipeline p = FlinkTestPipeline.createForBatch();

	PCollection<String> input =
			p.apply(Create.of(strings))
					.setCoder(StringUtf8Coder.of());

	PCollection<String> output =
			input.apply(RemoveDuplicates.<String>create());

	output.apply(TextIO.Write.to(resultPath));
	p.run();
}

Source File: ReadSourceITCase.java From flink-dataflow with Apache License 2.0

6 votes

private static void runProgram(String resultPath) {

		Pipeline p = FlinkTestPipeline.createForBatch();

		PCollection<String> result = p
				.apply(Read.from(new ReadSource(1, 10)))
				.apply(ParDo.of(new DoFn<Integer, String>() {
					@Override
					public void processElement(ProcessContext c) throws Exception {
						c.output(c.element().toString());
					}
				}));

		result.apply(TextIO.Write.to(resultPath));
		p.run();
	}

Source File: FlinkBatchTransformTranslators.java From flink-dataflow with Apache License 2.0

6 votes

@Override
public void translateNode(TextIO.Write.Bound<T> transform, FlinkBatchTranslationContext context) {
	PValue input = context.getInput(transform);
	DataSet<T> inputDataSet = context.getInputDataSet(input);

	String filenamePrefix = transform.getFilenamePrefix();
	String filenameSuffix = transform.getFilenameSuffix();
	boolean needsValidation = transform.needsValidation();
	int numShards = transform.getNumShards();
	String shardNameTemplate = transform.getShardNameTemplate();

	// TODO: Implement these. We need Flink support for this.
	LOG.warn("Translation of TextIO.Write.needsValidation not yet supported. Is: {}.", needsValidation);
	LOG.warn("Translation of TextIO.Write.filenameSuffix not yet supported. Is: {}.", filenameSuffix);
	LOG.warn("Translation of TextIO.Write.shardNameTemplate not yet supported. Is: {}.", shardNameTemplate);

	//inputDataSet.print();
	DataSink<T> dataSink = inputDataSet.writeAsText(filenamePrefix);

	if (numShards > 0) {
		dataSink.setParallelism(numShards);
	}
}

Source File: LiveStateCheckerRunner.java From policyscanner with Apache License 2.0

6 votes

/**
 * Main function for the runner.
 * @param args The args this program was called with.
 * @throws IOException Thrown if there's an error reading from one of the APIs.
 */
public static void main(String[] args) throws IOException {
  Preconditions.checkNotNull(Constants.ORG_NAME);
  Preconditions.checkNotNull(Constants.POLICY_BUCKET);
  Preconditions.checkNotNull(Constants.OUTPUT_PREFIX);
  Preconditions.checkNotNull(Constants.DATAFLOW_STAGING);
  GCSFilesSource source = null;
  try {
    source = new GCSFilesSource(Constants.POLICY_BUCKET, Constants.ORG_NAME);
  } catch (GeneralSecurityException e) {
    throw new IOException("SecurityException: Cannot create GCSFileSource");
  }
  PipelineOptions options;
  if (CloudUtil.willExecuteOnCloud()) {
    options = getCloudExecutionOptions(Constants.DATAFLOW_STAGING);
  } else {
    options = getLocalExecutionOptions();
  }
  new OnDemandLiveStateChecker(options, source)
      .attachSink(TextIO.Write.named("Write messages to GCS").to(Constants.OUTPUT_PREFIX))
      .run();
}

Source File: FlinkBatchTransformTranslators.java From flink-dataflow with Apache License 2.0

6 votes

@Override
public void translateNode(TextIO.Read.Bound<String> transform, FlinkBatchTranslationContext context) {
	String path = transform.getFilepattern();
	String name = transform.getName();

	TextIO.CompressionType compressionType = transform.getCompressionType();
	boolean needsValidation = transform.needsValidation();

	// TODO: Implement these. We need Flink support for this.
	LOG.warn("Translation of TextIO.CompressionType not yet supported. Is: {}.", compressionType);
	LOG.warn("Translation of TextIO.Read.needsValidation not yet supported. Is: {}.", needsValidation);

	PValue output = context.getOutput(transform);

	TypeInformation<String> typeInformation = context.getTypeInfo(output);
	DataSource<String> source = new DataSource<>(context.getExecutionEnvironment(), new TextInputFormat(new Path(path)), typeInformation, name);

	context.setOutputDataSet(output, source);
}

Source File: TFIDF.java From flink-dataflow with Apache License 2.0

6 votes

@Override
public PDone apply(PCollection<KV<String, KV<URI, Double>>> wordToUriAndTfIdf) {
	return wordToUriAndTfIdf
			.apply(ParDo.named("Format").of(new DoFn<KV<String, KV<URI, Double>>, String>() {
				private static final long serialVersionUID = 0;

				@Override
				public void processElement(ProcessContext c) {
					c.output(String.format("%s,\t%s,\t%f",
							c.element().getKey(),
							c.element().getValue().getKey(),
							c.element().getValue().getValue()));
				}
			}))
			.apply(TextIO.Write
					.to(output)
					.withSuffix(".csv"));
}

Source File: MaybeEmptyTestITCase.java From flink-dataflow with Apache License 2.0

5 votes

@Override
protected void testProgram() throws Exception {

	Pipeline p = FlinkTestPipeline.createForBatch();

	p.apply(Create.of((Void) null)).setCoder(VoidCoder.of())
			.apply(ParDo.of(
					new DoFn<Void, String>() {
						@Override
						public void processElement(DoFn<Void, String>.ProcessContext c) {
							c.output(expected);
						}
					})).apply(TextIO.Write.to(resultPath));
	p.run();
}

Source File: WordCountJoin3ITCase.java From flink-dataflow with Apache License 2.0

5 votes

@Override
protected void testProgram() throws Exception {

	Pipeline p = FlinkTestPipeline.createForBatch();

	/* Create two PCollections and join them */
	PCollection<KV<String,Long>> occurences1 = p.apply(Create.of(WORDS_1))
			.apply(ParDo.of(new ExtractWordsFn()))
			.apply(Count.<String>perElement());

	PCollection<KV<String,Long>> occurences2 = p.apply(Create.of(WORDS_2))
			.apply(ParDo.of(new ExtractWordsFn()))
			.apply(Count.<String>perElement());

	PCollection<KV<String,Long>> occurences3 = p.apply(Create.of(WORDS_3))
			.apply(ParDo.of(new ExtractWordsFn()))
			.apply(Count.<String>perElement());

	/* CoGroup the two collections */
	PCollection<KV<String, CoGbkResult>> mergedOccurences = KeyedPCollectionTuple
			.of(tag1, occurences1)
			.and(tag2, occurences2)
			.and(tag3, occurences3)
			.apply(CoGroupByKey.<String>create());

	/* Format output */
	mergedOccurences.apply(ParDo.of(new FormatCountsFn()))
			.apply(TextIO.Write.named("test").to(resultPath));

	p.run();
}

Source File: JoinExamples.java From flink-dataflow with Apache License 2.0

5 votes

public static void main(String[] args) throws Exception {
	Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
	Pipeline p = Pipeline.create(options);
	// the following two 'applys' create multiple inputs to our pipeline, one for each
	// of our two input sources.
	PCollection<TableRow> eventsTable = p.apply(BigQueryIO.Read.from(GDELT_EVENTS_TABLE));
	PCollection<TableRow> countryCodes = p.apply(BigQueryIO.Read.from(COUNTRY_CODES));
	PCollection<String> formattedResults = joinEvents(eventsTable, countryCodes);
	formattedResults.apply(TextIO.Write.to(options.getOutput()));
	p.run();
}

Source File: WordCountITCase.java From flink-dataflow with Apache License 2.0

5 votes

@Override
protected void testProgram() throws Exception {

	Pipeline p = FlinkTestPipeline.createForBatch();

	PCollection<String> input = p.apply(Create.of(WORDS)).setCoder(StringUtf8Coder.of());

	input
			.apply(new WordCount.CountWords())
			.apply(MapElements.via(new WordCount.FormatAsTextFn()))
			.apply(TextIO.Write.to(resultPath));

	p.run();
}

Source File: TFIDF.java From flink-dataflow with Apache License 2.0

5 votes

@Override
public PCollection<KV<URI, String>> apply(PInput input) {
	Pipeline pipeline = input.getPipeline();

	// Create one TextIO.Read transform for each document
	// and add its output to a PCollectionList
	PCollectionList<KV<URI, String>> urisToLines =
			PCollectionList.empty(pipeline);

	// TextIO.Read supports:
	//  - file: URIs and paths locally
	//  - gs: URIs on the service
	for (final URI uri : uris) {
		String uriString;
		if (uri.getScheme().equals("file")) {
			uriString = new File(uri).getPath();
		} else {
			uriString = uri.toString();
		}

		PCollection<KV<URI, String>> oneUriToLines = pipeline
				.apply(TextIO.Read.from(uriString)
						.named("TextIO.Read(" + uriString + ")"))
				.apply("WithKeys(" + uriString + ")", WithKeys.<URI, String>of(uri));

		urisToLines = urisToLines.and(oneUriToLines);
	}

	return urisToLines.apply(Flatten.<KV<URI, String>>pCollections());
}

Source File: WordCount.java From flink-dataflow with Apache License 2.0

5 votes

public static void main(String[] args) {

		Options options = PipelineOptionsFactory.fromArgs(args).withValidation()
				.as(Options.class);
		options.setRunner(FlinkPipelineRunner.class);

		Pipeline p = Pipeline.create(options);

		p.apply(TextIO.Read.named("ReadLines").from(options.getInput()))
				.apply(new CountWords())
				.apply(MapElements.via(new FormatAsTextFn()))
				.apply(TextIO.Write.named("WriteCounts").to(options.getOutput()));

		p.run();
	}

Source File: AvroITCase.java From flink-dataflow with Apache License 2.0

5 votes

private static void runProgram(String tmpPath, String resultPath) {
	Pipeline p = FlinkTestPipeline.createForBatch();

	p
		.apply(Create.of(
				new User("Joe", 3, "red"),
				new User("Mary", 4, "blue"),
				new User("Mark", 1, "green"),
				new User("Julia", 5, "purple"))
			.withCoder(AvroCoder.of(User.class)))

		.apply(AvroIO.Write.to(tmpPath)
			.withSchema(User.class));

	p.run();

	p = FlinkTestPipeline.createForBatch();

	p
		.apply(AvroIO.Read.from(tmpPath).withSchema(User.class).withoutValidation())

			.apply(ParDo.of(new DoFn<User, String>() {
				@Override
				public void processElement(ProcessContext c) throws Exception {
					User u = c.element();
					String result = u.getName() + " " + u.getFavoriteColor() + " " + u.getFavoriteNumber();
					c.output(result);
				}
			}))

		.apply(TextIO.Write.to(resultPath));

	p.run();
}

Source File: KafkaWindowedWordCountExample.java From flink-dataflow with Apache License 2.0

5 votes

public static void main(String[] args) {
	PipelineOptionsFactory.register(KafkaStreamingWordCountOptions.class);
	KafkaStreamingWordCountOptions options = PipelineOptionsFactory.fromArgs(args).as(KafkaStreamingWordCountOptions.class);
	options.setJobName("KafkaExample - WindowSize: " + options.getWindowSize() + " seconds");
	options.setStreaming(true);
	options.setCheckpointingInterval(1000L);
	options.setNumberOfExecutionRetries(5);
	options.setExecutionRetryDelay(3000L);
	options.setRunner(FlinkPipelineRunner.class);

	System.out.println(options.getKafkaTopic() +" "+ options.getZookeeper() +" "+ options.getBroker() +" "+ options.getGroup() );
	Pipeline pipeline = Pipeline.create(options);

	Properties p = new Properties();
	p.setProperty("zookeeper.connect", options.getZookeeper());
	p.setProperty("bootstrap.servers", options.getBroker());
	p.setProperty("group.id", options.getGroup());

	// this is the Flink consumer that reads the input to
	// the program from a kafka topic.
	FlinkKafkaConsumer08<String> kafkaConsumer = new FlinkKafkaConsumer08<>(
			options.getKafkaTopic(),
			new SimpleStringSchema(), p);

	PCollection<String> words = pipeline
			.apply(Read.from(new UnboundedFlinkSource<>(kafkaConsumer)).named("StreamingWordCount"))
			.apply(ParDo.of(new ExtractWordsFn()))
			.apply(Window.<String>into(FixedWindows.of(Duration.standardSeconds(options.getWindowSize())))
					.triggering(AfterWatermark.pastEndOfWindow()).withAllowedLateness(Duration.ZERO)
					.discardingFiredPanes());

	PCollection<KV<String, Long>> wordCounts =
			words.apply(Count.<String>perElement());

	wordCounts.apply(ParDo.of(new FormatAsStringFn()))
			.apply(TextIO.Write.to("./outputKafka.txt"));

	pipeline.run();
}

Source File: JoinExamples.java From flink-dataflow with Apache License 2.0

5 votes

public static void main(String[] args) throws Exception {
	Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
	options.setStreaming(true);
	options.setCheckpointingInterval(1000L);
	options.setNumberOfExecutionRetries(5);
	options.setExecutionRetryDelay(3000L);
	options.setRunner(FlinkPipelineRunner.class);

	PTransform<? super PBegin, PCollection<String>> readSourceA =
			Read.from(new UnboundedSocketSource<>("localhost", 9999, '\n', 3)).named("FirstStream");
	PTransform<? super PBegin, PCollection<String>> readSourceB =
			Read.from(new UnboundedSocketSource<>("localhost", 9998, '\n', 3)).named("SecondStream");

	WindowFn<Object, ?> windowFn = FixedWindows.of(Duration.standardSeconds(options.getWindowSize()));

	Pipeline p = Pipeline.create(options);

	// the following two 'applys' create multiple inputs to our pipeline, one for each
	// of our two input sources.
	PCollection<String> streamA = p.apply(readSourceA)
			.apply(Window.<String>into(windowFn)
					.triggering(AfterWatermark.pastEndOfWindow()).withAllowedLateness(Duration.ZERO)
					.discardingFiredPanes());
	PCollection<String> streamB = p.apply(readSourceB)
			.apply(Window.<String>into(windowFn)
					.triggering(AfterWatermark.pastEndOfWindow()).withAllowedLateness(Duration.ZERO)
					.discardingFiredPanes());

	PCollection<String> formattedResults = joinEvents(streamA, streamB);
	formattedResults.apply(TextIO.Write.to("./outputJoin.txt"));
	p.run();
}

Source File: FlinkStreamingTransformTranslators.java From flink-dataflow with Apache License 2.0

5 votes

@Override
public void translateNode(TextIO.Write.Bound<T> transform, FlinkStreamingTranslationContext context) {
	PValue input = context.getInput(transform);
	DataStream<WindowedValue<T>> inputDataStream = context.getInputDataStream(input);

	String filenamePrefix = transform.getFilenamePrefix();
	String filenameSuffix = transform.getFilenameSuffix();
	boolean needsValidation = transform.needsValidation();
	int numShards = transform.getNumShards();
	String shardNameTemplate = transform.getShardNameTemplate();

	// TODO: Implement these. We need Flink support for this.
	LOG.warn("Translation of TextIO.Write.needsValidation not yet supported. Is: {}.", needsValidation);
	LOG.warn("Translation of TextIO.Write.filenameSuffix not yet supported. Is: {}.", filenameSuffix);
	LOG.warn("Translation of TextIO.Write.shardNameTemplate not yet supported. Is: {}.", shardNameTemplate);

	DataStream<String> dataSink = inputDataStream.flatMap(new FlatMapFunction<WindowedValue<T>, String>() {
		@Override
		public void flatMap(WindowedValue<T> value, Collector<String> out) throws Exception {
			out.collect(value.getValue().toString());
		}
	});
	DataStreamSink<String> output = dataSink.writeAsText(filenamePrefix, FileSystem.WriteMode.OVERWRITE);

	if (numShards > 0) {
		output.setParallelism(numShards);
	}
}

Source File: FlattenizeITCase.java From flink-dataflow with Apache License 2.0

4 votes

@Override
protected void testProgram() throws Exception {
	Pipeline p = FlinkTestPipeline.createForBatch();

	PCollection<String> p1 = p.apply(Create.of(words));
	PCollection<String> p2 = p.apply(Create.of(words2));

	PCollectionList<String> list = PCollectionList.of(p1).and(p2);

	list.apply(Flatten.<String>pCollections()).apply(TextIO.Write.to(resultPath));

	PCollection<String> p3 = p.apply(Create.of(words3));

	PCollectionList<String> list2 = list.and(p3);

	list2.apply(Flatten.<String>pCollections()).apply(TextIO.Write.to(resultPath2));

	p.run();
}

Source File: ParDoMultiOutputITCase.java From flink-dataflow with Apache License 2.0

4 votes

@Override
protected void testProgram() throws Exception {
	Pipeline p = FlinkTestPipeline.createForBatch();

	PCollection<String> words = p.apply(Create.of("Hello", "Whatupmyman", "hey", "SPECIALthere", "MAAA", "MAAFOOO"));

	// Select words whose length is below a cut off,
	// plus the lengths of words that are above the cut off.
	// Also select words starting with "MARKER".
	final int wordLengthCutOff = 3;
	// Create tags to use for the main and side outputs.
	final TupleTag<String> wordsBelowCutOffTag = new TupleTag<String>(){};
	final TupleTag<Integer> wordLengthsAboveCutOffTag = new TupleTag<Integer>(){};
	final TupleTag<String> markedWordsTag = new TupleTag<String>(){};

	PCollectionTuple results =
			words.apply(ParDo
					.withOutputTags(wordsBelowCutOffTag, TupleTagList.of(wordLengthsAboveCutOffTag)
							.and(markedWordsTag))
					.of(new DoFn<String, String>() {
						final TupleTag<String> specialWordsTag = new TupleTag<String>() {
						};

						public void processElement(ProcessContext c) {
							String word = c.element();
							if (word.length() <= wordLengthCutOff) {
								c.output(word);
							} else {
								c.sideOutput(wordLengthsAboveCutOffTag, word.length());
							}
							if (word.startsWith("MAA")) {
								c.sideOutput(markedWordsTag, word);
							}

							if (word.startsWith("SPECIAL")) {
								c.sideOutput(specialWordsTag, word);
							}
						}
					}));

	// Extract the PCollection results, by tag.
	PCollection<String> wordsBelowCutOff = results.get(wordsBelowCutOffTag);
	PCollection<Integer> wordLengthsAboveCutOff = results.get
			(wordLengthsAboveCutOffTag);
	PCollection<String> markedWords = results.get(markedWordsTag);

	markedWords.apply(TextIO.Write.to(resultPath));

	p.run();
}

Source File: DesiredStateEnforcerApp.java From policyscanner with Apache License 2.0

4 votes

/**
 * Handler for the GET request to this app.
 * @param req The request object.
 * @param resp The response object.
 * @throws IOException Thrown if there's an error reading from one of the APIs.
 */
@Override
public void doGet(HttpServletRequest req, HttpServletResponse resp)
    throws IOException {
  PrintWriter out = resp.getWriter();

  Preconditions.checkNotNull(Constants.ORG_NAME);
  Preconditions.checkNotNull(Constants.ORG_ID);
  Preconditions.checkNotNull(Constants.POLICY_BUCKET);
  Preconditions.checkNotNull(Constants.OUTPUT_PREFIX);
  Preconditions.checkNotNull(Constants.DATAFLOW_STAGING);

  GCSFilesSource source = null;
  try {
    source = new GCSFilesSource(Constants.POLICY_BUCKET, Constants.ORG_NAME);
  } catch (GeneralSecurityException e) {
    throw new IOException("SecurityException: Cannot create GCSFileSource");
  }
  PipelineOptions options;
  if (CloudUtil.willExecuteOnCloud()) {
    options = getCloudExecutionOptions(Constants.DATAFLOW_STAGING);
  } else {
    options = getLocalExecutionOptions();
  }
  String datetimestamp = new SimpleDateFormat(Constants.SINK_TIMESTAMP_FORMAT).format(new Date());
  DesiredStateEnforcer enforcer = null;
  try {
    enforcer = new DesiredStateEnforcer(options, source, Constants.ORG_ID)
        .attachSink(TextIO.Write
            .named("Write messages to GCS")
            .to(MessageFormat.format(Constants.SINK_NAME_FORMAT,
                new Object[]{
                    Constants.OUTPUT_PREFIX,
                    datetimestamp,
                    Constants.OUTPUT_LABEL_ENFORCER
                    })))
        .run();
    if (enforcer.getTotalEnforcedStates() < 1) {
      out.println("Finished running Enforcer! No states needed to be enforced.");
    } else {
      out.println("Finished running Enforcer! The output was written to GCS");
    }
  } catch (AggregatorRetrievalException aggRetrievalException) {
    // TODO(carise): do something better than this
    aggRetrievalException.printStackTrace();
  }
}

Source File: JoinExamplesITCase.java From flink-dataflow with Apache License 2.0

3 votes

@Override
protected void testProgram() throws Exception {

	Pipeline p = FlinkTestPipeline.createForBatch();

	PCollection<TableRow> input1 = p.apply(Create.of(EVENT_ARRAY));
	PCollection<TableRow> input2 = p.apply(Create.of(CC_ARRAY));

	PCollection<String> output = JoinExamples.joinEvents(input1, input2);

	output.apply(TextIO.Write.to(resultPath));

	p.run();
}

com.google.cloud.dataflow.sdk.io.TextIO Java Examples