org.apache.beam.sdk.transforms.FlatMapElements Java Examples
The following examples show how to use
org.apache.beam.sdk.transforms.FlatMapElements.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: WordCount.java From java-docs-samples with Apache License 2.0 | 7 votes |
public static void main(String[] args) { WordCountOptions options = PipelineOptionsFactory.fromArgs(args) .withValidation().as(WordCountOptions.class); Pipeline pipeline = Pipeline.create(options); pipeline .apply("Read lines", TextIO.read().from(options.getInputFile())) // [END value_provider] .apply("Find words", FlatMapElements.into(TypeDescriptors.strings()) .via((String line) -> Arrays.asList(line.split("[^\\p{L}]+")))) .apply("Filter empty words", Filter.by((String word) -> !word.isEmpty())) .apply("Filter with substring", ParDo.of(new FilterWithSubstring( options.getWithSubstring(), options.getIsCaseSensitive()))) .apply("Count words", Count.perElement()) .apply("Format results", MapElements.into(TypeDescriptors.strings()) .via((KV<String, Long> wordCount) -> wordCount.getKey() + ": " + wordCount.getValue())) // [START nested_value_provider] .apply("Write results", TextIO.write().to(NestedValueProvider.of( options.getOutputBucket(), (String bucket) -> String.format("gs://%s/samples/dataflow/wordcount/outputs", bucket) ))); // [END nested_value_provider] pipeline.run(); }
Example #2
Source File: ParsePayload.java From gcp-ingestion with Mozilla Public License 2.0 | 6 votes |
@Override public Result<PCollection<PubsubMessage>, PubsubMessage> expand( PCollection<PubsubMessage> messages) { return messages.apply(FlatMapElements.into(TypeDescriptor.of(PubsubMessage.class)) // .via(new Fn()) // .exceptionsInto(TypeDescriptor.of(PubsubMessage.class)) // .exceptionsVia((WithFailures.ExceptionElement<PubsubMessage> ee) -> { try { throw ee.exception(); } catch (IOException | SchemaNotFoundException | ValidationException | MessageScrubberException e) { return FailureMessage.of(ParsePayload.class.getSimpleName(), // ee.element(), // ee.exception()); } })); }
Example #3
Source File: DecryptPioneerPayloads.java From gcp-ingestion with Mozilla Public License 2.0 | 6 votes |
@Override public Result<PCollection<PubsubMessage>, PubsubMessage> expand( PCollection<PubsubMessage> messages) { return messages.apply(FlatMapElements.into(TypeDescriptor.of(PubsubMessage.class)) // .via(new Fn()) // .exceptionsInto(TypeDescriptor.of(PubsubMessage.class)) // .exceptionsVia((WithFailures.ExceptionElement<PubsubMessage> ee) -> { try { throw ee.exception(); } catch (IOException | JoseException | ValidationException e) { return FailureMessage.of(DecryptPioneerPayloads.class.getSimpleName(), // ee.element(), // ee.exception()); } })); }
Example #4
Source File: TextTableProvider.java From beam with Apache License 2.0 | 5 votes |
@Override public PCollection<Row> expand(PCollection<String> input) { return input .apply( "csvToRow", FlatMapElements.into(TypeDescriptors.rows()) .via(s -> csvLines2BeamRows(csvFormat, s, schema))) .setRowSchema(schema); }
Example #5
Source File: HL7v2IO.java From beam with Apache License 2.0 | 5 votes |
@Override public PCollection<HL7v2Message> expand(PBegin input) { CoderRegistry coderRegistry = input.getPipeline().getCoderRegistry(); coderRegistry.registerCoderForClass(HL7v2Message.class, HL7v2MessageCoder.of()); return input .apply(Create.ofProvider(this.hl7v2Stores, ListCoder.of(StringUtf8Coder.of()))) .apply(FlatMapElements.into(TypeDescriptors.strings()).via((x) -> x)) .apply(ParDo.of(new ListHL7v2MessagesFn(filter, initialSplitDuration))) .setCoder(HL7v2MessageCoder.of()) // Break fusion to encourage parallelization of downstream processing. .apply(Reshuffle.viaRandomKey()); }
Example #6
Source File: MinimalWordCount.java From deployment-examples with MIT License | 4 votes |
public static void main(String[] args) { // Create a PipelineOptions object. This object lets us set various execution // options for our pipeline, such as the runner you wish to use. This example // will run with the DirectRunner by default, based on the class path configured // in its dependencies. PipelineOptions options = PipelineOptionsFactory.create(); // In order to run your pipeline, you need to make following runner specific changes: // // CHANGE 1/3: Select a Beam runner, such as BlockingDataflowRunner // or FlinkRunner. // CHANGE 2/3: Specify runner-required options. // For BlockingDataflowRunner, set project and temp location as follows: // DataflowPipelineOptions dataflowOptions = options.as(DataflowPipelineOptions.class); // dataflowOptions.setRunner(BlockingDataflowRunner.class); // dataflowOptions.setProject("SET_YOUR_PROJECT_ID_HERE"); // dataflowOptions.setTempLocation("gs://SET_YOUR_BUCKET_NAME_HERE/AND_TEMP_DIRECTORY"); // For FlinkRunner, set the runner as follows. See {@code FlinkPipelineOptions} // for more details. // options.as(FlinkPipelineOptions.class) // .setRunner(FlinkRunner.class); // Create the Pipeline object with the options we defined above Pipeline p = Pipeline.create(options); // Concept #1: Apply a root transform to the pipeline; in this case, TextIO.Read to read a set // of input text files. TextIO.Read returns a PCollection where each element is one line from // the input text (a set of Shakespeare's texts). // This example reads a public data set consisting of the complete works of Shakespeare. p.apply(TextIO.read().from("gs://apache-beam-samples/shakespeare/*")) // Concept #2: Apply a FlatMapElements transform the PCollection of text lines. // This transform splits the lines in PCollection<String>, where each element is an // individual word in Shakespeare's collected texts. .apply( FlatMapElements.into(TypeDescriptors.strings()) .via((String line) -> Arrays.asList(line.split("[^\\p{L}]+")))) // We use a Filter transform to avoid empty word .apply(Filter.by((String word) -> !word.isEmpty())) // Concept #3: Apply the Count transform to our PCollection of individual words. The Count // transform returns a new PCollection of key/value pairs, where each key represents a // unique word in the text. The associated value is the occurrence count for that word. .apply(Count.perElement()) // Apply a MapElements transform that formats our PCollection of word counts into a // printable string, suitable for writing to an output file. .apply( MapElements.into(TypeDescriptors.strings()) .via( (KV<String, Long> wordCount) -> wordCount.getKey() + ": " + wordCount.getValue())) // Concept #4: Apply a write transform, TextIO.Write, at the end of the pipeline. // TextIO.Write writes the contents of a PCollection (in this case, our PCollection of // formatted strings) to a series of text files. // // By default, it will write to a set of files with names like wordcounts-00001-of-00005 .apply(TextIO.write().to("wordcounts")); p.run().waitUntilFinish(); }
Example #7
Source File: MinimalWordCount.java From beam with Apache License 2.0 | 4 votes |
public static void main(String[] args) { // Create a PipelineOptions object. This object lets us set various execution // options for our pipeline, such as the runner you wish to use. This example // will run with the DirectRunner by default, based on the class path configured // in its dependencies. PipelineOptions options = PipelineOptionsFactory.create(); // In order to run your pipeline, you need to make following runner specific changes: // // CHANGE 1/3: Select a Beam runner, such as BlockingDataflowRunner // or FlinkRunner. // CHANGE 2/3: Specify runner-required options. // For BlockingDataflowRunner, set project and temp location as follows: // DataflowPipelineOptions dataflowOptions = options.as(DataflowPipelineOptions.class); // dataflowOptions.setRunner(BlockingDataflowRunner.class); // dataflowOptions.setProject("SET_YOUR_PROJECT_ID_HERE"); // dataflowOptions.setTempLocation("gs://SET_YOUR_BUCKET_NAME_HERE/AND_TEMP_DIRECTORY"); // For FlinkRunner, set the runner as follows. See {@code FlinkPipelineOptions} // for more details. // options.as(FlinkPipelineOptions.class) // .setRunner(FlinkRunner.class); // Create the Pipeline object with the options we defined above Pipeline p = Pipeline.create(options); // Concept #1: Apply a root transform to the pipeline; in this case, TextIO.Read to read a set // of input text files. TextIO.Read returns a PCollection where each element is one line from // the input text (a set of Shakespeare's texts). // This example reads a public data set consisting of the complete works of Shakespeare. p.apply(TextIO.read().from("gs://apache-beam-samples/shakespeare/*")) // Concept #2: Apply a FlatMapElements transform the PCollection of text lines. // This transform splits the lines in PCollection<String>, where each element is an // individual word in Shakespeare's collected texts. .apply( FlatMapElements.into(TypeDescriptors.strings()) .via((String line) -> Arrays.asList(line.split("[^\\p{L}]+")))) // We use a Filter transform to avoid empty word .apply(Filter.by((String word) -> !word.isEmpty())) // Concept #3: Apply the Count transform to our PCollection of individual words. The Count // transform returns a new PCollection of key/value pairs, where each key represents a // unique word in the text. The associated value is the occurrence count for that word. .apply(Count.perElement()) // Apply a MapElements transform that formats our PCollection of word counts into a // printable string, suitable for writing to an output file. .apply( MapElements.into(TypeDescriptors.strings()) .via( (KV<String, Long> wordCount) -> wordCount.getKey() + ": " + wordCount.getValue())) // Concept #4: Apply a write transform, TextIO.Write, at the end of the pipeline. // TextIO.Write writes the contents of a PCollection (in this case, our PCollection of // formatted strings) to a series of text files. // // By default, it will write to a set of files with names like wordcounts-00001-of-00005 .apply(TextIO.write().to("wordcounts")); p.run().waitUntilFinish(); }
Example #8
Source File: Task.java From beam with Apache License 2.0 | 4 votes |
static PCollection<String> applyTransform(PCollection<String> input) { return input.apply( FlatMapElements.into(TypeDescriptors.strings()) .via(sentence -> Arrays.asList(sentence.split(" "))) ); }
Example #9
Source File: Task.java From beam with Apache License 2.0 | 4 votes |
static PCollection<String> applyTransform(PCollection<String> input) { return input .apply(FlatMapElements.into(TypeDescriptors.strings()) .via(line -> Arrays.asList(line.split(" ")))) .apply(Count.perElement()) .apply(ParDo.of(new DoFn<KV<String, Long>, String>() { @ProcessElement public void processElement( @Element KV<String, Long> element, OutputReceiver<String> out) { out.output(element.getKey() + ":" + element.getValue()); } })); }