Java Code Examples for org.apache.beam.sdk.Pipeline#apply()
The following examples show how to use
org.apache.beam.sdk.Pipeline#apply() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: DataflowRunnerTest.java From beam with Apache License 2.0 | 6 votes |
/** Tests that all reads are consumed by at least one {@link PTransform}. */ @Test public void testUnconsumedReads() throws IOException { DataflowPipelineOptions dataflowOptions = buildPipelineOptions(); RuntimeTestOptions options = dataflowOptions.as(RuntimeTestOptions.class); Pipeline p = buildDataflowPipeline(dataflowOptions); p.apply(TextIO.read().from(options.getInput())); DataflowRunner.fromOptions(dataflowOptions).replaceTransforms(p); final AtomicBoolean unconsumedSeenAsInput = new AtomicBoolean(); p.traverseTopologically( new PipelineVisitor.Defaults() { @Override public void visitPrimitiveTransform(Node node) { unconsumedSeenAsInput.set(true); } }); assertThat(unconsumedSeenAsInput.get(), is(true)); }
Example 2
Source File: QueryablePipelineTest.java From beam with Apache License 2.0 | 6 votes |
@Test public void rootTransforms() { Pipeline p = Pipeline.create(); p.apply("UnboundedRead", Read.from(CountingSource.unbounded())) .apply(Window.into(FixedWindows.of(Duration.millis(5L)))) .apply(Count.perElement()); p.apply("BoundedRead", Read.from(CountingSource.upTo(100L))); Components components = PipelineTranslation.toProto(p).getComponents(); QueryablePipeline qp = QueryablePipeline.forPrimitivesIn(components); assertThat(qp.getRootTransforms(), hasSize(2)); for (PTransformNode rootTransform : qp.getRootTransforms()) { assertThat( "Root transforms should have no inputs", rootTransform.getTransform().getInputsCount(), equalTo(0)); assertThat( "Only added source reads to the pipeline", rootTransform.getTransform().getSpec().getUrn(), equalTo(PTransformTranslation.READ_TRANSFORM_URN)); } }
Example 3
Source File: FlinkPipelineExecutionEnvironmentTest.java From beam with Apache License 2.0 | 6 votes |
@Test public void testTranslationModeNoOverrideWithoutUnboundedSources() { boolean[] testArgs = new boolean[] {true, false}; for (boolean streaming : testArgs) { FlinkPipelineOptions options = PipelineOptionsFactory.as(FlinkPipelineOptions.class); options.setRunner(FlinkRunner.class); options.setStreaming(streaming); FlinkPipelineExecutionEnvironment flinkEnv = new FlinkPipelineExecutionEnvironment(options); Pipeline pipeline = Pipeline.create(options); pipeline.apply(GenerateSequence.from(0).to(10)); flinkEnv.translate(pipeline); assertThat(options.isStreaming(), Matchers.is(streaming)); } }
Example 4
Source File: BeamBigQueryInputStepHandler.java From hop with Apache License 2.0 | 5 votes |
@Override public void handleStep( ILogChannel log, TransformMeta transformMeta, Map<String, PCollection<HopRow>> stepCollectionMap, Pipeline pipeline, IRowMeta rowMeta, List<TransformMeta> previousSteps, PCollection<HopRow> input ) throws HopException { // Input handling // BeamBQInputMeta beamInputMeta = (BeamBQInputMeta) transformMeta.getTransform(); // Output rows (fields selection) // IRowMeta outputRowMeta = new RowMeta(); beamInputMeta.getFields( outputRowMeta, transformMeta.getName(), null, null, pipelineMeta, null ); BeamBQInputTransform beamInputTransform = new BeamBQInputTransform( transformMeta.getName(), transformMeta.getName(), pipelineMeta.environmentSubstitute( beamInputMeta.getProjectId() ), pipelineMeta.environmentSubstitute( beamInputMeta.getDatasetId() ), pipelineMeta.environmentSubstitute( beamInputMeta.getTableId() ), pipelineMeta.environmentSubstitute( beamInputMeta.getQuery() ), JsonRowMeta.toJson( outputRowMeta ), transformPluginClasses, xpPluginClasses ); PCollection<HopRow> afterInput = pipeline.apply( beamInputTransform ); stepCollectionMap.put( transformMeta.getName(), afterInput ); log.logBasic( "Handled transform (BQ INPUT) : " + transformMeta.getName() ); }
Example 5
Source File: SparkSimpleFileIOOutputRuntimeTestIT.java From components with Apache License 2.0 | 5 votes |
/** * Basic unit test using all default values (except for the path) on an in-memory DFS cluster. */ @Category(ValidatesRunner.class) @Ignore("BEAM-1206") @Test public void testBasicDefaults() throws IOException { FileSystem fs = FileSystem.get(spark.createHadoopConfiguration()); String fileSpec = fs.getUri().resolve(new Path(tmp.getRoot().toString(), "basic").toUri()).toString(); // Configure the component. SimpleFileIOOutputProperties props = SimpleFileIOOutputRuntimeTest.createOutputComponentProperties(); props.getDatasetProperties().path.setValue(fileSpec); props.getDatasetProperties().format.setValue(SimpleFileIOFormat.AVRO); // Create the runtime. SimpleFileIOOutputRuntime runtime = new SimpleFileIOOutputRuntime(); runtime.initialize(null, props); // Use the runtime in a Spark pipeline to test. final Pipeline p = spark.createPipeline(); PCollection<IndexedRecord> input = p.apply( // Create.of(ConvertToIndexedRecord.convertToAvro(new String[] { "1", "one" }), // ConvertToIndexedRecord.convertToAvro(new String[] { "2", "two" }))); // input.apply(runtime); // And run the test. p.run().waitUntilFinish(); // Check the expected values. MiniDfsResource.assertReadFile(fs, fileSpec, "1;one", "2;two"); }
Example 6
Source File: TestDataflowRunnerTest.java From beam with Apache License 2.0 | 5 votes |
/** * Tests that if a streaming pipeline crash loops for a non-assertion reason that the test run * throws an {@link AssertionError}. * * <p>This is a known limitation/bug of the runner that it does not distinguish the two modes of * failure. */ @Test public void testStreamingPipelineFailsIfException() throws Exception { options.setStreaming(true); Pipeline pipeline = TestPipeline.create(options); PCollection<Integer> pc = pipeline.apply(Create.of(1, 2, 3)); PAssert.that(pc).containsInAnyOrder(1, 2, 3); DataflowPipelineJob mockJob = Mockito.mock(DataflowPipelineJob.class); when(mockJob.getState()).thenReturn(State.RUNNING); when(mockJob.getProjectId()).thenReturn("test-project"); when(mockJob.getJobId()).thenReturn("test-job"); when(mockJob.waitUntilFinish(any(Duration.class), any(JobMessagesHandler.class))) .thenAnswer( invocation -> { JobMessage message = new JobMessage(); message.setMessageText("FooException"); message.setTime(TimeUtil.toCloudTime(Instant.now())); message.setMessageImportance("JOB_MESSAGE_ERROR"); ((JobMessagesHandler) invocation.getArguments()[1]).process(Arrays.asList(message)); return State.CANCELLED; }); DataflowRunner mockRunner = Mockito.mock(DataflowRunner.class); when(mockRunner.run(any(Pipeline.class))).thenReturn(mockJob); when(mockClient.getJobMetrics(anyString())) .thenReturn(generateMockMetricResponse(false /* success */, true /* tentative */)); TestDataflowRunner runner = TestDataflowRunner.fromOptionsAndClient(options, mockClient); expectedException.expect(RuntimeException.class); runner.run(pipeline, mockRunner); }
Example 7
Source File: IndexerPipeline.java From dataflow-opinion-analysis with Apache License 2.0 | 5 votes |
/** * @param contentToIndexNotSkipped * @param contentNotToIndexSkipped * @param pipeline * @param options * @return */ private static ContentToIndexOrNot filterAlreadyProcessedDocuments( PCollection<InputContent> contentToIndexNotSkipped, PCollection<InputContent> contentNotToIndexSkipped, Pipeline pipeline, IndexerPipelineOptions options) { PCollection<KV<String,Long>> alreadyProcessedDocs = null; if (!options.getWriteTruncate()) { String query = IndexerPipelineUtils.buildBigQueryProcessedDocsQuery(options); alreadyProcessedDocs = pipeline .apply("Get already processed Documents",BigQueryIO.read().fromQuery(query)) .apply(ParDo.of(new GetDocumentHashFn())); } else { Map<String, Long> map = new HashMap<String,Long>(); alreadyProcessedDocs = pipeline .apply("Create empty side input of Docs", Create.of(map).withCoder(KvCoder.of(StringUtf8Coder.of(),VarLongCoder.of()))); } final PCollectionView<Map<String,Long>> alreadyProcessedDocsSideInput = alreadyProcessedDocs.apply(View.<String,Long>asMap()); PCollectionTuple indexOrNotBasedOnExactDupes = contentToIndexNotSkipped .apply("Extract DocumentHash key", ParDo.of(new GetInputContentDocumentHashFn())) .apply("Group by DocumentHash key", GroupByKey.<String, InputContent>create()) .apply("Eliminate InputContent Dupes", ParDo.of(new EliminateInputContentDupes(alreadyProcessedDocsSideInput)) .withSideInputs(alreadyProcessedDocsSideInput) .withOutputTags(PipelineTags.contentToIndexNotExactDupesTag, // main output collection TupleTagList.of(PipelineTags.contentNotToIndexExactDupesTag))); // side output collection PCollection<InputContent> contentToIndexNotExactDupes = indexOrNotBasedOnExactDupes.get(PipelineTags.contentToIndexNotExactDupesTag); PCollection<InputContent> contentNotToIndexExactDupes = indexOrNotBasedOnExactDupes.get(PipelineTags.contentNotToIndexExactDupesTag); // Merge the sets of items that are dupes or skipped PCollectionList<InputContent> contentNotToIndexList = PCollectionList.of(contentNotToIndexExactDupes).and(contentNotToIndexSkipped); ContentToIndexOrNot content = new ContentToIndexOrNot(contentToIndexNotExactDupes, contentNotToIndexList.apply(Flatten.<InputContent>pCollections())); return content; }
Example 8
Source File: S3OutputRuntimeTestIT.java From components with Apache License 2.0 | 5 votes |
@Test public void testAvro_merge() throws IOException { S3DatasetProperties datasetProps = s3.createS3DatasetProperties(); datasetProps.format.setValue(SimpleFileIOFormat.AVRO); S3OutputProperties outputProperties = new S3OutputProperties("out"); outputProperties.init(); outputProperties.setDatasetProperties(datasetProps); outputProperties.mergeOutput.setValue(true); // Create the runtime. S3OutputRuntime runtime = new S3OutputRuntime(); runtime.initialize(null, outputProperties); // Use the runtime in a Spark pipeline to test. final Pipeline p = spark.createPipeline(); PCollection<IndexedRecord> input = p.apply( // Create.of(ConvertToIndexedRecord.convertToAvro(new String[] { "1", "one" }), // ConvertToIndexedRecord.convertToAvro(new String[] { "2", "two" }))); // input.apply(runtime); // And run the test. p.run().waitUntilFinish(); FileSystem s3FileSystem = S3Connection.createFileSystem(datasetProps); MiniDfsResource.assertReadAvroFile(s3FileSystem, s3.getS3APath(datasetProps), new HashSet<IndexedRecord>(Arrays.asList(ConvertToIndexedRecord.convertToAvro(new String[] { "1", "one" }), // ConvertToIndexedRecord.convertToAvro(new String[] { "2", "two" }))), false); MiniDfsResource.assertFileNumber(s3FileSystem, s3.getS3APath(datasetProps), 1); }
Example 9
Source File: SparkSimpleFileIOOutputRuntimeTestIT.java From components with Apache License 2.0 | 5 votes |
@Test public void testAvro_merge() throws IOException { FileSystem fs = FileSystem.get(spark.createHadoopConfiguration()); String fileSpec = fs.getUri().resolve(new Path(tmp.getRoot().toString(), "output.avro").toUri()).toString(); // Configure the component. SimpleFileIOOutputProperties props = SimpleFileIOOutputRuntimeTest.createOutputComponentProperties(); props.getDatasetProperties().path.setValue(fileSpec); props.getDatasetProperties().format.setValue(SimpleFileIOFormat.AVRO); props.mergeOutput.setValue(true); // Create the runtime. SimpleFileIOOutputRuntime runtime = new SimpleFileIOOutputRuntime(); runtime.initialize(null, props); // Use the runtime in a Spark pipeline to test. final Pipeline p = spark.createPipeline(); PCollection<IndexedRecord> input = p.apply( // Create.of(ConvertToIndexedRecord.convertToAvro(new String[] { "1", "one" }), // ConvertToIndexedRecord.convertToAvro(new String[] { "2", "two" }))); // input.apply(runtime); // And run the test. p.run().waitUntilFinish(); // Check the expected values. MiniDfsResource.assertReadAvroFile(fs, fileSpec, new HashSet<IndexedRecord>(Arrays.asList(ConvertToIndexedRecord.convertToAvro(new String[] { "1", "one" }), // ConvertToIndexedRecord.convertToAvro(new String[] { "2", "two" }))), false); MiniDfsResource.assertFileNumber(fs, fileSpec, 1); }
Example 10
Source File: Task.java From beam with Apache License 2.0 | 5 votes |
public static void main(String[] args) { PipelineOptions options = PipelineOptionsFactory.fromArgs(args).create(); Pipeline pipeline = Pipeline.create(options); PCollection<KV<String, String>> citiesToCountries = pipeline.apply("Cities and Countries", Create.of( KV.of("Beijing", "China"), KV.of("London", "United Kingdom"), KV.of("San Francisco", "United States"), KV.of("Singapore", "Singapore"), KV.of("Sydney", "Australia") )); PCollectionView<Map<String, String>> citiesToCountriesView = createView(citiesToCountries); PCollection<Person> persons = pipeline.apply("Persons", Create.of( new Person("Henry", "Singapore"), new Person("Jane", "San Francisco"), new Person("Lee", "Beijing"), new Person("John", "Sydney"), new Person("Alfred", "London") )); PCollection<Person> output = applyTransform(persons, citiesToCountriesView); output.apply(Log.ofElements()); pipeline.run(); }
Example 11
Source File: SpannerGroupWrite.java From java-docs-samples with Apache License 2.0 | 4 votes |
public static void main(String[] args) { Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); Pipeline p = Pipeline.create(options); String instanceId = options.getInstanceId(); String databaseId = options.getDatabaseId(); String usersIdFile = options.getSuspiciousUsersFile(); PCollection<String> suspiciousUserIds = p.apply(TextIO.read().from(usersIdFile)); final Timestamp timestamp = Timestamp.now(); // [START spanner_dataflow_writegroup] PCollection<MutationGroup> mutations = suspiciousUserIds .apply(MapElements.via(new SimpleFunction<String, MutationGroup>() { @Override public MutationGroup apply(String userId) { // Immediately block the user. Mutation userMutation = Mutation.newUpdateBuilder("Users") .set("id").to(userId) .set("state").to("BLOCKED") .build(); long generatedId = Hashing.sha1().newHasher() .putString(userId, Charsets.UTF_8) .putLong(timestamp.getSeconds()) .putLong(timestamp.getNanos()) .hash() .asLong(); // Add an entry to pending review requests. Mutation pendingReview = Mutation.newInsertOrUpdateBuilder("PendingReviews") .set("id").to(generatedId) // Must be deterministically generated. .set("userId").to(userId) .set("action").to("REVIEW ACCOUNT") .set("note").to("Suspicious activity detected.") .build(); return MutationGroup.create(userMutation, pendingReview); } })); mutations.apply(SpannerIO.write() .withInstanceId(instanceId) .withDatabaseId(databaseId) .grouped()); // [END spanner_dataflow_writegroup] p.run().waitUntilFinish(); }
Example 12
Source File: BigQueryReadWriteIT.java From beam with Apache License 2.0 | 4 votes |
private PCollection<Row> createPCollection(Pipeline pipeline, Row... rows) { return pipeline.apply(Create.of(Arrays.asList(rows)).withRowSchema(SOURCE_SCHEMA)); }
Example 13
Source File: TestUtils.java From beam with Apache License 2.0 | 4 votes |
public static <T> PCollection<T> createMockDataset( Pipeline pipeline, TypeDescriptor<T> typeDescriptor) { return pipeline.apply(Create.empty(typeDescriptor)); }
Example 14
Source File: SimpleFileIOOutputErrorTest.java From components with Apache License 2.0 | 4 votes |
/** * Basic unit test using all default values (except for the path) on an in-memory DFS cluster. */ @Test public void testUnauthorizedOverwrite() throws IOException, URISyntaxException { Path parent = new Path(mini.newFolder().toString()); Path dst = new Path(parent, "output"); String fileSpec = mini.getLocalFs().getUri().resolve(dst.toUri()).toString(); // Write something to the file before trying to run. try (OutputStream out = mini.getLocalFs().create(new Path(dst, "part-00000"))) { out.write(0); } // Ensure that the destination is unwritable. FileUtil.chmod(dst.toUri().toString(), "000", true); // Trying to overwrite an unmodifiable destination throws an exception. thrown.expect(TalendRuntimeException.class); thrown.expect(hasProperty("code", is(SimpleFileIOErrorCode.OUTPUT_NOT_AUTHORIZED))); thrown.expectMessage("Can not write to " + fileSpec + ". Please check user permissions or existence of base directory."); // Now try using the component. try { // Configure the component. SimpleFileIOOutputProperties props = SimpleFileIOOutputRuntimeTest.createOutputComponentProperties(); props.getDatasetProperties().path.setValue(fileSpec); props.overwrite.setValue(true); // Create the runtime. SimpleFileIOOutputRuntime runtime = new SimpleFileIOOutputRuntime(); runtime.initialize(null, props); // Use the runtime in a direct pipeline to test. final Pipeline p = beam.createPipeline(); PCollection<IndexedRecord> input = p.apply( // Create.of(ConvertToIndexedRecord.convertToAvro(new String[] { "1", "one" }), // ConvertToIndexedRecord.convertToAvro(new String[] { "2", "two" }))); // input.apply(runtime); // And run the test. runtime.runAtDriver(null); p.run().waitUntilFinish(); } catch (Pipeline.PipelineExecutionException e) { if (e.getCause() instanceof TalendRuntimeException) throw (TalendRuntimeException) e.getCause(); throw e; } }
Example 15
Source File: Task.java From beam with Apache License 2.0 | 4 votes |
static PCollection<String> setupPipeline(Pipeline pipeline) { return pipeline.apply(Create.of("Hello Beam")); }
Example 16
Source File: Task.java From beam with Apache License 2.0 | 3 votes |
public static void main(String[] args) { PipelineOptions options = PipelineOptionsFactory.fromArgs(args).create(); Pipeline pipeline = Pipeline.create(options); PCollection<Integer> numbers = pipeline.apply(Create.of(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)); PCollection<Integer> output = applyTransform(numbers); output.apply(Log.ofElements()); pipeline.run(); }
Example 17
Source File: Task.java From beam with Apache License 2.0 | 3 votes |
public static void main(String[] args) { PipelineOptions options = PipelineOptionsFactory.fromArgs(args).create(); Pipeline pipeline = Pipeline.create(options); PCollection<Integer> numbers = pipeline.apply(Create.of(10, 30, 50, 70, 90)); PCollection<Integer> output = applyTransform(numbers); output.apply(Log.ofElements()); pipeline.run(); }
Example 18
Source File: Task.java From beam with Apache License 2.0 | 3 votes |
public static void main(String[] args) { PipelineOptions options = PipelineOptionsFactory.fromArgs(args).create(); Pipeline pipeline = Pipeline.create(options); PCollection<String> events = pipeline.apply(GenerateEvent.everySecond()); PCollection<Long> output = applyTransform(events); output.apply(Log.ofElements()); pipeline.run(); }
Example 19
Source File: Task.java From beam with Apache License 2.0 | 3 votes |
public static void main(String[] args) { PipelineOptions options = PipelineOptionsFactory.fromArgs(args).create(); Pipeline pipeline = Pipeline.create(options); PCollection<Integer> numbers = pipeline.apply(Create.of(10, 20, 50, 70, 90)); PCollection<Double> output = applyTransform(numbers); output.apply(Log.ofElements()); pipeline.run(); }
Example 20
Source File: Task.java From beam with Apache License 2.0 | 3 votes |
public static void main(String[] args) { PipelineOptions options = PipelineOptionsFactory.fromArgs(args).create(); Pipeline pipeline = Pipeline.create(options); PCollection<Integer> numbers = pipeline.apply(Create.of(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)); PCollection<Integer> output = applyTransform(numbers); output.apply(Log.ofElements()); pipeline.run(); }