org.apache.beam.sdk.Pipeline#apply

Source File: DataflowRunnerTest.java From beam with Apache License 2.0

6 votes

/** Tests that all reads are consumed by at least one {@link PTransform}. */
@Test
public void testUnconsumedReads() throws IOException {
  DataflowPipelineOptions dataflowOptions = buildPipelineOptions();
  RuntimeTestOptions options = dataflowOptions.as(RuntimeTestOptions.class);
  Pipeline p = buildDataflowPipeline(dataflowOptions);
  p.apply(TextIO.read().from(options.getInput()));
  DataflowRunner.fromOptions(dataflowOptions).replaceTransforms(p);
  final AtomicBoolean unconsumedSeenAsInput = new AtomicBoolean();
  p.traverseTopologically(
      new PipelineVisitor.Defaults() {
        @Override
        public void visitPrimitiveTransform(Node node) {
          unconsumedSeenAsInput.set(true);
        }
      });
  assertThat(unconsumedSeenAsInput.get(), is(true));
}

Source File: QueryablePipelineTest.java From beam with Apache License 2.0

6 votes

@Test
public void rootTransforms() {
  Pipeline p = Pipeline.create();
  p.apply("UnboundedRead", Read.from(CountingSource.unbounded()))
      .apply(Window.into(FixedWindows.of(Duration.millis(5L))))
      .apply(Count.perElement());
  p.apply("BoundedRead", Read.from(CountingSource.upTo(100L)));

  Components components = PipelineTranslation.toProto(p).getComponents();
  QueryablePipeline qp = QueryablePipeline.forPrimitivesIn(components);

  assertThat(qp.getRootTransforms(), hasSize(2));
  for (PTransformNode rootTransform : qp.getRootTransforms()) {
    assertThat(
        "Root transforms should have no inputs",
        rootTransform.getTransform().getInputsCount(),
        equalTo(0));
    assertThat(
        "Only added source reads to the pipeline",
        rootTransform.getTransform().getSpec().getUrn(),
        equalTo(PTransformTranslation.READ_TRANSFORM_URN));
  }
}

Source File: FlinkPipelineExecutionEnvironmentTest.java From beam with Apache License 2.0

6 votes

@Test
public void testTranslationModeNoOverrideWithoutUnboundedSources() {
  boolean[] testArgs = new boolean[] {true, false};
  for (boolean streaming : testArgs) {
    FlinkPipelineOptions options = PipelineOptionsFactory.as(FlinkPipelineOptions.class);
    options.setRunner(FlinkRunner.class);
    options.setStreaming(streaming);

    FlinkPipelineExecutionEnvironment flinkEnv = new FlinkPipelineExecutionEnvironment(options);
    Pipeline pipeline = Pipeline.create(options);
    pipeline.apply(GenerateSequence.from(0).to(10));
    flinkEnv.translate(pipeline);

    assertThat(options.isStreaming(), Matchers.is(streaming));
  }
}

Source File: BeamBigQueryInputStepHandler.java From hop with Apache License 2.0

5 votes

@Override public void handleStep( ILogChannel log, TransformMeta transformMeta, Map<String, PCollection<HopRow>> stepCollectionMap,
                                  Pipeline pipeline, IRowMeta rowMeta, List<TransformMeta> previousSteps,
                                  PCollection<HopRow> input ) throws HopException {

  // Input handling
  //
  BeamBQInputMeta beamInputMeta = (BeamBQInputMeta) transformMeta.getTransform();

  // Output rows (fields selection)
  //
  IRowMeta outputRowMeta = new RowMeta();
  beamInputMeta.getFields( outputRowMeta, transformMeta.getName(), null, null, pipelineMeta, null );

  BeamBQInputTransform beamInputTransform = new BeamBQInputTransform(
    transformMeta.getName(),
    transformMeta.getName(),
    pipelineMeta.environmentSubstitute( beamInputMeta.getProjectId() ),
    pipelineMeta.environmentSubstitute( beamInputMeta.getDatasetId() ),
    pipelineMeta.environmentSubstitute( beamInputMeta.getTableId() ),
    pipelineMeta.environmentSubstitute( beamInputMeta.getQuery() ),
    JsonRowMeta.toJson( outputRowMeta ),
    transformPluginClasses,
    xpPluginClasses
  );
  PCollection<HopRow> afterInput = pipeline.apply( beamInputTransform );
  stepCollectionMap.put( transformMeta.getName(), afterInput );
  log.logBasic( "Handled transform (BQ INPUT) : " + transformMeta.getName() );

}

Source File: SparkSimpleFileIOOutputRuntimeTestIT.java From components with Apache License 2.0

5 votes

/**
 * Basic unit test using all default values (except for the path) on an in-memory DFS cluster.
 */
@Category(ValidatesRunner.class)
@Ignore("BEAM-1206")
@Test
public void testBasicDefaults() throws IOException {
    FileSystem fs = FileSystem.get(spark.createHadoopConfiguration());
    String fileSpec = fs.getUri().resolve(new Path(tmp.getRoot().toString(), "basic").toUri()).toString();

    // Configure the component.
    SimpleFileIOOutputProperties props = SimpleFileIOOutputRuntimeTest.createOutputComponentProperties();
    props.getDatasetProperties().path.setValue(fileSpec);
    props.getDatasetProperties().format.setValue(SimpleFileIOFormat.AVRO);

    // Create the runtime.
    SimpleFileIOOutputRuntime runtime = new SimpleFileIOOutputRuntime();
    runtime.initialize(null, props);

    // Use the runtime in a Spark pipeline to test.
    final Pipeline p = spark.createPipeline();
    PCollection<IndexedRecord> input = p.apply( //
            Create.of(ConvertToIndexedRecord.convertToAvro(new String[] { "1", "one" }), //
                    ConvertToIndexedRecord.convertToAvro(new String[] { "2", "two" }))); //
    input.apply(runtime);

    // And run the test.
    p.run().waitUntilFinish();

    // Check the expected values.
    MiniDfsResource.assertReadFile(fs, fileSpec, "1;one", "2;two");
}

Source File: TestDataflowRunnerTest.java From beam with Apache License 2.0

5 votes

/**
 * Tests that if a streaming pipeline crash loops for a non-assertion reason that the test run
 * throws an {@link AssertionError}.
 *
 * <p>This is a known limitation/bug of the runner that it does not distinguish the two modes of
 * failure.
 */
@Test
public void testStreamingPipelineFailsIfException() throws Exception {
  options.setStreaming(true);
  Pipeline pipeline = TestPipeline.create(options);
  PCollection<Integer> pc = pipeline.apply(Create.of(1, 2, 3));
  PAssert.that(pc).containsInAnyOrder(1, 2, 3);

  DataflowPipelineJob mockJob = Mockito.mock(DataflowPipelineJob.class);
  when(mockJob.getState()).thenReturn(State.RUNNING);
  when(mockJob.getProjectId()).thenReturn("test-project");
  when(mockJob.getJobId()).thenReturn("test-job");
  when(mockJob.waitUntilFinish(any(Duration.class), any(JobMessagesHandler.class)))
      .thenAnswer(
          invocation -> {
            JobMessage message = new JobMessage();
            message.setMessageText("FooException");
            message.setTime(TimeUtil.toCloudTime(Instant.now()));
            message.setMessageImportance("JOB_MESSAGE_ERROR");
            ((JobMessagesHandler) invocation.getArguments()[1]).process(Arrays.asList(message));
            return State.CANCELLED;
          });

  DataflowRunner mockRunner = Mockito.mock(DataflowRunner.class);
  when(mockRunner.run(any(Pipeline.class))).thenReturn(mockJob);

  when(mockClient.getJobMetrics(anyString()))
      .thenReturn(generateMockMetricResponse(false /* success */, true /* tentative */));
  TestDataflowRunner runner = TestDataflowRunner.fromOptionsAndClient(options, mockClient);

  expectedException.expect(RuntimeException.class);
  runner.run(pipeline, mockRunner);
}

Source File: IndexerPipeline.java From dataflow-opinion-analysis with Apache License 2.0

5 votes

/**
 * @param contentToIndexNotSkipped
 * @param contentNotToIndexSkipped
 * @param pipeline
 * @param options
 * @return
 */
private static ContentToIndexOrNot filterAlreadyProcessedDocuments(
		PCollection<InputContent> contentToIndexNotSkipped, PCollection<InputContent> contentNotToIndexSkipped,
		Pipeline pipeline, IndexerPipelineOptions options) {
	PCollection<KV<String,Long>> alreadyProcessedDocs = null;
	
	if (!options.getWriteTruncate()) {
		String query = IndexerPipelineUtils.buildBigQueryProcessedDocsQuery(options);
		alreadyProcessedDocs = pipeline
			.apply("Get already processed Documents",BigQueryIO.read().fromQuery(query))
			.apply(ParDo.of(new GetDocumentHashFn()));

	} else {
		Map<String, Long> map = new HashMap<String,Long>();
		alreadyProcessedDocs = pipeline
			.apply("Create empty side input of Docs",
				Create.of(map).withCoder(KvCoder.of(StringUtf8Coder.of(),VarLongCoder.of())));
	}			
	
	final PCollectionView<Map<String,Long>> alreadyProcessedDocsSideInput =  
		alreadyProcessedDocs.apply(View.<String,Long>asMap());
	
	PCollectionTuple indexOrNotBasedOnExactDupes = contentToIndexNotSkipped
		.apply("Extract DocumentHash key", ParDo.of(new GetInputContentDocumentHashFn()))
		.apply("Group by DocumentHash key", GroupByKey.<String, InputContent>create())
		.apply("Eliminate InputContent Dupes", ParDo.of(new EliminateInputContentDupes(alreadyProcessedDocsSideInput))
			.withSideInputs(alreadyProcessedDocsSideInput)
			.withOutputTags(PipelineTags.contentToIndexNotExactDupesTag, // main output collection
				TupleTagList.of(PipelineTags.contentNotToIndexExactDupesTag))); // side output collection	
	
	PCollection<InputContent> contentToIndexNotExactDupes = indexOrNotBasedOnExactDupes.get(PipelineTags.contentToIndexNotExactDupesTag);
	PCollection<InputContent> contentNotToIndexExactDupes = indexOrNotBasedOnExactDupes.get(PipelineTags.contentNotToIndexExactDupesTag);
	
	// Merge the sets of items that are dupes or skipped
	PCollectionList<InputContent> contentNotToIndexList = PCollectionList.of(contentNotToIndexExactDupes).and(contentNotToIndexSkipped);
	
	ContentToIndexOrNot content = new ContentToIndexOrNot(contentToIndexNotExactDupes, contentNotToIndexList.apply(Flatten.<InputContent>pCollections()));
	return content;
}

Source File: S3OutputRuntimeTestIT.java From components with Apache License 2.0

5 votes

@Test
public void testAvro_merge() throws IOException {
    S3DatasetProperties datasetProps = s3.createS3DatasetProperties();
    datasetProps.format.setValue(SimpleFileIOFormat.AVRO);
    S3OutputProperties outputProperties = new S3OutputProperties("out");
    outputProperties.init();
    outputProperties.setDatasetProperties(datasetProps);
    outputProperties.mergeOutput.setValue(true);

    // Create the runtime.
    S3OutputRuntime runtime = new S3OutputRuntime();
    runtime.initialize(null, outputProperties);

    // Use the runtime in a Spark pipeline to test.
    final Pipeline p = spark.createPipeline();
    PCollection<IndexedRecord> input = p.apply( //
            Create.of(ConvertToIndexedRecord.convertToAvro(new String[] { "1", "one" }), //
                    ConvertToIndexedRecord.convertToAvro(new String[] { "2", "two" }))); //
    input.apply(runtime);

    // And run the test.
    p.run().waitUntilFinish();

    FileSystem s3FileSystem = S3Connection.createFileSystem(datasetProps);
    MiniDfsResource.assertReadAvroFile(s3FileSystem, s3.getS3APath(datasetProps),
            new HashSet<IndexedRecord>(Arrays.asList(ConvertToIndexedRecord.convertToAvro(new String[] { "1", "one" }), //
                    ConvertToIndexedRecord.convertToAvro(new String[] { "2", "two" }))),
            false);
    MiniDfsResource.assertFileNumber(s3FileSystem, s3.getS3APath(datasetProps), 1);

}

Source File: SparkSimpleFileIOOutputRuntimeTestIT.java From components with Apache License 2.0

5 votes

@Test
public void testAvro_merge() throws IOException {
    FileSystem fs = FileSystem.get(spark.createHadoopConfiguration());
    String fileSpec = fs.getUri().resolve(new Path(tmp.getRoot().toString(), "output.avro").toUri()).toString();

    // Configure the component.
    SimpleFileIOOutputProperties props = SimpleFileIOOutputRuntimeTest.createOutputComponentProperties();
    props.getDatasetProperties().path.setValue(fileSpec);
    props.getDatasetProperties().format.setValue(SimpleFileIOFormat.AVRO);
    props.mergeOutput.setValue(true);

    // Create the runtime.
    SimpleFileIOOutputRuntime runtime = new SimpleFileIOOutputRuntime();
    runtime.initialize(null, props);

    // Use the runtime in a Spark pipeline to test.
    final Pipeline p = spark.createPipeline();
    PCollection<IndexedRecord> input = p.apply( //
            Create.of(ConvertToIndexedRecord.convertToAvro(new String[] { "1", "one" }), //
                    ConvertToIndexedRecord.convertToAvro(new String[] { "2", "two" }))); //
    input.apply(runtime);

    // And run the test.
    p.run().waitUntilFinish();

    // Check the expected values.

    MiniDfsResource.assertReadAvroFile(fs, fileSpec,
            new HashSet<IndexedRecord>(Arrays.asList(ConvertToIndexedRecord.convertToAvro(new String[] { "1", "one" }), //
                    ConvertToIndexedRecord.convertToAvro(new String[] { "2", "two" }))),
            false);
    MiniDfsResource.assertFileNumber(fs, fileSpec, 1);
}

Source File: Task.java From beam with Apache License 2.0

5 votes

public static void main(String[] args) {
  PipelineOptions options = PipelineOptionsFactory.fromArgs(args).create();
  Pipeline pipeline = Pipeline.create(options);

  PCollection<KV<String, String>> citiesToCountries =
      pipeline.apply("Cities and Countries",
          Create.of(
              KV.of("Beijing", "China"),
              KV.of("London", "United Kingdom"),
              KV.of("San Francisco", "United States"),
              KV.of("Singapore", "Singapore"),
              KV.of("Sydney", "Australia")
          ));

  PCollectionView<Map<String, String>> citiesToCountriesView =
      createView(citiesToCountries);

  PCollection<Person> persons =
      pipeline.apply("Persons",
          Create.of(
              new Person("Henry", "Singapore"),
              new Person("Jane", "San Francisco"),
              new Person("Lee", "Beijing"),
              new Person("John", "Sydney"),
              new Person("Alfred", "London")
          ));

  PCollection<Person> output = applyTransform(persons, citiesToCountriesView);

  output.apply(Log.ofElements());

  pipeline.run();
}

Source File: SpannerGroupWrite.java From java-docs-samples with Apache License 2.0

4 votes

public static void main(String[] args) {
  Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
  Pipeline p = Pipeline.create(options);

  String instanceId = options.getInstanceId();
  String databaseId = options.getDatabaseId();

  String usersIdFile = options.getSuspiciousUsersFile();

  PCollection<String> suspiciousUserIds = p.apply(TextIO.read().from(usersIdFile));

  final Timestamp timestamp = Timestamp.now();

  // [START spanner_dataflow_writegroup]
  PCollection<MutationGroup> mutations = suspiciousUserIds
      .apply(MapElements.via(new SimpleFunction<String, MutationGroup>() {

        @Override
        public MutationGroup apply(String userId) {
          // Immediately block the user.
          Mutation userMutation = Mutation.newUpdateBuilder("Users")
              .set("id").to(userId)
              .set("state").to("BLOCKED")
              .build();
          long generatedId = Hashing.sha1().newHasher()
              .putString(userId, Charsets.UTF_8)
              .putLong(timestamp.getSeconds())
              .putLong(timestamp.getNanos())
              .hash()
              .asLong();

          // Add an entry to pending review requests.
          Mutation pendingReview = Mutation.newInsertOrUpdateBuilder("PendingReviews")
              .set("id").to(generatedId)  // Must be deterministically generated.
              .set("userId").to(userId)
              .set("action").to("REVIEW ACCOUNT")
              .set("note").to("Suspicious activity detected.")
              .build();

          return MutationGroup.create(userMutation, pendingReview);
        }
      }));

  mutations.apply(SpannerIO.write()
      .withInstanceId(instanceId)
      .withDatabaseId(databaseId)
      .grouped());
  // [END spanner_dataflow_writegroup]

  p.run().waitUntilFinish();

}

Source File: BigQueryReadWriteIT.java From beam with Apache License 2.0

4 votes

private PCollection<Row> createPCollection(Pipeline pipeline, Row... rows) {
  return pipeline.apply(Create.of(Arrays.asList(rows)).withRowSchema(SOURCE_SCHEMA));
}

Source File: TestUtils.java From beam with Apache License 2.0

4 votes

public static <T> PCollection<T> createMockDataset(
    Pipeline pipeline, TypeDescriptor<T> typeDescriptor) {
  return pipeline.apply(Create.empty(typeDescriptor));
}

Source File: SimpleFileIOOutputErrorTest.java From components with Apache License 2.0

4 votes

/**
 * Basic unit test using all default values (except for the path) on an in-memory DFS cluster.
 */
@Test
public void testUnauthorizedOverwrite() throws IOException, URISyntaxException {
    Path parent = new Path(mini.newFolder().toString());
    Path dst = new Path(parent, "output");
    String fileSpec = mini.getLocalFs().getUri().resolve(dst.toUri()).toString();

    // Write something to the file before trying to run.
    try (OutputStream out = mini.getLocalFs().create(new Path(dst, "part-00000"))) {
        out.write(0);
    }

    // Ensure that the destination is unwritable.
    FileUtil.chmod(dst.toUri().toString(), "000", true);

    // Trying to overwrite an unmodifiable destination throws an exception.
    thrown.expect(TalendRuntimeException.class);
    thrown.expect(hasProperty("code", is(SimpleFileIOErrorCode.OUTPUT_NOT_AUTHORIZED)));
    thrown.expectMessage("Can not write to " + fileSpec
            + ". Please check user permissions or existence of base directory.");

    // Now try using the component.
    try {
        // Configure the component.
        SimpleFileIOOutputProperties props = SimpleFileIOOutputRuntimeTest.createOutputComponentProperties();
        props.getDatasetProperties().path.setValue(fileSpec);
        props.overwrite.setValue(true);

        // Create the runtime.
        SimpleFileIOOutputRuntime runtime = new SimpleFileIOOutputRuntime();
        runtime.initialize(null, props);

        // Use the runtime in a direct pipeline to test.
        final Pipeline p = beam.createPipeline();
        PCollection<IndexedRecord> input = p.apply( //
                Create.of(ConvertToIndexedRecord.convertToAvro(new String[] { "1", "one" }), //
                        ConvertToIndexedRecord.convertToAvro(new String[] { "2", "two" }))); //
        input.apply(runtime);

        // And run the test.
        runtime.runAtDriver(null);
        p.run().waitUntilFinish();
    } catch (Pipeline.PipelineExecutionException e) {
        if (e.getCause() instanceof TalendRuntimeException)
            throw (TalendRuntimeException) e.getCause();
        throw e;
    }
}

Source File: Task.java From beam with Apache License 2.0

4 votes

static PCollection<String> setupPipeline(Pipeline pipeline) {
  return pipeline.apply(Create.of("Hello Beam"));
}

Source File: Task.java From beam with Apache License 2.0

3 votes

public static void main(String[] args) {
  PipelineOptions options = PipelineOptionsFactory.fromArgs(args).create();
  Pipeline pipeline = Pipeline.create(options);

  PCollection<Integer> numbers = pipeline.apply(Create.of(1, 2, 3, 4, 5, 6, 7, 8, 9, 10));

  PCollection<Integer> output = applyTransform(numbers);

  output.apply(Log.ofElements());

  pipeline.run();
}

Source File: Task.java From beam with Apache License 2.0

3 votes

public static void main(String[] args) {
  PipelineOptions options = PipelineOptionsFactory.fromArgs(args).create();
  Pipeline pipeline = Pipeline.create(options);

  PCollection<Integer> numbers = pipeline.apply(Create.of(10, 30, 50, 70, 90));

  PCollection<Integer> output = applyTransform(numbers);

  output.apply(Log.ofElements());

  pipeline.run();
}

Source File: Task.java From beam with Apache License 2.0

3 votes

public static void main(String[] args) {
  PipelineOptions options = PipelineOptionsFactory.fromArgs(args).create();
  Pipeline pipeline = Pipeline.create(options);

  PCollection<String> events =
      pipeline.apply(GenerateEvent.everySecond());

  PCollection<Long> output = applyTransform(events);

  output.apply(Log.ofElements());

  pipeline.run();
}

Source File: Task.java From beam with Apache License 2.0

3 votes

public static void main(String[] args) {
  PipelineOptions options = PipelineOptionsFactory.fromArgs(args).create();
  Pipeline pipeline = Pipeline.create(options);

  PCollection<Integer> numbers = pipeline.apply(Create.of(10, 20, 50, 70, 90));

  PCollection<Double> output = applyTransform(numbers);

  output.apply(Log.ofElements());

  pipeline.run();
}

Source File: Task.java From beam with Apache License 2.0

3 votes

public static void main(String[] args) {
  PipelineOptions options = PipelineOptionsFactory.fromArgs(args).create();
  Pipeline pipeline = Pipeline.create(options);

  PCollection<Integer> numbers = pipeline.apply(Create.of(1, 2, 3, 4, 5, 6, 7, 8, 9, 10));

  PCollection<Integer> output = applyTransform(numbers);

  output.apply(Log.ofElements());

  pipeline.run();
}

Java Code Examples for org.apache.beam.sdk.Pipeline#apply()