org.apache.beam.runners.spark.SparkRunner Java Exaples

Source File: BeamPipelineEngine.java From hop with Apache License 2.0

6 votes

private PipelineResult executePipeline( org.apache.beam.sdk.Pipeline pipeline ) throws HopException {

    RunnerType runnerType = beamEngineRunConfiguration.getRunnerType();
    switch ( runnerType ) {
      case Direct:
        return DirectRunner.fromOptions( pipeline.getOptions() ).run( pipeline );
      case Flink:
        return FlinkRunner.fromOptions( pipeline.getOptions() ).run( pipeline );
      case DataFlow:
        return DataflowRunner.fromOptions( pipeline.getOptions() ).run( pipeline );
      case Spark:
        return SparkRunner.fromOptions( pipeline.getOptions() ).run( pipeline );
      default:
        throw new HopException( "Execution on runner '" + runnerType.name() + "' is not supported yet." );
    }
  }

Source File: HopPipelineMetaToBeamPipelineConverter.java From hop with Apache License 2.0

6 votes

public static Class<? extends PipelineRunner<?>> getPipelineRunnerClass( RunnerType runnerType ) throws HopException {
  if ( runnerType == null ) {
    throw new HopException( "Please specify a valid runner type" );
  }
  switch ( runnerType ) {
    case Direct:
      return DirectRunner.class;
    case Flink:
      return FlinkRunner.class;
    case Spark:
      return SparkRunner.class;
    case DataFlow:
      return DataflowRunner.class;
    default:
      throw new HopException( "Unsupported runner type: " + runnerType.name() );
  }
}

Source File: TranslationUtils.java From beam with Apache License 2.0

6 votes

/**
 * Reject state and timers {@link DoFn}.
 *
 * @param doFn the {@link DoFn} to possibly reject.
 */
public static void rejectStateAndTimers(DoFn<?, ?> doFn) {
  DoFnSignature signature = DoFnSignatures.getSignature(doFn.getClass());

  if (signature.stateDeclarations().size() > 0) {
    throw new UnsupportedOperationException(
        String.format(
            "Found %s annotations on %s, but %s cannot yet be used with state in the %s.",
            DoFn.StateId.class.getSimpleName(),
            doFn.getClass().getName(),
            DoFn.class.getSimpleName(),
            SparkRunner.class.getSimpleName()));
  }

  if (signature.timerDeclarations().size() > 0
      || signature.timerFamilyDeclarations().size() > 0) {
    throw new UnsupportedOperationException(
        String.format(
            "Found %s annotations on %s, but %s cannot yet be used with timers in the %s.",
            DoFn.TimerId.class.getSimpleName(),
            doFn.getClass().getName(),
            DoFn.class.getSimpleName(),
            SparkRunner.class.getSimpleName()));
  }
}

Source File: TrackStreamingSourcesTest.java From beam with Apache License 2.0

6 votes

@Test
public void testTrackSingle() {
  options.setRunner(SparkRunner.class);
  JavaSparkContext jsc = SparkContextFactory.getSparkContext(options);
  JavaStreamingContext jssc =
      new JavaStreamingContext(
          jsc, new org.apache.spark.streaming.Duration(options.getBatchIntervalMillis()));

  Pipeline p = Pipeline.create(options);

  CreateStream<Integer> emptyStream =
      CreateStream.of(VarIntCoder.of(), Duration.millis(options.getBatchIntervalMillis()))
          .emptyBatch();

  p.apply(emptyStream).apply(ParDo.of(new PassthroughFn<>()));

  p.traverseTopologically(new StreamingSourceTracker(jssc, p, ParDo.MultiOutput.class, 0));
  assertThat(StreamingSourceTracker.numAssertions, equalTo(1));
}

Source File: TrackStreamingSourcesTest.java From beam with Apache License 2.0

6 votes

@Test
public void testTrackFlattened() {
  options.setRunner(SparkRunner.class);
  JavaSparkContext jsc = SparkContextFactory.getSparkContext(options);
  JavaStreamingContext jssc =
      new JavaStreamingContext(
          jsc, new org.apache.spark.streaming.Duration(options.getBatchIntervalMillis()));

  Pipeline p = Pipeline.create(options);

  CreateStream<Integer> queueStream1 =
      CreateStream.of(VarIntCoder.of(), Duration.millis(options.getBatchIntervalMillis()))
          .emptyBatch();
  CreateStream<Integer> queueStream2 =
      CreateStream.of(VarIntCoder.of(), Duration.millis(options.getBatchIntervalMillis()))
          .emptyBatch();

  PCollection<Integer> pcol1 = p.apply(queueStream1);
  PCollection<Integer> pcol2 = p.apply(queueStream2);
  PCollection<Integer> flattened =
      PCollectionList.of(pcol1).and(pcol2).apply(Flatten.pCollections());
  flattened.apply(ParDo.of(new PassthroughFn<>()));

  p.traverseTopologically(new StreamingSourceTracker(jssc, p, ParDo.MultiOutput.class, 0, 1));
  assertThat(StreamingSourceTracker.numAssertions, equalTo(1));
}

Source File: TransMetaPipelineConverter.java From kettle-beam with Apache License 2.0

5 votes

public static Class<? extends PipelineRunner<?>> getPipelineRunnerClass( RunnerType runnerType ) throws KettleException {
  if (runnerType==null) {
    throw new KettleException( "Please specify a valid runner type");
  }
  switch(runnerType) {
    case Direct: return DirectRunner.class;
    case Flink: return FlinkRunner.class;
    case Spark: return SparkRunner.class;
    case DataFlow: return DataflowRunner.class;
    default:
      throw new KettleException( "Unsupported runner type: "+runnerType.name() );
  }
}

Source File: KettleBeamPipelineExecutor.java From kettle-beam with Apache License 2.0

5 votes

private PipelineResult asyncExecutePipeline( Pipeline pipeline ) throws KettleException {

    RunnerType runnerType = RunnerType.getRunnerTypeByName( transMeta.environmentSubstitute( jobConfig.getRunnerTypeName() ) );
    if (runnerType==null) {
      throw new KettleException( "Runner type '"+jobConfig.getRunnerTypeName()+"' is not recognized");
    }
    switch ( runnerType ) {
      case Direct: return DirectRunner.fromOptions( pipeline.getOptions() ).run( pipeline );
      case Flink: return FlinkRunner.fromOptions(pipeline.getOptions()).run( pipeline );
      case DataFlow: return DataflowRunner.fromOptions( pipeline.getOptions() ).run( pipeline );
      case Spark: return SparkRunner.fromOptions( pipeline.getOptions() ).run( pipeline );
      default:
        throw new KettleException( "Execution on runner '" + runnerType.name() + "' is not supported yet, sorry." );
    }
  }

Source File: SparkRunnerStreamingContextFactory.java From beam with Apache License 2.0

5 votes

@Override
public JavaStreamingContext call() throws Exception {
  LOG.info("Creating a new Spark Streaming Context");
  // validate unbounded read properties.
  checkArgument(
      options.getMinReadTimeMillis() < options.getBatchIntervalMillis(),
      "Minimum read time has to be less than batch time.");
  checkArgument(
      options.getReadTimePercentage() > 0 && options.getReadTimePercentage() < 1,
      "Read time percentage is bound to (0, 1).");

  SparkPipelineTranslator translator =
      new StreamingTransformTranslator.Translator(new TransformTranslator.Translator());
  Duration batchDuration = new Duration(options.getBatchIntervalMillis());
  LOG.info("Setting Spark streaming batchDuration to {} msec", batchDuration.milliseconds());

  JavaSparkContext jsc = SparkContextFactory.getSparkContext(options);
  JavaStreamingContext jssc = new JavaStreamingContext(jsc, batchDuration);

  // We must first init accumulators since translators expect them to be instantiated.
  SparkRunner.initAccumulators(options, jsc);
  // do not need to create a MetricsPusher instance here because if is called in SparkRunner.run()

  EvaluationContext ctxt = new EvaluationContext(jsc, pipeline, options, jssc);
  // update cache candidates
  SparkRunner.updateCacheCandidates(pipeline, translator, ctxt);
  pipeline.traverseTopologically(new SparkRunner.Evaluator(translator, ctxt));
  ctxt.computeOutputs();

  checkpoint(jssc, checkpointDir);

  return jssc;
}

Source File: TrackStreamingSourcesTest.java From beam with Apache License 2.0

5 votes

private StreamingSourceTracker(
    JavaStreamingContext jssc,
    Pipeline pipeline,
    Class<? extends PTransform> transformClassToAssert,
    Integer... expected) {
  this.ctxt = new EvaluationContext(jssc.sparkContext(), pipeline, options, jssc);
  this.evaluator =
      new SparkRunner.Evaluator(
          new StreamingTransformTranslator.Translator(new TransformTranslator.Translator()),
          ctxt);
  this.transformClassToAssert = transformClassToAssert;
  this.expected = expected;
}

Source File: SparkRunnerTestUtils.java From components with Apache License 2.0

5 votes

public Pipeline createPipeline() {
    SparkContextOptions sparkOpts = options.as(SparkContextOptions.class);
    sparkOpts.setFilesToStage(emptyList());

    SparkConf conf = new SparkConf();
    conf.setAppName(appName);
    conf.setMaster("local[2]");
    conf.set("spark.driver.allowMultipleContexts", "true");
    JavaSparkContext jsc = new JavaSparkContext(new SparkContext(conf));
    sparkOpts.setProvidedSparkContext(jsc);
    sparkOpts.setUsesProvidedSparkContext(true);
    sparkOpts.setRunner(SparkRunner.class);

    return Pipeline.create(sparkOpts);
}

Source File: S3SparkRuntimeTestIT.java From components with Apache License 2.0

5 votes

@Before
public void setupLazyAvroCoder() {
    options = PipelineOptionsFactory.as(SparkPipelineOptions.class);
    options.setRunner(SparkRunner.class);
    options.setSparkMaster("local");
    options.setStreaming(false);
    pWrite = Pipeline.create(options);
    pRead = Pipeline.create(options);

}

Source File: SparkIntegrationTestResource.java From components with Apache License 2.0

5 votes

/**
 * @return the options used to create this pipeline. These can be or changed before the Pipeline is created.
 */
public SparkContextOptions getOptions() {
    if (options == null) {
        options = PipelineOptionsFactory.as(SparkContextOptions.class);
        options.setRunner(SparkRunner.class);
        options.setFilesToStage(emptyList()); // useless for us and broken on java > 8 with beam <= 2.10.0
    }
    return options;
}

org.apache.beam.runners.spark.SparkRunner Java Examples