org.apache.beam.runners.spark.SparkPipelineOptions Java Exaples

Source File: SparkRunnerKryoRegistratorTest.java From beam with Apache License 2.0

6 votes

private void runSimplePipelineWithSparkContext(SparkConf conf) {
  SparkPipelineOptions options =
      PipelineOptionsFactory.create().as(TestSparkPipelineOptions.class);
  options.setRunner(TestSparkRunner.class);

  conf.set("spark.master", "local");
  conf.setAppName("test");

  JavaSparkContext javaSparkContext = new JavaSparkContext(conf);
  options.setUsesProvidedSparkContext(true);
  options.as(SparkContextOptions.class).setProvidedSparkContext(javaSparkContext);
  Pipeline p = Pipeline.create(options);
  p.apply(Create.of("a")); // some operation to trigger pipeline construction
  p.run().waitUntilFinish();
  javaSparkContext.stop();
}

Source File: SourceRDD.java From beam with Apache License 2.0

6 votes

public Bounded(
    SparkContext sc,
    BoundedSource<T> source,
    SerializablePipelineOptions options,
    String stepName) {
  super(sc, NIL, JavaSparkContext$.MODULE$.fakeClassTag());
  this.source = source;
  this.options = options;
  // the input parallelism is determined by Spark's scheduler backend.
  // when running on YARN/SparkDeploy it's the result of max(totalCores, 2).
  // when running on Mesos it's 8.
  // when running local it's the total number of cores (local = 1, local[N] = N,
  // local[*] = estimation of the machine's cores).
  // ** the configuration "spark.default.parallelism" takes precedence over all of the above **
  this.numPartitions = sc.defaultParallelism();
  checkArgument(this.numPartitions > 0, "Number of partitions must be greater than zero.");
  this.bundleSize = options.get().as(SparkPipelineOptions.class).getBundleSize();
  this.stepName = stepName;
  this.metricsAccum = MetricsAccumulator.getInstance();
}

Source File: SparkContextFactory.java From beam with Apache License 2.0

6 votes

public static synchronized JavaSparkContext getSparkContext(SparkPipelineOptions options) {
  SparkContextOptions contextOptions = options.as(SparkContextOptions.class);
  usesProvidedSparkContext = contextOptions.getUsesProvidedSparkContext();
  // reuse should be ignored if the context is provided.
  if (Boolean.getBoolean(TEST_REUSE_SPARK_CONTEXT) && !usesProvidedSparkContext) {

    // if the context is null or stopped for some reason, re-create it.
    if (sparkContext == null || sparkContext.sc().isStopped()) {
      sparkContext = createSparkContext(contextOptions);
      sparkMaster = options.getSparkMaster();
    } else if (!options.getSparkMaster().equals(sparkMaster)) {
      throw new IllegalArgumentException(
          String.format(
              "Cannot reuse spark context "
                  + "with different spark master URL. Existing: %s, requested: %s.",
              sparkMaster, options.getSparkMaster()));
    }
    return sparkContext;
  } else {
    return createSparkContext(contextOptions);
  }
}

Source File: SparkBatchPortablePipelineTranslator.java From beam with Apache License 2.0

5 votes

@Nullable
private static Partitioner getPartitioner(SparkTranslationContext context) {
  Long bundleSize =
      context.serializablePipelineOptions.get().as(SparkPipelineOptions.class).getBundleSize();
  return (bundleSize > 0)
      ? null
      : new HashPartitioner(context.getSparkContext().defaultParallelism());
}

Source File: S3SparkRuntimeTestIT.java From components with Apache License 2.0

5 votes

@Before
public void setupLazyAvroCoder() {
    options = PipelineOptionsFactory.as(SparkPipelineOptions.class);
    options.setRunner(SparkRunner.class);
    options.setSparkMaster("local");
    options.setStreaming(false);
    pWrite = Pipeline.create(options);
    pRead = Pipeline.create(options);

}

Source File: SparkMetricsSinkTest.java From beam with Apache License 2.0

5 votes

@Category(StreamingTest.class)
@Test
public void testInStreamingMode() throws Exception {
  assertThat(InMemoryMetrics.valueOf("emptyLines"), is(nullValue()));

  Instant instant = new Instant(0);
  CreateStream<String> source =
      CreateStream.of(
              StringUtf8Coder.of(),
              Duration.millis(
                  (pipeline.getOptions().as(SparkPipelineOptions.class))
                      .getBatchIntervalMillis()))
          .emptyBatch()
          .advanceWatermarkForNextBatch(instant)
          .nextBatch(
              TimestampedValue.of(WORDS.get(0), instant),
              TimestampedValue.of(WORDS.get(1), instant),
              TimestampedValue.of(WORDS.get(2), instant))
          .advanceWatermarkForNextBatch(instant.plus(Duration.standardSeconds(2L)))
          .nextBatch(
              TimestampedValue.of(WORDS.get(3), instant.plus(Duration.standardSeconds(1L))),
              TimestampedValue.of(WORDS.get(4), instant.plus(Duration.standardSeconds(1L))),
              TimestampedValue.of(WORDS.get(5), instant.plus(Duration.standardSeconds(1L))))
          .advanceNextBatchWatermarkToInfinity();
  PCollection<String> output =
      pipeline
          .apply(source)
          .apply(
              Window.<String>into(FixedWindows.of(Duration.standardSeconds(3L)))
                  .withAllowedLateness(Duration.ZERO))
          .apply(new WordCount.CountWords())
          .apply(MapElements.via(new WordCount.FormatAsTextFn()));
  PAssert.that(output).containsInAnyOrder(EXPECTED_COUNTS);
  pipeline.run();

  assertThat(InMemoryMetrics.<Double>valueOf("emptyLines"), is(1d));
}

Source File: SourceDStream.java From beam with Apache License 2.0

5 votes

SourceDStream(
    StreamingContext ssc,
    UnboundedSource<T, CheckpointMarkT> unboundedSource,
    SerializablePipelineOptions options,
    Long boundMaxRecords) {
  super(ssc, JavaSparkContext$.MODULE$.fakeClassTag());
  this.unboundedSource = unboundedSource;
  this.options = options;

  SparkPipelineOptions sparkOptions = options.get().as(SparkPipelineOptions.class);

  // Reader cache expiration interval. 50% of batch interval is added to accommodate latency.
  this.readerCacheInterval = 1.5 * sparkOptions.getBatchIntervalMillis();

  this.boundReadDuration =
      boundReadDuration(
          sparkOptions.getReadTimePercentage(), sparkOptions.getMinReadTimeMillis());
  // set initial parallelism once.
  this.initialParallelism = ssc().sparkContext().defaultParallelism();
  checkArgument(this.initialParallelism > 0, "Number of partitions must be greater than zero.");

  this.boundMaxRecords = boundMaxRecords;

  try {
    this.numPartitions = createMicrobatchSource().split(sparkOptions).size();
  } catch (Exception e) {
    throw new RuntimeException(e);
  }
}

Source File: AggregatorsAccumulator.java From beam with Apache License 2.0

5 votes

/** Init aggregators accumulator if it has not been initiated. This method is idempotent. */
public static void init(SparkPipelineOptions opts, JavaSparkContext jsc) {
  if (instance == null) {
    synchronized (AggregatorsAccumulator.class) {
      if (instance == null) {
        Optional<CheckpointDir> maybeCheckpointDir =
            opts.isStreaming()
                ? Optional.of(new CheckpointDir(opts.getCheckpointDir()))
                : Optional.absent();
        NamedAggregators namedAggregators = new NamedAggregators();
        NamedAggregatorsAccumulator accumulator =
            new NamedAggregatorsAccumulator(namedAggregators);
        jsc.sc().register(accumulator, ACCUMULATOR_NAME);

        if (maybeCheckpointDir.isPresent()) {
          Optional<NamedAggregators> maybeRecoveredValue =
              recoverValueFromCheckpoint(jsc, maybeCheckpointDir.get());
          if (maybeRecoveredValue.isPresent()) {
            accumulator = new NamedAggregatorsAccumulator(maybeRecoveredValue.get());
          }
        }
        instance = accumulator;
      }
    }
    LOG.info("Instantiated aggregators accumulator: " + instance.value());
  }
}

Source File: SparkTranslationContext.java From beam with Apache License 2.0

5 votes

/** Add output of transform to context. */
public void pushDataset(String pCollectionId, Dataset dataset) {
  dataset.setName(pCollectionId);
  SparkPipelineOptions sparkOptions =
      serializablePipelineOptions.get().as(SparkPipelineOptions.class);
  if (!sparkOptions.isCacheDisabled() && consumptionCount.getOrDefault(pCollectionId, 0) > 1) {
    String storageLevel = sparkOptions.getStorageLevel();
    @Nullable Coder coder = coderMap.get(pCollectionId);
    dataset.cache(storageLevel, coder);
  }
  datasets.put(pCollectionId, dataset);
  leaves.add(dataset);
}

Source File: TransformTranslator.java From beam with Apache License 2.0

5 votes

@Nullable
private static Partitioner getPartitioner(EvaluationContext context) {
  Long bundleSize =
      context.getSerializableOptions().get().as(SparkPipelineOptions.class).getBundleSize();
  return (bundleSize > 0)
      ? null
      : new HashPartitioner(context.getSparkContext().defaultParallelism());
}

Source File: MetricsAccumulator.java From beam with Apache License 2.0

5 votes

/** Init metrics accumulator if it has not been initiated. This method is idempotent. */
public static void init(SparkPipelineOptions opts, JavaSparkContext jsc) {
  if (instance == null) {
    synchronized (MetricsAccumulator.class) {
      if (instance == null) {
        Optional<CheckpointDir> maybeCheckpointDir =
            opts.isStreaming()
                ? Optional.of(new CheckpointDir(opts.getCheckpointDir()))
                : Optional.absent();
        MetricsContainerStepMap metricsContainerStepMap = new MetricsContainerStepMap();
        MetricsContainerStepMapAccumulator accumulator =
            new MetricsContainerStepMapAccumulator(metricsContainerStepMap);
        jsc.sc().register(accumulator, ACCUMULATOR_NAME);

        if (maybeCheckpointDir.isPresent()) {
          Optional<MetricsContainerStepMap> maybeRecoveredValue =
              recoverValueFromCheckpoint(jsc, maybeCheckpointDir.get());
          if (maybeRecoveredValue.isPresent()) {
            accumulator = new MetricsContainerStepMapAccumulator(maybeRecoveredValue.get());
          }
        }
        instance = accumulator;
      }
    }
    LOG.info("Instantiated metrics accumulator: " + instance.value());
  } else {
    instance.reset();
  }
}

Source File: EvaluationContext.java From beam with Apache License 2.0

4 votes

public String storageLevel() {
  return serializableOptions.get().as(SparkPipelineOptions.class).getStorageLevel();
}

Source File: SparkRunnerStreamingContextFactory.java From beam with Apache License 2.0

4 votes

public SparkRunnerStreamingContextFactory(
    Pipeline pipeline, SparkPipelineOptions options, CheckpointDir checkpointDir) {
  this.pipeline = pipeline;
  this.options = options;
  this.checkpointDir = checkpointDir;
}

Source File: BeamSparkPipelineRunConfiguration.java From hop with Apache License 2.0

4 votes

@Override public PipelineOptions getPipelineOptions() throws HopException {
  SparkPipelineOptions options = PipelineOptionsFactory.as( SparkPipelineOptions.class );

  if ( StringUtils.isNotEmpty( getSparkMaster() ) ) {
    options.setSparkMaster( environmentSubstitute( getSparkMaster() ) );
  }
  if ( StringUtils.isNotEmpty( getSparkBatchIntervalMillis() ) ) {
    long interval = Const.toLong( environmentSubstitute( getSparkBatchIntervalMillis() ), -1L );
    if ( interval >= 0 ) {
      options.setBatchIntervalMillis( interval );
    }
  }
  if ( StringUtils.isNotEmpty( getSparkCheckpointDir() ) ) {
    options.setCheckpointDir( environmentSubstitute( getSparkCheckpointDir() ) );
  }
  if ( StringUtils.isNotEmpty( getSparkCheckpointDurationMillis() ) ) {
    long duration = Const.toLong( environmentSubstitute( getSparkCheckpointDurationMillis() ), -1L );
    if ( duration >= 0 ) {
      options.setCheckpointDurationMillis( duration );
    }
  }
  if ( StringUtils.isNotEmpty( getSparkMaxRecordsPerBatch() ) ) {
    long records = Const.toLong( environmentSubstitute( getSparkMaxRecordsPerBatch() ), -1L );
    if ( records >= 0 ) {
      options.setMaxRecordsPerBatch( records );
    }
  }
  if ( StringUtils.isNotEmpty( getSparkMinReadTimeMillis() ) ) {
    long readTime = Const.toLong( environmentSubstitute( getSparkMinReadTimeMillis() ), -1L );
    if ( readTime >= 0 ) {
      options.setMinReadTimeMillis( readTime );
    }
  }
  if ( StringUtils.isNotEmpty( getSparkReadTimePercentage() ) ) {
    double percentage = Const.toDouble( environmentSubstitute( getSparkReadTimePercentage() ), -1.0 );
    if ( percentage >= 0 ) {
      options.setReadTimePercentage( percentage / 100 );
    }
  }
  if ( StringUtils.isNotEmpty( getSparkBundleSize() ) ) {
    long bundleSize = Const.toLong( environmentSubstitute( getSparkBundleSize() ), -1L );
    if ( bundleSize >= 0 ) {
      options.setBundleSize( bundleSize );
    }
  }
  if ( StringUtils.isNotEmpty( getSparkStorageLevel() ) ) {
    options.setStorageLevel( environmentSubstitute( getSparkStorageLevel() ) );
  }

  if (StringUtils.isNotEmpty( getFatJar() )) {
    options.setFilesToStage( Arrays.asList(fatJar) );
  }


  return options;
}

Source File: SparkGroupAlsoByWindowViaWindowSet.java From beam with Apache License 2.0

4 votes

private static Long getBatchDuration(final SerializablePipelineOptions options) {
  return options.get().as(SparkPipelineOptions.class).getCheckpointDurationMillis();
}

Source File: SparkUnboundedSource.java From beam with Apache License 2.0

4 votes

public static <T, CheckpointMarkT extends CheckpointMark> UnboundedDataset<T> read(
    JavaStreamingContext jssc,
    SerializablePipelineOptions rc,
    UnboundedSource<T, CheckpointMarkT> source,
    String stepName) {

  SparkPipelineOptions options = rc.get().as(SparkPipelineOptions.class);
  Long maxRecordsPerBatch = options.getMaxRecordsPerBatch();
  SourceDStream<T, CheckpointMarkT> sourceDStream =
      new SourceDStream<>(jssc.ssc(), source, rc, maxRecordsPerBatch);

  JavaPairInputDStream<Source<T>, CheckpointMarkT> inputDStream =
      JavaPairInputDStream$.MODULE$.fromInputDStream(
          sourceDStream,
          JavaSparkContext$.MODULE$.fakeClassTag(),
          JavaSparkContext$.MODULE$.fakeClassTag());

  // call mapWithState to read from a checkpointable sources.
  JavaMapWithStateDStream<
          Source<T>, CheckpointMarkT, Tuple2<byte[], Instant>, Tuple2<Iterable<byte[]>, Metadata>>
      mapWithStateDStream =
          inputDStream.mapWithState(
              StateSpec.function(
                      StateSpecFunctions.<T, CheckpointMarkT>mapSourceFunction(rc, stepName))
                  .numPartitions(sourceDStream.getNumPartitions()));

  // set checkpoint duration for read stream, if set.
  checkpointStream(mapWithStateDStream, options);

  // report the number of input elements for this InputDStream to the InputInfoTracker.
  int id = inputDStream.inputDStream().id();
  JavaDStream<Metadata> metadataDStream = mapWithStateDStream.map(new Tuple2MetadataFunction());

  // register ReadReportDStream to report information related to this read.
  new ReadReportDStream(metadataDStream.dstream(), id, getSourceName(source, id), stepName)
      .register();

  // output the actual (deserialized) stream.
  WindowedValue.FullWindowedValueCoder<T> coder =
      WindowedValue.FullWindowedValueCoder.of(
          source.getOutputCoder(), GlobalWindow.Coder.INSTANCE);
  JavaDStream<WindowedValue<T>> readUnboundedStream =
      mapWithStateDStream
          .flatMap(new Tuple2byteFlatMapFunction())
          .map(CoderHelpers.fromByteFunction(coder));
  return new UnboundedDataset<>(readUnboundedStream, Collections.singletonList(id));
}

Source File: SparkUnboundedSource.java From beam with Apache License 2.0

4 votes

private static void checkpointStream(JavaDStream<?> dStream, SparkPipelineOptions options) {
  long checkpointDurationMillis = options.getCheckpointDurationMillis();
  if (checkpointDurationMillis > 0) {
    dStream.checkpoint(new Duration(checkpointDurationMillis));
  }
}

Source File: SparkMetricsPusherTest.java From beam with Apache License 2.0

4 votes

private Duration batchDuration() {
  return Duration.millis(
      (pipeline.getOptions().as(SparkPipelineOptions.class)).getBatchIntervalMillis());
}

Source File: CreateStreamTest.java From beam with Apache License 2.0

4 votes

private Duration batchDuration() {
  return Duration.millis(
      (p.getOptions().as(SparkPipelineOptions.class)).getBatchIntervalMillis());
}

Source File: SparkCoGroupByKeyStreamingTest.java From beam with Apache License 2.0

4 votes

private Duration batchDuration() {
  return Duration.millis(
      (pipeline.getOptions().as(SparkPipelineOptions.class)).getBatchIntervalMillis());
}

Source File: KettleBeamPipelineExecutor.java From kettle-beam with Apache License 2.0

4 votes

private void configureSparkOptions( BeamJobConfig config, SparkPipelineOptions options, VariableSpace space, String transformationName ) throws IOException {

    // options.setFilesToStage( BeamConst.findLibraryFilesToStage( null, config.getPluginsToStage(), true, true ) );

    if ( StringUtils.isNotEmpty( config.getSparkMaster() ) ) {
      options.setSparkMaster( space.environmentSubstitute( config.getSparkMaster() ) );
    }
    if ( StringUtils.isNotEmpty( config.getSparkBatchIntervalMillis() ) ) {
      long interval = Const.toLong( space.environmentSubstitute( config.getSparkBatchIntervalMillis() ), -1L );
      if ( interval >= 0 ) {
        options.setBatchIntervalMillis( interval );
      }
    }
    if ( StringUtils.isNotEmpty( config.getSparkCheckpointDir() ) ) {
      options.setCheckpointDir( space.environmentSubstitute( config.getSparkCheckpointDir() ) );
    }
    if ( StringUtils.isNotEmpty( config.getSparkCheckpointDurationMillis() ) ) {
      long duration = Const.toLong( space.environmentSubstitute( config.getSparkCheckpointDurationMillis() ), -1L );
      if ( duration >= 0 ) {
        options.setCheckpointDurationMillis( duration );
      }
    }
    if ( StringUtils.isNotEmpty( config.getSparkMaxRecordsPerBatch() ) ) {
      long records = Const.toLong( space.environmentSubstitute( config.getSparkMaxRecordsPerBatch() ), -1L );
      if ( records >= 0 ) {
        options.setMaxRecordsPerBatch( records );
      }
    }
    if ( StringUtils.isNotEmpty( config.getSparkMinReadTimeMillis() ) ) {
      long readTime = Const.toLong( space.environmentSubstitute( config.getSparkMinReadTimeMillis() ), -1L );
      if ( readTime >= 0 ) {
        options.setMinReadTimeMillis( readTime );
      }
    }
    if ( StringUtils.isNotEmpty( config.getSparkReadTimePercentage() ) ) {
      double percentage = Const.toDouble( space.environmentSubstitute( config.getSparkReadTimePercentage() ), -1.0 );
      if ( percentage >= 0 ) {
        options.setReadTimePercentage( percentage / 100 );
      }
    }
    if ( StringUtils.isNotEmpty( config.getSparkBundleSize() ) ) {
      long bundleSize = Const.toLong( space.environmentSubstitute( config.getSparkBundleSize() ), -1L );
      if ( bundleSize >= 0 ) {
        options.setBundleSize( bundleSize );
      }
    }
    if ( StringUtils.isNotEmpty( config.getSparkStorageLevel() ) ) {
      options.setStorageLevel( space.environmentSubstitute( config.getSparkStorageLevel() ) );
    }
    String appName = transformationName.replace( " ", "_" );
    options.setAppName( appName );
  }

Source File: KettleBeamPipelineExecutor.java From kettle-beam with Apache License 2.0

4 votes

public Pipeline getPipeline( TransMeta transMeta, BeamJobConfig config ) throws KettleException {

    try {

      if ( StringUtils.isEmpty( config.getRunnerTypeName() ) ) {
        throw new KettleException( "You need to specify a runner type, one of : " + RunnerType.values().toString() );
      }
      PipelineOptions pipelineOptions = null;
      VariableSpace space = transMeta;

      RunnerType runnerType = RunnerType.getRunnerTypeByName( transMeta.environmentSubstitute( config.getRunnerTypeName() ) );
      switch ( runnerType ) {
        case Direct:
          pipelineOptions = PipelineOptionsFactory.create();
          break;
        case DataFlow:
          DataflowPipelineOptions dfOptions = PipelineOptionsFactory.as( DataflowPipelineOptions.class );
          configureDataFlowOptions( config, dfOptions, space );
          pipelineOptions = dfOptions;
          break;
        case Spark:
          SparkPipelineOptions sparkOptions;
          if (sparkContext!=null) {
            SparkContextOptions sparkContextOptions = PipelineOptionsFactory.as( SparkContextOptions.class );
            sparkContextOptions.setProvidedSparkContext( sparkContext );
            sparkOptions = sparkContextOptions;
          } else {
            sparkOptions = PipelineOptionsFactory.as( SparkPipelineOptions.class );
          }
          configureSparkOptions( config, sparkOptions, space, transMeta.getName() );
          pipelineOptions = sparkOptions;
          break;
        case Flink:
          FlinkPipelineOptions flinkOptions = PipelineOptionsFactory.as( FlinkPipelineOptions.class );
          configureFlinkOptions( config, flinkOptions, space );
          pipelineOptions = flinkOptions;
          break;
        default:
          throw new KettleException( "Sorry, this isn't implemented yet" );
      }

      configureStandardOptions( config, transMeta.getName(), pipelineOptions, space );

      setVariablesInTransformation( config, transMeta );

      TransMetaPipelineConverter converter;
      if (stepPluginClasses!=null && xpPluginClasses!=null) {
        converter = new TransMetaPipelineConverter( transMeta, metaStore, stepPluginClasses, xpPluginClasses, jobConfig );
      } else {
        converter = new TransMetaPipelineConverter( transMeta, metaStore, config.getPluginsToStage(), jobConfig );
      }
      Pipeline pipeline = converter.createPipeline( pipelineOptions );

      // Also set the pipeline options...
      //
      FileSystems.setDefaultPipelineOptions(pipelineOptions);

      return pipeline;
    } catch ( Exception e ) {
      throw new KettleException( "Error configuring local Beam Engine", e );
    }

  }

Source File: EvaluationContext.java From beam with Apache License 2.0

3 votes

/**
 * Cache PCollection if SparkPipelineOptions.isCacheDisabled is false or transform isn't
 * GroupByKey transformation and PCollection is used more then once in Pipeline.
 *
 * <p>PCollection is not cached in GroupByKey transformation, because Spark automatically persists
 * some intermediate data in shuffle operations, even without users calling persist.
 *
 * @param pvalue output of transform
 * @param transform the transform to check
 * @return if PCollection will be cached
 */
public boolean shouldCache(PTransform<?, ? extends PValue> transform, PValue pvalue) {
  if (serializableOptions.get().as(SparkPipelineOptions.class).isCacheDisabled()
      || transform instanceof GroupByKey) {
    return false;
  }
  return pvalue instanceof PCollection && cacheCandidates.getOrDefault(pvalue, 0L) > 1;
}

org.apache.beam.runners.spark.SparkPipelineOptions Java Examples