org.apache.beam.runners.spark.SparkPipelineOptions Java Examples
The following examples show how to use
org.apache.beam.runners.spark.SparkPipelineOptions.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: SparkRunnerKryoRegistratorTest.java From beam with Apache License 2.0 | 6 votes |
private void runSimplePipelineWithSparkContext(SparkConf conf) { SparkPipelineOptions options = PipelineOptionsFactory.create().as(TestSparkPipelineOptions.class); options.setRunner(TestSparkRunner.class); conf.set("spark.master", "local"); conf.setAppName("test"); JavaSparkContext javaSparkContext = new JavaSparkContext(conf); options.setUsesProvidedSparkContext(true); options.as(SparkContextOptions.class).setProvidedSparkContext(javaSparkContext); Pipeline p = Pipeline.create(options); p.apply(Create.of("a")); // some operation to trigger pipeline construction p.run().waitUntilFinish(); javaSparkContext.stop(); }
Example #2
Source File: SourceRDD.java From beam with Apache License 2.0 | 6 votes |
public Bounded( SparkContext sc, BoundedSource<T> source, SerializablePipelineOptions options, String stepName) { super(sc, NIL, JavaSparkContext$.MODULE$.fakeClassTag()); this.source = source; this.options = options; // the input parallelism is determined by Spark's scheduler backend. // when running on YARN/SparkDeploy it's the result of max(totalCores, 2). // when running on Mesos it's 8. // when running local it's the total number of cores (local = 1, local[N] = N, // local[*] = estimation of the machine's cores). // ** the configuration "spark.default.parallelism" takes precedence over all of the above ** this.numPartitions = sc.defaultParallelism(); checkArgument(this.numPartitions > 0, "Number of partitions must be greater than zero."); this.bundleSize = options.get().as(SparkPipelineOptions.class).getBundleSize(); this.stepName = stepName; this.metricsAccum = MetricsAccumulator.getInstance(); }
Example #3
Source File: SparkContextFactory.java From beam with Apache License 2.0 | 6 votes |
public static synchronized JavaSparkContext getSparkContext(SparkPipelineOptions options) { SparkContextOptions contextOptions = options.as(SparkContextOptions.class); usesProvidedSparkContext = contextOptions.getUsesProvidedSparkContext(); // reuse should be ignored if the context is provided. if (Boolean.getBoolean(TEST_REUSE_SPARK_CONTEXT) && !usesProvidedSparkContext) { // if the context is null or stopped for some reason, re-create it. if (sparkContext == null || sparkContext.sc().isStopped()) { sparkContext = createSparkContext(contextOptions); sparkMaster = options.getSparkMaster(); } else if (!options.getSparkMaster().equals(sparkMaster)) { throw new IllegalArgumentException( String.format( "Cannot reuse spark context " + "with different spark master URL. Existing: %s, requested: %s.", sparkMaster, options.getSparkMaster())); } return sparkContext; } else { return createSparkContext(contextOptions); } }
Example #4
Source File: SparkBatchPortablePipelineTranslator.java From beam with Apache License 2.0 | 5 votes |
@Nullable private static Partitioner getPartitioner(SparkTranslationContext context) { Long bundleSize = context.serializablePipelineOptions.get().as(SparkPipelineOptions.class).getBundleSize(); return (bundleSize > 0) ? null : new HashPartitioner(context.getSparkContext().defaultParallelism()); }
Example #5
Source File: S3SparkRuntimeTestIT.java From components with Apache License 2.0 | 5 votes |
@Before public void setupLazyAvroCoder() { options = PipelineOptionsFactory.as(SparkPipelineOptions.class); options.setRunner(SparkRunner.class); options.setSparkMaster("local"); options.setStreaming(false); pWrite = Pipeline.create(options); pRead = Pipeline.create(options); }
Example #6
Source File: SparkMetricsSinkTest.java From beam with Apache License 2.0 | 5 votes |
@Category(StreamingTest.class) @Test public void testInStreamingMode() throws Exception { assertThat(InMemoryMetrics.valueOf("emptyLines"), is(nullValue())); Instant instant = new Instant(0); CreateStream<String> source = CreateStream.of( StringUtf8Coder.of(), Duration.millis( (pipeline.getOptions().as(SparkPipelineOptions.class)) .getBatchIntervalMillis())) .emptyBatch() .advanceWatermarkForNextBatch(instant) .nextBatch( TimestampedValue.of(WORDS.get(0), instant), TimestampedValue.of(WORDS.get(1), instant), TimestampedValue.of(WORDS.get(2), instant)) .advanceWatermarkForNextBatch(instant.plus(Duration.standardSeconds(2L))) .nextBatch( TimestampedValue.of(WORDS.get(3), instant.plus(Duration.standardSeconds(1L))), TimestampedValue.of(WORDS.get(4), instant.plus(Duration.standardSeconds(1L))), TimestampedValue.of(WORDS.get(5), instant.plus(Duration.standardSeconds(1L)))) .advanceNextBatchWatermarkToInfinity(); PCollection<String> output = pipeline .apply(source) .apply( Window.<String>into(FixedWindows.of(Duration.standardSeconds(3L))) .withAllowedLateness(Duration.ZERO)) .apply(new WordCount.CountWords()) .apply(MapElements.via(new WordCount.FormatAsTextFn())); PAssert.that(output).containsInAnyOrder(EXPECTED_COUNTS); pipeline.run(); assertThat(InMemoryMetrics.<Double>valueOf("emptyLines"), is(1d)); }
Example #7
Source File: SourceDStream.java From beam with Apache License 2.0 | 5 votes |
SourceDStream( StreamingContext ssc, UnboundedSource<T, CheckpointMarkT> unboundedSource, SerializablePipelineOptions options, Long boundMaxRecords) { super(ssc, JavaSparkContext$.MODULE$.fakeClassTag()); this.unboundedSource = unboundedSource; this.options = options; SparkPipelineOptions sparkOptions = options.get().as(SparkPipelineOptions.class); // Reader cache expiration interval. 50% of batch interval is added to accommodate latency. this.readerCacheInterval = 1.5 * sparkOptions.getBatchIntervalMillis(); this.boundReadDuration = boundReadDuration( sparkOptions.getReadTimePercentage(), sparkOptions.getMinReadTimeMillis()); // set initial parallelism once. this.initialParallelism = ssc().sparkContext().defaultParallelism(); checkArgument(this.initialParallelism > 0, "Number of partitions must be greater than zero."); this.boundMaxRecords = boundMaxRecords; try { this.numPartitions = createMicrobatchSource().split(sparkOptions).size(); } catch (Exception e) { throw new RuntimeException(e); } }
Example #8
Source File: AggregatorsAccumulator.java From beam with Apache License 2.0 | 5 votes |
/** Init aggregators accumulator if it has not been initiated. This method is idempotent. */ public static void init(SparkPipelineOptions opts, JavaSparkContext jsc) { if (instance == null) { synchronized (AggregatorsAccumulator.class) { if (instance == null) { Optional<CheckpointDir> maybeCheckpointDir = opts.isStreaming() ? Optional.of(new CheckpointDir(opts.getCheckpointDir())) : Optional.absent(); NamedAggregators namedAggregators = new NamedAggregators(); NamedAggregatorsAccumulator accumulator = new NamedAggregatorsAccumulator(namedAggregators); jsc.sc().register(accumulator, ACCUMULATOR_NAME); if (maybeCheckpointDir.isPresent()) { Optional<NamedAggregators> maybeRecoveredValue = recoverValueFromCheckpoint(jsc, maybeCheckpointDir.get()); if (maybeRecoveredValue.isPresent()) { accumulator = new NamedAggregatorsAccumulator(maybeRecoveredValue.get()); } } instance = accumulator; } } LOG.info("Instantiated aggregators accumulator: " + instance.value()); } }
Example #9
Source File: SparkTranslationContext.java From beam with Apache License 2.0 | 5 votes |
/** Add output of transform to context. */ public void pushDataset(String pCollectionId, Dataset dataset) { dataset.setName(pCollectionId); SparkPipelineOptions sparkOptions = serializablePipelineOptions.get().as(SparkPipelineOptions.class); if (!sparkOptions.isCacheDisabled() && consumptionCount.getOrDefault(pCollectionId, 0) > 1) { String storageLevel = sparkOptions.getStorageLevel(); @Nullable Coder coder = coderMap.get(pCollectionId); dataset.cache(storageLevel, coder); } datasets.put(pCollectionId, dataset); leaves.add(dataset); }
Example #10
Source File: TransformTranslator.java From beam with Apache License 2.0 | 5 votes |
@Nullable private static Partitioner getPartitioner(EvaluationContext context) { Long bundleSize = context.getSerializableOptions().get().as(SparkPipelineOptions.class).getBundleSize(); return (bundleSize > 0) ? null : new HashPartitioner(context.getSparkContext().defaultParallelism()); }
Example #11
Source File: MetricsAccumulator.java From beam with Apache License 2.0 | 5 votes |
/** Init metrics accumulator if it has not been initiated. This method is idempotent. */ public static void init(SparkPipelineOptions opts, JavaSparkContext jsc) { if (instance == null) { synchronized (MetricsAccumulator.class) { if (instance == null) { Optional<CheckpointDir> maybeCheckpointDir = opts.isStreaming() ? Optional.of(new CheckpointDir(opts.getCheckpointDir())) : Optional.absent(); MetricsContainerStepMap metricsContainerStepMap = new MetricsContainerStepMap(); MetricsContainerStepMapAccumulator accumulator = new MetricsContainerStepMapAccumulator(metricsContainerStepMap); jsc.sc().register(accumulator, ACCUMULATOR_NAME); if (maybeCheckpointDir.isPresent()) { Optional<MetricsContainerStepMap> maybeRecoveredValue = recoverValueFromCheckpoint(jsc, maybeCheckpointDir.get()); if (maybeRecoveredValue.isPresent()) { accumulator = new MetricsContainerStepMapAccumulator(maybeRecoveredValue.get()); } } instance = accumulator; } } LOG.info("Instantiated metrics accumulator: " + instance.value()); } else { instance.reset(); } }
Example #12
Source File: EvaluationContext.java From beam with Apache License 2.0 | 4 votes |
public String storageLevel() { return serializableOptions.get().as(SparkPipelineOptions.class).getStorageLevel(); }
Example #13
Source File: SparkRunnerStreamingContextFactory.java From beam with Apache License 2.0 | 4 votes |
public SparkRunnerStreamingContextFactory( Pipeline pipeline, SparkPipelineOptions options, CheckpointDir checkpointDir) { this.pipeline = pipeline; this.options = options; this.checkpointDir = checkpointDir; }
Example #14
Source File: BeamSparkPipelineRunConfiguration.java From hop with Apache License 2.0 | 4 votes |
@Override public PipelineOptions getPipelineOptions() throws HopException { SparkPipelineOptions options = PipelineOptionsFactory.as( SparkPipelineOptions.class ); if ( StringUtils.isNotEmpty( getSparkMaster() ) ) { options.setSparkMaster( environmentSubstitute( getSparkMaster() ) ); } if ( StringUtils.isNotEmpty( getSparkBatchIntervalMillis() ) ) { long interval = Const.toLong( environmentSubstitute( getSparkBatchIntervalMillis() ), -1L ); if ( interval >= 0 ) { options.setBatchIntervalMillis( interval ); } } if ( StringUtils.isNotEmpty( getSparkCheckpointDir() ) ) { options.setCheckpointDir( environmentSubstitute( getSparkCheckpointDir() ) ); } if ( StringUtils.isNotEmpty( getSparkCheckpointDurationMillis() ) ) { long duration = Const.toLong( environmentSubstitute( getSparkCheckpointDurationMillis() ), -1L ); if ( duration >= 0 ) { options.setCheckpointDurationMillis( duration ); } } if ( StringUtils.isNotEmpty( getSparkMaxRecordsPerBatch() ) ) { long records = Const.toLong( environmentSubstitute( getSparkMaxRecordsPerBatch() ), -1L ); if ( records >= 0 ) { options.setMaxRecordsPerBatch( records ); } } if ( StringUtils.isNotEmpty( getSparkMinReadTimeMillis() ) ) { long readTime = Const.toLong( environmentSubstitute( getSparkMinReadTimeMillis() ), -1L ); if ( readTime >= 0 ) { options.setMinReadTimeMillis( readTime ); } } if ( StringUtils.isNotEmpty( getSparkReadTimePercentage() ) ) { double percentage = Const.toDouble( environmentSubstitute( getSparkReadTimePercentage() ), -1.0 ); if ( percentage >= 0 ) { options.setReadTimePercentage( percentage / 100 ); } } if ( StringUtils.isNotEmpty( getSparkBundleSize() ) ) { long bundleSize = Const.toLong( environmentSubstitute( getSparkBundleSize() ), -1L ); if ( bundleSize >= 0 ) { options.setBundleSize( bundleSize ); } } if ( StringUtils.isNotEmpty( getSparkStorageLevel() ) ) { options.setStorageLevel( environmentSubstitute( getSparkStorageLevel() ) ); } if (StringUtils.isNotEmpty( getFatJar() )) { options.setFilesToStage( Arrays.asList(fatJar) ); } return options; }
Example #15
Source File: SparkGroupAlsoByWindowViaWindowSet.java From beam with Apache License 2.0 | 4 votes |
private static Long getBatchDuration(final SerializablePipelineOptions options) { return options.get().as(SparkPipelineOptions.class).getCheckpointDurationMillis(); }
Example #16
Source File: SparkUnboundedSource.java From beam with Apache License 2.0 | 4 votes |
public static <T, CheckpointMarkT extends CheckpointMark> UnboundedDataset<T> read( JavaStreamingContext jssc, SerializablePipelineOptions rc, UnboundedSource<T, CheckpointMarkT> source, String stepName) { SparkPipelineOptions options = rc.get().as(SparkPipelineOptions.class); Long maxRecordsPerBatch = options.getMaxRecordsPerBatch(); SourceDStream<T, CheckpointMarkT> sourceDStream = new SourceDStream<>(jssc.ssc(), source, rc, maxRecordsPerBatch); JavaPairInputDStream<Source<T>, CheckpointMarkT> inputDStream = JavaPairInputDStream$.MODULE$.fromInputDStream( sourceDStream, JavaSparkContext$.MODULE$.fakeClassTag(), JavaSparkContext$.MODULE$.fakeClassTag()); // call mapWithState to read from a checkpointable sources. JavaMapWithStateDStream< Source<T>, CheckpointMarkT, Tuple2<byte[], Instant>, Tuple2<Iterable<byte[]>, Metadata>> mapWithStateDStream = inputDStream.mapWithState( StateSpec.function( StateSpecFunctions.<T, CheckpointMarkT>mapSourceFunction(rc, stepName)) .numPartitions(sourceDStream.getNumPartitions())); // set checkpoint duration for read stream, if set. checkpointStream(mapWithStateDStream, options); // report the number of input elements for this InputDStream to the InputInfoTracker. int id = inputDStream.inputDStream().id(); JavaDStream<Metadata> metadataDStream = mapWithStateDStream.map(new Tuple2MetadataFunction()); // register ReadReportDStream to report information related to this read. new ReadReportDStream(metadataDStream.dstream(), id, getSourceName(source, id), stepName) .register(); // output the actual (deserialized) stream. WindowedValue.FullWindowedValueCoder<T> coder = WindowedValue.FullWindowedValueCoder.of( source.getOutputCoder(), GlobalWindow.Coder.INSTANCE); JavaDStream<WindowedValue<T>> readUnboundedStream = mapWithStateDStream .flatMap(new Tuple2byteFlatMapFunction()) .map(CoderHelpers.fromByteFunction(coder)); return new UnboundedDataset<>(readUnboundedStream, Collections.singletonList(id)); }
Example #17
Source File: SparkUnboundedSource.java From beam with Apache License 2.0 | 4 votes |
private static void checkpointStream(JavaDStream<?> dStream, SparkPipelineOptions options) { long checkpointDurationMillis = options.getCheckpointDurationMillis(); if (checkpointDurationMillis > 0) { dStream.checkpoint(new Duration(checkpointDurationMillis)); } }
Example #18
Source File: SparkMetricsPusherTest.java From beam with Apache License 2.0 | 4 votes |
private Duration batchDuration() { return Duration.millis( (pipeline.getOptions().as(SparkPipelineOptions.class)).getBatchIntervalMillis()); }
Example #19
Source File: CreateStreamTest.java From beam with Apache License 2.0 | 4 votes |
private Duration batchDuration() { return Duration.millis( (p.getOptions().as(SparkPipelineOptions.class)).getBatchIntervalMillis()); }
Example #20
Source File: SparkCoGroupByKeyStreamingTest.java From beam with Apache License 2.0 | 4 votes |
private Duration batchDuration() { return Duration.millis( (pipeline.getOptions().as(SparkPipelineOptions.class)).getBatchIntervalMillis()); }
Example #21
Source File: KettleBeamPipelineExecutor.java From kettle-beam with Apache License 2.0 | 4 votes |
private void configureSparkOptions( BeamJobConfig config, SparkPipelineOptions options, VariableSpace space, String transformationName ) throws IOException { // options.setFilesToStage( BeamConst.findLibraryFilesToStage( null, config.getPluginsToStage(), true, true ) ); if ( StringUtils.isNotEmpty( config.getSparkMaster() ) ) { options.setSparkMaster( space.environmentSubstitute( config.getSparkMaster() ) ); } if ( StringUtils.isNotEmpty( config.getSparkBatchIntervalMillis() ) ) { long interval = Const.toLong( space.environmentSubstitute( config.getSparkBatchIntervalMillis() ), -1L ); if ( interval >= 0 ) { options.setBatchIntervalMillis( interval ); } } if ( StringUtils.isNotEmpty( config.getSparkCheckpointDir() ) ) { options.setCheckpointDir( space.environmentSubstitute( config.getSparkCheckpointDir() ) ); } if ( StringUtils.isNotEmpty( config.getSparkCheckpointDurationMillis() ) ) { long duration = Const.toLong( space.environmentSubstitute( config.getSparkCheckpointDurationMillis() ), -1L ); if ( duration >= 0 ) { options.setCheckpointDurationMillis( duration ); } } if ( StringUtils.isNotEmpty( config.getSparkMaxRecordsPerBatch() ) ) { long records = Const.toLong( space.environmentSubstitute( config.getSparkMaxRecordsPerBatch() ), -1L ); if ( records >= 0 ) { options.setMaxRecordsPerBatch( records ); } } if ( StringUtils.isNotEmpty( config.getSparkMinReadTimeMillis() ) ) { long readTime = Const.toLong( space.environmentSubstitute( config.getSparkMinReadTimeMillis() ), -1L ); if ( readTime >= 0 ) { options.setMinReadTimeMillis( readTime ); } } if ( StringUtils.isNotEmpty( config.getSparkReadTimePercentage() ) ) { double percentage = Const.toDouble( space.environmentSubstitute( config.getSparkReadTimePercentage() ), -1.0 ); if ( percentage >= 0 ) { options.setReadTimePercentage( percentage / 100 ); } } if ( StringUtils.isNotEmpty( config.getSparkBundleSize() ) ) { long bundleSize = Const.toLong( space.environmentSubstitute( config.getSparkBundleSize() ), -1L ); if ( bundleSize >= 0 ) { options.setBundleSize( bundleSize ); } } if ( StringUtils.isNotEmpty( config.getSparkStorageLevel() ) ) { options.setStorageLevel( space.environmentSubstitute( config.getSparkStorageLevel() ) ); } String appName = transformationName.replace( " ", "_" ); options.setAppName( appName ); }
Example #22
Source File: KettleBeamPipelineExecutor.java From kettle-beam with Apache License 2.0 | 4 votes |
public Pipeline getPipeline( TransMeta transMeta, BeamJobConfig config ) throws KettleException { try { if ( StringUtils.isEmpty( config.getRunnerTypeName() ) ) { throw new KettleException( "You need to specify a runner type, one of : " + RunnerType.values().toString() ); } PipelineOptions pipelineOptions = null; VariableSpace space = transMeta; RunnerType runnerType = RunnerType.getRunnerTypeByName( transMeta.environmentSubstitute( config.getRunnerTypeName() ) ); switch ( runnerType ) { case Direct: pipelineOptions = PipelineOptionsFactory.create(); break; case DataFlow: DataflowPipelineOptions dfOptions = PipelineOptionsFactory.as( DataflowPipelineOptions.class ); configureDataFlowOptions( config, dfOptions, space ); pipelineOptions = dfOptions; break; case Spark: SparkPipelineOptions sparkOptions; if (sparkContext!=null) { SparkContextOptions sparkContextOptions = PipelineOptionsFactory.as( SparkContextOptions.class ); sparkContextOptions.setProvidedSparkContext( sparkContext ); sparkOptions = sparkContextOptions; } else { sparkOptions = PipelineOptionsFactory.as( SparkPipelineOptions.class ); } configureSparkOptions( config, sparkOptions, space, transMeta.getName() ); pipelineOptions = sparkOptions; break; case Flink: FlinkPipelineOptions flinkOptions = PipelineOptionsFactory.as( FlinkPipelineOptions.class ); configureFlinkOptions( config, flinkOptions, space ); pipelineOptions = flinkOptions; break; default: throw new KettleException( "Sorry, this isn't implemented yet" ); } configureStandardOptions( config, transMeta.getName(), pipelineOptions, space ); setVariablesInTransformation( config, transMeta ); TransMetaPipelineConverter converter; if (stepPluginClasses!=null && xpPluginClasses!=null) { converter = new TransMetaPipelineConverter( transMeta, metaStore, stepPluginClasses, xpPluginClasses, jobConfig ); } else { converter = new TransMetaPipelineConverter( transMeta, metaStore, config.getPluginsToStage(), jobConfig ); } Pipeline pipeline = converter.createPipeline( pipelineOptions ); // Also set the pipeline options... // FileSystems.setDefaultPipelineOptions(pipelineOptions); return pipeline; } catch ( Exception e ) { throw new KettleException( "Error configuring local Beam Engine", e ); } }
Example #23
Source File: EvaluationContext.java From beam with Apache License 2.0 | 3 votes |
/** * Cache PCollection if SparkPipelineOptions.isCacheDisabled is false or transform isn't * GroupByKey transformation and PCollection is used more then once in Pipeline. * * <p>PCollection is not cached in GroupByKey transformation, because Spark automatically persists * some intermediate data in shuffle operations, even without users calling persist. * * @param pvalue output of transform * @param transform the transform to check * @return if PCollection will be cached */ public boolean shouldCache(PTransform<?, ? extends PValue> transform, PValue pvalue) { if (serializableOptions.get().as(SparkPipelineOptions.class).isCacheDisabled() || transform instanceof GroupByKey) { return false; } return pvalue instanceof PCollection && cacheCandidates.getOrDefault(pvalue, 0L) > 1; }