Java Code Examples for org.apache.spark.SparkConf#set()
The following examples show how to use
org.apache.spark.SparkConf#set() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: StreamingContextConfiguration.java From Decision with Apache License 2.0 | 6 votes |
private JavaStreamingContext create(String streamingContextName, int port, long streamingBatchTime, String sparkHost) { SparkConf conf = new SparkConf(); conf.set("spark.ui.port", String.valueOf(port)); conf.setAppName(streamingContextName); conf.setJars(JavaStreamingContext.jarOfClass(StreamingEngine.class)); conf.setMaster(sparkHost); conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); conf.registerKryoClasses(new Class[] { StratioStreamingMessage.class, InsertMessage.class, ColumnType.class, Action.class}); HashMap<String, String> tuningProperties = configurationContext.getSparkTunningProperties(); if (tuningProperties != null && tuningProperties.size() > 0) { tuningProperties.forEach( (key, value) -> conf.set(key, value)); } JavaStreamingContext streamingContext = new JavaStreamingContext(conf, new Duration(streamingBatchTime)); return streamingContext; }
Example 2
Source File: ChronixSparkLoader.java From chronix.spark with Apache License 2.0 | 6 votes |
public ChronixSparkContext createChronixSparkContext() throws IOException { if (chronixSparkContext != null) { return chronixSparkContext; } SparkConf sparkConf = new SparkConf() .setMaster(chronixYAMLConfiguration.getSparkMaster()) .setAppName(chronixYAMLConfiguration.getAppName()); ChronixSparkContext.tuneSparkConf(sparkConf); //Set spark values given in yaml config for (Map.Entry<String, String> setting : chronixYAMLConfiguration.getSparkSettings().entrySet()) { sparkConf.set(setting.getKey(), setting.getValue()); } if (chronixYAMLConfiguration.isDistributed()) { sparkConf.setJars(chronixYAMLConfiguration.getJars()); } chronixSparkContext = new ChronixSparkContext(new JavaSparkContext(sparkConf)); return chronixSparkContext; }
Example 3
Source File: SparkOnYarnContainer.java From liteflow with Apache License 2.0 | 6 votes |
private SparkConf initSparkConf(JSONObject configObj){ String jobName = configObj.getString(CommonConstants.PARAM_EXECUTOR_JOB_NAME); String yarnQueue = configObj.getString(CommonConstants.SPARK_PARAM_YARN_QUEUE); String instanceNum = configObj.getString(CommonConstants.SPARK_PARAM_INSTANCE_NUM); SparkConf sparkConf = new SparkConf(); sparkConf.setAppName(jobName); sparkConf.set("spark.app.name", jobName); sparkConf.set("spark.yarn.queue", yarnQueue); sparkConf.set("spark.driver.cores", configObj.getString(CommonConstants.SPARK_PARAM_DRIVER_CORES)); sparkConf.set("spark.driver.memory", configObj.getString(CommonConstants.SPARK_PARAM_DRIVER_MEMORY) + CommonConstants.SPARK_PARAM_MEMORY_UNIT); sparkConf.set("spark.executor.cores", configObj.getString(CommonConstants.SPARK_PARAM_EXECUTOR_CORES)); sparkConf.set("spark.executor.memory", configObj.getString(CommonConstants.SPARK_PARAM_EXECUTOR_MEMORY) + CommonConstants.SPARK_PARAM_MEMORY_UNIT); // 设置并发实例数 Boolean isDynamicAllocation = HadoopConfig.getHadoopConf().getIsDynamicAllocation(); if (isDynamicAllocation != null && isDynamicAllocation) { sparkConf.set("spark.shuffle.service.enabled", "true"); sparkConf.set("spark.dynamicAllocation.enabled", "true"); sparkConf.set("spark.dynamicAllocation.minExecutors", "1"); sparkConf.set("spark.dynamicAllocation.maxExecutors", String.valueOf(instanceNum)); } else { sparkConf.set("spark.executor.instances", String.valueOf(instanceNum)); } /** * hadoop、hive配置文件 */ String hadoopFiles = HadoopConfig.getHadoopConf().getSparkYarnDistFiles(); sparkConf.set("spark.yarn.dist.files", hadoopFiles + CommonConstants.COMMA + configObj.getString(Constants.JOB_CONFIG_PATH)); return sparkConf; }
Example 4
Source File: UtilHelpers.java From hudi with Apache License 2.0 | 6 votes |
private static SparkConf buildSparkConf(String appName, String defaultMaster, Map<String, String> additionalConfigs) { final SparkConf sparkConf = new SparkConf().setAppName(appName); String master = sparkConf.get("spark.master", defaultMaster); sparkConf.setMaster(master); if (master.startsWith("yarn")) { sparkConf.set("spark.eventLog.overwrite", "true"); sparkConf.set("spark.eventLog.enabled", "true"); } sparkConf.setIfMissing("spark.driver.maxResultSize", "2g"); sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); sparkConf.set("spark.hadoop.mapred.output.compress", "true"); sparkConf.set("spark.hadoop.mapred.output.compression.codec", "true"); sparkConf.set("spark.hadoop.mapred.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec"); sparkConf.set("spark.hadoop.mapred.output.compression.type", "BLOCK"); additionalConfigs.forEach(sparkConf::set); return HoodieWriteClient.registerClasses(sparkConf); }
Example 5
Source File: SparkUtil.java From hudi with Apache License 2.0 | 5 votes |
public static JavaSparkContext initJavaSparkConf(String name, Option<String> master, Option<String> executorMemory) { SparkConf sparkConf = new SparkConf().setAppName(name); String defMaster = master.orElse(sparkConf.getenv(HoodieCliSparkConfig.CLI_SPARK_MASTER)); if ((null == defMaster) || (defMaster.isEmpty())) { sparkConf.setMaster(DEFAULT_SPARK_MASTER); } else { sparkConf.setMaster(defMaster); } sparkConf.set(HoodieCliSparkConfig.CLI_SERIALIZER, "org.apache.spark.serializer.KryoSerializer"); sparkConf.set(HoodieCliSparkConfig.CLI_DRIVER_MAX_RESULT_SIZE, "2g"); sparkConf.set(HoodieCliSparkConfig.CLI_EVENT_LOG_OVERWRITE, "true"); sparkConf.set(HoodieCliSparkConfig.CLI_EVENT_LOG_ENABLED, "true"); if (executorMemory.isPresent()) { sparkConf.set(HoodieCliSparkConfig.CLI_EXECUTOR_MEMORY, executorMemory.get()); } // Configure hadoop conf sparkConf.set(HoodieCliSparkConfig.CLI_MAPRED_OUTPUT_COMPRESS, "true"); sparkConf.set(HoodieCliSparkConfig.CLI_MAPRED_OUTPUT_COMPRESSION_CODEC, "true"); sparkConf.set(HoodieCliSparkConfig.CLI_MAPRED_OUTPUT_COMPRESSION_CODEC, "org.apache.hadoop.io.compress.GzipCodec"); sparkConf.set(HoodieCliSparkConfig.CLI_MAPRED_OUTPUT_COMPRESSION_TYPE, "BLOCK"); HoodieWriteClient.registerClasses(sparkConf); JavaSparkContext jsc = new JavaSparkContext(sparkConf); jsc.hadoopConfiguration().setBoolean(HoodieCliSparkConfig.CLI_PARQUET_ENABLE_SUMMARY_METADATA, false); FSUtils.prepareHadoopConf(jsc.hadoopConfiguration()); return jsc; }
Example 6
Source File: SqoopSparkJob.java From sqoop-on-spark with Apache License 2.0 | 5 votes |
public SparkConf init(CommandLine cArgs) throws ClassNotFoundException { System.setProperty(ConfigurationConstants.SYSPROP_CONFIG_DIR, cArgs.getOptionValue("confDir")); // by default it is local, override based on the submit parameter SparkConf conf = new SparkConf().setAppName("sqoop-spark").setMaster("local"); if (cArgs.getOptionValue("defaultExtractors") != null) { conf.set(SqoopSparkDriver.DEFAULT_EXTRACTORS, cArgs.getOptionValue("defaultExtractors")); } if (cArgs.getOptionValue("numL") != null) { conf.set(SqoopSparkDriver.NUM_LOADERS, cArgs.getOptionValue("numL")); } // hack to load extra classes directly Class.forName("com.mysql.jdbc.Driver"); SqoopServer.initialize(); return conf; }
Example 7
Source File: JavaHBaseStreamingBulkPutExample.java From learning-hadoop with Apache License 2.0 | 5 votes |
public static void main(String args[]) { if (args.length == 0) { System.out .println("JavaHBaseBulkPutExample {master} {host} {post} {tableName} {columnFamily}"); } String master = args[0]; String host = args[1]; String port = args[2]; String tableName = args[3]; String columnFamily = args[4]; System.out.println("master:" + master); System.out.println("host:" + host); System.out.println("port:" + Integer.parseInt(port)); System.out.println("tableName:" + tableName); System.out.println("columnFamily:" + columnFamily); SparkConf sparkConf = new SparkConf(); sparkConf.set("spark.cleaner.ttl", "120000"); JavaSparkContext jsc = new JavaSparkContext(master, "JavaHBaseBulkPutExample"); jsc.addJar("SparkHBase.jar"); JavaStreamingContext jssc = new JavaStreamingContext(jsc, new Duration(1000)); JavaReceiverInputDStream<String> javaDstream = jssc.socketTextStream(host, Integer.parseInt(port)); Configuration conf = HBaseConfiguration.create(); conf.addResource(new Path("/etc/hbase/conf/core-site.xml")); conf.addResource(new Path("/etc/hbase/conf/hbase-site.xml")); JavaHBaseContext hbaseContext = new JavaHBaseContext(jsc, conf); hbaseContext.streamBulkPut(javaDstream, tableName, new PutFunction(), true); }
Example 8
Source File: GeoWaveSparkConf.java From geowave with Apache License 2.0 | 5 votes |
public static SparkSession createSessionFromParams( final String appName, String master, final String host, final String jars) { // Grab default config for GeoWave SparkConf defaultConfig = GeoWaveSparkConf.getDefaultConfig(); // Apply master from default if (master == null) { master = "yarn"; } // Apply user options if set, correctly handling host for yarn. if (appName != null) { defaultConfig = defaultConfig.setAppName(appName); } defaultConfig = defaultConfig.setMaster(master); if (host != null) { if (master != "yarn") { defaultConfig = defaultConfig.set("spark.driver.host", host); } else { LOGGER.warn( "Attempting to set spark driver host for yarn master. Normally this is handled via hadoop configuration. Remove host or set another master designation and try again."); } } if (jars != null) { defaultConfig = defaultConfig.set("spark.jars", jars); } // Finally return the session from builder return GeoWaveSparkConf.internalCreateSession(defaultConfig, null); }
Example 9
Source File: JavaEmbeddedIgniteRDDWithLocalStoreSelfTest.java From ignite with Apache License 2.0 | 5 votes |
/** * Creates default spark context * * @return Context. */ private JavaSparkContext createContext() { SparkConf conf = new SparkConf(); conf.set("spark.executor.instances", String.valueOf(GRID_CNT)); return new JavaSparkContext("local[" + GRID_CNT + "]", "test", conf); }
Example 10
Source File: SqoopSparkClientFactory.java From sqoop-on-spark with Apache License 2.0 | 5 votes |
static SparkConf generateSparkConf(Map<String, String> conf) { SparkConf sparkConf = new SparkConf(false); for (Map.Entry<String, String> entry : conf.entrySet()) { sparkConf.set(entry.getKey(), entry.getValue()); } return sparkConf; }
Example 11
Source File: SpliceSparkWatcher.java From spliceengine with GNU Affero General Public License v3.0 | 5 votes |
@Override protected void starting(Description description) { super.starting(description); SpliceLogUtils.trace(LOG, "starting spark"); SparkConf sparkConf = new SparkConf().setAppName(appName).setMaster("local"); sparkConf.set("spark.broadcast.compress", "false"); // Will attempt to use Snappy without this set. sparkConf.set("spark.driver.allowMultipleContexts", "true"); // SPARK-2243 jsc = new JavaSparkContext(sparkConf); }
Example 12
Source File: SparkEngineBase.java From beakerx with Apache License 2.0 | 5 votes |
protected void configureSparkConf(SparkConf sparkConf) { if (!sparkConf.contains(SPARK_APP_NAME)) { sparkConf.setAppName("beaker_" + UUID.randomUUID().toString()); } if (sparkConf.contains(SPARK_MASTER) && !isLocalSpark(sparkConf)) { sparkConf.set(SPARK_REPL_CLASS_OUTPUT_DIR, KernelManager.get().getOutDir()); } }
Example 13
Source File: SparkIntegrationTestResource.java From components with Apache License 2.0 | 5 votes |
/** * @return a clean spark configuration created from the options in this resource. */ public SparkConf createSparkConf(String appName) { SparkConf conf = new SparkConf(); conf.setAppName(appName); conf.setMaster(sm); // conf.set("spark.driver.host", "10.42.30.148"); for (Map.Entry<String, String> kv : hadoopConf.entrySet()) conf.set("spark.hadoop." + kv.getKey(), kv.getValue()); return conf; }
Example 14
Source File: SparkRunnerKryoRegistratorTest.java From beam with Apache License 2.0 | 4 votes |
@Test public void testDefaultSerializerNotCallingKryo() { SparkConf conf = new SparkConf(); conf.set("spark.kryo.registrator", KryoRegistratorIsNotCalled.class.getName()); runSimplePipelineWithSparkContext(conf); }
Example 15
Source File: ComputeResponse.java From incubator-retired-pirk with Apache License 2.0 | 4 votes |
public ComputeResponse(FileSystem fileSys) throws PIRException { fs = fileSys; storage = new HadoopFileSystemStore(fs); dataInputFormat = SystemConfiguration.getProperty("pir.dataInputFormat"); if (!InputFormatConst.ALLOWED_FORMATS.contains(dataInputFormat)) { throw new IllegalArgumentException("inputFormat = " + dataInputFormat + " is of an unknown form"); } logger.info("inputFormat = " + dataInputFormat); if (dataInputFormat.equals(InputFormatConst.BASE_FORMAT)) { inputData = SystemConfiguration.getProperty("pir.inputData", "none"); if (inputData.equals("none")) { throw new IllegalArgumentException("For inputFormat = " + dataInputFormat + " an inputFile must be specified"); } logger.info("inputFile = " + inputData); } else if (dataInputFormat.equals(InputFormatConst.ES)) { esQuery = SystemConfiguration.getProperty("pir.esQuery", "none"); esResource = SystemConfiguration.getProperty("pir.esResource", "none"); if (esQuery.equals("none")) { throw new IllegalArgumentException("esQuery must be specified"); } if (esResource.equals("none")) { throw new IllegalArgumentException("esResource must be specified"); } logger.info("esQuery = " + esQuery + " esResource = " + esResource); } outputFile = SystemConfiguration.getProperty("pir.outputFile"); outputDirExp = outputFile + "_exp"; queryInput = SystemConfiguration.getProperty("pir.queryInput"); String stopListFile = SystemConfiguration.getProperty("pir.stopListFile"); useModExpJoin = SystemConfiguration.getBooleanProperty("pir.useModExpJoin", false); logger.info("outputFile = " + outputFile + " queryInputDir = " + queryInput + " stopListFile = " + stopListFile + " esQuery = " + esQuery + " esResource = " + esResource); // Set the necessary configurations SparkConf conf = new SparkConf().setAppName("SparkPIR").setMaster("yarn-cluster"); conf.set("es.nodes", SystemConfiguration.getProperty("es.nodes", "none")); conf.set("es.port", SystemConfiguration.getProperty("es.port", "none")); conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); conf.set("spark.memory.storageFraction", "0.10"); conf.set("spark.memory.fraction", "0.25"); // conf.set("spark.memory.fraction", "0.25"); // conf.set("spark.executor.extraJavaOptions", "-XX:+UseCompressedOops"); sc = new JavaSparkContext(conf); // Setup, run query, teardown logger.info("Setting up for query run"); try { setup(); } catch (IOException e) { throw new PIRException("An error occurred setting up the Spark responder.", e); } logger.info("Setup complete"); }
Example 16
Source File: WordCountingAppWithCheckpoint.java From tutorials with MIT License | 4 votes |
public static void main(String[] args) throws InterruptedException { Logger.getLogger("org") .setLevel(Level.OFF); Logger.getLogger("akka") .setLevel(Level.OFF); Map<String, Object> kafkaParams = new HashMap<>(); kafkaParams.put("bootstrap.servers", "localhost:9092"); kafkaParams.put("key.deserializer", StringDeserializer.class); kafkaParams.put("value.deserializer", StringDeserializer.class); kafkaParams.put("group.id", "use_a_separate_group_id_for_each_stream"); kafkaParams.put("auto.offset.reset", "latest"); kafkaParams.put("enable.auto.commit", false); Collection<String> topics = Arrays.asList("messages"); SparkConf sparkConf = new SparkConf(); sparkConf.setMaster("local[2]"); sparkConf.setAppName("WordCountingAppWithCheckpoint"); sparkConf.set("spark.cassandra.connection.host", "127.0.0.1"); JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1)); sparkContext = streamingContext.sparkContext(); streamingContext.checkpoint("./.checkpoint"); JavaInputDStream<ConsumerRecord<String, String>> messages = KafkaUtils.createDirectStream(streamingContext, LocationStrategies.PreferConsistent(), ConsumerStrategies.<String, String> Subscribe(topics, kafkaParams)); JavaPairDStream<String, String> results = messages.mapToPair(record -> new Tuple2<>(record.key(), record.value())); JavaDStream<String> lines = results.map(tuple2 -> tuple2._2()); JavaDStream<String> words = lines.flatMap(x -> Arrays.asList(x.split("\\s+")) .iterator()); JavaPairDStream<String, Integer> wordCounts = words.mapToPair(s -> new Tuple2<>(s, 1)) .reduceByKey((Function2<Integer, Integer, Integer>) (i1, i2) -> i1 + i2); JavaMapWithStateDStream<String, Integer, Integer, Tuple2<String, Integer>> cumulativeWordCounts = wordCounts.mapWithState(StateSpec.function((word, one, state) -> { int sum = one.orElse(0) + (state.exists() ? state.get() : 0); Tuple2<String, Integer> output = new Tuple2<>(word, sum); state.update(sum); return output; })); cumulativeWordCounts.foreachRDD(javaRdd -> { List<Tuple2<String, Integer>> wordCountList = javaRdd.collect(); for (Tuple2<String, Integer> tuple : wordCountList) { List<Word> wordList = Arrays.asList(new Word(tuple._1, tuple._2)); JavaRDD<Word> rdd = sparkContext.parallelize(wordList); javaFunctions(rdd).writerBuilder("vocabulary", "words", mapToRow(Word.class)) .saveToCassandra(); } }); streamingContext.start(); streamingContext.awaitTermination(); }
Example 17
Source File: SparkMergingDictionary.java From kylin with Apache License 2.0 | 4 votes |
@Override protected void execute(OptionsHelper optionsHelper) throws Exception { final String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME); final String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID); final String metaUrl = optionsHelper.getOptionValue(OPTION_META_URL); final String segmentIds = optionsHelper.getOptionValue(OPTION_MERGE_SEGMENT_IDS); final String dictOutputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH_DICT); final String statOutputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH_STAT); Class[] kryoClassArray = new Class[] { Class.forName("scala.reflect.ClassTag$$anon$1"), Class.forName("scala.collection.mutable.WrappedArray$ofRef") }; SparkConf conf = new SparkConf().setAppName("Merge dictionary for cube:" + cubeName + ", segment " + segmentId); //serialization conf conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator"); conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray); try (JavaSparkContext sc = new JavaSparkContext(conf)) { KylinSparkJobListener jobListener = new KylinSparkJobListener(); sc.sc().addSparkListener(jobListener); HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(dictOutputPath)); final SerializableConfiguration sConf = new SerializableConfiguration(sc.hadoopConfiguration()); final KylinConfig envConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl); final CubeInstance cubeInstance = CubeManager.getInstance(envConfig).getCube(cubeName); final CubeDesc cubeDesc = CubeDescManager.getInstance(envConfig).getCubeDesc(cubeInstance.getDescName()); logger.info("Dictionary output path: {}", dictOutputPath); logger.info("Statistics output path: {}", statOutputPath); final TblColRef[] tblColRefs = cubeDesc.getAllColumnsNeedDictionaryBuilt().toArray(new TblColRef[0]); final int columnLength = tblColRefs.length; List<Integer> indexs = Lists.newArrayListWithCapacity(columnLength); for (int i = 0; i <= columnLength; i++) { indexs.add(i); } JavaRDD<Integer> indexRDD = sc.parallelize(indexs, columnLength + 1); JavaPairRDD<Text, Text> colToDictPathRDD = indexRDD.mapToPair(new MergeDictAndStatsFunction(cubeName, metaUrl, segmentId, StringUtil.splitByComma(segmentIds), statOutputPath, tblColRefs, sConf)); colToDictPathRDD.coalesce(1, false).saveAsNewAPIHadoopFile(dictOutputPath, Text.class, Text.class, SequenceFileOutputFormat.class); } }
Example 18
Source File: JavaStocks.java From spark-ts-examples with Apache License 2.0 | 4 votes |
public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("Spark-TS Ticker Example").setMaster("local"); conf.set("spark.io.compression.codec", "org.apache.spark.io.LZ4CompressionCodec"); JavaSparkContext context = new JavaSparkContext(conf); SQLContext sqlContext = new SQLContext(context); DataFrame tickerObs = loadObservations(context, sqlContext, "../data/ticker.tsv"); // Create an daily DateTimeIndex over August and September 2015 ZoneId zone = ZoneId.systemDefault(); DateTimeIndex dtIndex = DateTimeIndexFactory.uniformFromInterval( ZonedDateTime.of(LocalDateTime.parse("2015-08-03T00:00:00"), zone), ZonedDateTime.of(LocalDateTime.parse("2015-09-22T00:00:00"), zone), new BusinessDayFrequency(1, 0)); // Align the ticker data on the DateTimeIndex to create a TimeSeriesRDD JavaTimeSeriesRDD tickerTsrdd = JavaTimeSeriesRDDFactory.timeSeriesRDDFromObservations( dtIndex, tickerObs, "timestamp", "symbol", "price"); // Cache it in memory tickerTsrdd.cache(); // Count the number of series (number of symbols) System.out.println(tickerTsrdd.count()); // Impute missing values using linear interpolation JavaTimeSeriesRDD<String> filled = tickerTsrdd.fill("linear"); // Compute return rates JavaTimeSeriesRDD<String> returnRates = filled.returnRates(); // Compute Durbin-Watson stats for each series JavaPairRDD<String, Double> dwStats = returnRates.mapValues( (Vector x) -> TimeSeriesStatisticalTests.dwtest(x) ); class StatsComparator implements Comparator<Tuple2<String,Double>>, java.io.Serializable { public int compare(Tuple2<String, Double> a, Tuple2<String, Double> b) { return a._2.compareTo(b._2); } } System.out.println(dwStats.min(new StatsComparator())); System.out.println(dwStats.max(new StatsComparator())); }
Example 19
Source File: SparkUHCDictionary.java From kylin-on-parquet-v2 with Apache License 2.0 | 4 votes |
@Override protected void execute(OptionsHelper optionsHelper) throws Exception { String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME); String metaUrl = optionsHelper.getOptionValue(OPTION_META_URL); String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID); String inputPath = optionsHelper.getOptionValue(OPTION_INPUT_PATH); String outputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH); String counterPath = optionsHelper.getOptionValue(OPTION_COUNTER_PATH); Class[] kryoClassArray = new Class[]{Class.forName("scala.reflect.ClassTag$$anon$1"), Class.forName("org.apache.kylin.engine.mr.steps.SelfDefineSortableKey")}; SparkConf conf = new SparkConf().setAppName("Build uhc dictionary with spark for:" + cubeName + " segment " + segmentId); //serialization conf conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator"); conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray); KylinSparkJobListener jobListener = new KylinSparkJobListener(); try (JavaSparkContext sc = new JavaSparkContext(conf)) { sc.sc().addSparkListener(jobListener); HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(outputPath)); Configuration hadoopConf = sc.hadoopConfiguration(); hadoopConf.set("mapreduce.input.pathFilter.class", "org.apache.kylin.engine.mr.steps.filter.UHCDictPathFilter"); final SerializableConfiguration sConf = new SerializableConfiguration(hadoopConf); KylinConfig config = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl); CubeManager cubeMgr = CubeManager.getInstance(config); CubeInstance cube = cubeMgr.getCube(cubeName); final Job job = Job.getInstance(sConf.get()); // calculate source record bytes size final LongAccumulator bytesWritten = sc.sc().longAccumulator(); String hdfsDir = sc.hadoopConfiguration().get(BatchConstants.CFG_GLOBAL_DICT_BASE_DIR); List<TblColRef> uhcColumns = cube.getDescriptor().getAllUHCColumns(); int reducerCount = uhcColumns.size(); if (reducerCount == 0) { return; } logger.info("RDD Output path: {}", outputPath); logger.info("getTotalReducerNum: {}", reducerCount); logger.info("counter path {}", counterPath); JavaPairRDD<String, String> wholeSequenceFileNames = null; for (TblColRef tblColRef : uhcColumns) { String columnPath = inputPath + "/" + tblColRef.getIdentity(); if (!HadoopUtil.getFileSystem(columnPath).exists(new Path(columnPath))) { continue; } if (wholeSequenceFileNames == null) { wholeSequenceFileNames = sc.wholeTextFiles(columnPath); } else { wholeSequenceFileNames = wholeSequenceFileNames.union(sc.wholeTextFiles(columnPath)); } } if (wholeSequenceFileNames == null) { logger.error("There're no sequence files at " + inputPath + " !"); return; } JavaPairRDD<String, Tuple3<Writable, Writable, String>> pairRDD = wholeSequenceFileNames.map(tuple -> tuple._1) .mapToPair(new InputPathAndFilterAddFunction2(config, uhcColumns)) .filter(tuple -> tuple._1 != -1) .reduceByKey((list1, list2) -> combineAllColumnDistinctValues(list1, list2)) .mapToPair(new ProcessUHCColumnValues(cubeName, config, hdfsDir, uhcColumns)); MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_DICT, SequenceFileOutputFormat.class, NullWritable.class, ArrayPrimitiveWritable.class); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.getConfiguration().set(BatchConstants.CFG_OUTPUT_PATH, outputPath); //prevent to create zero-sized default output LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class); MultipleOutputsRDD multipleOutputsRDD = MultipleOutputsRDD.rddToMultipleOutputsRDD(pairRDD); multipleOutputsRDD.saveAsNewAPIHadoopDatasetWithMultipleOutputs(job.getConfiguration()); logger.info("Map input records={}", reducerCount); logger.info("HDFS Read: {} HDFS Write", bytesWritten.value()); Map<String, String> counterMap = Maps.newHashMap(); counterMap.put(ExecutableConstants.SOURCE_RECORDS_COUNT, String.valueOf(reducerCount)); counterMap.put(ExecutableConstants.SOURCE_RECORDS_SIZE, String.valueOf(bytesWritten.value())); // save counter to hdfs HadoopUtil.writeToSequenceFile(sc.hadoopConfiguration(), counterPath, counterMap); HadoopUtil.deleteHDFSMeta(metaUrl); } }
Example 20
Source File: SparkCubingByLayer.java From kylin-on-parquet-v2 with Apache License 2.0 | 4 votes |
@Override protected void execute(OptionsHelper optionsHelper) throws Exception { String metaUrl = optionsHelper.getOptionValue(OPTION_META_URL); String hiveTable = optionsHelper.getOptionValue(OPTION_INPUT_TABLE); String inputPath = optionsHelper.getOptionValue(OPTION_INPUT_PATH); String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME); String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID); String outputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH); Class[] kryoClassArray = new Class[] { Class.forName("scala.reflect.ClassTag$$anon$1") }; SparkConf conf = new SparkConf().setAppName("Cubing for:" + cubeName + " segment " + segmentId); //serialization conf conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator"); conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray); KylinSparkJobListener jobListener = new KylinSparkJobListener(); JavaSparkContext sc = new JavaSparkContext(conf); sc.sc().addSparkListener(jobListener); HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(outputPath)); SparkUtil.modifySparkHadoopConfiguration(sc.sc()); // set dfs.replication=2 and enable compress final SerializableConfiguration sConf = new SerializableConfiguration(sc.hadoopConfiguration()); KylinConfig envConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl); final CubeInstance cubeInstance = CubeManager.getInstance(envConfig).getCube(cubeName); final CubeDesc cubeDesc = cubeInstance.getDescriptor(); final CubeSegment cubeSegment = cubeInstance.getSegmentById(segmentId); logger.info("RDD input path: {}", inputPath); logger.info("RDD Output path: {}", outputPath); final Job job = Job.getInstance(sConf.get()); SparkUtil.setHadoopConfForCuboid(job, cubeSegment, metaUrl); int countMeasureIndex = 0; for (MeasureDesc measureDesc : cubeDesc.getMeasures()) { if (measureDesc.getFunction().isCount() == true) { break; } else { countMeasureIndex++; } } final CubeStatsReader cubeStatsReader = new CubeStatsReader(cubeSegment, envConfig); boolean[] needAggr = new boolean[cubeDesc.getMeasures().size()]; boolean allNormalMeasure = true; for (int i = 0; i < cubeDesc.getMeasures().size(); i++) { needAggr[i] = !cubeDesc.getMeasures().get(i).getFunction().getMeasureType().onlyAggrInBaseCuboid(); allNormalMeasure = allNormalMeasure && needAggr[i]; } logger.info("All measure are normal (agg on all cuboids) ? : " + allNormalMeasure); StorageLevel storageLevel = StorageLevel.fromString(envConfig.getSparkStorageLevel()); boolean isSequenceFile = JoinedFlatTable.SEQUENCEFILE.equalsIgnoreCase(envConfig.getFlatTableStorageFormat()); final JavaPairRDD<ByteArray, Object[]> encodedBaseRDD = SparkUtil .hiveRecordInputRDD(isSequenceFile, sc, inputPath, hiveTable) .mapToPair(new EncodeBaseCuboid(cubeName, segmentId, metaUrl, sConf)); Long totalCount = 0L; if (envConfig.isSparkSanityCheckEnabled()) { totalCount = encodedBaseRDD.count(); } final BaseCuboidReducerFunction2 baseCuboidReducerFunction = new BaseCuboidReducerFunction2(cubeName, metaUrl, sConf); BaseCuboidReducerFunction2 reducerFunction2 = baseCuboidReducerFunction; if (allNormalMeasure == false) { reducerFunction2 = new CuboidReducerFunction2(cubeName, metaUrl, sConf, needAggr); } final int totalLevels = cubeSegment.getCuboidScheduler().getBuildLevel(); JavaPairRDD<ByteArray, Object[]>[] allRDDs = new JavaPairRDD[totalLevels + 1]; int level = 0; int partition = SparkUtil.estimateLayerPartitionNum(level, cubeStatsReader, envConfig); // aggregate to calculate base cuboid allRDDs[0] = encodedBaseRDD.reduceByKey(baseCuboidReducerFunction, partition).persist(storageLevel); saveToHDFS(allRDDs[0], metaUrl, cubeName, cubeSegment, outputPath, 0, job, envConfig); PairFlatMapFunction flatMapFunction = new CuboidFlatMap(cubeName, segmentId, metaUrl, sConf); // aggregate to ND cuboids for (level = 1; level <= totalLevels; level++) { partition = SparkUtil.estimateLayerPartitionNum(level, cubeStatsReader, envConfig); allRDDs[level] = allRDDs[level - 1].flatMapToPair(flatMapFunction).reduceByKey(reducerFunction2, partition) .persist(storageLevel); allRDDs[level - 1].unpersist(false); if (envConfig.isSparkSanityCheckEnabled() == true) { sanityCheck(allRDDs[level], totalCount, level, cubeStatsReader, countMeasureIndex); } saveToHDFS(allRDDs[level], metaUrl, cubeName, cubeSegment, outputPath, level, job, envConfig); } allRDDs[totalLevels].unpersist(false); logger.info("Finished on calculating all level cuboids."); logger.info("HDFS: Number of bytes written=" + jobListener.metrics.getBytesWritten()); //HadoopUtil.deleteHDFSMeta(metaUrl); }