org.apache.spark.SparkConf#set

Source File: StreamingContextConfiguration.java From Decision with Apache License 2.0

6 votes

private JavaStreamingContext create(String streamingContextName, int port, long streamingBatchTime, String sparkHost) {
    SparkConf conf = new SparkConf();
    conf.set("spark.ui.port", String.valueOf(port));
    conf.setAppName(streamingContextName);
    conf.setJars(JavaStreamingContext.jarOfClass(StreamingEngine.class));
    conf.setMaster(sparkHost);

    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
    conf.registerKryoClasses(new Class[] { StratioStreamingMessage.class, InsertMessage.class, ColumnType.class,
            Action.class});


    HashMap<String, String> tuningProperties = configurationContext.getSparkTunningProperties();
    if (tuningProperties != null && tuningProperties.size() > 0) {
        tuningProperties.forEach( (key, value) ->  conf.set(key, value));
    }

    JavaStreamingContext streamingContext = new JavaStreamingContext(conf, new Duration(streamingBatchTime));

    return streamingContext;
}

Source File: ChronixSparkLoader.java From chronix.spark with Apache License 2.0

6 votes

public ChronixSparkContext createChronixSparkContext() throws IOException {
    if (chronixSparkContext != null) {
        return chronixSparkContext;
    }

    SparkConf sparkConf = new SparkConf()
            .setMaster(chronixYAMLConfiguration.getSparkMaster())
            .setAppName(chronixYAMLConfiguration.getAppName());

    ChronixSparkContext.tuneSparkConf(sparkConf);

    //Set spark values given in yaml config
    for (Map.Entry<String, String> setting : chronixYAMLConfiguration.getSparkSettings().entrySet()) {
        sparkConf.set(setting.getKey(), setting.getValue());
    }

    if (chronixYAMLConfiguration.isDistributed()) {
        sparkConf.setJars(chronixYAMLConfiguration.getJars());
    }

    chronixSparkContext = new ChronixSparkContext(new JavaSparkContext(sparkConf));
    return chronixSparkContext;
}

Source File: SparkOnYarnContainer.java From liteflow with Apache License 2.0

6 votes

private SparkConf initSparkConf(JSONObject configObj){

        String jobName = configObj.getString(CommonConstants.PARAM_EXECUTOR_JOB_NAME);

        String yarnQueue = configObj.getString(CommonConstants.SPARK_PARAM_YARN_QUEUE);
        String instanceNum = configObj.getString(CommonConstants.SPARK_PARAM_INSTANCE_NUM);

        SparkConf sparkConf = new SparkConf();
        sparkConf.setAppName(jobName);

        sparkConf.set("spark.app.name", jobName);
        sparkConf.set("spark.yarn.queue", yarnQueue);


        sparkConf.set("spark.driver.cores", configObj.getString(CommonConstants.SPARK_PARAM_DRIVER_CORES));
        sparkConf.set("spark.driver.memory", configObj.getString(CommonConstants.SPARK_PARAM_DRIVER_MEMORY) + CommonConstants.SPARK_PARAM_MEMORY_UNIT);
        sparkConf.set("spark.executor.cores", configObj.getString(CommonConstants.SPARK_PARAM_EXECUTOR_CORES));
        sparkConf.set("spark.executor.memory", configObj.getString(CommonConstants.SPARK_PARAM_EXECUTOR_MEMORY) + CommonConstants.SPARK_PARAM_MEMORY_UNIT);
        // 设置并发实例数
        Boolean isDynamicAllocation = HadoopConfig.getHadoopConf().getIsDynamicAllocation();
        if (isDynamicAllocation != null && isDynamicAllocation) {
            sparkConf.set("spark.shuffle.service.enabled", "true");
            sparkConf.set("spark.dynamicAllocation.enabled", "true");
            sparkConf.set("spark.dynamicAllocation.minExecutors", "1");
            sparkConf.set("spark.dynamicAllocation.maxExecutors", String.valueOf(instanceNum));
        } else {
            sparkConf.set("spark.executor.instances", String.valueOf(instanceNum));
        }

        /**
         * hadoop、hive配置文件
         */
        String hadoopFiles = HadoopConfig.getHadoopConf().getSparkYarnDistFiles();
        sparkConf.set("spark.yarn.dist.files", hadoopFiles + CommonConstants.COMMA + configObj.getString(Constants.JOB_CONFIG_PATH));

        return sparkConf;

    }

Source File: UtilHelpers.java From hudi with Apache License 2.0

6 votes

private static SparkConf buildSparkConf(String appName, String defaultMaster, Map<String, String> additionalConfigs) {
  final SparkConf sparkConf = new SparkConf().setAppName(appName);
  String master = sparkConf.get("spark.master", defaultMaster);
  sparkConf.setMaster(master);
  if (master.startsWith("yarn")) {
    sparkConf.set("spark.eventLog.overwrite", "true");
    sparkConf.set("spark.eventLog.enabled", "true");
  }
  sparkConf.setIfMissing("spark.driver.maxResultSize", "2g");
  sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
  sparkConf.set("spark.hadoop.mapred.output.compress", "true");
  sparkConf.set("spark.hadoop.mapred.output.compression.codec", "true");
  sparkConf.set("spark.hadoop.mapred.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");
  sparkConf.set("spark.hadoop.mapred.output.compression.type", "BLOCK");

  additionalConfigs.forEach(sparkConf::set);
  return HoodieWriteClient.registerClasses(sparkConf);
}

Source File: SparkUtil.java From hudi with Apache License 2.0

5 votes

public static JavaSparkContext initJavaSparkConf(String name, Option<String> master,
    Option<String> executorMemory) {
  SparkConf sparkConf = new SparkConf().setAppName(name);

  String defMaster = master.orElse(sparkConf.getenv(HoodieCliSparkConfig.CLI_SPARK_MASTER));
  if ((null == defMaster) || (defMaster.isEmpty())) {
    sparkConf.setMaster(DEFAULT_SPARK_MASTER);
  } else {
    sparkConf.setMaster(defMaster);
  }

  sparkConf.set(HoodieCliSparkConfig.CLI_SERIALIZER, "org.apache.spark.serializer.KryoSerializer");
  sparkConf.set(HoodieCliSparkConfig.CLI_DRIVER_MAX_RESULT_SIZE, "2g");
  sparkConf.set(HoodieCliSparkConfig.CLI_EVENT_LOG_OVERWRITE, "true");
  sparkConf.set(HoodieCliSparkConfig.CLI_EVENT_LOG_ENABLED, "true");
  if (executorMemory.isPresent()) {
    sparkConf.set(HoodieCliSparkConfig.CLI_EXECUTOR_MEMORY, executorMemory.get());
  }

  // Configure hadoop conf
  sparkConf.set(HoodieCliSparkConfig.CLI_MAPRED_OUTPUT_COMPRESS, "true");
  sparkConf.set(HoodieCliSparkConfig.CLI_MAPRED_OUTPUT_COMPRESSION_CODEC, "true");
  sparkConf.set(HoodieCliSparkConfig.CLI_MAPRED_OUTPUT_COMPRESSION_CODEC, "org.apache.hadoop.io.compress.GzipCodec");
  sparkConf.set(HoodieCliSparkConfig.CLI_MAPRED_OUTPUT_COMPRESSION_TYPE, "BLOCK");

  HoodieWriteClient.registerClasses(sparkConf);
  JavaSparkContext jsc = new JavaSparkContext(sparkConf);
  jsc.hadoopConfiguration().setBoolean(HoodieCliSparkConfig.CLI_PARQUET_ENABLE_SUMMARY_METADATA, false);
  FSUtils.prepareHadoopConf(jsc.hadoopConfiguration());
  return jsc;
}

Source File: SqoopSparkJob.java From sqoop-on-spark with Apache License 2.0

5 votes

public SparkConf init(CommandLine cArgs) throws ClassNotFoundException {
  System.setProperty(ConfigurationConstants.SYSPROP_CONFIG_DIR, cArgs.getOptionValue("confDir"));
  // by default it is local, override based on the submit parameter
  SparkConf conf = new SparkConf().setAppName("sqoop-spark").setMaster("local");
  if (cArgs.getOptionValue("defaultExtractors") != null) {
    conf.set(SqoopSparkDriver.DEFAULT_EXTRACTORS, cArgs.getOptionValue("defaultExtractors"));
  }
  if (cArgs.getOptionValue("numL") != null) {
    conf.set(SqoopSparkDriver.NUM_LOADERS, cArgs.getOptionValue("numL"));
  }
  // hack to load extra classes directly
  Class.forName("com.mysql.jdbc.Driver");
  SqoopServer.initialize();
  return conf;
}

Source File: JavaHBaseStreamingBulkPutExample.java From learning-hadoop with Apache License 2.0

5 votes

public static void main(String args[]) {
  if (args.length == 0) {
    System.out
        .println("JavaHBaseBulkPutExample  {master} {host} {post} {tableName} {columnFamily}");
  }

  String master = args[0];
  String host = args[1];
  String port = args[2];
  String tableName = args[3];
  String columnFamily = args[4];

  System.out.println("master:" + master);
  System.out.println("host:" + host);
  System.out.println("port:" + Integer.parseInt(port));
  System.out.println("tableName:" + tableName);
  System.out.println("columnFamily:" + columnFamily);
  
  SparkConf sparkConf = new SparkConf();
  sparkConf.set("spark.cleaner.ttl", "120000");
  
  JavaSparkContext jsc = new JavaSparkContext(master,
      "JavaHBaseBulkPutExample");
  jsc.addJar("SparkHBase.jar");
  
  JavaStreamingContext jssc = new JavaStreamingContext(jsc, new Duration(1000));

  JavaReceiverInputDStream<String> javaDstream = jssc.socketTextStream(host, Integer.parseInt(port));
  
  Configuration conf = HBaseConfiguration.create();
  conf.addResource(new Path("/etc/hbase/conf/core-site.xml"));
  conf.addResource(new Path("/etc/hbase/conf/hbase-site.xml"));

  JavaHBaseContext hbaseContext = new JavaHBaseContext(jsc, conf);

  hbaseContext.streamBulkPut(javaDstream, tableName, new PutFunction(), true);
}

Source File: GeoWaveSparkConf.java From geowave with Apache License 2.0

5 votes

public static SparkSession createSessionFromParams(
    final String appName,
    String master,
    final String host,
    final String jars) {
  // Grab default config for GeoWave
  SparkConf defaultConfig = GeoWaveSparkConf.getDefaultConfig();
  // Apply master from default
  if (master == null) {
    master = "yarn";
  }

  // Apply user options if set, correctly handling host for yarn.
  if (appName != null) {
    defaultConfig = defaultConfig.setAppName(appName);
  }
  defaultConfig = defaultConfig.setMaster(master);
  if (host != null) {
    if (master != "yarn") {
      defaultConfig = defaultConfig.set("spark.driver.host", host);
    } else {
      LOGGER.warn(
          "Attempting to set spark driver host for yarn master. Normally this is handled via hadoop configuration. Remove host or set another master designation and try again.");
    }
  }

  if (jars != null) {
    defaultConfig = defaultConfig.set("spark.jars", jars);
  }

  // Finally return the session from builder
  return GeoWaveSparkConf.internalCreateSession(defaultConfig, null);
}

Source File: JavaEmbeddedIgniteRDDWithLocalStoreSelfTest.java From ignite with Apache License 2.0

5 votes

/**
 * Creates default spark context
 *
 * @return Context.
 */
private JavaSparkContext createContext() {
    SparkConf conf = new SparkConf();

    conf.set("spark.executor.instances", String.valueOf(GRID_CNT));

    return new JavaSparkContext("local[" + GRID_CNT + "]", "test", conf);
}

Source File: SqoopSparkClientFactory.java From sqoop-on-spark with Apache License 2.0

5 votes

static SparkConf generateSparkConf(Map<String, String> conf) {
  SparkConf sparkConf = new SparkConf(false);
  for (Map.Entry<String, String> entry : conf.entrySet()) {
    sparkConf.set(entry.getKey(), entry.getValue());
  }
  return sparkConf;
}

Source File: SpliceSparkWatcher.java From spliceengine with GNU Affero General Public License v3.0

5 votes

@Override
protected void starting(Description description) {
	super.starting(description);
	SpliceLogUtils.trace(LOG, "starting spark");
   	SparkConf sparkConf = new SparkConf().setAppName(appName).setMaster("local");
   	sparkConf.set("spark.broadcast.compress", "false"); // Will attempt to use Snappy without this set.
   	sparkConf.set("spark.driver.allowMultipleContexts", "true"); // SPARK-2243
   	jsc = new JavaSparkContext(sparkConf);
}

Source File: SparkEngineBase.java From beakerx with Apache License 2.0

5 votes

protected void configureSparkConf(SparkConf sparkConf) {
  if (!sparkConf.contains(SPARK_APP_NAME)) {
    sparkConf.setAppName("beaker_" + UUID.randomUUID().toString());
  }
  if (sparkConf.contains(SPARK_MASTER) && !isLocalSpark(sparkConf)) {
    sparkConf.set(SPARK_REPL_CLASS_OUTPUT_DIR, KernelManager.get().getOutDir());
  }
}

Source File: SparkIntegrationTestResource.java From components with Apache License 2.0

5 votes

/**
 * @return a clean spark configuration created from the options in this resource.
 */
public SparkConf createSparkConf(String appName) {
    SparkConf conf = new SparkConf();
    conf.setAppName(appName);
    conf.setMaster(sm);
    // conf.set("spark.driver.host", "10.42.30.148");
    for (Map.Entry<String, String> kv : hadoopConf.entrySet())
        conf.set("spark.hadoop." + kv.getKey(), kv.getValue());
    return conf;
}

Source File: SparkRunnerKryoRegistratorTest.java From beam with Apache License 2.0

4 votes

@Test
public void testDefaultSerializerNotCallingKryo() {
  SparkConf conf = new SparkConf();
  conf.set("spark.kryo.registrator", KryoRegistratorIsNotCalled.class.getName());
  runSimplePipelineWithSparkContext(conf);
}

Source File: ComputeResponse.java From incubator-retired-pirk with Apache License 2.0

4 votes

public ComputeResponse(FileSystem fileSys) throws PIRException
{
  fs = fileSys;
  storage = new HadoopFileSystemStore(fs);

  dataInputFormat = SystemConfiguration.getProperty("pir.dataInputFormat");
  if (!InputFormatConst.ALLOWED_FORMATS.contains(dataInputFormat))
  {
    throw new IllegalArgumentException("inputFormat = " + dataInputFormat + " is of an unknown form");
  }
  logger.info("inputFormat = " + dataInputFormat);
  if (dataInputFormat.equals(InputFormatConst.BASE_FORMAT))
  {
    inputData = SystemConfiguration.getProperty("pir.inputData", "none");
    if (inputData.equals("none"))
    {
      throw new IllegalArgumentException("For inputFormat = " + dataInputFormat + " an inputFile must be specified");
    }
    logger.info("inputFile = " + inputData);
  }
  else if (dataInputFormat.equals(InputFormatConst.ES))
  {
    esQuery = SystemConfiguration.getProperty("pir.esQuery", "none");
    esResource = SystemConfiguration.getProperty("pir.esResource", "none");
    if (esQuery.equals("none"))
    {
      throw new IllegalArgumentException("esQuery must be specified");
    }
    if (esResource.equals("none"))
    {
      throw new IllegalArgumentException("esResource must be specified");
    }
    logger.info("esQuery = " + esQuery + " esResource = " + esResource);
  }
  outputFile = SystemConfiguration.getProperty("pir.outputFile");
  outputDirExp = outputFile + "_exp";

  queryInput = SystemConfiguration.getProperty("pir.queryInput");
  String stopListFile = SystemConfiguration.getProperty("pir.stopListFile");
  useModExpJoin = SystemConfiguration.getBooleanProperty("pir.useModExpJoin", false);

  logger.info("outputFile = " + outputFile + " queryInputDir = " + queryInput + " stopListFile = " + stopListFile + " esQuery = " + esQuery + " esResource = "
      + esResource);

  // Set the necessary configurations
  SparkConf conf = new SparkConf().setAppName("SparkPIR").setMaster("yarn-cluster");
  conf.set("es.nodes", SystemConfiguration.getProperty("es.nodes", "none"));
  conf.set("es.port", SystemConfiguration.getProperty("es.port", "none"));
  conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
  conf.set("spark.memory.storageFraction", "0.10");
  conf.set("spark.memory.fraction", "0.25");
  // conf.set("spark.memory.fraction", "0.25");
  // conf.set("spark.executor.extraJavaOptions", "-XX:+UseCompressedOops");
  sc = new JavaSparkContext(conf);

  // Setup, run query, teardown
  logger.info("Setting up for query run");
  try
  {
    setup();
  } catch (IOException e)
  {
    throw new PIRException("An error occurred setting up the Spark responder.", e);
  }
  logger.info("Setup complete");
}

Source File: WordCountingAppWithCheckpoint.java From tutorials with MIT License

4 votes

public static void main(String[] args) throws InterruptedException {

        Logger.getLogger("org")
            .setLevel(Level.OFF);
        Logger.getLogger("akka")
            .setLevel(Level.OFF);

        Map<String, Object> kafkaParams = new HashMap<>();
        kafkaParams.put("bootstrap.servers", "localhost:9092");
        kafkaParams.put("key.deserializer", StringDeserializer.class);
        kafkaParams.put("value.deserializer", StringDeserializer.class);
        kafkaParams.put("group.id", "use_a_separate_group_id_for_each_stream");
        kafkaParams.put("auto.offset.reset", "latest");
        kafkaParams.put("enable.auto.commit", false);

        Collection<String> topics = Arrays.asList("messages");

        SparkConf sparkConf = new SparkConf();
        sparkConf.setMaster("local[2]");
        sparkConf.setAppName("WordCountingAppWithCheckpoint");
        sparkConf.set("spark.cassandra.connection.host", "127.0.0.1");

        JavaStreamingContext streamingContext = new JavaStreamingContext(sparkConf, Durations.seconds(1));

        sparkContext = streamingContext.sparkContext();

        streamingContext.checkpoint("./.checkpoint");

        JavaInputDStream<ConsumerRecord<String, String>> messages = KafkaUtils.createDirectStream(streamingContext, LocationStrategies.PreferConsistent(), ConsumerStrategies.<String, String> Subscribe(topics, kafkaParams));

        JavaPairDStream<String, String> results = messages.mapToPair(record -> new Tuple2<>(record.key(), record.value()));

        JavaDStream<String> lines = results.map(tuple2 -> tuple2._2());

        JavaDStream<String> words = lines.flatMap(x -> Arrays.asList(x.split("\\s+"))
            .iterator());

        JavaPairDStream<String, Integer> wordCounts = words.mapToPair(s -> new Tuple2<>(s, 1))
            .reduceByKey((Function2<Integer, Integer, Integer>) (i1, i2) -> i1 + i2);

        JavaMapWithStateDStream<String, Integer, Integer, Tuple2<String, Integer>> cumulativeWordCounts = wordCounts.mapWithState(StateSpec.function((word, one, state) -> {
            int sum = one.orElse(0) + (state.exists() ? state.get() : 0);
            Tuple2<String, Integer> output = new Tuple2<>(word, sum);
            state.update(sum);
            return output;
        }));

        cumulativeWordCounts.foreachRDD(javaRdd -> {
            List<Tuple2<String, Integer>> wordCountList = javaRdd.collect();
            for (Tuple2<String, Integer> tuple : wordCountList) {
                List<Word> wordList = Arrays.asList(new Word(tuple._1, tuple._2));
                JavaRDD<Word> rdd = sparkContext.parallelize(wordList);
                javaFunctions(rdd).writerBuilder("vocabulary", "words", mapToRow(Word.class))
                    .saveToCassandra();
            }
        });

        streamingContext.start();
        streamingContext.awaitTermination();
    }

Source File: SparkMergingDictionary.java From kylin with Apache License 2.0

4 votes

@Override
protected void execute(OptionsHelper optionsHelper) throws Exception {
    final String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME);
    final String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID);
    final String metaUrl = optionsHelper.getOptionValue(OPTION_META_URL);
    final String segmentIds = optionsHelper.getOptionValue(OPTION_MERGE_SEGMENT_IDS);
    final String dictOutputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH_DICT);
    final String statOutputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH_STAT);

    Class[] kryoClassArray = new Class[] { Class.forName("scala.reflect.ClassTag$$anon$1"),
            Class.forName("scala.collection.mutable.WrappedArray$ofRef") };

    SparkConf conf = new SparkConf().setAppName("Merge dictionary for cube:" + cubeName + ", segment " + segmentId);
    //serialization conf
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
    conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator");
    conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray);

    try (JavaSparkContext sc = new JavaSparkContext(conf)) {
        KylinSparkJobListener jobListener = new KylinSparkJobListener();
        sc.sc().addSparkListener(jobListener);

        HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(dictOutputPath));

        final SerializableConfiguration sConf = new SerializableConfiguration(sc.hadoopConfiguration());
        final KylinConfig envConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl);

        final CubeInstance cubeInstance = CubeManager.getInstance(envConfig).getCube(cubeName);
        final CubeDesc cubeDesc = CubeDescManager.getInstance(envConfig).getCubeDesc(cubeInstance.getDescName());

        logger.info("Dictionary output path: {}", dictOutputPath);
        logger.info("Statistics output path: {}", statOutputPath);

        final TblColRef[] tblColRefs = cubeDesc.getAllColumnsNeedDictionaryBuilt().toArray(new TblColRef[0]);
        final int columnLength = tblColRefs.length;

        List<Integer> indexs = Lists.newArrayListWithCapacity(columnLength);

        for (int i = 0; i <= columnLength; i++) {
            indexs.add(i);
        }

        JavaRDD<Integer> indexRDD = sc.parallelize(indexs, columnLength + 1);

        JavaPairRDD<Text, Text> colToDictPathRDD = indexRDD.mapToPair(new MergeDictAndStatsFunction(cubeName,
                metaUrl, segmentId, StringUtil.splitByComma(segmentIds), statOutputPath, tblColRefs, sConf));

        colToDictPathRDD.coalesce(1, false).saveAsNewAPIHadoopFile(dictOutputPath, Text.class, Text.class,
                SequenceFileOutputFormat.class);
    }
}

Source File: JavaStocks.java From spark-ts-examples with Apache License 2.0

4 votes

public static void main(String[] args) {
  SparkConf conf = new SparkConf().setAppName("Spark-TS Ticker Example").setMaster("local");
  conf.set("spark.io.compression.codec", "org.apache.spark.io.LZ4CompressionCodec");
  JavaSparkContext context = new JavaSparkContext(conf);
  SQLContext sqlContext = new SQLContext(context);

  DataFrame tickerObs = loadObservations(context, sqlContext, "../data/ticker.tsv");

  // Create an daily DateTimeIndex over August and September 2015
  ZoneId zone = ZoneId.systemDefault();
  DateTimeIndex dtIndex = DateTimeIndexFactory.uniformFromInterval(
      ZonedDateTime.of(LocalDateTime.parse("2015-08-03T00:00:00"), zone),
      ZonedDateTime.of(LocalDateTime.parse("2015-09-22T00:00:00"), zone),
      new BusinessDayFrequency(1, 0));

  // Align the ticker data on the DateTimeIndex to create a TimeSeriesRDD
  JavaTimeSeriesRDD tickerTsrdd = JavaTimeSeriesRDDFactory.timeSeriesRDDFromObservations(
      dtIndex, tickerObs, "timestamp", "symbol", "price");

  // Cache it in memory
  tickerTsrdd.cache();

  // Count the number of series (number of symbols)
  System.out.println(tickerTsrdd.count());

  // Impute missing values using linear interpolation
  JavaTimeSeriesRDD<String> filled = tickerTsrdd.fill("linear");

  // Compute return rates
  JavaTimeSeriesRDD<String> returnRates = filled.returnRates();

  // Compute Durbin-Watson stats for each series
  JavaPairRDD<String, Double> dwStats = returnRates.mapValues(
      (Vector x) -> TimeSeriesStatisticalTests.dwtest(x)
  );

  class StatsComparator implements Comparator<Tuple2<String,Double>>, java.io.Serializable {
      public int compare(Tuple2<String, Double> a, Tuple2<String, Double> b) {
          return a._2.compareTo(b._2);
      }
  }

  System.out.println(dwStats.min(new StatsComparator()));
  System.out.println(dwStats.max(new StatsComparator()));
}

Source File: SparkUHCDictionary.java From kylin-on-parquet-v2 with Apache License 2.0

4 votes

@Override
protected void execute(OptionsHelper optionsHelper) throws Exception {
    String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME);
    String metaUrl = optionsHelper.getOptionValue(OPTION_META_URL);
    String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID);
    String inputPath = optionsHelper.getOptionValue(OPTION_INPUT_PATH);
    String outputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH);
    String counterPath = optionsHelper.getOptionValue(OPTION_COUNTER_PATH);

    Class[] kryoClassArray = new Class[]{Class.forName("scala.reflect.ClassTag$$anon$1"),
            Class.forName("org.apache.kylin.engine.mr.steps.SelfDefineSortableKey")};

    SparkConf conf = new SparkConf().setAppName("Build uhc dictionary with spark for:" + cubeName + " segment " + segmentId);
    //serialization conf
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
    conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator");
    conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray);

    KylinSparkJobListener jobListener = new KylinSparkJobListener();
    try (JavaSparkContext sc = new JavaSparkContext(conf)) {
        sc.sc().addSparkListener(jobListener);
        HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(outputPath));

        Configuration hadoopConf = sc.hadoopConfiguration();
        hadoopConf.set("mapreduce.input.pathFilter.class", "org.apache.kylin.engine.mr.steps.filter.UHCDictPathFilter");

        final SerializableConfiguration sConf = new SerializableConfiguration(hadoopConf);
        KylinConfig config = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl);

        CubeManager cubeMgr = CubeManager.getInstance(config);
        CubeInstance cube = cubeMgr.getCube(cubeName);
        final Job job = Job.getInstance(sConf.get());

        // calculate source record bytes size
        final LongAccumulator bytesWritten = sc.sc().longAccumulator();
        String hdfsDir = sc.hadoopConfiguration().get(BatchConstants.CFG_GLOBAL_DICT_BASE_DIR);

        List<TblColRef> uhcColumns = cube.getDescriptor().getAllUHCColumns();
        int reducerCount = uhcColumns.size();
        if (reducerCount == 0) {
            return;
        }

        logger.info("RDD Output path: {}", outputPath);
        logger.info("getTotalReducerNum: {}", reducerCount);
        logger.info("counter path {}", counterPath);

        JavaPairRDD<String, String> wholeSequenceFileNames = null;
        for (TblColRef tblColRef : uhcColumns) {
            String columnPath = inputPath + "/" + tblColRef.getIdentity();
            if (!HadoopUtil.getFileSystem(columnPath).exists(new Path(columnPath))) {
                continue;
            }
            if (wholeSequenceFileNames == null) {
                wholeSequenceFileNames = sc.wholeTextFiles(columnPath);
            } else {
                wholeSequenceFileNames = wholeSequenceFileNames.union(sc.wholeTextFiles(columnPath));
            }
        }

        if (wholeSequenceFileNames == null) {
            logger.error("There're no sequence files at " + inputPath + " !");
            return;
        }

        JavaPairRDD<String, Tuple3<Writable, Writable, String>> pairRDD = wholeSequenceFileNames.map(tuple -> tuple._1)
                .mapToPair(new InputPathAndFilterAddFunction2(config, uhcColumns))
                .filter(tuple -> tuple._1 != -1)
                .reduceByKey((list1, list2) -> combineAllColumnDistinctValues(list1, list2))
                .mapToPair(new ProcessUHCColumnValues(cubeName, config, hdfsDir, uhcColumns));

        MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_DICT, SequenceFileOutputFormat.class,
                NullWritable.class, ArrayPrimitiveWritable.class);

        FileOutputFormat.setOutputPath(job, new Path(outputPath));
        job.getConfiguration().set(BatchConstants.CFG_OUTPUT_PATH, outputPath);
        //prevent to create zero-sized default output
        LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);

        MultipleOutputsRDD multipleOutputsRDD = MultipleOutputsRDD.rddToMultipleOutputsRDD(pairRDD);
        multipleOutputsRDD.saveAsNewAPIHadoopDatasetWithMultipleOutputs(job.getConfiguration());

        logger.info("Map input records={}", reducerCount);
        logger.info("HDFS Read: {} HDFS Write", bytesWritten.value());

        Map<String, String> counterMap = Maps.newHashMap();
        counterMap.put(ExecutableConstants.SOURCE_RECORDS_COUNT, String.valueOf(reducerCount));
        counterMap.put(ExecutableConstants.SOURCE_RECORDS_SIZE, String.valueOf(bytesWritten.value()));

        // save counter to hdfs
        HadoopUtil.writeToSequenceFile(sc.hadoopConfiguration(), counterPath, counterMap);
        HadoopUtil.deleteHDFSMeta(metaUrl);
    }
}

Source File: SparkCubingByLayer.java From kylin-on-parquet-v2 with Apache License 2.0

4 votes

@Override
protected void execute(OptionsHelper optionsHelper) throws Exception {
    String metaUrl = optionsHelper.getOptionValue(OPTION_META_URL);
    String hiveTable = optionsHelper.getOptionValue(OPTION_INPUT_TABLE);
    String inputPath = optionsHelper.getOptionValue(OPTION_INPUT_PATH);
    String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME);
    String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID);
    String outputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH);

    Class[] kryoClassArray = new Class[] { Class.forName("scala.reflect.ClassTag$$anon$1") };

    SparkConf conf = new SparkConf().setAppName("Cubing for:" + cubeName + " segment " + segmentId);
    //serialization conf
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
    conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator");
    conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray);

    KylinSparkJobListener jobListener = new KylinSparkJobListener();
    JavaSparkContext sc = new JavaSparkContext(conf);
    sc.sc().addSparkListener(jobListener);
    HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(outputPath));
    SparkUtil.modifySparkHadoopConfiguration(sc.sc()); // set dfs.replication=2 and enable compress
    final SerializableConfiguration sConf = new SerializableConfiguration(sc.hadoopConfiguration());
    KylinConfig envConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl);

    final CubeInstance cubeInstance = CubeManager.getInstance(envConfig).getCube(cubeName);
    final CubeDesc cubeDesc = cubeInstance.getDescriptor();
    final CubeSegment cubeSegment = cubeInstance.getSegmentById(segmentId);

    logger.info("RDD input path: {}", inputPath);
    logger.info("RDD Output path: {}", outputPath);

    final Job job = Job.getInstance(sConf.get());
    SparkUtil.setHadoopConfForCuboid(job, cubeSegment, metaUrl);

    int countMeasureIndex = 0;
    for (MeasureDesc measureDesc : cubeDesc.getMeasures()) {
        if (measureDesc.getFunction().isCount() == true) {
            break;
        } else {
            countMeasureIndex++;
        }
    }
    final CubeStatsReader cubeStatsReader = new CubeStatsReader(cubeSegment, envConfig);
    boolean[] needAggr = new boolean[cubeDesc.getMeasures().size()];
    boolean allNormalMeasure = true;
    for (int i = 0; i < cubeDesc.getMeasures().size(); i++) {
        needAggr[i] = !cubeDesc.getMeasures().get(i).getFunction().getMeasureType().onlyAggrInBaseCuboid();
        allNormalMeasure = allNormalMeasure && needAggr[i];
    }
    logger.info("All measure are normal (agg on all cuboids) ? : " + allNormalMeasure);
    StorageLevel storageLevel = StorageLevel.fromString(envConfig.getSparkStorageLevel());

    boolean isSequenceFile = JoinedFlatTable.SEQUENCEFILE.equalsIgnoreCase(envConfig.getFlatTableStorageFormat());

    final JavaPairRDD<ByteArray, Object[]> encodedBaseRDD = SparkUtil
            .hiveRecordInputRDD(isSequenceFile, sc, inputPath, hiveTable)
            .mapToPair(new EncodeBaseCuboid(cubeName, segmentId, metaUrl, sConf));

    Long totalCount = 0L;
    if (envConfig.isSparkSanityCheckEnabled()) {
        totalCount = encodedBaseRDD.count();
    }

    final BaseCuboidReducerFunction2 baseCuboidReducerFunction = new BaseCuboidReducerFunction2(cubeName, metaUrl,
            sConf);
    BaseCuboidReducerFunction2 reducerFunction2 = baseCuboidReducerFunction;
    if (allNormalMeasure == false) {
        reducerFunction2 = new CuboidReducerFunction2(cubeName, metaUrl, sConf, needAggr);
    }

    final int totalLevels = cubeSegment.getCuboidScheduler().getBuildLevel();
    JavaPairRDD<ByteArray, Object[]>[] allRDDs = new JavaPairRDD[totalLevels + 1];
    int level = 0;
    int partition = SparkUtil.estimateLayerPartitionNum(level, cubeStatsReader, envConfig);

    // aggregate to calculate base cuboid
    allRDDs[0] = encodedBaseRDD.reduceByKey(baseCuboidReducerFunction, partition).persist(storageLevel);

    saveToHDFS(allRDDs[0], metaUrl, cubeName, cubeSegment, outputPath, 0, job, envConfig);

    PairFlatMapFunction flatMapFunction = new CuboidFlatMap(cubeName, segmentId, metaUrl, sConf);
    // aggregate to ND cuboids
    for (level = 1; level <= totalLevels; level++) {
        partition = SparkUtil.estimateLayerPartitionNum(level, cubeStatsReader, envConfig);

        allRDDs[level] = allRDDs[level - 1].flatMapToPair(flatMapFunction).reduceByKey(reducerFunction2, partition)
                .persist(storageLevel);
        allRDDs[level - 1].unpersist(false);
        if (envConfig.isSparkSanityCheckEnabled() == true) {
            sanityCheck(allRDDs[level], totalCount, level, cubeStatsReader, countMeasureIndex);
        }
        saveToHDFS(allRDDs[level], metaUrl, cubeName, cubeSegment, outputPath, level, job, envConfig);
    }
    allRDDs[totalLevels].unpersist(false);
    logger.info("Finished on calculating all level cuboids.");
    logger.info("HDFS: Number of bytes written=" + jobListener.metrics.getBytesWritten());
    //HadoopUtil.deleteHDFSMeta(metaUrl);
}

Java Code Examples for org.apache.spark.SparkConf#set()