org.apache.spark.storage.StorageLevel#fromString

Source File: BoundedDataset.java From beam with Apache License 2.0

6 votes

@Override
@SuppressWarnings("unchecked")
public void cache(String storageLevel, Coder<?> coder) {
  StorageLevel level = StorageLevel.fromString(storageLevel);
  if (TranslationUtils.canAvoidRddSerialization(level)) {
    // if it is memory only reduce the overhead of moving to bytes
    this.rdd = getRDD().persist(level);
  } else {
    // Caching can cause Serialization, we need to code to bytes
    // more details in https://issues.apache.org/jira/browse/BEAM-2669
    Coder<WindowedValue<T>> windowedValueCoder = (Coder<WindowedValue<T>>) coder;
    this.rdd =
        getRDD()
            .map(v -> ValueAndCoderLazySerializable.of(v, windowedValueCoder))
            .persist(level)
            .map(v -> v.getOrDecode(windowedValueCoder));
  }
}

Source File: PersistedOutputRDD.java From tinkerpop with Apache License 2.0

6 votes

@Override
public void writeGraphRDD(final Configuration configuration, final JavaPairRDD<Object, VertexWritable> graphRDD) {
    if (!configuration.getBoolean(Constants.GREMLIN_SPARK_PERSIST_CONTEXT, false))
        LOGGER.warn("The SparkContext should be persisted in order for the RDD to persist across jobs. To do so, set " + Constants.GREMLIN_SPARK_PERSIST_CONTEXT + " to true");
    if (!configuration.containsKey(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION))
        throw new IllegalArgumentException("There is no provided " + Constants.GREMLIN_HADOOP_OUTPUT_LOCATION + " to write the persisted RDD to");
    SparkContextStorage.open(configuration).rm(configuration.getString(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION));  // this might be bad cause it unpersists the job RDD
    // determine which storage level to persist the RDD as with MEMORY_ONLY being the default cache()
    final StorageLevel storageLevel = StorageLevel.fromString(configuration.getString(Constants.GREMLIN_SPARK_PERSIST_STORAGE_LEVEL, "MEMORY_ONLY"));
    if (!configuration.getBoolean(Constants.GREMLIN_HADOOP_GRAPH_WRITER_HAS_EDGES, true))
        graphRDD.mapValues(vertex -> {
            vertex.get().dropEdges(Direction.BOTH);
            return vertex;
        }).setName(Constants.getGraphLocation(configuration.getString(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION))).persist(storageLevel)
                // call action to eager store rdd
                .count();
    else
        graphRDD.setName(Constants.getGraphLocation(configuration.getString(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION))).persist(storageLevel)
                // call action to eager store rdd
                .count();
    Spark.refresh(); // necessary to do really fast so the Spark GC doesn't clear out the RDD
}

Source File: CheckpointSPInstruction.java From systemds with Apache License 2.0

5 votes

public static CheckpointSPInstruction parseInstruction ( String str ) {
	String[] parts = InstructionUtils.getInstructionPartsWithValueType(str);
	InstructionUtils.checkNumFields(parts, 3);
	
	String opcode = parts[0];
	CPOperand in = new CPOperand(parts[1]);
	CPOperand out = new CPOperand(parts[2]);
	StorageLevel level = StorageLevel.fromString(parts[3]);

	return new CheckpointSPInstruction(null, in, out, level, opcode, str);
}

Source File: CheckpointSPInstruction.java From systemds with Apache License 2.0

5 votes

public static CheckpointSPInstruction parseInstruction ( String str ) {
	String[] parts = InstructionUtils.getInstructionPartsWithValueType(str);
	InstructionUtils.checkNumFields(parts, 3);
	
	String opcode = parts[0];
	CPOperand in = new CPOperand(parts[1]);
	CPOperand out = new CPOperand(parts[2]);
	StorageLevel level = StorageLevel.fromString(parts[3]);

	return new CheckpointSPInstruction(null, in, out, level, opcode, str);
}

Source File: StorageLevelDeserializer.java From deeplearning4j with Apache License 2.0

5 votes

@Override
public StorageLevel deserialize(JsonParser jsonParser, DeserializationContext deserializationContext)
                throws IOException, JsonProcessingException {
    JsonNode node = jsonParser.getCodec().readTree(jsonParser);
    String value = node.textValue();
    if (value == null || "null".equals(value)) {
        return null;
    }
    return StorageLevel.fromString(value);
}

Source File: SparkCubingByLayer.java From kylin-on-parquet-v2 with Apache License 2.0

4 votes

@Override
protected void execute(OptionsHelper optionsHelper) throws Exception {
    String metaUrl = optionsHelper.getOptionValue(OPTION_META_URL);
    String hiveTable = optionsHelper.getOptionValue(OPTION_INPUT_TABLE);
    String inputPath = optionsHelper.getOptionValue(OPTION_INPUT_PATH);
    String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME);
    String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID);
    String outputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH);

    Class[] kryoClassArray = new Class[] { Class.forName("scala.reflect.ClassTag$$anon$1") };

    SparkConf conf = new SparkConf().setAppName("Cubing for:" + cubeName + " segment " + segmentId);
    //serialization conf
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
    conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator");
    conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray);

    KylinSparkJobListener jobListener = new KylinSparkJobListener();
    JavaSparkContext sc = new JavaSparkContext(conf);
    sc.sc().addSparkListener(jobListener);
    HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(outputPath));
    SparkUtil.modifySparkHadoopConfiguration(sc.sc()); // set dfs.replication=2 and enable compress
    final SerializableConfiguration sConf = new SerializableConfiguration(sc.hadoopConfiguration());
    KylinConfig envConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl);

    final CubeInstance cubeInstance = CubeManager.getInstance(envConfig).getCube(cubeName);
    final CubeDesc cubeDesc = cubeInstance.getDescriptor();
    final CubeSegment cubeSegment = cubeInstance.getSegmentById(segmentId);

    logger.info("RDD input path: {}", inputPath);
    logger.info("RDD Output path: {}", outputPath);

    final Job job = Job.getInstance(sConf.get());
    SparkUtil.setHadoopConfForCuboid(job, cubeSegment, metaUrl);

    int countMeasureIndex = 0;
    for (MeasureDesc measureDesc : cubeDesc.getMeasures()) {
        if (measureDesc.getFunction().isCount() == true) {
            break;
        } else {
            countMeasureIndex++;
        }
    }
    final CubeStatsReader cubeStatsReader = new CubeStatsReader(cubeSegment, envConfig);
    boolean[] needAggr = new boolean[cubeDesc.getMeasures().size()];
    boolean allNormalMeasure = true;
    for (int i = 0; i < cubeDesc.getMeasures().size(); i++) {
        needAggr[i] = !cubeDesc.getMeasures().get(i).getFunction().getMeasureType().onlyAggrInBaseCuboid();
        allNormalMeasure = allNormalMeasure && needAggr[i];
    }
    logger.info("All measure are normal (agg on all cuboids) ? : " + allNormalMeasure);
    StorageLevel storageLevel = StorageLevel.fromString(envConfig.getSparkStorageLevel());

    boolean isSequenceFile = JoinedFlatTable.SEQUENCEFILE.equalsIgnoreCase(envConfig.getFlatTableStorageFormat());

    final JavaPairRDD<ByteArray, Object[]> encodedBaseRDD = SparkUtil
            .hiveRecordInputRDD(isSequenceFile, sc, inputPath, hiveTable)
            .mapToPair(new EncodeBaseCuboid(cubeName, segmentId, metaUrl, sConf));

    Long totalCount = 0L;
    if (envConfig.isSparkSanityCheckEnabled()) {
        totalCount = encodedBaseRDD.count();
    }

    final BaseCuboidReducerFunction2 baseCuboidReducerFunction = new BaseCuboidReducerFunction2(cubeName, metaUrl,
            sConf);
    BaseCuboidReducerFunction2 reducerFunction2 = baseCuboidReducerFunction;
    if (allNormalMeasure == false) {
        reducerFunction2 = new CuboidReducerFunction2(cubeName, metaUrl, sConf, needAggr);
    }

    final int totalLevels = cubeSegment.getCuboidScheduler().getBuildLevel();
    JavaPairRDD<ByteArray, Object[]>[] allRDDs = new JavaPairRDD[totalLevels + 1];
    int level = 0;
    int partition = SparkUtil.estimateLayerPartitionNum(level, cubeStatsReader, envConfig);

    // aggregate to calculate base cuboid
    allRDDs[0] = encodedBaseRDD.reduceByKey(baseCuboidReducerFunction, partition).persist(storageLevel);

    saveToHDFS(allRDDs[0], metaUrl, cubeName, cubeSegment, outputPath, 0, job, envConfig);

    PairFlatMapFunction flatMapFunction = new CuboidFlatMap(cubeName, segmentId, metaUrl, sConf);
    // aggregate to ND cuboids
    for (level = 1; level <= totalLevels; level++) {
        partition = SparkUtil.estimateLayerPartitionNum(level, cubeStatsReader, envConfig);

        allRDDs[level] = allRDDs[level - 1].flatMapToPair(flatMapFunction).reduceByKey(reducerFunction2, partition)
                .persist(storageLevel);
        allRDDs[level - 1].unpersist(false);
        if (envConfig.isSparkSanityCheckEnabled() == true) {
            sanityCheck(allRDDs[level], totalCount, level, cubeStatsReader, countMeasureIndex);
        }
        saveToHDFS(allRDDs[level], metaUrl, cubeName, cubeSegment, outputPath, level, job, envConfig);
    }
    allRDDs[totalLevels].unpersist(false);
    logger.info("Finished on calculating all level cuboids.");
    logger.info("HDFS: Number of bytes written=" + jobListener.metrics.getBytesWritten());
    //HadoopUtil.deleteHDFSMeta(metaUrl);
}

Source File: StorageLevelConverter.java From rdf2x with Apache License 2.0

4 votes

@Override
public StorageLevel convert(String value) {
    return StorageLevel.fromString(value);
}

Source File: SparkConfigUtils.java From hudi with Apache License 2.0

4 votes

public static StorageLevel getWriteStatusStorageLevel(Properties properties) {
  return StorageLevel.fromString(properties.getProperty(WRITE_STATUS_STORAGE_LEVEL));
}

Source File: SparkConfigUtils.java From hudi with Apache License 2.0

4 votes

public static StorageLevel getBloomIndexInputStorageLevel(Properties properties) {
  return StorageLevel.fromString(properties.getProperty(HoodieIndexConfig.BLOOM_INDEX_INPUT_STORAGE_LEVEL));
}

Source File: SparkConfigUtils.java From hudi with Apache License 2.0

4 votes

public static StorageLevel getSimpleIndexInputStorageLevel(Properties properties) {
  return StorageLevel.fromString(properties.getProperty(HoodieIndexConfig.SIMPLE_INDEX_INPUT_STORAGE_LEVEL));
}

Source File: SparkCubingByLayer.java From kylin with Apache License 2.0

4 votes

@Override
protected void execute(OptionsHelper optionsHelper) throws Exception {
    String metaUrl = optionsHelper.getOptionValue(OPTION_META_URL);
    String hiveTable = optionsHelper.getOptionValue(OPTION_INPUT_TABLE);
    String inputPath = optionsHelper.getOptionValue(OPTION_INPUT_PATH);
    String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME);
    String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID);
    String outputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH);

    Class[] kryoClassArray = new Class[] { Class.forName("scala.reflect.ClassTag$$anon$1") };

    SparkConf conf = new SparkConf().setAppName("Cubing for:" + cubeName + " segment " + segmentId);
    //serialization conf
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
    conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator");
    conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray);

    KylinSparkJobListener jobListener = new KylinSparkJobListener();
    JavaSparkContext sc = new JavaSparkContext(conf);
    sc.sc().addSparkListener(jobListener);
    HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(outputPath));
    SparkUtil.modifySparkHadoopConfiguration(sc.sc(), AbstractHadoopJob.loadKylinConfigFromHdfs(new SerializableConfiguration(sc.hadoopConfiguration()), metaUrl)); // set dfs.replication and enable compress
    final SerializableConfiguration sConf = new SerializableConfiguration(sc.hadoopConfiguration());
    KylinConfig envConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl);

    final CubeInstance cubeInstance = CubeManager.getInstance(envConfig).getCube(cubeName);
    final CubeDesc cubeDesc = cubeInstance.getDescriptor();
    final CubeSegment cubeSegment = cubeInstance.getSegmentById(segmentId);

    logger.info("RDD input path: {}", inputPath);
    logger.info("RDD Output path: {}", outputPath);

    final Job job = Job.getInstance(sConf.get());
    SparkUtil.setHadoopConfForCuboid(job, cubeSegment, metaUrl);

    int countMeasureIndex = 0;
    for (MeasureDesc measureDesc : cubeDesc.getMeasures()) {
        if (measureDesc.getFunction().isCount() == true) {
            break;
        } else {
            countMeasureIndex++;
        }
    }
    final CubeStatsReader cubeStatsReader = new CubeStatsReader(cubeSegment, envConfig);
    boolean[] needAggr = new boolean[cubeDesc.getMeasures().size()];
    boolean allNormalMeasure = true;
    for (int i = 0; i < cubeDesc.getMeasures().size(); i++) {
        needAggr[i] = !cubeDesc.getMeasures().get(i).getFunction().getMeasureType().onlyAggrInBaseCuboid();
        allNormalMeasure = allNormalMeasure && needAggr[i];
    }
    logger.info("All measure are normal (agg on all cuboids) ? : " + allNormalMeasure);
    StorageLevel storageLevel = StorageLevel.fromString(envConfig.getSparkStorageLevel());

    boolean isSequenceFile = JoinedFlatTable.SEQUENCEFILE.equalsIgnoreCase(envConfig.getFlatTableStorageFormat());

    final JavaPairRDD<ByteArray, Object[]> encodedBaseRDD = SparkUtil
            .hiveRecordInputRDD(isSequenceFile, sc, inputPath, hiveTable)
            .mapToPair(new EncodeBaseCuboid(cubeName, segmentId, metaUrl, sConf));

    Long totalCount = 0L;
    if (envConfig.isSparkSanityCheckEnabled()) {
        totalCount = encodedBaseRDD.count();
    }

    final BaseCuboidReducerFunction2 baseCuboidReducerFunction = new BaseCuboidReducerFunction2(cubeName, metaUrl,
            sConf);
    BaseCuboidReducerFunction2 reducerFunction2 = baseCuboidReducerFunction;
    if (allNormalMeasure == false) {
        reducerFunction2 = new CuboidReducerFunction2(cubeName, metaUrl, sConf, needAggr);
    }

    final int totalLevels = cubeSegment.getCuboidScheduler().getBuildLevel();
    JavaPairRDD<ByteArray, Object[]>[] allRDDs = new JavaPairRDD[totalLevels + 1];
    int level = 0;
    int partition = SparkUtil.estimateLayerPartitionNum(level, cubeStatsReader, envConfig);

    // aggregate to calculate base cuboid
    allRDDs[0] = encodedBaseRDD.reduceByKey(baseCuboidReducerFunction, partition).persist(storageLevel);

    saveToHDFS(allRDDs[0], metaUrl, cubeName, cubeSegment, outputPath, 0, job, envConfig);

    PairFlatMapFunction flatMapFunction = new CuboidFlatMap(cubeName, segmentId, metaUrl, sConf);
    // aggregate to ND cuboids
    for (level = 1; level <= totalLevels; level++) {
        partition = SparkUtil.estimateLayerPartitionNum(level, cubeStatsReader, envConfig);

        allRDDs[level] = allRDDs[level - 1].flatMapToPair(flatMapFunction).reduceByKey(reducerFunction2, partition)
                .persist(storageLevel);
        allRDDs[level - 1].unpersist(false);
        if (envConfig.isSparkSanityCheckEnabled() == true) {
            sanityCheck(allRDDs[level], totalCount, level, cubeStatsReader, countMeasureIndex);
        }
        saveToHDFS(allRDDs[level], metaUrl, cubeName, cubeSegment, outputPath, level, job, envConfig);
    }
    allRDDs[totalLevels].unpersist(false);
    logger.info("Finished on calculating all level cuboids.");
    logger.info("HDFS: Number of bytes written=" + jobListener.metrics.getBytesWritten());
    //HadoopUtil.deleteHDFSMeta(metaUrl);
}

Source File: Checkpoint.java From systemds with Apache License 2.0

3 votes

/**
 * TODO change string parameter storage.level to StorageLevel as soon as we can assume
 * that Spark libraries are always available.
 * 
 * @param input low-level operator
 * @param dt data type
 * @param vt value type
 * @param level storage level
 */
public Checkpoint(Lop input, DataType dt, ValueType vt, String level)  {
	super(Lop.Type.Checkpoint, dt, vt);
	this.addInput(input);
	input.addOutput(this);
	
	_storageLevel = StorageLevel.fromString(level);
	lps.setProperties(inputs, ExecType.SPARK);
}

Source File: Checkpoint.java From systemds with Apache License 2.0

3 votes

/**
 * TODO change string parameter storage.level to StorageLevel as soon as we can assume
 * that Spark libraries are always available.
 * 
 * @param input low-level operator
 * @param dt data type
 * @param vt value type
 * @param level storage level
 */
public Checkpoint(Lop input, DataType dt, ValueType vt, String level)  {
	super(Lop.Type.Checkpoint, dt, vt);
	addInput(input);
	input.addOutput(this);
	
	_storageLevel = StorageLevel.fromString(level);
	lps.setProperties(inputs, ExecType.SPARK);
}

Java Code Examples for org.apache.spark.storage.StorageLevel#fromString()