Java Code Examples for org.apache.spark.storage.StorageLevel#fromString()
The following examples show how to use
org.apache.spark.storage.StorageLevel#fromString() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: BoundedDataset.java From beam with Apache License 2.0 | 6 votes |
@Override @SuppressWarnings("unchecked") public void cache(String storageLevel, Coder<?> coder) { StorageLevel level = StorageLevel.fromString(storageLevel); if (TranslationUtils.canAvoidRddSerialization(level)) { // if it is memory only reduce the overhead of moving to bytes this.rdd = getRDD().persist(level); } else { // Caching can cause Serialization, we need to code to bytes // more details in https://issues.apache.org/jira/browse/BEAM-2669 Coder<WindowedValue<T>> windowedValueCoder = (Coder<WindowedValue<T>>) coder; this.rdd = getRDD() .map(v -> ValueAndCoderLazySerializable.of(v, windowedValueCoder)) .persist(level) .map(v -> v.getOrDecode(windowedValueCoder)); } }
Example 2
Source File: PersistedOutputRDD.java From tinkerpop with Apache License 2.0 | 6 votes |
@Override public void writeGraphRDD(final Configuration configuration, final JavaPairRDD<Object, VertexWritable> graphRDD) { if (!configuration.getBoolean(Constants.GREMLIN_SPARK_PERSIST_CONTEXT, false)) LOGGER.warn("The SparkContext should be persisted in order for the RDD to persist across jobs. To do so, set " + Constants.GREMLIN_SPARK_PERSIST_CONTEXT + " to true"); if (!configuration.containsKey(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION)) throw new IllegalArgumentException("There is no provided " + Constants.GREMLIN_HADOOP_OUTPUT_LOCATION + " to write the persisted RDD to"); SparkContextStorage.open(configuration).rm(configuration.getString(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION)); // this might be bad cause it unpersists the job RDD // determine which storage level to persist the RDD as with MEMORY_ONLY being the default cache() final StorageLevel storageLevel = StorageLevel.fromString(configuration.getString(Constants.GREMLIN_SPARK_PERSIST_STORAGE_LEVEL, "MEMORY_ONLY")); if (!configuration.getBoolean(Constants.GREMLIN_HADOOP_GRAPH_WRITER_HAS_EDGES, true)) graphRDD.mapValues(vertex -> { vertex.get().dropEdges(Direction.BOTH); return vertex; }).setName(Constants.getGraphLocation(configuration.getString(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION))).persist(storageLevel) // call action to eager store rdd .count(); else graphRDD.setName(Constants.getGraphLocation(configuration.getString(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION))).persist(storageLevel) // call action to eager store rdd .count(); Spark.refresh(); // necessary to do really fast so the Spark GC doesn't clear out the RDD }
Example 3
Source File: CheckpointSPInstruction.java From systemds with Apache License 2.0 | 5 votes |
public static CheckpointSPInstruction parseInstruction ( String str ) { String[] parts = InstructionUtils.getInstructionPartsWithValueType(str); InstructionUtils.checkNumFields(parts, 3); String opcode = parts[0]; CPOperand in = new CPOperand(parts[1]); CPOperand out = new CPOperand(parts[2]); StorageLevel level = StorageLevel.fromString(parts[3]); return new CheckpointSPInstruction(null, in, out, level, opcode, str); }
Example 4
Source File: CheckpointSPInstruction.java From systemds with Apache License 2.0 | 5 votes |
public static CheckpointSPInstruction parseInstruction ( String str ) { String[] parts = InstructionUtils.getInstructionPartsWithValueType(str); InstructionUtils.checkNumFields(parts, 3); String opcode = parts[0]; CPOperand in = new CPOperand(parts[1]); CPOperand out = new CPOperand(parts[2]); StorageLevel level = StorageLevel.fromString(parts[3]); return new CheckpointSPInstruction(null, in, out, level, opcode, str); }
Example 5
Source File: StorageLevelDeserializer.java From deeplearning4j with Apache License 2.0 | 5 votes |
@Override public StorageLevel deserialize(JsonParser jsonParser, DeserializationContext deserializationContext) throws IOException, JsonProcessingException { JsonNode node = jsonParser.getCodec().readTree(jsonParser); String value = node.textValue(); if (value == null || "null".equals(value)) { return null; } return StorageLevel.fromString(value); }
Example 6
Source File: SparkCubingByLayer.java From kylin-on-parquet-v2 with Apache License 2.0 | 4 votes |
@Override protected void execute(OptionsHelper optionsHelper) throws Exception { String metaUrl = optionsHelper.getOptionValue(OPTION_META_URL); String hiveTable = optionsHelper.getOptionValue(OPTION_INPUT_TABLE); String inputPath = optionsHelper.getOptionValue(OPTION_INPUT_PATH); String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME); String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID); String outputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH); Class[] kryoClassArray = new Class[] { Class.forName("scala.reflect.ClassTag$$anon$1") }; SparkConf conf = new SparkConf().setAppName("Cubing for:" + cubeName + " segment " + segmentId); //serialization conf conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator"); conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray); KylinSparkJobListener jobListener = new KylinSparkJobListener(); JavaSparkContext sc = new JavaSparkContext(conf); sc.sc().addSparkListener(jobListener); HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(outputPath)); SparkUtil.modifySparkHadoopConfiguration(sc.sc()); // set dfs.replication=2 and enable compress final SerializableConfiguration sConf = new SerializableConfiguration(sc.hadoopConfiguration()); KylinConfig envConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl); final CubeInstance cubeInstance = CubeManager.getInstance(envConfig).getCube(cubeName); final CubeDesc cubeDesc = cubeInstance.getDescriptor(); final CubeSegment cubeSegment = cubeInstance.getSegmentById(segmentId); logger.info("RDD input path: {}", inputPath); logger.info("RDD Output path: {}", outputPath); final Job job = Job.getInstance(sConf.get()); SparkUtil.setHadoopConfForCuboid(job, cubeSegment, metaUrl); int countMeasureIndex = 0; for (MeasureDesc measureDesc : cubeDesc.getMeasures()) { if (measureDesc.getFunction().isCount() == true) { break; } else { countMeasureIndex++; } } final CubeStatsReader cubeStatsReader = new CubeStatsReader(cubeSegment, envConfig); boolean[] needAggr = new boolean[cubeDesc.getMeasures().size()]; boolean allNormalMeasure = true; for (int i = 0; i < cubeDesc.getMeasures().size(); i++) { needAggr[i] = !cubeDesc.getMeasures().get(i).getFunction().getMeasureType().onlyAggrInBaseCuboid(); allNormalMeasure = allNormalMeasure && needAggr[i]; } logger.info("All measure are normal (agg on all cuboids) ? : " + allNormalMeasure); StorageLevel storageLevel = StorageLevel.fromString(envConfig.getSparkStorageLevel()); boolean isSequenceFile = JoinedFlatTable.SEQUENCEFILE.equalsIgnoreCase(envConfig.getFlatTableStorageFormat()); final JavaPairRDD<ByteArray, Object[]> encodedBaseRDD = SparkUtil .hiveRecordInputRDD(isSequenceFile, sc, inputPath, hiveTable) .mapToPair(new EncodeBaseCuboid(cubeName, segmentId, metaUrl, sConf)); Long totalCount = 0L; if (envConfig.isSparkSanityCheckEnabled()) { totalCount = encodedBaseRDD.count(); } final BaseCuboidReducerFunction2 baseCuboidReducerFunction = new BaseCuboidReducerFunction2(cubeName, metaUrl, sConf); BaseCuboidReducerFunction2 reducerFunction2 = baseCuboidReducerFunction; if (allNormalMeasure == false) { reducerFunction2 = new CuboidReducerFunction2(cubeName, metaUrl, sConf, needAggr); } final int totalLevels = cubeSegment.getCuboidScheduler().getBuildLevel(); JavaPairRDD<ByteArray, Object[]>[] allRDDs = new JavaPairRDD[totalLevels + 1]; int level = 0; int partition = SparkUtil.estimateLayerPartitionNum(level, cubeStatsReader, envConfig); // aggregate to calculate base cuboid allRDDs[0] = encodedBaseRDD.reduceByKey(baseCuboidReducerFunction, partition).persist(storageLevel); saveToHDFS(allRDDs[0], metaUrl, cubeName, cubeSegment, outputPath, 0, job, envConfig); PairFlatMapFunction flatMapFunction = new CuboidFlatMap(cubeName, segmentId, metaUrl, sConf); // aggregate to ND cuboids for (level = 1; level <= totalLevels; level++) { partition = SparkUtil.estimateLayerPartitionNum(level, cubeStatsReader, envConfig); allRDDs[level] = allRDDs[level - 1].flatMapToPair(flatMapFunction).reduceByKey(reducerFunction2, partition) .persist(storageLevel); allRDDs[level - 1].unpersist(false); if (envConfig.isSparkSanityCheckEnabled() == true) { sanityCheck(allRDDs[level], totalCount, level, cubeStatsReader, countMeasureIndex); } saveToHDFS(allRDDs[level], metaUrl, cubeName, cubeSegment, outputPath, level, job, envConfig); } allRDDs[totalLevels].unpersist(false); logger.info("Finished on calculating all level cuboids."); logger.info("HDFS: Number of bytes written=" + jobListener.metrics.getBytesWritten()); //HadoopUtil.deleteHDFSMeta(metaUrl); }
Example 7
Source File: StorageLevelConverter.java From rdf2x with Apache License 2.0 | 4 votes |
@Override public StorageLevel convert(String value) { return StorageLevel.fromString(value); }
Example 8
Source File: SparkConfigUtils.java From hudi with Apache License 2.0 | 4 votes |
public static StorageLevel getWriteStatusStorageLevel(Properties properties) { return StorageLevel.fromString(properties.getProperty(WRITE_STATUS_STORAGE_LEVEL)); }
Example 9
Source File: SparkConfigUtils.java From hudi with Apache License 2.0 | 4 votes |
public static StorageLevel getBloomIndexInputStorageLevel(Properties properties) { return StorageLevel.fromString(properties.getProperty(HoodieIndexConfig.BLOOM_INDEX_INPUT_STORAGE_LEVEL)); }
Example 10
Source File: SparkConfigUtils.java From hudi with Apache License 2.0 | 4 votes |
public static StorageLevel getSimpleIndexInputStorageLevel(Properties properties) { return StorageLevel.fromString(properties.getProperty(HoodieIndexConfig.SIMPLE_INDEX_INPUT_STORAGE_LEVEL)); }
Example 11
Source File: SparkCubingByLayer.java From kylin with Apache License 2.0 | 4 votes |
@Override protected void execute(OptionsHelper optionsHelper) throws Exception { String metaUrl = optionsHelper.getOptionValue(OPTION_META_URL); String hiveTable = optionsHelper.getOptionValue(OPTION_INPUT_TABLE); String inputPath = optionsHelper.getOptionValue(OPTION_INPUT_PATH); String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME); String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID); String outputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH); Class[] kryoClassArray = new Class[] { Class.forName("scala.reflect.ClassTag$$anon$1") }; SparkConf conf = new SparkConf().setAppName("Cubing for:" + cubeName + " segment " + segmentId); //serialization conf conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator"); conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray); KylinSparkJobListener jobListener = new KylinSparkJobListener(); JavaSparkContext sc = new JavaSparkContext(conf); sc.sc().addSparkListener(jobListener); HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(outputPath)); SparkUtil.modifySparkHadoopConfiguration(sc.sc(), AbstractHadoopJob.loadKylinConfigFromHdfs(new SerializableConfiguration(sc.hadoopConfiguration()), metaUrl)); // set dfs.replication and enable compress final SerializableConfiguration sConf = new SerializableConfiguration(sc.hadoopConfiguration()); KylinConfig envConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl); final CubeInstance cubeInstance = CubeManager.getInstance(envConfig).getCube(cubeName); final CubeDesc cubeDesc = cubeInstance.getDescriptor(); final CubeSegment cubeSegment = cubeInstance.getSegmentById(segmentId); logger.info("RDD input path: {}", inputPath); logger.info("RDD Output path: {}", outputPath); final Job job = Job.getInstance(sConf.get()); SparkUtil.setHadoopConfForCuboid(job, cubeSegment, metaUrl); int countMeasureIndex = 0; for (MeasureDesc measureDesc : cubeDesc.getMeasures()) { if (measureDesc.getFunction().isCount() == true) { break; } else { countMeasureIndex++; } } final CubeStatsReader cubeStatsReader = new CubeStatsReader(cubeSegment, envConfig); boolean[] needAggr = new boolean[cubeDesc.getMeasures().size()]; boolean allNormalMeasure = true; for (int i = 0; i < cubeDesc.getMeasures().size(); i++) { needAggr[i] = !cubeDesc.getMeasures().get(i).getFunction().getMeasureType().onlyAggrInBaseCuboid(); allNormalMeasure = allNormalMeasure && needAggr[i]; } logger.info("All measure are normal (agg on all cuboids) ? : " + allNormalMeasure); StorageLevel storageLevel = StorageLevel.fromString(envConfig.getSparkStorageLevel()); boolean isSequenceFile = JoinedFlatTable.SEQUENCEFILE.equalsIgnoreCase(envConfig.getFlatTableStorageFormat()); final JavaPairRDD<ByteArray, Object[]> encodedBaseRDD = SparkUtil .hiveRecordInputRDD(isSequenceFile, sc, inputPath, hiveTable) .mapToPair(new EncodeBaseCuboid(cubeName, segmentId, metaUrl, sConf)); Long totalCount = 0L; if (envConfig.isSparkSanityCheckEnabled()) { totalCount = encodedBaseRDD.count(); } final BaseCuboidReducerFunction2 baseCuboidReducerFunction = new BaseCuboidReducerFunction2(cubeName, metaUrl, sConf); BaseCuboidReducerFunction2 reducerFunction2 = baseCuboidReducerFunction; if (allNormalMeasure == false) { reducerFunction2 = new CuboidReducerFunction2(cubeName, metaUrl, sConf, needAggr); } final int totalLevels = cubeSegment.getCuboidScheduler().getBuildLevel(); JavaPairRDD<ByteArray, Object[]>[] allRDDs = new JavaPairRDD[totalLevels + 1]; int level = 0; int partition = SparkUtil.estimateLayerPartitionNum(level, cubeStatsReader, envConfig); // aggregate to calculate base cuboid allRDDs[0] = encodedBaseRDD.reduceByKey(baseCuboidReducerFunction, partition).persist(storageLevel); saveToHDFS(allRDDs[0], metaUrl, cubeName, cubeSegment, outputPath, 0, job, envConfig); PairFlatMapFunction flatMapFunction = new CuboidFlatMap(cubeName, segmentId, metaUrl, sConf); // aggregate to ND cuboids for (level = 1; level <= totalLevels; level++) { partition = SparkUtil.estimateLayerPartitionNum(level, cubeStatsReader, envConfig); allRDDs[level] = allRDDs[level - 1].flatMapToPair(flatMapFunction).reduceByKey(reducerFunction2, partition) .persist(storageLevel); allRDDs[level - 1].unpersist(false); if (envConfig.isSparkSanityCheckEnabled() == true) { sanityCheck(allRDDs[level], totalCount, level, cubeStatsReader, countMeasureIndex); } saveToHDFS(allRDDs[level], metaUrl, cubeName, cubeSegment, outputPath, level, job, envConfig); } allRDDs[totalLevels].unpersist(false); logger.info("Finished on calculating all level cuboids."); logger.info("HDFS: Number of bytes written=" + jobListener.metrics.getBytesWritten()); //HadoopUtil.deleteHDFSMeta(metaUrl); }
Example 12
Source File: Checkpoint.java From systemds with Apache License 2.0 | 3 votes |
/** * TODO change string parameter storage.level to StorageLevel as soon as we can assume * that Spark libraries are always available. * * @param input low-level operator * @param dt data type * @param vt value type * @param level storage level */ public Checkpoint(Lop input, DataType dt, ValueType vt, String level) { super(Lop.Type.Checkpoint, dt, vt); this.addInput(input); input.addOutput(this); _storageLevel = StorageLevel.fromString(level); lps.setProperties(inputs, ExecType.SPARK); }
Example 13
Source File: Checkpoint.java From systemds with Apache License 2.0 | 3 votes |
/** * TODO change string parameter storage.level to StorageLevel as soon as we can assume * that Spark libraries are always available. * * @param input low-level operator * @param dt data type * @param vt value type * @param level storage level */ public Checkpoint(Lop input, DataType dt, ValueType vt, String level) { super(Lop.Type.Checkpoint, dt, vt); addInput(input); input.addOutput(this); _storageLevel = StorageLevel.fromString(level); lps.setProperties(inputs, ExecType.SPARK); }