Java Code Examples for org.apache.spark.api.java.JavaPairRDD#saveAsHadoopFile()
The following examples show how to use
org.apache.spark.api.java.JavaPairRDD#saveAsHadoopFile() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: SparkExecutionContext.java From systemds with Apache License 2.0 | 6 votes |
@SuppressWarnings("unchecked") public static long writeRDDtoHDFS( RDDObject rdd, String path, OutputInfo oinfo ) { JavaPairRDD<MatrixIndexes,MatrixBlock> lrdd = (JavaPairRDD<MatrixIndexes, MatrixBlock>) rdd.getRDD(); //piggyback nnz maintenance on write LongAccumulator aNnz = getSparkContextStatic().sc().longAccumulator("nnz"); lrdd = lrdd.mapValues(new ComputeBinaryBlockNnzFunction(aNnz)); //save file is an action which also triggers nnz maintenance lrdd.saveAsHadoopFile(path, oinfo.outputKeyClass, oinfo.outputValueClass, oinfo.outputFormatClass); //return nnz aggregate of all blocks return aNnz.value(); }
Example 2
Source File: SparkExecutionContext.java From systemds with Apache License 2.0 | 6 votes |
@SuppressWarnings("unchecked") public static void writeFrameRDDtoHDFS( RDDObject rdd, String path, OutputInfo oinfo ) { JavaPairRDD<?, FrameBlock> lrdd = (JavaPairRDD<Long, FrameBlock>) rdd.getRDD(); //convert keys to writables if necessary if( oinfo == OutputInfo.BinaryBlockOutputInfo ) { lrdd = ((JavaPairRDD<Long, FrameBlock>)lrdd).mapToPair( new LongFrameToLongWritableFrameFunction()); oinfo = OutputInfo.BinaryBlockFrameOutputInfo; } //save file is an action which also triggers nnz maintenance lrdd.saveAsHadoopFile(path, oinfo.outputKeyClass, oinfo.outputValueClass, oinfo.outputFormatClass); }
Example 3
Source File: SparkExecutionContext.java From systemds with Apache License 2.0 | 6 votes |
@SuppressWarnings("unchecked") public static long writeMatrixRDDtoHDFS( RDDObject rdd, String path, FileFormat fmt ) { JavaPairRDD<MatrixIndexes,MatrixBlock> lrdd = (JavaPairRDD<MatrixIndexes, MatrixBlock>) rdd.getRDD(); InputOutputInfo oinfo = InputOutputInfo.get(DataType.MATRIX, fmt); //piggyback nnz maintenance on write LongAccumulator aNnz = getSparkContextStatic().sc().longAccumulator("nnz"); lrdd = lrdd.mapValues(new ComputeBinaryBlockNnzFunction(aNnz)); //save file is an action which also triggers nnz maintenance lrdd.saveAsHadoopFile(path, oinfo.keyClass, oinfo.valueClass, oinfo.outputFormatClass); //return nnz aggregate of all blocks return aNnz.value(); }
Example 4
Source File: SparkExecutionContext.java From systemds with Apache License 2.0 | 6 votes |
@SuppressWarnings("unchecked") public static void writeFrameRDDtoHDFS( RDDObject rdd, String path, FileFormat fmt) { JavaPairRDD<?, FrameBlock> lrdd = (JavaPairRDD<Long, FrameBlock>) rdd.getRDD(); InputOutputInfo oinfo = InputOutputInfo.get(DataType.FRAME, fmt); //convert keys to writables if necessary if( fmt == FileFormat.BINARY ) { lrdd = ((JavaPairRDD<Long, FrameBlock>)lrdd).mapToPair( new LongFrameToLongWritableFrameFunction()); } //save file is an action which also triggers nnz maintenance lrdd.saveAsHadoopFile(path, oinfo.keyClass, oinfo.valueClass, oinfo.outputFormatClass); }
Example 5
Source File: SequenceFile.java From sparkResearch with Apache License 2.0 | 5 votes |
protected static void run(JavaSparkContext sparkContext) { JavaPairRDD<Text, IntWritable> javaPairRDD = sparkContext.sequenceFile("url", Text.class, IntWritable.class); JavaPairRDD<String, Integer> pairRDD = javaPairRDD.mapToPair(new sequenceToConvert()); //写 pairRDD.saveAsHadoopFile("url",Text.class,IntWritable.class,SequenceFileOutputFormat.class); }
Example 6
Source File: RDDConverterUtils.java From systemds with Apache License 2.0 | 4 votes |
/** * Converts a libsvm text input file into two binary block matrices for features * and labels, and saves these to the specified output files. This call also deletes * existing files at the specified output locations, as well as determines and * writes the meta data files of both output matrices. * <p> * Note: We use {@code org.apache.spark.mllib.util.MLUtils.loadLibSVMFile} for parsing * the libsvm input files in order to ensure consistency with Spark. * * @param sc java spark context * @param pathIn path to libsvm input file * @param pathX path to binary block output file of features * @param pathY path to binary block output file of labels * @param mcOutX matrix characteristics of output matrix X */ public static void libsvmToBinaryBlock(JavaSparkContext sc, String pathIn, String pathX, String pathY, DataCharacteristics mcOutX) { if( !mcOutX.dimsKnown() ) throw new DMLRuntimeException("Matrix characteristics " + "required to convert sparse input representation."); try { //cleanup existing output files HDFSTool.deleteFileIfExistOnHDFS(pathX); HDFSTool.deleteFileIfExistOnHDFS(pathY); //convert libsvm to labeled points int numFeatures = (int) mcOutX.getCols(); int numPartitions = SparkUtils.getNumPreferredPartitions(mcOutX, null); JavaRDD<org.apache.spark.mllib.regression.LabeledPoint> lpoints = MLUtils.loadLibSVMFile(sc.sc(), pathIn, numFeatures, numPartitions).toJavaRDD(); //append row index and best-effort caching to avoid repeated text parsing JavaPairRDD<org.apache.spark.mllib.regression.LabeledPoint,Long> ilpoints = lpoints.zipWithIndex().persist(StorageLevel.MEMORY_AND_DISK()); //extract labels and convert to binary block DataCharacteristics mc1 = new MatrixCharacteristics(mcOutX.getRows(), 1, mcOutX.getBlocksize(), -1); LongAccumulator aNnz1 = sc.sc().longAccumulator("nnz"); JavaPairRDD<MatrixIndexes,MatrixBlock> out1 = ilpoints .mapPartitionsToPair(new LabeledPointToBinaryBlockFunction(mc1, true, aNnz1)); int numPartitions2 = SparkUtils.getNumPreferredPartitions(mc1, null); out1 = RDDAggregateUtils.mergeByKey(out1, numPartitions2, false); out1.saveAsHadoopFile(pathY, MatrixIndexes.class, MatrixBlock.class, SequenceFileOutputFormat.class); mc1.setNonZeros(aNnz1.value()); //update nnz after triggered save HDFSTool.writeMetaDataFile(pathY+".mtd", ValueType.FP64, mc1, OutputInfo.BinaryBlockOutputInfo); //extract data and convert to binary block DataCharacteristics mc2 = new MatrixCharacteristics(mcOutX.getRows(), mcOutX.getCols(), mcOutX.getBlocksize(), -1); LongAccumulator aNnz2 = sc.sc().longAccumulator("nnz"); JavaPairRDD<MatrixIndexes,MatrixBlock> out2 = ilpoints .mapPartitionsToPair(new LabeledPointToBinaryBlockFunction(mc2, false, aNnz2)); out2 = RDDAggregateUtils.mergeByKey(out2, numPartitions, false); out2.saveAsHadoopFile(pathX, MatrixIndexes.class, MatrixBlock.class, SequenceFileOutputFormat.class); mc2.setNonZeros(aNnz2.value()); //update nnz after triggered save HDFSTool.writeMetaDataFile(pathX+".mtd", ValueType.FP64, mc2, OutputInfo.BinaryBlockOutputInfo); //asynchronous cleanup of cached intermediates ilpoints.unpersist(false); } catch(IOException ex) { throw new DMLRuntimeException(ex); } }
Example 7
Source File: RDDConverterUtils.java From systemds with Apache License 2.0 | 4 votes |
/** * Converts a libsvm text input file into two binary block matrices for features * and labels, and saves these to the specified output files. This call also deletes * existing files at the specified output locations, as well as determines and * writes the meta data files of both output matrices. * <p> * Note: We use {@code org.apache.spark.mllib.util.MLUtils.loadLibSVMFile} for parsing * the libsvm input files in order to ensure consistency with Spark. * * @param sc java spark context * @param pathIn path to libsvm input file * @param pathX path to binary block output file of features * @param pathY path to binary block output file of labels * @param mcOutX matrix characteristics of output matrix X */ public static void libsvmToBinaryBlock(JavaSparkContext sc, String pathIn, String pathX, String pathY, DataCharacteristics mcOutX) { if( !mcOutX.dimsKnown() ) throw new DMLRuntimeException("Matrix characteristics " + "required to convert sparse input representation."); try { //cleanup existing output files HDFSTool.deleteFileIfExistOnHDFS(pathX); HDFSTool.deleteFileIfExistOnHDFS(pathY); //convert libsvm to labeled points int numFeatures = (int) mcOutX.getCols(); int numPartitions = SparkUtils.getNumPreferredPartitions(mcOutX, null); JavaRDD<org.apache.spark.mllib.regression.LabeledPoint> lpoints = MLUtils.loadLibSVMFile(sc.sc(), pathIn, numFeatures, numPartitions).toJavaRDD(); //append row index and best-effort caching to avoid repeated text parsing JavaPairRDD<org.apache.spark.mllib.regression.LabeledPoint,Long> ilpoints = lpoints.zipWithIndex().persist(StorageLevel.MEMORY_AND_DISK()); //extract labels and convert to binary block DataCharacteristics mc1 = new MatrixCharacteristics(mcOutX.getRows(), 1, mcOutX.getBlocksize(), -1); LongAccumulator aNnz1 = sc.sc().longAccumulator("nnz"); JavaPairRDD<MatrixIndexes,MatrixBlock> out1 = ilpoints .mapPartitionsToPair(new LabeledPointToBinaryBlockFunction(mc1, true, aNnz1)); int numPartitions2 = SparkUtils.getNumPreferredPartitions(mc1, null); out1 = RDDAggregateUtils.mergeByKey(out1, numPartitions2, false); out1.saveAsHadoopFile(pathY, MatrixIndexes.class, MatrixBlock.class, SequenceFileOutputFormat.class); mc1.setNonZeros(aNnz1.value()); //update nnz after triggered save HDFSTool.writeMetaDataFile(pathY+".mtd", ValueType.FP64, mc1, FileFormat.BINARY); //extract data and convert to binary block DataCharacteristics mc2 = new MatrixCharacteristics(mcOutX.getRows(), mcOutX.getCols(), mcOutX.getBlocksize(), -1); LongAccumulator aNnz2 = sc.sc().longAccumulator("nnz"); JavaPairRDD<MatrixIndexes,MatrixBlock> out2 = ilpoints .mapPartitionsToPair(new LabeledPointToBinaryBlockFunction(mc2, false, aNnz2)); out2 = RDDAggregateUtils.mergeByKey(out2, numPartitions, false); out2.saveAsHadoopFile(pathX, MatrixIndexes.class, MatrixBlock.class, SequenceFileOutputFormat.class); mc2.setNonZeros(aNnz2.value()); //update nnz after triggered save HDFSTool.writeMetaDataFile(pathX+".mtd", ValueType.FP64, mc2, FileFormat.BINARY); //asynchronous cleanup of cached intermediates ilpoints.unpersist(false); } catch(IOException ex) { throw new DMLRuntimeException(ex); } }