Java Code Examples for org.apache.spark.api.java.JavaPairRDD#union()
The following examples show how to use
org.apache.spark.api.java.JavaPairRDD#union() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ExampleBatchLayerUpdate.java From oryx with Apache License 2.0 | 6 votes |
@Override public void runUpdate(JavaSparkContext sparkContext, long timestamp, JavaPairRDD<String,String> newData, JavaPairRDD<String,String> pastData, String modelDirString, TopicProducer<String,String> modelUpdateTopic) throws IOException { JavaPairRDD<String,String> allData = pastData == null ? newData : newData.union(pastData); String modelString; try { modelString = new ObjectMapper().writeValueAsString(countDistinctOtherWords(allData)); } catch (JsonProcessingException jpe) { throw new IOException(jpe); } modelUpdateTopic.send("MODEL", modelString); }
Example 2
Source File: AppendGAlignedSPInstruction.java From systemds with Apache License 2.0 | 6 votes |
@Override public void processInstruction(ExecutionContext ec) { // general case append (map-extend, aggregate) SparkExecutionContext sec = (SparkExecutionContext)ec; checkBinaryAppendInputCharacteristics(sec, _cbind, false, true); DataCharacteristics mc1 = sec.getDataCharacteristics(input1.getName()); JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() ); JavaPairRDD<MatrixIndexes,MatrixBlock> in2 = sec.getBinaryMatrixBlockRDDHandleForVariable( input2.getName() ); JavaPairRDD<MatrixIndexes,MatrixBlock> out = null; // Simple changing of matrix indexes of RHS long shiftBy = _cbind ? mc1.getNumColBlocks() : mc1.getNumRowBlocks(); out = in2.mapToPair(new ShiftColumnIndex(shiftBy, _cbind)); out = in1.union( out ); //put output RDD handle into symbol table updateBinaryAppendOutputDataCharacteristics(sec, _cbind); sec.setRDDHandleForVariable(output.getName(), out); sec.addLineageRDD(output.getName(), input1.getName()); sec.addLineageRDD(output.getName(), input2.getName()); }
Example 3
Source File: RDDConverterUtilsExt.java From systemds with Apache License 2.0 | 6 votes |
public static JavaPairRDD<MatrixIndexes, MatrixBlock> coordinateMatrixToBinaryBlock(JavaSparkContext sc, CoordinateMatrix input, DataCharacteristics mcIn, boolean outputEmptyBlocks) { //convert matrix entry rdd to binary block rdd (w/ partial blocks) JavaPairRDD<MatrixIndexes, MatrixBlock> out = input.entries().toJavaRDD() .mapPartitionsToPair(new MatrixEntryToBinaryBlockFunction(mcIn)); //inject empty blocks (if necessary) if( outputEmptyBlocks && mcIn.mightHaveEmptyBlocks() ) { out = out.union( SparkUtils.getEmptyBlockRDD(sc, mcIn) ); } //aggregate partial matrix blocks out = RDDAggregateUtils.mergeByKey(out, false); return out; }
Example 4
Source File: RDDConverterUtils.java From systemds with Apache License 2.0 | 6 votes |
public static JavaPairRDD<MatrixIndexes, MatrixBlock> textCellToBinaryBlock(JavaSparkContext sc, JavaPairRDD<LongWritable, Text> input, DataCharacteristics mcOut, boolean outputEmptyBlocks, FileFormatPropertiesMM mmProps) { //convert textcell rdd to binary block rdd (w/ partial blocks) JavaPairRDD<MatrixIndexes, MatrixBlock> out = input.values() .mapPartitionsToPair(new TextToBinaryBlockFunction(mcOut, mmProps)); //inject empty blocks (if necessary) if( outputEmptyBlocks && mcOut.mightHaveEmptyBlocks() ) { out = out.union( SparkUtils.getEmptyBlockRDD(sc, mcOut) ); } //aggregate partial matrix blocks out = RDDAggregateUtils.mergeByKey(out, false); return out; }
Example 5
Source File: RDDConverterUtils.java From systemds with Apache License 2.0 | 6 votes |
public static JavaPairRDD<MatrixIndexes, MatrixBlock> binaryCellToBinaryBlock(JavaSparkContext sc, JavaPairRDD<MatrixIndexes, MatrixCell> input, DataCharacteristics mcOut, boolean outputEmptyBlocks) { //convert binarycell rdd to binary block rdd (w/ partial blocks) JavaPairRDD<MatrixIndexes, MatrixBlock> out = input .mapPartitionsToPair(new BinaryCellToBinaryBlockFunction(mcOut)); //inject empty blocks (if necessary) if( outputEmptyBlocks && mcOut.mightHaveEmptyBlocks() ) { out = out.union( SparkUtils.getEmptyBlockRDD(sc, mcOut) ); } //aggregate partial matrix blocks out = RDDAggregateUtils.mergeByKey(out, false); return out; }
Example 6
Source File: AppendGAlignedSPInstruction.java From systemds with Apache License 2.0 | 6 votes |
@Override public void processInstruction(ExecutionContext ec) { // general case append (map-extend, aggregate) SparkExecutionContext sec = (SparkExecutionContext)ec; checkBinaryAppendInputCharacteristics(sec, _cbind, false, true); DataCharacteristics mc1 = sec.getDataCharacteristics(input1.getName()); JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() ); JavaPairRDD<MatrixIndexes,MatrixBlock> in2 = sec.getBinaryMatrixBlockRDDHandleForVariable( input2.getName() ); JavaPairRDD<MatrixIndexes,MatrixBlock> out = null; // Simple changing of matrix indexes of RHS long shiftBy = _cbind ? mc1.getNumColBlocks() : mc1.getNumRowBlocks(); out = in2.mapToPair(new ShiftColumnIndex(shiftBy, _cbind)); out = in1.union( out ); //put output RDD handle into symbol table updateBinaryAppendOutputDataCharacteristics(sec, _cbind); sec.setRDDHandleForVariable(output.getName(), out); sec.addLineageRDD(output.getName(), input1.getName()); sec.addLineageRDD(output.getName(), input2.getName()); }
Example 7
Source File: RDDConverterUtils.java From systemds with Apache License 2.0 | 6 votes |
public static JavaPairRDD<MatrixIndexes, MatrixBlock> binaryCellToBinaryBlock(JavaSparkContext sc, JavaPairRDD<MatrixIndexes, MatrixCell> input, DataCharacteristics mcOut, boolean outputEmptyBlocks) { //convert binarycell rdd to binary block rdd (w/ partial blocks) JavaPairRDD<MatrixIndexes, MatrixBlock> out = input .mapPartitionsToPair(new BinaryCellToBinaryBlockFunction(mcOut)); //inject empty blocks (if necessary) if( outputEmptyBlocks && mcOut.mightHaveEmptyBlocks() ) { out = out.union( SparkUtils.getEmptyBlockRDD(sc, mcOut) ); } //aggregate partial matrix blocks out = RDDAggregateUtils.mergeByKey(out, false); return out; }
Example 8
Source File: RDDConverterUtils.java From systemds with Apache License 2.0 | 6 votes |
public static JavaPairRDD<MatrixIndexes, MatrixBlock> textCellToBinaryBlock(JavaSparkContext sc, JavaPairRDD<LongWritable, Text> input, DataCharacteristics mcOut, boolean outputEmptyBlocks, FileFormatPropertiesMM mmProps) { //convert textcell rdd to binary block rdd (w/ partial blocks) JavaPairRDD<MatrixIndexes, MatrixBlock> out = input.values() .mapPartitionsToPair(new TextToBinaryBlockFunction(mcOut, mmProps)); //inject empty blocks (if necessary) if( outputEmptyBlocks && mcOut.mightHaveEmptyBlocks() ) { out = out.union( SparkUtils.getEmptyBlockRDD(sc, mcOut) ); } //aggregate partial matrix blocks out = RDDAggregateUtils.mergeByKey(out, false); return out; }
Example 9
Source File: RDDConverterUtilsExt.java From systemds with Apache License 2.0 | 6 votes |
public static JavaPairRDD<MatrixIndexes, MatrixBlock> coordinateMatrixToBinaryBlock(JavaSparkContext sc, CoordinateMatrix input, DataCharacteristics mcIn, boolean outputEmptyBlocks) { //convert matrix entry rdd to binary block rdd (w/ partial blocks) JavaPairRDD<MatrixIndexes, MatrixBlock> out = input.entries().toJavaRDD() .mapPartitionsToPair(new MatrixEntryToBinaryBlockFunction(mcIn)); //inject empty blocks (if necessary) if( outputEmptyBlocks && mcIn.mightHaveEmptyBlocks() ) { out = out.union( SparkUtils.getEmptyBlockRDD(sc, mcIn) ); } //aggregate partial matrix blocks out = RDDAggregateUtils.mergeByKey(out, false); return out; }
Example 10
Source File: FrameAppendRSPInstruction.java From systemds with Apache License 2.0 | 5 votes |
@Override public void processInstruction(ExecutionContext ec) { SparkExecutionContext sec = (SparkExecutionContext)ec; JavaPairRDD<Long,FrameBlock> in1 = sec.getFrameBinaryBlockRDDHandleForVariable( input1.getName() ); JavaPairRDD<Long,FrameBlock> in2 = sec.getFrameBinaryBlockRDDHandleForVariable( input2.getName() ); JavaPairRDD<Long,FrameBlock> out = null; long leftRows = sec.getDataCharacteristics(input1.getName()).getRows(); if(_cbind) { JavaPairRDD<Long,FrameBlock> in1Aligned = in1.mapToPair(new ReduceSideAppendAlignFunction(leftRows)); in1Aligned = FrameRDDAggregateUtils.mergeByKey(in1Aligned); JavaPairRDD<Long,FrameBlock> in2Aligned = in2.mapToPair(new ReduceSideAppendAlignFunction(leftRows)); in2Aligned = FrameRDDAggregateUtils.mergeByKey(in2Aligned); out = in1Aligned.join(in2Aligned).mapValues(new ReduceSideColumnsFunction(_cbind)); } else { //rbind JavaPairRDD<Long,FrameBlock> right = in2.mapToPair( new ReduceSideAppendRowsFunction(leftRows)); out = in1.union(right); } //put output RDD handle into symbol table updateBinaryAppendOutputDataCharacteristics(sec, _cbind); sec.setRDDHandleForVariable(output.getName(), out); sec.addLineageRDD(output.getName(), input1.getName()); sec.addLineageRDD(output.getName(), input2.getName()); //update schema of output with merged input schemas sec.getFrameObject(output.getName()).setSchema( sec.getFrameObject(input1.getName()).mergeSchemas( sec.getFrameObject(input2.getName()))); }
Example 11
Source File: FrameAppendRSPInstruction.java From systemds with Apache License 2.0 | 5 votes |
@Override public void processInstruction(ExecutionContext ec) { SparkExecutionContext sec = (SparkExecutionContext)ec; JavaPairRDD<Long,FrameBlock> in1 = sec.getFrameBinaryBlockRDDHandleForVariable( input1.getName() ); JavaPairRDD<Long,FrameBlock> in2 = sec.getFrameBinaryBlockRDDHandleForVariable( input2.getName() ); JavaPairRDD<Long,FrameBlock> out = null; long leftRows = sec.getDataCharacteristics(input1.getName()).getRows(); if(_cbind) { JavaPairRDD<Long,FrameBlock> in1Aligned = in1.mapToPair(new ReduceSideAppendAlignFunction(leftRows)); in1Aligned = FrameRDDAggregateUtils.mergeByKey(in1Aligned); JavaPairRDD<Long,FrameBlock> in2Aligned = in2.mapToPair(new ReduceSideAppendAlignFunction(leftRows)); in2Aligned = FrameRDDAggregateUtils.mergeByKey(in2Aligned); out = in1Aligned.join(in2Aligned).mapValues(new ReduceSideColumnsFunction(_cbind)); } else { //rbind JavaPairRDD<Long,FrameBlock> right = in2.mapToPair( new ReduceSideAppendRowsFunction(leftRows)); out = in1.union(right); } //put output RDD handle into symbol table updateBinaryAppendOutputDataCharacteristics(sec, _cbind); sec.setRDDHandleForVariable(output.getName(), out); sec.addLineageRDD(output.getName(), input1.getName()); sec.addLineageRDD(output.getName(), input2.getName()); //update schema of output with merged input schemas sec.getFrameObject(output.getName()).setSchema( sec.getFrameObject(input1.getName()).mergeSchemas( sec.getFrameObject(input2.getName()))); }
Example 12
Source File: SparkUHCDictionary.java From kylin with Apache License 2.0 | 4 votes |
@Override protected void execute(OptionsHelper optionsHelper) throws Exception { String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME); String metaUrl = optionsHelper.getOptionValue(OPTION_META_URL); String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID); String inputPath = optionsHelper.getOptionValue(OPTION_INPUT_PATH); String outputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH); String counterPath = optionsHelper.getOptionValue(OPTION_COUNTER_PATH); Class[] kryoClassArray = new Class[]{Class.forName("scala.reflect.ClassTag$$anon$1"), Class.forName("org.apache.kylin.engine.mr.steps.SelfDefineSortableKey")}; SparkConf conf = new SparkConf().setAppName("Build uhc dictionary with spark for:" + cubeName + " segment " + segmentId); //serialization conf conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator"); conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray); KylinSparkJobListener jobListener = new KylinSparkJobListener(); try (JavaSparkContext sc = new JavaSparkContext(conf)) { sc.sc().addSparkListener(jobListener); HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(outputPath)); Configuration hadoopConf = sc.hadoopConfiguration(); hadoopConf.set("mapreduce.input.pathFilter.class", "org.apache.kylin.engine.mr.steps.filter.UHCDictPathFilter"); final SerializableConfiguration sConf = new SerializableConfiguration(hadoopConf); KylinConfig config = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl); CubeManager cubeMgr = CubeManager.getInstance(config); CubeInstance cube = cubeMgr.getCube(cubeName); final Job job = Job.getInstance(sConf.get()); // calculate source record bytes size final LongAccumulator bytesWritten = sc.sc().longAccumulator(); String hdfsDir = sc.hadoopConfiguration().get(BatchConstants.CFG_GLOBAL_DICT_BASE_DIR); List<TblColRef> uhcColumns = cube.getDescriptor().getAllUHCColumns(); int reducerCount = uhcColumns.size(); if (reducerCount == 0) { return; } logger.info("RDD Output path: {}", outputPath); logger.info("getTotalReducerNum: {}", reducerCount); logger.info("counter path {}", counterPath); JavaPairRDD<String, String> wholeSequenceFileNames = null; for (TblColRef tblColRef : uhcColumns) { String columnPath = inputPath + "/" + tblColRef.getIdentity(); if (!HadoopUtil.getFileSystem(columnPath).exists(new Path(columnPath))) { continue; } if (wholeSequenceFileNames == null) { wholeSequenceFileNames = sc.wholeTextFiles(columnPath); } else { wholeSequenceFileNames = wholeSequenceFileNames.union(sc.wholeTextFiles(columnPath)); } } if (wholeSequenceFileNames == null) { logger.error("There're no sequence files at " + inputPath + " !"); return; } JavaPairRDD<String, Tuple3<Writable, Writable, String>> pairRDD = wholeSequenceFileNames.map(tuple -> tuple._1) .mapToPair(new InputPathAndFilterAddFunction2(config, uhcColumns)) .filter(tuple -> tuple._1 != -1) .reduceByKey((list1, list2) -> combineAllColumnDistinctValues(list1, list2)) .mapToPair(new ProcessUHCColumnValues(cubeName, config, hdfsDir, uhcColumns)); MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_DICT, SequenceFileOutputFormat.class, NullWritable.class, ArrayPrimitiveWritable.class); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.getConfiguration().set(BatchConstants.CFG_OUTPUT_PATH, outputPath); //prevent to create zero-sized default output LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class); MultipleOutputsRDD multipleOutputsRDD = MultipleOutputsRDD.rddToMultipleOutputsRDD(pairRDD); multipleOutputsRDD.saveAsNewAPIHadoopDatasetWithMultipleOutputs(job.getConfiguration()); logger.info("Map input records={}", reducerCount); logger.info("HDFS Read: {} HDFS Write", bytesWritten.value()); Map<String, String> counterMap = Maps.newHashMap(); counterMap.put(ExecutableConstants.SOURCE_RECORDS_COUNT, String.valueOf(reducerCount)); counterMap.put(ExecutableConstants.SOURCE_RECORDS_SIZE, String.valueOf(bytesWritten.value())); // save counter to hdfs HadoopUtil.writeToSequenceFile(sc.hadoopConfiguration(), counterPath, counterMap); HadoopUtil.deleteHDFSMeta(metaUrl); } }
Example 13
Source File: DataVecSparkUtil.java From deeplearning4j with Apache License 2.0 | 4 votes |
/**This is a convenience method to combine data from separate files together (intended to write to a sequence file, using * {@link org.apache.spark.api.java.JavaPairRDD#saveAsNewAPIHadoopFile(String, Class, Class, Class) })<br> * A typical use case is to combine input and label data from different files, for later parsing by a RecordReader * or SequenceRecordReader. * A typical use case is as follows:<br> * Given two paths (directories), combine the files in these two directories into pairs.<br> * Then, for each pair of files, convert the file contents into a {@link BytesPairWritable}, which also contains * the original file paths of the files.<br> * The assumptions are as follows:<br> * - For every file in the first directory, there is an equivalent file in the second directory (i.e., same key)<br> * - The pairing of files can be done based on the paths of the files; paths are mapped to a key using a {@link PathToKeyConverter}; * keys are then matched to give pairs of files<br> * <br><br> * <b>Example usage</b>: to combine all files in directory {@code dir1} with equivalent files in {@code dir2}, by file name: * <pre> * <code>JavaSparkContext sc = ...; * String path1 = "/dir1"; * String path2 = "/dir2"; * PathToKeyConverter pathConverter = new PathToKeyConverterFilename(); * JavaPairRDD<Text,BytesPairWritable> toWrite = DataVecSparkUtil.combineFilesForSequenceFile(sc, path1, path2, pathConverter, pathConverter ); * String outputPath = "/my/output/path"; * toWrite.saveAsNewAPIHadoopFile(outputPath, Text.class, BytesPairWritable.class, SequenceFileOutputFormat.class); * </code> * </pre> * Result: the file contexts aggregated (pairwise), written to a hadoop sequence file at /my/output/path * * * @param sc Spark context * @param path1 First directory (passed to JavaSparkContext.binaryFiles(path1)) * @param path2 Second directory (passed to JavaSparkContext.binaryFiles(path1)) * @param converter1 Converter, to convert file paths in first directory to a key (to allow files to be matched/paired by key) * @param converter2 As above, for second directory * @return */ public static JavaPairRDD<Text, BytesPairWritable> combineFilesForSequenceFile(JavaSparkContext sc, String path1, String path2, PathToKeyConverter converter1, PathToKeyConverter converter2) { JavaPairRDD<String, PortableDataStream> first = sc.binaryFiles(path1); JavaPairRDD<String, PortableDataStream> second = sc.binaryFiles(path2); //Now: process keys (paths) so that they can be merged JavaPairRDD<String, Tuple3<String, Integer, PortableDataStream>> first2 = first.mapToPair(new PathToKeyFunction(0, converter1)); JavaPairRDD<String, Tuple3<String, Integer, PortableDataStream>> second2 = second.mapToPair(new PathToKeyFunction(1, converter2)); JavaPairRDD<String, Tuple3<String, Integer, PortableDataStream>> merged = first2.union(second2); //Combine into pairs, and prepare for writing JavaPairRDD<Text, BytesPairWritable> toWrite = merged.groupByKey().mapToPair(new MapToBytesPairWritableFunction()); return toWrite; }
Example 14
Source File: CtableSPInstruction.java From systemds with Apache License 2.0 | 4 votes |
@Override public void processInstruction(ExecutionContext ec) { SparkExecutionContext sec = (SparkExecutionContext)ec; Ctable.OperationTypes ctableOp = Ctable.findCtableOperationByInputDataTypes( input1.getDataType(), input2.getDataType(), input3.getDataType()); ctableOp = _isExpand ? Ctable.OperationTypes.CTABLE_EXPAND_SCALAR_WEIGHT : ctableOp; //get input rdd handle JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() ); JavaPairRDD<MatrixIndexes,MatrixBlock> in2 = !ctableOp.hasSecondInput() ? null : sec.getBinaryMatrixBlockRDDHandleForVariable( input2.getName() ); JavaPairRDD<MatrixIndexes,MatrixBlock> in3 = null; double s2 = -1, s3 = -1; //scalars DataCharacteristics mc1 = sec.getDataCharacteristics(input1.getName()); DataCharacteristics mcOut = sec.getDataCharacteristics(output.getName()); // handle known/unknown dimensions long dim1 = (_dim1Literal ? (long) Double.parseDouble(_outDim1) : (sec.getScalarInput(_outDim1, ValueType.FP64, false)).getLongValue()); long dim2 = (_dim2Literal ? (long) Double.parseDouble(_outDim2) : (sec.getScalarInput(_outDim2, ValueType.FP64, false)).getLongValue()); if( dim1 == -1 && dim2 == -1 ) { //note: if we need to determine the dimensions to we do so before //creating cells to avoid unnecessary caching, repeated joins, etc. dim1 = (long) RDDAggregateUtils.max(in1); dim2 = ctableOp.hasSecondInput() ? (long) RDDAggregateUtils.max(in2) : sec.getScalarInput(input3).getLongValue(); } mcOut.set(dim1, dim2, mc1.getBlocksize(), mc1.getBlocksize()); mcOut.setNonZerosBound(mc1.getRows()); //compute preferred degree of parallelism int numParts = Math.max(4 * (mc1.dimsKnown() ? SparkUtils.getNumPreferredPartitions(mc1) : in1.getNumPartitions()), SparkUtils.getNumPreferredPartitions(mcOut)); JavaPairRDD<MatrixIndexes, MatrixBlock> out = null; switch(ctableOp) { case CTABLE_TRANSFORM: //(VECTOR) // F=ctable(A,B,W) in3 = sec.getBinaryMatrixBlockRDDHandleForVariable( input3.getName() ); out = in1.join(in2, numParts).join(in3, numParts) .mapValues(new MapJoinSignature3()) .mapPartitionsToPair(new CTableFunction(ctableOp, s2, s3, _ignoreZeros, mcOut)); break; case CTABLE_EXPAND_SCALAR_WEIGHT: //(VECTOR) case CTABLE_TRANSFORM_SCALAR_WEIGHT: //(VECTOR/MATRIX) // F = ctable(A,B) or F = ctable(A,B,1) s3 = sec.getScalarInput(input3).getDoubleValue(); out = in1.join(in2, numParts).mapValues(new MapJoinSignature2()) .mapPartitionsToPair(new CTableFunction(ctableOp, s2, s3, _ignoreZeros, mcOut)); break; case CTABLE_TRANSFORM_HISTOGRAM: //(VECTOR) // F=ctable(A,1) or F = ctable(A,1,1) s2 = sec.getScalarInput(input2).getDoubleValue(); s3 = sec.getScalarInput(input3).getDoubleValue(); out = in1.mapValues(new MapJoinSignature1()) .mapPartitionsToPair(new CTableFunction(ctableOp, s2, s3, _ignoreZeros, mcOut)); break; case CTABLE_TRANSFORM_WEIGHTED_HISTOGRAM: //(VECTOR) // F=ctable(A,1,W) in3 = sec.getBinaryMatrixBlockRDDHandleForVariable( input3.getName() ); s2 = sec.getScalarInput(input2).getDoubleValue(); out = in1.join(in3, numParts).mapValues(new MapJoinSignature2()) .mapPartitionsToPair(new CTableFunction(ctableOp, s2, s3, _ignoreZeros, mcOut)); break; default: throw new DMLRuntimeException("Encountered an invalid ctable operation ("+ctableOp+") while executing instruction: " + this.toString()); } //perform fused aggregation and reblock out = out.union(SparkUtils.getEmptyBlockRDD(sec.getSparkContext(), mcOut)); out = RDDAggregateUtils.sumByKeyStable(out, numParts, false); //store output rdd handle sec.setRDDHandleForVariable(output.getName(), out); sec.addLineageRDD(output.getName(), input1.getName()); if( ctableOp.hasSecondInput() ) sec.addLineageRDD(output.getName(), input2.getName()); if( ctableOp.hasThirdInput() ) sec.addLineageRDD(output.getName(), input3.getName()); }
Example 15
Source File: PMapmmSPInstruction.java From systemds with Apache License 2.0 | 4 votes |
@Override public void processInstruction(ExecutionContext ec) { SparkExecutionContext sec = (SparkExecutionContext)ec; //get inputs JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() ); JavaPairRDD<MatrixIndexes,MatrixBlock> in2 = sec.getBinaryMatrixBlockRDDHandleForVariable( input2.getName() ); DataCharacteristics mc1 = sec.getDataCharacteristics(input1.getName()); // This avoids errors such as java.lang.UnsupportedOperationException: Cannot change storage level of an RDD after it was already assigned a level // Ideally, we should ensure that we donot redundantly call persist on the same RDD. StorageLevel pmapmmStorageLevel = StorageLevel.MEMORY_AND_DISK(); //cache right hand side because accessed many times in2 = in2.repartition(sec.getSparkContext().defaultParallelism()) .persist(pmapmmStorageLevel); JavaPairRDD<MatrixIndexes,MatrixBlock> out = null; for( int i=0; i<mc1.getRows(); i+=NUM_ROWBLOCKS*mc1.getBlocksize() ) { //create broadcast for rdd partition JavaPairRDD<MatrixIndexes,MatrixBlock> rdd = in1 .filter(new IsBlockInRange(i+1, i+NUM_ROWBLOCKS*mc1.getBlocksize(), 1, mc1.getCols(), mc1)) .mapToPair(new PMapMMRebaseBlocksFunction(i/mc1.getBlocksize())); int rlen = (int)Math.min(mc1.getRows()-i, NUM_ROWBLOCKS*mc1.getBlocksize()); PartitionedBlock<MatrixBlock> pmb = SparkExecutionContext.toPartitionedMatrixBlock(rdd, rlen, (int)mc1.getCols(), mc1.getBlocksize(), -1L); Broadcast<PartitionedBlock<MatrixBlock>> bpmb = sec.getSparkContext().broadcast(pmb); //matrix multiplication JavaPairRDD<MatrixIndexes,MatrixBlock> rdd2 = in2 .flatMapToPair(new PMapMMFunction(bpmb, i/mc1.getBlocksize())); rdd2 = RDDAggregateUtils.sumByKeyStable(rdd2, false); rdd2.persist(pmapmmStorageLevel) .count(); bpmb.unpersist(false); if( out == null ) out = rdd2; else out = out.union(rdd2); } //cache final result out = out.persist(pmapmmStorageLevel); out.count(); //put output RDD handle into symbol table sec.setRDDHandleForVariable(output.getName(), out); sec.addLineageRDD(output.getName(), input1.getName()); sec.addLineageRDD(output.getName(), input2.getName()); //update output statistics if not inferred updateBinaryMMOutputDataCharacteristics(sec, true); }
Example 16
Source File: SparkUHCDictionary.java From kylin-on-parquet-v2 with Apache License 2.0 | 4 votes |
@Override protected void execute(OptionsHelper optionsHelper) throws Exception { String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME); String metaUrl = optionsHelper.getOptionValue(OPTION_META_URL); String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID); String inputPath = optionsHelper.getOptionValue(OPTION_INPUT_PATH); String outputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH); String counterPath = optionsHelper.getOptionValue(OPTION_COUNTER_PATH); Class[] kryoClassArray = new Class[]{Class.forName("scala.reflect.ClassTag$$anon$1"), Class.forName("org.apache.kylin.engine.mr.steps.SelfDefineSortableKey")}; SparkConf conf = new SparkConf().setAppName("Build uhc dictionary with spark for:" + cubeName + " segment " + segmentId); //serialization conf conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator"); conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray); KylinSparkJobListener jobListener = new KylinSparkJobListener(); try (JavaSparkContext sc = new JavaSparkContext(conf)) { sc.sc().addSparkListener(jobListener); HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(outputPath)); Configuration hadoopConf = sc.hadoopConfiguration(); hadoopConf.set("mapreduce.input.pathFilter.class", "org.apache.kylin.engine.mr.steps.filter.UHCDictPathFilter"); final SerializableConfiguration sConf = new SerializableConfiguration(hadoopConf); KylinConfig config = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl); CubeManager cubeMgr = CubeManager.getInstance(config); CubeInstance cube = cubeMgr.getCube(cubeName); final Job job = Job.getInstance(sConf.get()); // calculate source record bytes size final LongAccumulator bytesWritten = sc.sc().longAccumulator(); String hdfsDir = sc.hadoopConfiguration().get(BatchConstants.CFG_GLOBAL_DICT_BASE_DIR); List<TblColRef> uhcColumns = cube.getDescriptor().getAllUHCColumns(); int reducerCount = uhcColumns.size(); if (reducerCount == 0) { return; } logger.info("RDD Output path: {}", outputPath); logger.info("getTotalReducerNum: {}", reducerCount); logger.info("counter path {}", counterPath); JavaPairRDD<String, String> wholeSequenceFileNames = null; for (TblColRef tblColRef : uhcColumns) { String columnPath = inputPath + "/" + tblColRef.getIdentity(); if (!HadoopUtil.getFileSystem(columnPath).exists(new Path(columnPath))) { continue; } if (wholeSequenceFileNames == null) { wholeSequenceFileNames = sc.wholeTextFiles(columnPath); } else { wholeSequenceFileNames = wholeSequenceFileNames.union(sc.wholeTextFiles(columnPath)); } } if (wholeSequenceFileNames == null) { logger.error("There're no sequence files at " + inputPath + " !"); return; } JavaPairRDD<String, Tuple3<Writable, Writable, String>> pairRDD = wholeSequenceFileNames.map(tuple -> tuple._1) .mapToPair(new InputPathAndFilterAddFunction2(config, uhcColumns)) .filter(tuple -> tuple._1 != -1) .reduceByKey((list1, list2) -> combineAllColumnDistinctValues(list1, list2)) .mapToPair(new ProcessUHCColumnValues(cubeName, config, hdfsDir, uhcColumns)); MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_DICT, SequenceFileOutputFormat.class, NullWritable.class, ArrayPrimitiveWritable.class); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.getConfiguration().set(BatchConstants.CFG_OUTPUT_PATH, outputPath); //prevent to create zero-sized default output LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class); MultipleOutputsRDD multipleOutputsRDD = MultipleOutputsRDD.rddToMultipleOutputsRDD(pairRDD); multipleOutputsRDD.saveAsNewAPIHadoopDatasetWithMultipleOutputs(job.getConfiguration()); logger.info("Map input records={}", reducerCount); logger.info("HDFS Read: {} HDFS Write", bytesWritten.value()); Map<String, String> counterMap = Maps.newHashMap(); counterMap.put(ExecutableConstants.SOURCE_RECORDS_COUNT, String.valueOf(reducerCount)); counterMap.put(ExecutableConstants.SOURCE_RECORDS_SIZE, String.valueOf(bytesWritten.value())); // save counter to hdfs HadoopUtil.writeToSequenceFile(sc.hadoopConfiguration(), counterPath, counterMap); HadoopUtil.deleteHDFSMeta(metaUrl); } }
Example 17
Source File: DataVecSparkUtil.java From DataVec with Apache License 2.0 | 4 votes |
/**This is a convenience method to combine data from separate files together (intended to write to a sequence file, using * {@link org.apache.spark.api.java.JavaPairRDD#saveAsNewAPIHadoopFile(String, Class, Class, Class) })<br> * A typical use case is to combine input and label data from different files, for later parsing by a RecordReader * or SequenceRecordReader. * A typical use case is as follows:<br> * Given two paths (directories), combine the files in these two directories into pairs.<br> * Then, for each pair of files, convert the file contents into a {@link BytesPairWritable}, which also contains * the original file paths of the files.<br> * The assumptions are as follows:<br> * - For every file in the first directory, there is an equivalent file in the second directory (i.e., same key)<br> * - The pairing of files can be done based on the paths of the files; paths are mapped to a key using a {@link PathToKeyConverter}; * keys are then matched to give pairs of files<br> * <br><br> * <b>Example usage</b>: to combine all files in directory {@code dir1} with equivalent files in {@code dir2}, by file name: * <pre> * <code>JavaSparkContext sc = ...; * String path1 = "/dir1"; * String path2 = "/dir2"; * PathToKeyConverter pathConverter = new PathToKeyConverterFilename(); * JavaPairRDD<Text,BytesPairWritable> toWrite = DataVecSparkUtil.combineFilesForSequenceFile(sc, path1, path2, pathConverter, pathConverter ); * String outputPath = "/my/output/path"; * toWrite.saveAsNewAPIHadoopFile(outputPath, Text.class, BytesPairWritable.class, SequenceFileOutputFormat.class); * </code> * </pre> * Result: the file contexts aggregated (pairwise), written to a hadoop sequence file at /my/output/path * * * @param sc Spark context * @param path1 First directory (passed to JavaSparkContext.binaryFiles(path1)) * @param path2 Second directory (passed to JavaSparkContext.binaryFiles(path1)) * @param converter1 Converter, to convert file paths in first directory to a key (to allow files to be matched/paired by key) * @param converter2 As above, for second directory * @return */ public static JavaPairRDD<Text, BytesPairWritable> combineFilesForSequenceFile(JavaSparkContext sc, String path1, String path2, PathToKeyConverter converter1, PathToKeyConverter converter2) { JavaPairRDD<String, PortableDataStream> first = sc.binaryFiles(path1); JavaPairRDD<String, PortableDataStream> second = sc.binaryFiles(path2); //Now: process keys (paths) so that they can be merged JavaPairRDD<String, Tuple3<String, Integer, PortableDataStream>> first2 = first.mapToPair(new PathToKeyFunction(0, converter1)); JavaPairRDD<String, Tuple3<String, Integer, PortableDataStream>> second2 = second.mapToPair(new PathToKeyFunction(1, converter2)); JavaPairRDD<String, Tuple3<String, Integer, PortableDataStream>> merged = first2.union(second2); //Combine into pairs, and prepare for writing JavaPairRDD<Text, BytesPairWritable> toWrite = merged.groupByKey().mapToPair(new MapToBytesPairWritableFunction()); return toWrite; }
Example 18
Source File: CtableSPInstruction.java From systemds with Apache License 2.0 | 4 votes |
@Override public void processInstruction(ExecutionContext ec) { SparkExecutionContext sec = (SparkExecutionContext)ec; Ctable.OperationTypes ctableOp = Ctable.findCtableOperationByInputDataTypes( input1.getDataType(), input2.getDataType(), input3.getDataType()); ctableOp = _isExpand ? Ctable.OperationTypes.CTABLE_EXPAND_SCALAR_WEIGHT : ctableOp; //get input rdd handle JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() ); JavaPairRDD<MatrixIndexes,MatrixBlock> in2 = !ctableOp.hasSecondInput() ? null : sec.getBinaryMatrixBlockRDDHandleForVariable( input2.getName() ); JavaPairRDD<MatrixIndexes,MatrixBlock> in3 = null; double s2 = -1, s3 = -1; //scalars DataCharacteristics mc1 = sec.getDataCharacteristics(input1.getName()); DataCharacteristics mcOut = sec.getDataCharacteristics(output.getName()); // handle known/unknown dimensions long dim1 = (_dim1Literal ? (long) Double.parseDouble(_outDim1) : (sec.getScalarInput(_outDim1, ValueType.FP64, false)).getLongValue()); long dim2 = (_dim2Literal ? (long) Double.parseDouble(_outDim2) : (sec.getScalarInput(_outDim2, ValueType.FP64, false)).getLongValue()); if( dim1 == -1 && dim2 == -1 ) { //note: if we need to determine the dimensions to we do so before //creating cells to avoid unnecessary caching, repeated joins, etc. dim1 = (long) RDDAggregateUtils.max(in1); dim2 = ctableOp.hasSecondInput() ? (long) RDDAggregateUtils.max(in2) : sec.getScalarInput(input3).getLongValue(); } mcOut.set(dim1, dim2, mc1.getBlocksize(), mc1.getBlocksize()); mcOut.setNonZerosBound(mc1.getRows()); //compute preferred degree of parallelism int numParts = Math.max(4 * (mc1.dimsKnown() ? SparkUtils.getNumPreferredPartitions(mc1) : in1.getNumPartitions()), SparkUtils.getNumPreferredPartitions(mcOut)); JavaPairRDD<MatrixIndexes, MatrixBlock> out = null; switch(ctableOp) { case CTABLE_TRANSFORM: //(VECTOR) // F=ctable(A,B,W) in3 = sec.getBinaryMatrixBlockRDDHandleForVariable( input3.getName() ); out = in1.join(in2, numParts).join(in3, numParts) .mapValues(new MapJoinSignature3()) .mapPartitionsToPair(new CTableFunction(ctableOp, s2, s3, _ignoreZeros, mcOut)); break; case CTABLE_EXPAND_SCALAR_WEIGHT: //(VECTOR) case CTABLE_TRANSFORM_SCALAR_WEIGHT: //(VECTOR/MATRIX) // F = ctable(A,B) or F = ctable(A,B,1) s3 = sec.getScalarInput(input3).getDoubleValue(); out = in1.join(in2, numParts).mapValues(new MapJoinSignature2()) .mapPartitionsToPair(new CTableFunction(ctableOp, s2, s3, _ignoreZeros, mcOut)); break; case CTABLE_TRANSFORM_HISTOGRAM: //(VECTOR) // F=ctable(A,1) or F = ctable(A,1,1) s2 = sec.getScalarInput(input2).getDoubleValue(); s3 = sec.getScalarInput(input3).getDoubleValue(); out = in1.mapValues(new MapJoinSignature1()) .mapPartitionsToPair(new CTableFunction(ctableOp, s2, s3, _ignoreZeros, mcOut)); break; case CTABLE_TRANSFORM_WEIGHTED_HISTOGRAM: //(VECTOR) // F=ctable(A,1,W) in3 = sec.getBinaryMatrixBlockRDDHandleForVariable( input3.getName() ); s2 = sec.getScalarInput(input2).getDoubleValue(); out = in1.join(in3, numParts).mapValues(new MapJoinSignature2()) .mapPartitionsToPair(new CTableFunction(ctableOp, s2, s3, _ignoreZeros, mcOut)); break; default: throw new DMLRuntimeException("Encountered an invalid ctable operation ("+ctableOp+") while executing instruction: " + this.toString()); } //perform fused aggregation and reblock out = out.union(SparkUtils.getEmptyBlockRDD(sec.getSparkContext(), mcOut)); out = RDDAggregateUtils.sumByKeyStable(out, numParts, false); //store output rdd handle sec.setRDDHandleForVariable(output.getName(), out); sec.addLineageRDD(output.getName(), input1.getName()); if( ctableOp.hasSecondInput() ) sec.addLineageRDD(output.getName(), input2.getName()); if( ctableOp.hasThirdInput() ) sec.addLineageRDD(output.getName(), input3.getName()); }
Example 19
Source File: PMapmmSPInstruction.java From systemds with Apache License 2.0 | 4 votes |
@Override public void processInstruction(ExecutionContext ec) { SparkExecutionContext sec = (SparkExecutionContext)ec; //get inputs JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() ); JavaPairRDD<MatrixIndexes,MatrixBlock> in2 = sec.getBinaryMatrixBlockRDDHandleForVariable( input2.getName() ); DataCharacteristics mc1 = sec.getDataCharacteristics(input1.getName()); // This avoids errors such as java.lang.UnsupportedOperationException: Cannot change storage level of an RDD after it was already assigned a level // Ideally, we should ensure that we donot redundantly call persist on the same RDD. StorageLevel pmapmmStorageLevel = StorageLevel.MEMORY_AND_DISK(); //cache right hand side because accessed many times in2 = in2.repartition(sec.getSparkContext().defaultParallelism()) .persist(pmapmmStorageLevel); JavaPairRDD<MatrixIndexes,MatrixBlock> out = null; for( int i=0; i<mc1.getRows(); i+=NUM_ROWBLOCKS*mc1.getBlocksize() ) { //create broadcast for rdd partition JavaPairRDD<MatrixIndexes,MatrixBlock> rdd = in1 .filter(new IsBlockInRange(i+1, i+NUM_ROWBLOCKS*mc1.getBlocksize(), 1, mc1.getCols(), mc1)) .mapToPair(new PMapMMRebaseBlocksFunction(i/mc1.getBlocksize())); int rlen = (int)Math.min(mc1.getRows()-i, NUM_ROWBLOCKS*mc1.getBlocksize()); PartitionedBlock<MatrixBlock> pmb = SparkExecutionContext.toPartitionedMatrixBlock(rdd, rlen, (int)mc1.getCols(), mc1.getBlocksize(), -1L); Broadcast<PartitionedBlock<MatrixBlock>> bpmb = sec.getSparkContext().broadcast(pmb); //matrix multiplication JavaPairRDD<MatrixIndexes,MatrixBlock> rdd2 = in2 .flatMapToPair(new PMapMMFunction(bpmb, i/mc1.getBlocksize())); rdd2 = RDDAggregateUtils.sumByKeyStable(rdd2, false); rdd2.persist(pmapmmStorageLevel) .count(); bpmb.unpersist(false); if( out == null ) out = rdd2; else out = out.union(rdd2); } //cache final result out = out.persist(pmapmmStorageLevel); out.count(); //put output RDD handle into symbol table sec.setRDDHandleForVariable(output.getName(), out); sec.addLineageRDD(output.getName(), input1.getName()); sec.addLineageRDD(output.getName(), input2.getName()); //update output statistics if not inferred updateBinaryMMOutputDataCharacteristics(sec, true); }
Example 20
Source File: RP_DBSCAN.java From RP-DBSCAN with Apache License 2.0 | 4 votes |
/** * Phase III : post-processing for RP-DBSCAN * Phase III-1 (Progressive Graph Merging) and Phase III-2 (Point Labeling) */ public void phaseIII() { /** * Phase III-1: Progressive Graph Merging */ // Merge subgraphs into global cell graph through following parallel procedures: Single Merger, Edge Type Detection and Edge Reduction. int curPartitionSize = Conf.numOfPartitions; while(curPartitionSize != 1) { curPartitionSize = curPartitionSize/2; edgeSet = edgeSet.mapPartitionsToPair(new Methods.BuildMST(conf, corePaths, curPartitionSize)).repartition(curPartitionSize); } List<Tuple2<Integer, Integer>> result = edgeSet.mapPartitionsToPair(new Methods.FinalPhase(conf, corePaths)).collect(); // Count the number of Cluster in global cell graph. numOfClusters = result.get(0)._2; /** * Phase III-2: Point Labeling */ //Assign border points into proper clusters (partially condition of Theorem 3.5). JavaPairRDD<Integer, ApproximatedPoint> borderPts = dataset.flatMapToPair(new Methods.EmitConnectedCoreCellsFromBorderCell(conf, Conf.numOfPartitions)).groupByKey().flatMapToPair(new Methods.AssignBorderPointToCluster(Conf.dim, Conf.epsilon, conf, Conf.pairOutputPath)); //Assign core points into proper clusters (fully condition of Theorem 3.5. JavaPairRDD<Integer, ApproximatedPoint> corePts = dataset.mapPartitionsToPair(new Methods.AssignCorePointToCluster(conf, Conf.pairOutputPath)); //Point labeling algorithm 1 : faster than algorithm 2, but not scalable. //If out-of-memory error is occurred during the labeling procedure, then use below algorithm 2 for labeling instead of this. //union the two results. JavaPairRDD<Integer, ApproximatedPoint> assignedResult = borderPts.union(corePts); //count the number of points in each cluster. numOfPtsInCluster = assignedResult.mapPartitionsToPair(new Methods.CountForEachCluster()).reduceByKey(new Methods.AggregateCount()).collect(); /* // Point labeling algorithm 2 : scalable, but slower than algorithm 1. List<Tuple2<Integer, Long>> borderPtsList = borderPts.mapPartitionsToPair(new Methods.CountForEachCluster()).reduceByKey(new Methods.AggregateCount()).collect(); List<Tuple2<Integer, Long>> corePtsList = corePts.mapPartitionsToPair(new Methods.CountForEachCluster()).reduceByKey(new Methods.AggregateCount()).collect(); HashMap<Integer, Long> numOfPtsInCluster = new HashMap<Integer, Long>(); for(Tuple2<Integer, Long> core : corePtsList) numOfPtsInCluster.put(core._1, core._2); for(Tuple2<Integer, Long> border : borderPtsList) numOfPtsInCluster.put( border._1 , numOfPtsInCluster.get(border._1)+border._2); for(Entry<Integer, Long> entry : numOfPtsInCluster.entrySet()) System.out.println("CLUSTER ["+(entry.getKey()+1)+"] : "+ entry.getValue()); */ }