Java Code Examples for org.apache.spark.api.java.JavaPairRDD#getNumPartitions()
The following examples show how to use
org.apache.spark.api.java.JavaPairRDD#getNumPartitions() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: BinarySPInstruction.java From systemds with Apache License 2.0 | 5 votes |
/** * Common binary tensor-tensor process instruction * * @param ec execution context */ protected void processTensorTensorBinaryInstruction(ExecutionContext ec) { SparkExecutionContext sec = (SparkExecutionContext)ec; //sanity check dimensions checkTensorTensorBinaryCharacteristics(sec); updateBinaryTensorOutputDataCharacteristics(sec); // Get input RDDs JavaPairRDD<TensorIndexes, TensorBlock> in1 = sec.getBinaryTensorBlockRDDHandleForVariable(input1.getName()); JavaPairRDD<TensorIndexes, TensorBlock> in2 = sec.getBinaryTensorBlockRDDHandleForVariable(input2.getName()); DataCharacteristics tc1 = sec.getDataCharacteristics(input1.getName()); DataCharacteristics tc2 = sec.getDataCharacteristics(input2.getName()); DataCharacteristics dcOut = sec.getDataCharacteristics(output.getName()); BinaryOperator bop = (BinaryOperator) _optr; // TODO blocking scheme for matrices with mismatching number of dimensions if (tc2.getNumDims() < tc1.getNumDims()) in2 = in2.flatMapToPair(new ReblockTensorFunction(tc1.getNumDims(), tc1.getBlocksize())); for (int i = 0; i < tc1.getNumDims(); i++) { long numReps = getNumDimReplicas(tc1, tc2, i); if (numReps > 1) in2 = in2.flatMapToPair(new ReplicateTensorFunction(i, numReps)); } int numPrefPart = SparkUtils.isHashPartitioned(in1) ? in1.getNumPartitions() : SparkUtils.isHashPartitioned(in2) ? in2.getNumPartitions() : Math.min(in1.getNumPartitions() + in2.getNumPartitions(), 2 * SparkUtils.getNumPreferredPartitions(dcOut)); //execute binary operation JavaPairRDD<TensorIndexes, TensorBlock> out = in1 .join(in2, numPrefPart) .mapValues(new TensorTensorBinaryOpFunction(bop)); //set output RDD sec.setRDDHandleForVariable(output.getName(), out); sec.addLineageRDD(output.getName(), input1.getName()); sec.addLineageRDD(output.getName(), input2.getName()); }
Example 2
Source File: BinarySPInstruction.java From systemds with Apache License 2.0 | 5 votes |
/** * Common binary tensor-tensor process instruction * * @param ec execution context */ protected void processTensorTensorBinaryInstruction(ExecutionContext ec) { SparkExecutionContext sec = (SparkExecutionContext)ec; //sanity check dimensions checkTensorTensorBinaryCharacteristics(sec); updateBinaryTensorOutputDataCharacteristics(sec); // Get input RDDs JavaPairRDD<TensorIndexes, TensorBlock> in1 = sec.getBinaryTensorBlockRDDHandleForVariable(input1.getName()); JavaPairRDD<TensorIndexes, TensorBlock> in2 = sec.getBinaryTensorBlockRDDHandleForVariable(input2.getName()); DataCharacteristics tc1 = sec.getDataCharacteristics(input1.getName()); DataCharacteristics tc2 = sec.getDataCharacteristics(input2.getName()); DataCharacteristics dcOut = sec.getDataCharacteristics(output.getName()); BinaryOperator bop = (BinaryOperator) _optr; // TODO blocking scheme for matrices with mismatching number of dimensions if (tc2.getNumDims() < tc1.getNumDims()) in2 = in2.flatMapToPair(new ReblockTensorFunction(tc1.getNumDims(), tc1.getBlocksize())); for (int i = 0; i < tc1.getNumDims(); i++) { long numReps = getNumDimReplicas(tc1, tc2, i); if (numReps > 1) in2 = in2.flatMapToPair(new ReplicateTensorFunction(i, numReps)); } int numPrefPart = SparkUtils.isHashPartitioned(in1) ? in1.getNumPartitions() : SparkUtils.isHashPartitioned(in2) ? in2.getNumPartitions() : Math.min(in1.getNumPartitions() + in2.getNumPartitions(), 2 * SparkUtils.getNumPreferredPartitions(dcOut)); //execute binary operation JavaPairRDD<TensorIndexes, TensorBlock> out = in1 .join(in2, numPrefPart) .mapValues(new TensorTensorBinaryOpFunction(bop)); //set output RDD sec.setRDDHandleForVariable(output.getName(), out); sec.addLineageRDD(output.getName(), input1.getName()); sec.addLineageRDD(output.getName(), input2.getName()); }
Example 3
Source File: SparkExecutionContext.java From systemds with Apache License 2.0 | 4 votes |
@SuppressWarnings("unchecked") public void repartitionAndCacheMatrixObject( String var ) { MatrixObject mo = getMatrixObject(var); DataCharacteristics dcIn = mo.getDataCharacteristics(); //double check size to avoid unnecessary spark context creation if( !OptimizerUtils.exceedsCachingThreshold(mo.getNumColumns(), OptimizerUtils.estimateSizeExactSparsity(dcIn)) ) return; //get input rdd and default storage level JavaPairRDD<MatrixIndexes,MatrixBlock> in = (JavaPairRDD<MatrixIndexes, MatrixBlock>) getRDDHandleForMatrixObject(mo, InputInfo.BinaryBlockInputInfo); //avoid unnecessary caching of input in order to reduce memory pressure if( mo.getRDDHandle().allowsShortCircuitRead() && isRDDMarkedForCaching(in.id()) && !isRDDCached(in.id()) ) { in = (JavaPairRDD<MatrixIndexes,MatrixBlock>) ((RDDObject)mo.getRDDHandle().getLineageChilds().get(0)).getRDD(); //investigate issue of unnecessarily large number of partitions int numPartitions = SparkUtils.getNumPreferredPartitions(dcIn, in); if( numPartitions < in.getNumPartitions() ) in = in.coalesce( numPartitions ); } //repartition rdd (force creation of shuffled rdd via merge), note: without deep copy albeit //executed on the original data, because there will be no merge, i.e., no key duplicates JavaPairRDD<MatrixIndexes,MatrixBlock> out = RDDAggregateUtils.mergeByKey(in, false); //convert mcsr into memory-efficient csr if potentially sparse if( OptimizerUtils.checkSparseBlockCSRConversion(dcIn) ) { out = out.mapValues(new CreateSparseBlockFunction(SparseBlock.Type.CSR)); } //persist rdd in default storage level out.persist( Checkpoint.DEFAULT_STORAGE_LEVEL ) .count(); //trigger caching to prevent contention //create new rdd handle, in-place of current matrix object RDDObject inro = mo.getRDDHandle(); //guaranteed to exist (see above) RDDObject outro = new RDDObject(out); //create new rdd object outro.setCheckpointRDD(true); //mark as checkpointed outro.addLineageChild(inro); //keep lineage to prevent cycles on cleanup mo.setRDDHandle(outro); }
Example 4
Source File: SparkUtils.java From systemds with Apache License 2.0 | 4 votes |
public static int getNumPreferredPartitions(DataCharacteristics dc, JavaPairRDD<?,?> in) { if( !dc.dimsKnown(true) && in != null ) return in.getNumPartitions(); return getNumPreferredPartitions(dc); }
Example 5
Source File: BinarySPInstruction.java From systemds with Apache License 2.0 | 4 votes |
/** * Common binary matrix-matrix process instruction * * @param ec execution context */ protected void processMatrixMatrixBinaryInstruction(ExecutionContext ec) { SparkExecutionContext sec = (SparkExecutionContext)ec; //sanity check dimensions checkMatrixMatrixBinaryCharacteristics(sec); updateBinaryOutputDataCharacteristics(sec); // Get input RDDs JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryMatrixBlockRDDHandleForVariable(input1.getName()); JavaPairRDD<MatrixIndexes,MatrixBlock> in2 = sec.getBinaryMatrixBlockRDDHandleForVariable(input2.getName()); DataCharacteristics mc1 = sec.getDataCharacteristics(input1.getName()); DataCharacteristics mc2 = sec.getDataCharacteristics(input2.getName()); DataCharacteristics mcOut = sec.getDataCharacteristics(output.getName()); BinaryOperator bop = (BinaryOperator) _optr; //vector replication if required (mv or outer operations) boolean rowvector = (mc2.getRows()==1 && mc1.getRows()>1); long numRepLeft = getNumReplicas(mc1, mc2, true); long numRepRight = getNumReplicas(mc1, mc2, false); if( numRepLeft > 1 ) in1 = in1.flatMapToPair(new ReplicateVectorFunction(false, numRepLeft )); if( numRepRight > 1 ) in2 = in2.flatMapToPair(new ReplicateVectorFunction(rowvector, numRepRight)); int numPrefPart = SparkUtils.isHashPartitioned(in1) ? in1.getNumPartitions() : SparkUtils.isHashPartitioned(in2) ? in2.getNumPartitions() : Math.min(in1.getNumPartitions() + in2.getNumPartitions(), 2 * SparkUtils.getNumPreferredPartitions(mcOut)); //execute binary operation JavaPairRDD<MatrixIndexes,MatrixBlock> out = in1 .join(in2, numPrefPart) .mapValues(new MatrixMatrixBinaryOpFunction(bop)); //set output RDD sec.setRDDHandleForVariable(output.getName(), out); sec.addLineageRDD(output.getName(), input1.getName()); sec.addLineageRDD(output.getName(), input2.getName()); }
Example 6
Source File: UUIDPartitioner.java From envelope with Apache License 2.0 | 4 votes |
@Override public void configureRDD(JavaPairRDD<Row, Row> rdd) { this.numPartitions = rdd.getNumPartitions(); }
Example 7
Source File: SparkExecutionContext.java From systemds with Apache License 2.0 | 4 votes |
@SuppressWarnings("unchecked") public void repartitionAndCacheMatrixObject( String var ) { MatrixObject mo = getMatrixObject(var); DataCharacteristics dcIn = mo.getDataCharacteristics(); //double check size to avoid unnecessary spark context creation if( !OptimizerUtils.exceedsCachingThreshold(mo.getNumColumns(), OptimizerUtils.estimateSizeExactSparsity(dcIn)) ) return; //get input rdd and default storage level JavaPairRDD<MatrixIndexes,MatrixBlock> in = (JavaPairRDD<MatrixIndexes, MatrixBlock>) getRDDHandleForMatrixObject(mo, FileFormat.BINARY); //avoid unnecessary caching of input in order to reduce memory pressure if( mo.getRDDHandle().allowsShortCircuitRead() && isRDDMarkedForCaching(in.id()) && !isRDDCached(in.id()) ) { in = (JavaPairRDD<MatrixIndexes,MatrixBlock>) ((RDDObject)mo.getRDDHandle().getLineageChilds().get(0)).getRDD(); //investigate issue of unnecessarily large number of partitions int numPartitions = SparkUtils.getNumPreferredPartitions(dcIn, in); if( numPartitions < in.getNumPartitions() ) in = in.coalesce( numPartitions ); } //repartition rdd (force creation of shuffled rdd via merge), note: without deep copy albeit //executed on the original data, because there will be no merge, i.e., no key duplicates JavaPairRDD<MatrixIndexes,MatrixBlock> out = RDDAggregateUtils.mergeByKey(in, false); //convert mcsr into memory-efficient csr if potentially sparse if( OptimizerUtils.checkSparseBlockCSRConversion(dcIn) ) { out = out.mapValues(new CreateSparseBlockFunction(SparseBlock.Type.CSR)); } //persist rdd in default storage level out.persist( Checkpoint.DEFAULT_STORAGE_LEVEL ) .count(); //trigger caching to prevent contention //create new rdd handle, in-place of current matrix object RDDObject inro = mo.getRDDHandle(); //guaranteed to exist (see above) RDDObject outro = new RDDObject(out); //create new rdd object outro.setCheckpointRDD(true); //mark as checkpointed outro.addLineageChild(inro); //keep lineage to prevent cycles on cleanup mo.setRDDHandle(outro); }
Example 8
Source File: SparkUtils.java From systemds with Apache License 2.0 | 4 votes |
public static int getNumPreferredPartitions(DataCharacteristics dc, JavaPairRDD<?,?> in) { if( !dc.dimsKnown(true) && in != null ) return in.getNumPartitions(); return getNumPreferredPartitions(dc); }
Example 9
Source File: BinarySPInstruction.java From systemds with Apache License 2.0 | 4 votes |
/** * Common binary matrix-matrix process instruction * * @param ec execution context */ protected void processMatrixMatrixBinaryInstruction(ExecutionContext ec) { SparkExecutionContext sec = (SparkExecutionContext)ec; //sanity check dimensions checkMatrixMatrixBinaryCharacteristics(sec); updateBinaryOutputDataCharacteristics(sec); // Get input RDDs JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryMatrixBlockRDDHandleForVariable(input1.getName()); JavaPairRDD<MatrixIndexes,MatrixBlock> in2 = sec.getBinaryMatrixBlockRDDHandleForVariable(input2.getName()); DataCharacteristics mc1 = sec.getDataCharacteristics(input1.getName()); DataCharacteristics mc2 = sec.getDataCharacteristics(input2.getName()); DataCharacteristics mcOut = sec.getDataCharacteristics(output.getName()); BinaryOperator bop = (BinaryOperator) _optr; //vector replication if required (mv or outer operations) boolean rowvector = (mc2.getRows()==1 && mc1.getRows()>1); long numRepLeft = getNumReplicas(mc1, mc2, true); long numRepRight = getNumReplicas(mc1, mc2, false); if( numRepLeft > 1 ) in1 = in1.flatMapToPair(new ReplicateVectorFunction(false, numRepLeft )); if( numRepRight > 1 ) in2 = in2.flatMapToPair(new ReplicateVectorFunction(rowvector, numRepRight)); int numPrefPart = SparkUtils.isHashPartitioned(in1) ? in1.getNumPartitions() : SparkUtils.isHashPartitioned(in2) ? in2.getNumPartitions() : Math.min(in1.getNumPartitions() + in2.getNumPartitions(), 2 * SparkUtils.getNumPreferredPartitions(mcOut)); //execute binary operation JavaPairRDD<MatrixIndexes,MatrixBlock> out = in1 .join(in2, numPrefPart) .mapValues(new MatrixMatrixBinaryOpFunction(bop)); //set output RDD sec.setRDDHandleForVariable(output.getName(), out); sec.addLineageRDD(output.getName(), input1.getName()); sec.addLineageRDD(output.getName(), input2.getName()); }
Example 10
Source File: HashingBalancedPartitionerTest.java From deeplearning4j with Apache License 2.0 | 4 votes |
@Test public void hashPartitionerBalancesAtScale() { LinearCongruentialGenerator r = new LinearCongruentialGenerator(10000); List<String> elements = new ArrayList<String>(); for (int i = 0; i < 10000; i++) { // The red occur towards the end if (r.nextDouble() < ((double) i / 10000D)) elements.add("red"); // The blue occur towards the front if (r.nextDouble() < (1 - (double) i / 10000D)) elements.add("blue"); } Integer countRed = 0; Integer countBlue = 0; for (String elem : elements) { if (elem.equals("red")) countRed++; else countBlue++; } JavaRDD<String> rdd = sc.parallelize(elements); JavaPairRDD<Tuple2<Long, Integer>, String> indexedRDD = rdd.zipWithUniqueId() .mapToPair(new PairFunction<Tuple2<String, Long>, Tuple2<Long, Integer>, String>() { @Override public Tuple2<Tuple2<Long, Integer>, String> call(Tuple2<String, Long> stringLongTuple2) throws Exception { Integer elemClass = stringLongTuple2._1().equals("red") ? 0 : 1; return new Tuple2<Tuple2<Long, Integer>, String>( new Tuple2<Long, Integer>(stringLongTuple2._2(), elemClass), stringLongTuple2._1()); } }); Integer numPartitions = indexedRDD.getNumPartitions(); // rdd and indexedRDD have the same partition distribution List<Tuple2<Integer, Integer>> partitionTuples = rdd.mapPartitionsWithIndex(new CountRedBluePartitionsFunction(), true).collect(); List<Double> redWeights = new ArrayList<Double>(); List<Double> blueWeights = new ArrayList<Double>(); Float avgRed = (float) countRed / numPartitions; Float avgBlue = (float) countBlue / numPartitions; for (int i = 0; i < partitionTuples.size(); i++) { Tuple2<Integer, Integer> counts = partitionTuples.get(i); redWeights.add((double) counts._1() / avgRed); blueWeights.add((double) counts._2() / avgBlue); } List<List<Double>> partitionWeights = Arrays.asList(redWeights, blueWeights); HashingBalancedPartitioner hbp = new HashingBalancedPartitioner(partitionWeights); List<Tuple2<Tuple2<Long, Integer>, String>> testList = indexedRDD.collect(); int[][] colorCountsByPartition = new int[numPartitions][2]; for (final Tuple2<Tuple2<Long, Integer>, String> val : testList) { Integer partition = hbp.getPartition(val._1()); if (val._2().equals("red")) colorCountsByPartition[partition][0] += 1; else colorCountsByPartition[partition][1] += 1; } // for (int i = 0; i < numPartitions; i++) { // System.out.println(Arrays.toString(colorCountsByPartition[i])); // } // // System.out.println("Ideal red # per partition: " + avgRed); // System.out.println("Ideal blue # per partition: " + avgBlue); for (int i = 0; i < numPartitions; i++) { // avg red per partition : 2.33 assertTrue(colorCountsByPartition[i][0] >= Math.round(avgRed * .99) && colorCountsByPartition[i][0] < Math.round(avgRed * 1.01) + 1); // avg blue per partition : 3.33 assertTrue(colorCountsByPartition[i][1] >= Math.round(avgBlue * .99) && colorCountsByPartition[i][1] < Math.round(avgBlue * 1.01) + 1); } }