Java Code Examples for org.apache.spark.api.java.JavaPairRDD#flatMapToPair()
The following examples show how to use
org.apache.spark.api.java.JavaPairRDD#flatMapToPair() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: FrameRDDConverterUtils.java From systemds with Apache License 2.0 | 7 votes |
public static JavaPairRDD<Long, FrameBlock> matrixBlockToBinaryBlockLongIndex(JavaSparkContext sc, JavaPairRDD<MatrixIndexes, MatrixBlock> input, DataCharacteristics dcIn) { JavaPairRDD<MatrixIndexes, MatrixBlock> in = input; DataCharacteristics mc = new MatrixCharacteristics(dcIn); //reblock matrix blocks if required (multiple column blocks) if(dcIn.getCols() > dcIn.getBlocksize()) { //split matrix blocks into extended matrix blocks in = in.flatMapToPair(new MatrixFrameReblockFunction(dcIn)); mc.setBlocksize(MatrixFrameReblockFunction.computeBlockSize(mc)); //shuffle matrix blocks (instead of frame blocks) in order to exploit //sparse formats (for sparse or wide matrices) during shuffle in = RDDAggregateUtils.mergeByKey(in, false); } //convert individual matrix blocks to frame blocks (w/o shuffle) return in.mapToPair(new MatrixToFrameBlockFunction(mc)); }
Example 2
Source File: PmmSPInstruction.java From systemds with Apache License 2.0 | 6 votes |
@Override public void processInstruction(ExecutionContext ec) { SparkExecutionContext sec = (SparkExecutionContext)ec; String rddVar = (_type==CacheType.LEFT) ? input2.getName() : input1.getName(); String bcastVar = (_type==CacheType.LEFT) ? input1.getName() : input2.getName(); DataCharacteristics mc = sec.getDataCharacteristics(output.getName()); long rlen = sec.getScalarInput(_nrow).getLongValue(); //get inputs JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryMatrixBlockRDDHandleForVariable( rddVar ); PartitionedBroadcast<MatrixBlock> in2 = sec.getBroadcastForVariable( bcastVar ); //execute pmm instruction JavaPairRDD<MatrixIndexes,MatrixBlock> out = in1 .flatMapToPair( new RDDPMMFunction(_type, in2, rlen, mc.getBlocksize()) ); out = RDDAggregateUtils.sumByKeyStable(out, false); //put output RDD handle into symbol table sec.setRDDHandleForVariable(output.getName(), out); sec.addLineageRDD(output.getName(), rddVar); sec.addLineageBroadcast(output.getName(), bcastVar); //update output statistics if not inferred updateBinaryMMOutputDataCharacteristics(sec, false); }
Example 3
Source File: RDDSortUtils.java From systemds with Apache License 2.0 | 6 votes |
public static JavaPairRDD<MatrixIndexes, MatrixBlock> sortIndexesByVals( JavaPairRDD<MatrixIndexes, MatrixBlock> in, boolean asc, long rlen, long clen, int blen ) { //create value-index rdd from inputs JavaPairRDD<ValuesIndexPair, double[]> dvals = in .flatMapToPair(new ExtractDoubleValuesWithIndexFunction2(blen)); //sort (creates sorted range per partition) int numPartitions = SparkUtils.getNumPreferredPartitions( new MatrixCharacteristics(rlen, clen+1, blen, blen)); JavaRDD<ValuesIndexPair> sdvals = dvals .sortByKey(new IndexComparator2(asc), true, numPartitions) .keys(); //workaround for index comparator //create binary block output JavaPairRDD<MatrixIndexes, MatrixBlock> ret = sdvals .zipWithIndex() .mapPartitionsToPair(new ConvertToBinaryBlockFunction6(rlen, blen)); ret = RDDAggregateUtils.mergeByKey(ret, false); return ret; }
Example 4
Source File: RDDSortUtils.java From systemds with Apache License 2.0 | 6 votes |
public static JavaPairRDD<MatrixIndexes, MatrixBlock> sortDataByVal( JavaPairRDD<MatrixIndexes, MatrixBlock> val, JavaPairRDD<MatrixIndexes, MatrixBlock> data, boolean asc, long rlen, long clen, int blen ) { //create value-index rdd from inputs JavaPairRDD<ValueIndexPair, Double> dvals = val .flatMapToPair(new ExtractDoubleValuesWithIndexFunction(blen)); //sort (creates sorted range per partition) long hdfsBlocksize = InfrastructureAnalyzer.getHDFSBlockSize(); int numPartitions = (int)Math.ceil(((double)rlen*16)/hdfsBlocksize); JavaRDD<ValueIndexPair> sdvals = dvals .sortByKey(new IndexComparator(asc), true, numPartitions) .keys(); //workaround for index comparator //create target indexes by original index JavaPairRDD<MatrixIndexes, MatrixBlock> ixmap = sdvals .zipWithIndex() .mapToPair(new ExtractIndexFunction()) .sortByKey() .mapPartitionsToPair(new ConvertToBinaryBlockFunction4(rlen, blen)); ixmap = RDDAggregateUtils.mergeByKey(ixmap, false); //actual data sort return sortDataByIx(data, ixmap, rlen, clen, blen); }
Example 5
Source File: RDDSortUtils.java From systemds with Apache License 2.0 | 6 votes |
public static JavaPairRDD<MatrixIndexes, MatrixBlock> sortDataByVals( JavaPairRDD<MatrixIndexes, MatrixBlock> val, JavaPairRDD<MatrixIndexes, MatrixBlock> data, boolean asc, long rlen, long clen, long clen2, int blen ) { //create value-index rdd from inputs JavaPairRDD<ValuesIndexPair, double[]> dvals = val .flatMapToPair(new ExtractDoubleValuesWithIndexFunction2(blen)); //sort (creates sorted range per partition) int numPartitions = SparkUtils.getNumPreferredPartitions( new MatrixCharacteristics(rlen, clen2+1, blen, blen)); JavaRDD<ValuesIndexPair> sdvals = dvals .sortByKey(new IndexComparator2(asc), true, numPartitions) .keys(); //workaround for index comparator //create target indexes by original index JavaPairRDD<MatrixIndexes, MatrixBlock> ixmap = sdvals .zipWithIndex() .mapToPair(new ExtractIndexFunction2()) .sortByKey() .mapPartitionsToPair(new ConvertToBinaryBlockFunction4(rlen, blen)); ixmap = RDDAggregateUtils.mergeByKey(ixmap, false); //actual data sort return sortDataByIx(data, ixmap, rlen, clen, blen); }
Example 6
Source File: FrameRDDConverterUtils.java From systemds with Apache License 2.0 | 6 votes |
public static JavaPairRDD<Long, FrameBlock> matrixBlockToBinaryBlockLongIndex(JavaSparkContext sc, JavaPairRDD<MatrixIndexes, MatrixBlock> input, DataCharacteristics dcIn) { JavaPairRDD<MatrixIndexes, MatrixBlock> in = input; DataCharacteristics mc = new MatrixCharacteristics(dcIn); //reblock matrix blocks if required (multiple column blocks) if(dcIn.getCols() > dcIn.getBlocksize()) { //split matrix blocks into extended matrix blocks in = in.flatMapToPair(new MatrixFrameReblockFunction(dcIn)); mc.setBlocksize(MatrixFrameReblockFunction.computeBlockSize(mc)); //shuffle matrix blocks (instead of frame blocks) in order to exploit //sparse formats (for sparse or wide matrices) during shuffle in = RDDAggregateUtils.mergeByKey(in, false); } //convert individual matrix blocks to frame blocks (w/o shuffle) return in.mapToPair(new MatrixToFrameBlockFunction(mc)); }
Example 7
Source File: StructureToPolymerChainsTest.java From mmtf-spark with Apache License 2.0 | 6 votes |
@Test public void test() { List<String> pdbIds = Arrays.asList("1STP","4HHB","1JLP","5X6H","5L2G","2MK1"); JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.downloadFullMmtfFiles(pdbIds, sc); // 1STP: 1 L-protein chain: // 4HHB: 4 polymer chains // 1JLP: 1 L-protein chains with non-polymer capping group (NH2) // 5X6H: 1 L-protein and 1 DNA chain // 5L2G: 2 DNA chain // 2MK1: 0 polymer chains // -------------------- /// tot: 10 chains JavaPairRDD<String, StructureDataInterface> polymers = pdb.flatMapToPair(new StructureToPolymerChains()); assertEquals(10, polymers.count()); }
Example 8
Source File: AppendGSPInstruction.java From systemds with Apache License 2.0 | 6 votes |
@Override public void processInstruction(ExecutionContext ec) { // general case append (map-extend, aggregate) SparkExecutionContext sec = (SparkExecutionContext)ec; checkBinaryAppendInputCharacteristics(sec, _cbind, false, false); DataCharacteristics mc1 = sec.getDataCharacteristics(input1.getName()); DataCharacteristics mc2 = sec.getDataCharacteristics(input2.getName()); JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() ); JavaPairRDD<MatrixIndexes,MatrixBlock> in2 = sec.getBinaryMatrixBlockRDDHandleForVariable( input2.getName() ); JavaPairRDD<MatrixIndexes,MatrixBlock> out = null; // General case: This one needs shifting and merging and hence has huge performance hit. JavaPairRDD<MatrixIndexes,MatrixBlock> shifted_in2 = in2 .flatMapToPair(new ShiftMatrix(mc1, mc2, _cbind)); out = in1.cogroup(shifted_in2) .mapToPair(new MergeWithShiftedBlocks(mc1, mc2, _cbind)); //put output RDD handle into symbol table updateBinaryAppendOutputDataCharacteristics(sec, _cbind); sec.setRDDHandleForVariable(output.getName(), out); sec.addLineageRDD(output.getName(), input1.getName()); sec.addLineageRDD(output.getName(), input2.getName()); }
Example 9
Source File: MmtfImporterTest.java From mmtf-spark with Apache License 2.0 | 5 votes |
public void test5() throws IOException { List<String> pdbIds = Arrays.asList("3SP5"); JavaPairRDD<String, StructureDataInterface> pdb = MmtfImporter.downloadPdbRedo(pdbIds, sc); assertEquals(1, pdb.count()); pdb = pdb.flatMapToPair(new StructureToPolymerChains()); assertEquals(2, pdb.count()); }
Example 10
Source File: MatrixAppendMSPInstruction.java From systemds with Apache License 2.0 | 5 votes |
@Override public void processInstruction(ExecutionContext ec) { // map-only append (rhs must be vector and fit in mapper mem) SparkExecutionContext sec = (SparkExecutionContext)ec; checkBinaryAppendInputCharacteristics(sec, _cbind, false, false); DataCharacteristics mc1 = sec.getDataCharacteristics(input1.getName()); DataCharacteristics mc2 = sec.getDataCharacteristics(input2.getName()); int blen = mc1.getBlocksize(); JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() ); PartitionedBroadcast<MatrixBlock> in2 = sec.getBroadcastForVariable( input2.getName() ); long off = sec.getScalarInput( _offset).getLongValue(); //execute map-append operations (partitioning preserving if #in-blocks = #out-blocks) JavaPairRDD<MatrixIndexes,MatrixBlock> out = null; if( preservesPartitioning(mc1, mc2, _cbind) ) { out = in1.mapPartitionsToPair( new MapSideAppendPartitionFunction(in2, _cbind, off, blen), true); } else { out = in1.flatMapToPair( new MapSideAppendFunction(in2, _cbind, off, blen)); } //put output RDD handle into symbol table updateBinaryAppendOutputDataCharacteristics(sec, _cbind); sec.setRDDHandleForVariable(output.getName(), out); sec.addLineageRDD(output.getName(), input1.getName()); sec.addLineageBroadcast(output.getName(), input2.getName()); }
Example 11
Source File: MarkDuplicatesSparkUtils.java From gatk with BSD 3-Clause "New" or "Revised" License | 5 votes |
/** * Primary landing point for MarkDuplicateSparkRecords: * - Handles separating out hashed keys into into groups by start position/readgroup * - Further separates out MarkDuplicatesSparkRecord by their record objects * - Farms out to methods which handles each group * - Collects the results and returns an iterator */ @SuppressWarnings("unchecked") private static JavaPairRDD<IndexPair<String>, Integer> markDuplicateRecords(final JavaPairRDD<ReadsKey, Iterable<MarkDuplicatesSparkRecord>> keyedPairs, final OpticalDuplicateFinder finder, final boolean markOpticalDups) { return keyedPairs.flatMapToPair(keyedPair -> { Iterable<MarkDuplicatesSparkRecord> pairGroups = keyedPair._2(); final List<Tuple2<IndexPair<String>, Integer>> nonDuplicates = Lists.newArrayList(); final Map<MarkDuplicatesSparkRecord.Type, List<MarkDuplicatesSparkRecord>> stratifiedByType = splitByType(pairGroups); // Each key corresponds to either fragments or paired ends, not a mixture of both. final List<MarkDuplicatesSparkRecord> emptyFragments = stratifiedByType.get(MarkDuplicatesSparkRecord.Type.EMPTY_FRAGMENT); final List<MarkDuplicatesSparkRecord> fragments = stratifiedByType.get(MarkDuplicatesSparkRecord.Type.FRAGMENT); final List<Pair> pairs = (List<Pair>)(List)stratifiedByType.get(MarkDuplicatesSparkRecord.Type.PAIR); final List<MarkDuplicatesSparkRecord> passthroughs = stratifiedByType.get(MarkDuplicatesSparkRecord.Type.PASSTHROUGH); //empty MarkDuplicatesSparkRecord signify that a pair has a mate somewhere else // If there are any non-fragment placeholders at this site, mark everything as duplicates, otherwise compute the best score if (Utils.isNonEmpty(fragments) && !Utils.isNonEmpty(emptyFragments)) { final Tuple2<IndexPair<String>, Integer> bestFragment = handleFragments(fragments, finder); nonDuplicates.add(bestFragment); } if (Utils.isNonEmpty(pairs)) { nonDuplicates.addAll(handlePairs(pairs, finder, markOpticalDups)); } if (Utils.isNonEmpty(passthroughs)) { nonDuplicates.addAll(handlePassthroughs(passthroughs)); } return nonDuplicates.iterator(); }); }
Example 12
Source File: BinarySPInstruction.java From systemds with Apache License 2.0 | 5 votes |
/** * Common binary tensor-tensor process instruction * * @param ec execution context */ protected void processTensorTensorBinaryInstruction(ExecutionContext ec) { SparkExecutionContext sec = (SparkExecutionContext)ec; //sanity check dimensions checkTensorTensorBinaryCharacteristics(sec); updateBinaryTensorOutputDataCharacteristics(sec); // Get input RDDs JavaPairRDD<TensorIndexes, TensorBlock> in1 = sec.getBinaryTensorBlockRDDHandleForVariable(input1.getName()); JavaPairRDD<TensorIndexes, TensorBlock> in2 = sec.getBinaryTensorBlockRDDHandleForVariable(input2.getName()); DataCharacteristics tc1 = sec.getDataCharacteristics(input1.getName()); DataCharacteristics tc2 = sec.getDataCharacteristics(input2.getName()); DataCharacteristics dcOut = sec.getDataCharacteristics(output.getName()); BinaryOperator bop = (BinaryOperator) _optr; // TODO blocking scheme for matrices with mismatching number of dimensions if (tc2.getNumDims() < tc1.getNumDims()) in2 = in2.flatMapToPair(new ReblockTensorFunction(tc1.getNumDims(), tc1.getBlocksize())); for (int i = 0; i < tc1.getNumDims(); i++) { long numReps = getNumDimReplicas(tc1, tc2, i); if (numReps > 1) in2 = in2.flatMapToPair(new ReplicateTensorFunction(i, numReps)); } int numPrefPart = SparkUtils.isHashPartitioned(in1) ? in1.getNumPartitions() : SparkUtils.isHashPartitioned(in2) ? in2.getNumPartitions() : Math.min(in1.getNumPartitions() + in2.getNumPartitions(), 2 * SparkUtils.getNumPreferredPartitions(dcOut)); //execute binary operation JavaPairRDD<TensorIndexes, TensorBlock> out = in1 .join(in2, numPrefPart) .mapValues(new TensorTensorBinaryOpFunction(bop)); //set output RDD sec.setRDDHandleForVariable(output.getName(), out); sec.addLineageRDD(output.getName(), input1.getName()); sec.addLineageRDD(output.getName(), input2.getName()); }
Example 13
Source File: BinarySPInstruction.java From systemds with Apache License 2.0 | 5 votes |
protected void processMatrixBVectorBinaryInstruction(ExecutionContext ec, VectorType vtype) { SparkExecutionContext sec = (SparkExecutionContext)ec; //sanity check dimensions checkMatrixMatrixBinaryCharacteristics(sec); //get input RDDs String rddVar = input1.getName(); String bcastVar = input2.getName(); JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryMatrixBlockRDDHandleForVariable( rddVar ); PartitionedBroadcast<MatrixBlock> in2 = sec.getBroadcastForVariable( bcastVar ); DataCharacteristics mc1 = sec.getDataCharacteristics(rddVar); DataCharacteristics mc2 = sec.getDataCharacteristics(bcastVar); BinaryOperator bop = (BinaryOperator) _optr; boolean isOuter = (mc1.getRows()>1 && mc1.getCols()==1 && mc2.getRows()==1 && mc2.getCols()>1); //execute map binary operation JavaPairRDD<MatrixIndexes,MatrixBlock> out = null; if( isOuter ) { out = in1.flatMapToPair(new OuterVectorBinaryOpFunction(bop, in2)); } else { //default //note: we use mappartition in order to preserve partitioning information for //binary mv operations where the keys are guaranteed not to change, the reason //why we cannot use mapValues is the need for broadcast key lookups. //alternative: out = in1.mapToPair(new MatrixVectorBinaryOpFunction(bop, in2, vtype)); out = in1.mapPartitionsToPair( new MatrixVectorBinaryOpPartitionFunction(bop, in2, vtype), true); } //set output RDD updateBinaryOutputDataCharacteristics(sec); sec.setRDDHandleForVariable(output.getName(), out); sec.addLineageRDD(output.getName(), rddVar); sec.addLineageBroadcast(output.getName(), bcastVar); }
Example 14
Source File: MmtfImporterTest.java From mmtf-spark with Apache License 2.0 | 5 votes |
@Test public void test4() throws IOException { Path p = Paths.get("./src/main/resources/files/test"); JavaPairRDD<String, StructureDataInterface> pdb = MmtfImporter.importMmcifFiles(p.toString(), sc); assertTrue(pdb.count() == 1); pdb = pdb.flatMapToPair(new StructureToPolymerChains()); assertEquals(8, pdb.count()); }
Example 15
Source File: FrameRDDConverterUtils.java From systemds with Apache License 2.0 | 5 votes |
public static JavaPairRDD<MatrixIndexes, MatrixBlock> binaryBlockToMatrixBlock(JavaPairRDD<Long,FrameBlock> input, DataCharacteristics mcIn, DataCharacteristics mcOut) { //convert binary block to matrix block JavaPairRDD<MatrixIndexes, MatrixBlock> out = input .flatMapToPair(new BinaryBlockToMatrixBlockFunction(mcIn, mcOut)); //aggregate partial matrix blocks return RDDAggregateUtils.mergeByKey(out, false); }
Example 16
Source File: RmmSPInstruction.java From systemds with Apache License 2.0 | 5 votes |
@Override public void processInstruction(ExecutionContext ec) { SparkExecutionContext sec = (SparkExecutionContext)ec; //get input rdds DataCharacteristics mc1 = sec.getDataCharacteristics( input1.getName() ); DataCharacteristics mc2 = sec.getDataCharacteristics( input2.getName() ); JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() ); JavaPairRDD<MatrixIndexes,MatrixBlock> in2 = sec.getBinaryMatrixBlockRDDHandleForVariable( input2.getName() ); DataCharacteristics mcOut = updateBinaryMMOutputDataCharacteristics(sec, true); //execute Spark RMM instruction //step 1: prepare join keys (w/ shallow replication), i/j/k JavaPairRDD<TripleIndexes,MatrixBlock> tmp1 = in1.flatMapToPair( new RmmReplicateFunction(mc2.getCols(), mc2.getBlocksize(), true)); JavaPairRDD<TripleIndexes,MatrixBlock> tmp2 = in2.flatMapToPair( new RmmReplicateFunction(mc1.getRows(), mc1.getBlocksize(), false)); //step 2: join prepared datasets, multiply, and aggregate int numPartJoin = Math.max(getNumJoinPartitions(mc1, mc2), SparkExecutionContext.getDefaultParallelism(true)); int numPartOut = SparkUtils.getNumPreferredPartitions(mcOut); JavaPairRDD<MatrixIndexes,MatrixBlock> out = tmp1 .join( tmp2, numPartJoin ) //join by result block .mapToPair( new RmmMultiplyFunction() ); //do matrix multiplication out = RDDAggregateUtils.sumByKeyStable(out, //aggregation per result block numPartOut, false); //put output block into symbol table (no lineage because single block) sec.setRDDHandleForVariable(output.getName(), out); sec.addLineageRDD(output.getName(), input1.getName()); sec.addLineageRDD(output.getName(), input2.getName()); }
Example 17
Source File: StructureToCathDomainsTest.java From mmtf-spark with Apache License 2.0 | 4 votes |
public void test1() throws IOException { List<String> pdbIds = Arrays.asList("1HV4"); // List<String> pdbIds = Arrays.asList("1STP","4HHB","1JLP","5X6H","5L2G","2MK1"); JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.downloadFullMmtfFiles(pdbIds, sc); String baseUrl = "ftp://orengoftp.biochem.ucl.ac.uk/cath/releases/daily-release/newest/cath-b-newest-all.gz"; // System.out.println(hmap.get("1HV4A")); JavaPairRDD<String, StructureDataInterface> cathDomains = pdb.flatMapToPair(new StructureToCathDomains(baseUrl)); Map<String, ArrayList<String>> hmap = StructureToCathDomains.loadCathDomains(baseUrl); String[] bound = hmap.get("1HV4A").get(0).split(":")[0].split("-"); int[] cath = cathDomains.first()._2.getGroupIds(); assertEquals(Integer.parseInt(bound[0]), cath[0]); assertEquals(Integer.parseInt(bound[1]), cath[cath.length-1]); assertEquals(Integer.parseInt(bound[1]) - Integer.parseInt(bound[0]) + 1, cathDomains.first()._2.getNumGroups()); assertEquals(8, cathDomains.count()); }
Example 18
Source File: BinarySPInstruction.java From systemds with Apache License 2.0 | 4 votes |
/** * Common binary matrix-matrix process instruction * * @param ec execution context */ protected void processMatrixMatrixBinaryInstruction(ExecutionContext ec) { SparkExecutionContext sec = (SparkExecutionContext)ec; //sanity check dimensions checkMatrixMatrixBinaryCharacteristics(sec); updateBinaryOutputDataCharacteristics(sec); // Get input RDDs JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryMatrixBlockRDDHandleForVariable(input1.getName()); JavaPairRDD<MatrixIndexes,MatrixBlock> in2 = sec.getBinaryMatrixBlockRDDHandleForVariable(input2.getName()); DataCharacteristics mc1 = sec.getDataCharacteristics(input1.getName()); DataCharacteristics mc2 = sec.getDataCharacteristics(input2.getName()); DataCharacteristics mcOut = sec.getDataCharacteristics(output.getName()); BinaryOperator bop = (BinaryOperator) _optr; //vector replication if required (mv or outer operations) boolean rowvector = (mc2.getRows()==1 && mc1.getRows()>1); long numRepLeft = getNumReplicas(mc1, mc2, true); long numRepRight = getNumReplicas(mc1, mc2, false); if( numRepLeft > 1 ) in1 = in1.flatMapToPair(new ReplicateVectorFunction(false, numRepLeft )); if( numRepRight > 1 ) in2 = in2.flatMapToPair(new ReplicateVectorFunction(rowvector, numRepRight)); int numPrefPart = SparkUtils.isHashPartitioned(in1) ? in1.getNumPartitions() : SparkUtils.isHashPartitioned(in2) ? in2.getNumPartitions() : Math.min(in1.getNumPartitions() + in2.getNumPartitions(), 2 * SparkUtils.getNumPreferredPartitions(mcOut)); //execute binary operation JavaPairRDD<MatrixIndexes,MatrixBlock> out = in1 .join(in2, numPrefPart) .mapValues(new MatrixMatrixBinaryOpFunction(bop)); //set output RDD sec.setRDDHandleForVariable(output.getName(), out); sec.addLineageRDD(output.getName(), input1.getName()); sec.addLineageRDD(output.getName(), input2.getName()); }
Example 19
Source File: StructureToCathDomainsTest.java From mmtf-spark with Apache License 2.0 | 4 votes |
public void test2() throws IOException { List<String> pdbIds = Arrays.asList("1STP"); // List<String> pdbIds = Arrays.asList("1STP","4HHB","1JLP","5X6H","5L2G","2MK1"); JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.downloadFullMmtfFiles(pdbIds, sc); String baseUrl = "ftp://orengoftp.biochem.ucl.ac.uk/cath/releases/daily-release/newest/cath-b-newest-all.gz"; // System.out.println(hmap.get("1STPA")); JavaPairRDD<String, StructureDataInterface> cathDomains = pdb.flatMapToPair(new StructureToCathDomains(baseUrl)); Map<String, ArrayList<String>> hmap = StructureToCathDomains.loadCathDomains(baseUrl); String[] bound = hmap.get("1STPA").get(0).split(":")[0].split("-"); int[] cath = cathDomains.first()._2.getGroupIds(); assertEquals(Integer.parseInt(bound[0]), cath[0]); assertEquals(Integer.parseInt(bound[1]), cath[cath.length-1]); assertEquals(Integer.parseInt(bound[1]) - Integer.parseInt(bound[0]) + 1, cathDomains.first()._2.getNumGroups()); assertEquals(1, cathDomains.count()); }
Example 20
Source File: PMapmmSPInstruction.java From systemds with Apache License 2.0 | 4 votes |
@Override public void processInstruction(ExecutionContext ec) { SparkExecutionContext sec = (SparkExecutionContext)ec; //get inputs JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() ); JavaPairRDD<MatrixIndexes,MatrixBlock> in2 = sec.getBinaryMatrixBlockRDDHandleForVariable( input2.getName() ); DataCharacteristics mc1 = sec.getDataCharacteristics(input1.getName()); // This avoids errors such as java.lang.UnsupportedOperationException: Cannot change storage level of an RDD after it was already assigned a level // Ideally, we should ensure that we donot redundantly call persist on the same RDD. StorageLevel pmapmmStorageLevel = StorageLevel.MEMORY_AND_DISK(); //cache right hand side because accessed many times in2 = in2.repartition(sec.getSparkContext().defaultParallelism()) .persist(pmapmmStorageLevel); JavaPairRDD<MatrixIndexes,MatrixBlock> out = null; for( int i=0; i<mc1.getRows(); i+=NUM_ROWBLOCKS*mc1.getBlocksize() ) { //create broadcast for rdd partition JavaPairRDD<MatrixIndexes,MatrixBlock> rdd = in1 .filter(new IsBlockInRange(i+1, i+NUM_ROWBLOCKS*mc1.getBlocksize(), 1, mc1.getCols(), mc1)) .mapToPair(new PMapMMRebaseBlocksFunction(i/mc1.getBlocksize())); int rlen = (int)Math.min(mc1.getRows()-i, NUM_ROWBLOCKS*mc1.getBlocksize()); PartitionedBlock<MatrixBlock> pmb = SparkExecutionContext.toPartitionedMatrixBlock(rdd, rlen, (int)mc1.getCols(), mc1.getBlocksize(), -1L); Broadcast<PartitionedBlock<MatrixBlock>> bpmb = sec.getSparkContext().broadcast(pmb); //matrix multiplication JavaPairRDD<MatrixIndexes,MatrixBlock> rdd2 = in2 .flatMapToPair(new PMapMMFunction(bpmb, i/mc1.getBlocksize())); rdd2 = RDDAggregateUtils.sumByKeyStable(rdd2, false); rdd2.persist(pmapmmStorageLevel) .count(); bpmb.unpersist(false); if( out == null ) out = rdd2; else out = out.union(rdd2); } //cache final result out = out.persist(pmapmmStorageLevel); out.count(); //put output RDD handle into symbol table sec.setRDDHandleForVariable(output.getName(), out); sec.addLineageRDD(output.getName(), input1.getName()); sec.addLineageRDD(output.getName(), input2.getName()); //update output statistics if not inferred updateBinaryMMOutputDataCharacteristics(sec, true); }