org.apache.spark.api.java.JavaPairRDD#flatMapToPair

Source File: FrameRDDConverterUtils.java From systemds with Apache License 2.0

7 votes

public static JavaPairRDD<Long, FrameBlock> matrixBlockToBinaryBlockLongIndex(JavaSparkContext sc,
		JavaPairRDD<MatrixIndexes, MatrixBlock> input, DataCharacteristics dcIn) {
	JavaPairRDD<MatrixIndexes, MatrixBlock> in = input;
	DataCharacteristics mc = new MatrixCharacteristics(dcIn);
	
	//reblock matrix blocks if required (multiple column blocks)
	if(dcIn.getCols() > dcIn.getBlocksize()) {
		//split matrix blocks into extended matrix blocks 
		in = in.flatMapToPair(new MatrixFrameReblockFunction(dcIn));
		mc.setBlocksize(MatrixFrameReblockFunction.computeBlockSize(mc));
		
		//shuffle matrix blocks (instead of frame blocks) in order to exploit 
		//sparse formats (for sparse or wide matrices) during shuffle
		in = RDDAggregateUtils.mergeByKey(in, false);
	}
		
	//convert individual matrix blocks to frame blocks (w/o shuffle)
	return in.mapToPair(new MatrixToFrameBlockFunction(mc));
}

Source File: PmmSPInstruction.java From systemds with Apache License 2.0

6 votes

@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	
	String rddVar = (_type==CacheType.LEFT) ? input2.getName() : input1.getName();
	String bcastVar = (_type==CacheType.LEFT) ? input1.getName() : input2.getName();
	DataCharacteristics mc = sec.getDataCharacteristics(output.getName());
	long rlen = sec.getScalarInput(_nrow).getLongValue();
	
	//get inputs
	JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryMatrixBlockRDDHandleForVariable( rddVar );
	PartitionedBroadcast<MatrixBlock> in2 = sec.getBroadcastForVariable( bcastVar ); 
	
	//execute pmm instruction
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = in1
			.flatMapToPair( new RDDPMMFunction(_type, in2, rlen, mc.getBlocksize()) );
	out = RDDAggregateUtils.sumByKeyStable(out, false);
	
	//put output RDD handle into symbol table
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), rddVar);
	sec.addLineageBroadcast(output.getName(), bcastVar);
	
	//update output statistics if not inferred
	updateBinaryMMOutputDataCharacteristics(sec, false);
}

Source File: RDDSortUtils.java From systemds with Apache License 2.0

6 votes

public static JavaPairRDD<MatrixIndexes, MatrixBlock> sortIndexesByVals( JavaPairRDD<MatrixIndexes, MatrixBlock> in,
		boolean asc, long rlen, long clen, int blen )
{
	//create value-index rdd from inputs
	JavaPairRDD<ValuesIndexPair, double[]> dvals = in
		.flatMapToPair(new ExtractDoubleValuesWithIndexFunction2(blen));
	
	//sort (creates sorted range per partition)
	int numPartitions = SparkUtils.getNumPreferredPartitions(
		new MatrixCharacteristics(rlen, clen+1, blen, blen));
	JavaRDD<ValuesIndexPair> sdvals = dvals
		.sortByKey(new IndexComparator2(asc), true, numPartitions)
		.keys(); //workaround for index comparator
	
	//create binary block output
	JavaPairRDD<MatrixIndexes, MatrixBlock> ret = sdvals
		.zipWithIndex()
		.mapPartitionsToPair(new ConvertToBinaryBlockFunction6(rlen, blen));
	ret = RDDAggregateUtils.mergeByKey(ret, false);
	
	return ret;
}

Source File: RDDSortUtils.java From systemds with Apache License 2.0

6 votes

public static JavaPairRDD<MatrixIndexes, MatrixBlock> sortDataByVal( JavaPairRDD<MatrixIndexes, MatrixBlock> val, 
	JavaPairRDD<MatrixIndexes, MatrixBlock> data, boolean asc, long rlen, long clen, int blen )
{
	//create value-index rdd from inputs
	JavaPairRDD<ValueIndexPair, Double> dvals = val
		.flatMapToPair(new ExtractDoubleValuesWithIndexFunction(blen));
	
	//sort (creates sorted range per partition)
	long hdfsBlocksize = InfrastructureAnalyzer.getHDFSBlockSize();
	int numPartitions = (int)Math.ceil(((double)rlen*16)/hdfsBlocksize);
	JavaRDD<ValueIndexPair> sdvals = dvals
		.sortByKey(new IndexComparator(asc), true, numPartitions)
		.keys(); //workaround for index comparator
	
	//create target indexes by original index
	JavaPairRDD<MatrixIndexes, MatrixBlock> ixmap = sdvals
		.zipWithIndex()
		.mapToPair(new ExtractIndexFunction())
		.sortByKey()
		.mapPartitionsToPair(new ConvertToBinaryBlockFunction4(rlen, blen));
	ixmap = RDDAggregateUtils.mergeByKey(ixmap, false);
	
	//actual data sort
	return sortDataByIx(data, ixmap, rlen, clen, blen);
}

Source File: RDDSortUtils.java From systemds with Apache License 2.0

6 votes

public static JavaPairRDD<MatrixIndexes, MatrixBlock> sortDataByVals( JavaPairRDD<MatrixIndexes, MatrixBlock> val, 
	JavaPairRDD<MatrixIndexes, MatrixBlock> data, boolean asc, long rlen, long clen, long clen2, int blen )
{
	//create value-index rdd from inputs
	JavaPairRDD<ValuesIndexPair, double[]> dvals = val
		.flatMapToPair(new ExtractDoubleValuesWithIndexFunction2(blen));
	
	//sort (creates sorted range per partition)
	int numPartitions = SparkUtils.getNumPreferredPartitions(
		new MatrixCharacteristics(rlen, clen2+1, blen, blen));
	JavaRDD<ValuesIndexPair> sdvals = dvals
		.sortByKey(new IndexComparator2(asc), true, numPartitions)
		.keys(); //workaround for index comparator
	
	//create target indexes by original index
	JavaPairRDD<MatrixIndexes, MatrixBlock> ixmap = sdvals
		.zipWithIndex()
		.mapToPair(new ExtractIndexFunction2())
		.sortByKey()
		.mapPartitionsToPair(new ConvertToBinaryBlockFunction4(rlen, blen));
	ixmap = RDDAggregateUtils.mergeByKey(ixmap, false);
	
	//actual data sort
	return sortDataByIx(data, ixmap, rlen, clen, blen);
}

Source File: FrameRDDConverterUtils.java From systemds with Apache License 2.0

6 votes

public static JavaPairRDD<Long, FrameBlock> matrixBlockToBinaryBlockLongIndex(JavaSparkContext sc,
		JavaPairRDD<MatrixIndexes, MatrixBlock> input, DataCharacteristics dcIn) {
	JavaPairRDD<MatrixIndexes, MatrixBlock> in = input;
	DataCharacteristics mc = new MatrixCharacteristics(dcIn);
	
	//reblock matrix blocks if required (multiple column blocks)
	if(dcIn.getCols() > dcIn.getBlocksize()) {
		//split matrix blocks into extended matrix blocks 
		in = in.flatMapToPair(new MatrixFrameReblockFunction(dcIn));
		mc.setBlocksize(MatrixFrameReblockFunction.computeBlockSize(mc));
		
		//shuffle matrix blocks (instead of frame blocks) in order to exploit 
		//sparse formats (for sparse or wide matrices) during shuffle
		in = RDDAggregateUtils.mergeByKey(in, false);
	}
		
	//convert individual matrix blocks to frame blocks (w/o shuffle)
	return in.mapToPair(new MatrixToFrameBlockFunction(mc));
}

Source File: StructureToPolymerChainsTest.java From mmtf-spark with Apache License 2.0

6 votes

@Test
public void test() {
    List<String> pdbIds = Arrays.asList("1STP","4HHB","1JLP","5X6H","5L2G","2MK1");
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.downloadFullMmtfFiles(pdbIds, sc);

    // 1STP: 1 L-protein chain:
    // 4HHB: 4 polymer chains
    // 1JLP: 1 L-protein chains with non-polymer capping group (NH2)
    // 5X6H: 1 L-protein and 1 DNA chain
    // 5L2G: 2 DNA chain
    // 2MK1: 0 polymer chains
    // --------------------
    /// tot: 10 chains
    
    JavaPairRDD<String, StructureDataInterface> polymers = pdb.flatMapToPair(new StructureToPolymerChains());
       assertEquals(10, polymers.count());
}

Source File: AppendGSPInstruction.java From systemds with Apache License 2.0

6 votes

@Override
public void processInstruction(ExecutionContext ec) {
	// general case append (map-extend, aggregate)
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	checkBinaryAppendInputCharacteristics(sec, _cbind, false, false);
	DataCharacteristics mc1 = sec.getDataCharacteristics(input1.getName());
	DataCharacteristics mc2 = sec.getDataCharacteristics(input2.getName());
	
	JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() );
	JavaPairRDD<MatrixIndexes,MatrixBlock> in2 = sec.getBinaryMatrixBlockRDDHandleForVariable( input2.getName() );
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = null;
	
	// General case: This one needs shifting and merging and hence has huge performance hit.
	JavaPairRDD<MatrixIndexes,MatrixBlock> shifted_in2 = in2
			.flatMapToPair(new ShiftMatrix(mc1, mc2, _cbind));
	out = in1.cogroup(shifted_in2)
			.mapToPair(new MergeWithShiftedBlocks(mc1, mc2, _cbind));
	
	//put output RDD handle into symbol table
	updateBinaryAppendOutputDataCharacteristics(sec, _cbind);
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), input1.getName());
	sec.addLineageRDD(output.getName(), input2.getName());
}

Source File: MmtfImporterTest.java From mmtf-spark with Apache License 2.0

5 votes

public void test5() throws IOException {
    List<String> pdbIds = Arrays.asList("3SP5");
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfImporter.downloadPdbRedo(pdbIds, sc);
    assertEquals(1, pdb.count());
    pdb = pdb.flatMapToPair(new StructureToPolymerChains());
    assertEquals(2, pdb.count());
}

Source File: MatrixAppendMSPInstruction.java From systemds with Apache License 2.0

5 votes

@Override
public void processInstruction(ExecutionContext ec) {
	// map-only append (rhs must be vector and fit in mapper mem)
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	checkBinaryAppendInputCharacteristics(sec, _cbind, false, false);
	DataCharacteristics mc1 = sec.getDataCharacteristics(input1.getName());
	DataCharacteristics mc2 = sec.getDataCharacteristics(input2.getName());
	int blen = mc1.getBlocksize();
	
	JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() );
	PartitionedBroadcast<MatrixBlock> in2 = sec.getBroadcastForVariable( input2.getName() );
	long off = sec.getScalarInput( _offset).getLongValue();
	
	//execute map-append operations (partitioning preserving if #in-blocks = #out-blocks)
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = null;
	if( preservesPartitioning(mc1, mc2, _cbind) ) {
		out = in1.mapPartitionsToPair(
			new MapSideAppendPartitionFunction(in2, _cbind, off, blen), true);
	}
	else {
		out = in1.flatMapToPair(
			new MapSideAppendFunction(in2, _cbind, off, blen));
	}
	
	//put output RDD handle into symbol table
	updateBinaryAppendOutputDataCharacteristics(sec, _cbind);
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), input1.getName());
	sec.addLineageBroadcast(output.getName(), input2.getName());
}

Source File: MarkDuplicatesSparkUtils.java From gatk with BSD 3-Clause "New" or "Revised" License

5 votes

/**
 * Primary landing point for MarkDuplicateSparkRecords:
 *  - Handles separating out hashed keys into into groups by start position/readgroup
 *  - Further separates out MarkDuplicatesSparkRecord by their record objects
 *  - Farms out to methods which handles each group
 *  - Collects the results and returns an iterator
 */
@SuppressWarnings("unchecked")
private static JavaPairRDD<IndexPair<String>, Integer> markDuplicateRecords(final JavaPairRDD<ReadsKey, Iterable<MarkDuplicatesSparkRecord>> keyedPairs,
                                                                            final OpticalDuplicateFinder finder, final boolean markOpticalDups) {
    return keyedPairs.flatMapToPair(keyedPair -> {
        Iterable<MarkDuplicatesSparkRecord> pairGroups = keyedPair._2();

        final List<Tuple2<IndexPair<String>, Integer>> nonDuplicates = Lists.newArrayList();
        final Map<MarkDuplicatesSparkRecord.Type, List<MarkDuplicatesSparkRecord>> stratifiedByType = splitByType(pairGroups);

        // Each key corresponds to either fragments or paired ends, not a mixture of both.
        final List<MarkDuplicatesSparkRecord> emptyFragments = stratifiedByType.get(MarkDuplicatesSparkRecord.Type.EMPTY_FRAGMENT);
        final List<MarkDuplicatesSparkRecord> fragments = stratifiedByType.get(MarkDuplicatesSparkRecord.Type.FRAGMENT);
        final List<Pair> pairs = (List<Pair>)(List)stratifiedByType.get(MarkDuplicatesSparkRecord.Type.PAIR);
        final List<MarkDuplicatesSparkRecord> passthroughs = stratifiedByType.get(MarkDuplicatesSparkRecord.Type.PASSTHROUGH);

        //empty MarkDuplicatesSparkRecord signify that a pair has a mate somewhere else
        // If there are any non-fragment placeholders at this site, mark everything as duplicates, otherwise compute the best score
        if (Utils.isNonEmpty(fragments) && !Utils.isNonEmpty(emptyFragments)) {
            final Tuple2<IndexPair<String>, Integer> bestFragment = handleFragments(fragments, finder);
            nonDuplicates.add(bestFragment);
        }

        if (Utils.isNonEmpty(pairs)) {
            nonDuplicates.addAll(handlePairs(pairs, finder, markOpticalDups));
        }

        if (Utils.isNonEmpty(passthroughs)) {
            nonDuplicates.addAll(handlePassthroughs(passthroughs));
        }

        return nonDuplicates.iterator();
    });
}

Source File: BinarySPInstruction.java From systemds with Apache License 2.0

5 votes

/**
 * Common binary tensor-tensor process instruction
 *
 * @param ec execution context
 */
protected void processTensorTensorBinaryInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;

	//sanity check dimensions
	checkTensorTensorBinaryCharacteristics(sec);
	updateBinaryTensorOutputDataCharacteristics(sec);

	// Get input RDDs
	JavaPairRDD<TensorIndexes, TensorBlock> in1 = sec.getBinaryTensorBlockRDDHandleForVariable(input1.getName());
	JavaPairRDD<TensorIndexes, TensorBlock> in2 = sec.getBinaryTensorBlockRDDHandleForVariable(input2.getName());
	DataCharacteristics tc1 = sec.getDataCharacteristics(input1.getName());
	DataCharacteristics tc2 = sec.getDataCharacteristics(input2.getName());
	DataCharacteristics dcOut = sec.getDataCharacteristics(output.getName());

	BinaryOperator bop = (BinaryOperator) _optr;

	// TODO blocking scheme for matrices with mismatching number of dimensions
	if (tc2.getNumDims() < tc1.getNumDims())
		in2 = in2.flatMapToPair(new ReblockTensorFunction(tc1.getNumDims(), tc1.getBlocksize()));
	for (int i = 0; i < tc1.getNumDims(); i++) {
		long numReps = getNumDimReplicas(tc1, tc2, i);
		if (numReps > 1)
			in2 = in2.flatMapToPair(new ReplicateTensorFunction(i, numReps));
	}
	int numPrefPart = SparkUtils.isHashPartitioned(in1) ? in1.getNumPartitions() :
			SparkUtils.isHashPartitioned(in2) ? in2.getNumPartitions() :
					Math.min(in1.getNumPartitions() + in2.getNumPartitions(),
							2 * SparkUtils.getNumPreferredPartitions(dcOut));

	//execute binary operation
	JavaPairRDD<TensorIndexes, TensorBlock> out = in1
			.join(in2, numPrefPart)
			.mapValues(new TensorTensorBinaryOpFunction(bop));

	//set output RDD
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), input1.getName());
	sec.addLineageRDD(output.getName(), input2.getName());
}

Source File: BinarySPInstruction.java From systemds with Apache License 2.0

5 votes

protected void processMatrixBVectorBinaryInstruction(ExecutionContext ec, VectorType vtype)
{
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	
	//sanity check dimensions
	checkMatrixMatrixBinaryCharacteristics(sec);

	//get input RDDs
	String rddVar = input1.getName();
	String bcastVar = input2.getName();
	JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryMatrixBlockRDDHandleForVariable( rddVar );
	PartitionedBroadcast<MatrixBlock> in2 = sec.getBroadcastForVariable( bcastVar );
	DataCharacteristics mc1 = sec.getDataCharacteristics(rddVar);
	DataCharacteristics mc2 = sec.getDataCharacteristics(bcastVar);
	
	BinaryOperator bop = (BinaryOperator) _optr;
	boolean isOuter = (mc1.getRows()>1 && mc1.getCols()==1 && mc2.getRows()==1 && mc2.getCols()>1);
	
	//execute map binary operation
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = null;
	if( isOuter ) {
		out = in1.flatMapToPair(new OuterVectorBinaryOpFunction(bop, in2));
	}
	else { //default
		//note: we use mappartition in order to preserve partitioning information for
		//binary mv operations where the keys are guaranteed not to change, the reason
		//why we cannot use mapValues is the need for broadcast key lookups.
		//alternative: out = in1.mapToPair(new MatrixVectorBinaryOpFunction(bop, in2, vtype));
		out = in1.mapPartitionsToPair(
			new MatrixVectorBinaryOpPartitionFunction(bop, in2, vtype), true);
	}
	
	//set output RDD
	updateBinaryOutputDataCharacteristics(sec);
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), rddVar);
	sec.addLineageBroadcast(output.getName(), bcastVar);
}

Source File: MmtfImporterTest.java From mmtf-spark with Apache License 2.0

5 votes

@Test
public void test4() throws IOException {
	Path p = Paths.get("./src/main/resources/files/test");
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfImporter.importMmcifFiles(p.toString(), sc);
    assertTrue(pdb.count() == 1);
    pdb = pdb.flatMapToPair(new StructureToPolymerChains());
    assertEquals(8, pdb.count());
}

Source File: FrameRDDConverterUtils.java From systemds with Apache License 2.0

5 votes

public static JavaPairRDD<MatrixIndexes, MatrixBlock> binaryBlockToMatrixBlock(JavaPairRDD<Long,FrameBlock> input,
	DataCharacteristics mcIn, DataCharacteristics mcOut)
{
	//convert binary block to matrix block
	JavaPairRDD<MatrixIndexes, MatrixBlock> out = input
		.flatMapToPair(new BinaryBlockToMatrixBlockFunction(mcIn, mcOut));

	//aggregate partial matrix blocks
	return RDDAggregateUtils.mergeByKey(out, false); 
}

Source File: RmmSPInstruction.java From systemds with Apache License 2.0

5 votes

@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	
	//get input rdds
	DataCharacteristics mc1 = sec.getDataCharacteristics( input1.getName() );
	DataCharacteristics mc2 = sec.getDataCharacteristics( input2.getName() );
	JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() );
	JavaPairRDD<MatrixIndexes,MatrixBlock> in2 = sec.getBinaryMatrixBlockRDDHandleForVariable( input2.getName() );
	DataCharacteristics mcOut = updateBinaryMMOutputDataCharacteristics(sec, true);
	
	//execute Spark RMM instruction
	//step 1: prepare join keys (w/ shallow replication), i/j/k
	JavaPairRDD<TripleIndexes,MatrixBlock> tmp1 = in1.flatMapToPair(
		new RmmReplicateFunction(mc2.getCols(), mc2.getBlocksize(), true)); 
	JavaPairRDD<TripleIndexes,MatrixBlock> tmp2 = in2.flatMapToPair(
		new RmmReplicateFunction(mc1.getRows(), mc1.getBlocksize(), false));
	
	//step 2: join prepared datasets, multiply, and aggregate
	int numPartJoin = Math.max(getNumJoinPartitions(mc1, mc2),
		SparkExecutionContext.getDefaultParallelism(true));
	int numPartOut = SparkUtils.getNumPreferredPartitions(mcOut);
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = tmp1
		.join( tmp2, numPartJoin )               //join by result block 
	    .mapToPair( new RmmMultiplyFunction() ); //do matrix multiplication
	out = RDDAggregateUtils.sumByKeyStable(out,  //aggregation per result block
		numPartOut, false); 
	
	//put output block into symbol table (no lineage because single block)
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), input1.getName());
	sec.addLineageRDD(output.getName(), input2.getName());
}

Source File: StructureToCathDomainsTest.java From mmtf-spark with Apache License 2.0

4 votes

public void test1() throws IOException {

		List<String> pdbIds = Arrays.asList("1HV4");
//		List<String> pdbIds = Arrays.asList("1STP","4HHB","1JLP","5X6H","5L2G","2MK1");
	    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.downloadFullMmtfFiles(pdbIds, sc);

	    String baseUrl = "ftp://orengoftp.biochem.ucl.ac.uk/cath/releases/daily-release/newest/cath-b-newest-all.gz";

	    
//	    System.out.println(hmap.get("1HV4A"));
	      
	    JavaPairRDD<String, StructureDataInterface> cathDomains = pdb.flatMapToPair(new StructureToCathDomains(baseUrl));
	    
	    Map<String, ArrayList<String>> hmap = StructureToCathDomains.loadCathDomains(baseUrl);
        String[] bound = hmap.get("1HV4A").get(0).split(":")[0].split("-");

        int[] cath = cathDomains.first()._2.getGroupIds();       

        assertEquals(Integer.parseInt(bound[0]), cath[0]);
        
        assertEquals(Integer.parseInt(bound[1]), cath[cath.length-1]);
        
        assertEquals(Integer.parseInt(bound[1]) - Integer.parseInt(bound[0]) + 1, cathDomains.first()._2.getNumGroups());
                
        assertEquals(8, cathDomains.count());
	}

Source File: BinarySPInstruction.java From systemds with Apache License 2.0

4 votes

/**
 * Common binary matrix-matrix process instruction
 * 
 * @param ec execution context
 */
protected void processMatrixMatrixBinaryInstruction(ExecutionContext ec) 
{
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	
	//sanity check dimensions
	checkMatrixMatrixBinaryCharacteristics(sec);
	updateBinaryOutputDataCharacteristics(sec);
	
	// Get input RDDs
	JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryMatrixBlockRDDHandleForVariable(input1.getName());
	JavaPairRDD<MatrixIndexes,MatrixBlock> in2 = sec.getBinaryMatrixBlockRDDHandleForVariable(input2.getName());
	DataCharacteristics mc1 = sec.getDataCharacteristics(input1.getName());
	DataCharacteristics mc2 = sec.getDataCharacteristics(input2.getName());
	DataCharacteristics mcOut = sec.getDataCharacteristics(output.getName());
	
	BinaryOperator bop = (BinaryOperator) _optr;

	//vector replication if required (mv or outer operations)
	boolean rowvector = (mc2.getRows()==1 && mc1.getRows()>1);
	long numRepLeft = getNumReplicas(mc1, mc2, true);
	long numRepRight = getNumReplicas(mc1, mc2, false);
	if( numRepLeft > 1 )
		in1 = in1.flatMapToPair(new ReplicateVectorFunction(false, numRepLeft ));
	if( numRepRight > 1 )
		in2 = in2.flatMapToPair(new ReplicateVectorFunction(rowvector, numRepRight));
	int numPrefPart = SparkUtils.isHashPartitioned(in1) ? in1.getNumPartitions() :
		SparkUtils.isHashPartitioned(in2) ? in2.getNumPartitions() :
		Math.min(in1.getNumPartitions() + in2.getNumPartitions(),
			2 * SparkUtils.getNumPreferredPartitions(mcOut));
	
	//execute binary operation
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = in1
		.join(in2, numPrefPart)
		.mapValues(new MatrixMatrixBinaryOpFunction(bop));
	
	//set output RDD
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), input1.getName());
	sec.addLineageRDD(output.getName(), input2.getName());
}

Source File: StructureToCathDomainsTest.java From mmtf-spark with Apache License 2.0

4 votes

public void test2() throws IOException {

		List<String> pdbIds = Arrays.asList("1STP");
//		List<String> pdbIds = Arrays.asList("1STP","4HHB","1JLP","5X6H","5L2G","2MK1");
	    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.downloadFullMmtfFiles(pdbIds, sc);

	    String baseUrl = "ftp://orengoftp.biochem.ucl.ac.uk/cath/releases/daily-release/newest/cath-b-newest-all.gz";

	    
//	    System.out.println(hmap.get("1STPA"));
	      
	    JavaPairRDD<String, StructureDataInterface> cathDomains = pdb.flatMapToPair(new StructureToCathDomains(baseUrl));
	    
	    Map<String, ArrayList<String>> hmap = StructureToCathDomains.loadCathDomains(baseUrl);
        String[] bound = hmap.get("1STPA").get(0).split(":")[0].split("-");

        int[] cath = cathDomains.first()._2.getGroupIds();       

        assertEquals(Integer.parseInt(bound[0]), cath[0]);
        
        assertEquals(Integer.parseInt(bound[1]), cath[cath.length-1]);
        
        assertEquals(Integer.parseInt(bound[1]) - Integer.parseInt(bound[0]) + 1, cathDomains.first()._2.getNumGroups());
                
        assertEquals(1, cathDomains.count());
	}

Source File: PMapmmSPInstruction.java From systemds with Apache License 2.0

4 votes

@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	
	//get inputs
	JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() );
	JavaPairRDD<MatrixIndexes,MatrixBlock> in2 = sec.getBinaryMatrixBlockRDDHandleForVariable( input2.getName() );
	DataCharacteristics mc1 = sec.getDataCharacteristics(input1.getName());
	
	// This avoids errors such as java.lang.UnsupportedOperationException: Cannot change storage level of an RDD after it was already assigned a level
	// Ideally, we should ensure that we donot redundantly call persist on the same RDD.
	StorageLevel pmapmmStorageLevel = StorageLevel.MEMORY_AND_DISK();
	
	//cache right hand side because accessed many times
	in2 = in2.repartition(sec.getSparkContext().defaultParallelism())
			 .persist(pmapmmStorageLevel);
	
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = null;
	for( int i=0; i<mc1.getRows(); i+=NUM_ROWBLOCKS*mc1.getBlocksize() ) 
	{
		//create broadcast for rdd partition
		JavaPairRDD<MatrixIndexes,MatrixBlock> rdd = in1
				.filter(new IsBlockInRange(i+1, i+NUM_ROWBLOCKS*mc1.getBlocksize(), 1, mc1.getCols(), mc1))
				.mapToPair(new PMapMMRebaseBlocksFunction(i/mc1.getBlocksize()));
		
		int rlen = (int)Math.min(mc1.getRows()-i, NUM_ROWBLOCKS*mc1.getBlocksize());
		PartitionedBlock<MatrixBlock> pmb = SparkExecutionContext.toPartitionedMatrixBlock(rdd, rlen, (int)mc1.getCols(), mc1.getBlocksize(), -1L);
		Broadcast<PartitionedBlock<MatrixBlock>> bpmb = sec.getSparkContext().broadcast(pmb);
		
		//matrix multiplication
		JavaPairRDD<MatrixIndexes,MatrixBlock> rdd2 = in2
				.flatMapToPair(new PMapMMFunction(bpmb, i/mc1.getBlocksize()));
		rdd2 = RDDAggregateUtils.sumByKeyStable(rdd2, false);
		rdd2.persist(pmapmmStorageLevel)
		    .count();
		bpmb.unpersist(false);
		
		if( out == null )
			out = rdd2;
		else
			out = out.union(rdd2);
	}
	
	//cache final result
	out = out.persist(pmapmmStorageLevel);
	out.count();
	
	//put output RDD handle into symbol table
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), input1.getName());
	sec.addLineageRDD(output.getName(), input2.getName());
		
	//update output statistics if not inferred
	updateBinaryMMOutputDataCharacteristics(sec, true);
}

Java Code Examples for org.apache.spark.api.java.JavaPairRDD#flatMapToPair()