org.apache.spark.api.java.JavaPairRDD#mapValues

Source File: RDDAggregateUtils.java From systemds with Apache License 2.0

6 votes

public static JavaPairRDD<MatrixIndexes, MatrixBlock> aggByKeyStable( JavaPairRDD<MatrixIndexes, MatrixBlock> in, 
		AggregateOperator aop, int numPartitions, boolean deepCopyCombiner )
{
	//stable sum of blocks per key, by passing correction blocks along with aggregates
	JavaPairRDD<MatrixIndexes, CorrMatrixBlock> tmp = 
			in.combineByKey( new CreateCorrBlockCombinerFunction(deepCopyCombiner),
						     new MergeAggBlockValueFunction(aop), 
						     new MergeAggBlockCombinerFunction(aop), numPartitions );
	
	//strip-off correction blocks from
	JavaPairRDD<MatrixIndexes, MatrixBlock> out =  
			tmp.mapValues( new ExtractMatrixBlock() );
	
	//return the aggregate rdd
	return out;
}

Source File: BinUaggChainSPInstruction.java From systemds with Apache License 2.0

6 votes

@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	
	//get input
	JavaPairRDD<MatrixIndexes,MatrixBlock> in = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() );
	
	//execute unary builtin operation
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = 
			in.mapValues(new RDDBinUaggChainFunction(_bOp, _uaggOp));
	
	//set output RDD
	updateUnaryOutputDataCharacteristics(sec);
	sec.setRDDHandleForVariable(output.getName(), out);	
	sec.addLineageRDD(output.getName(), input1.getName());
}

Source File: RDDAggregateUtils.java From systemds with Apache License 2.0

6 votes

public static JavaPairRDD<MatrixIndexes, MatrixBlock> sumByKeyStable(JavaPairRDD<MatrixIndexes, MatrixBlock> in, 
		int numPartitions, boolean deepCopyCombiner)
{
	//stable sum of blocks per key, by passing correction blocks along with aggregates
	JavaPairRDD<MatrixIndexes, CorrMatrixBlock> tmp = 
		in.combineByKey( new CreateCorrBlockCombinerFunction(deepCopyCombiner), 
			new MergeSumBlockValueFunction(deepCopyCombiner),
			new MergeSumBlockCombinerFunction(deepCopyCombiner), numPartitions );
	
	//strip-off correction blocks from
	JavaPairRDD<MatrixIndexes, MatrixBlock> out =
		tmp.mapValues( new ExtractMatrixBlock() );
	
	//return the aggregate rdd
	return out;
}

Source File: RDDAggregateUtils.java From systemds with Apache License 2.0

6 votes

public static JavaPairRDD<MatrixIndexes, MatrixBlock> aggByKeyStable( JavaPairRDD<MatrixIndexes, MatrixBlock> in, 
		AggregateOperator aop, int numPartitions, boolean deepCopyCombiner )
{
	//stable sum of blocks per key, by passing correction blocks along with aggregates
	JavaPairRDD<MatrixIndexes, CorrMatrixBlock> tmp = 
			in.combineByKey( new CreateCorrBlockCombinerFunction(deepCopyCombiner),
						     new MergeAggBlockValueFunction(aop), 
						     new MergeAggBlockCombinerFunction(aop), numPartitions );
	
	//strip-off correction blocks from
	JavaPairRDD<MatrixIndexes, MatrixBlock> out =  
			tmp.mapValues( new ExtractMatrixBlock() );
	
	//return the aggregate rdd
	return out;
}

Source File: BinUaggChainSPInstruction.java From systemds with Apache License 2.0

6 votes

@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	
	//get input
	JavaPairRDD<MatrixIndexes,MatrixBlock> in = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() );
	
	//execute unary builtin operation
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = 
			in.mapValues(new RDDBinUaggChainFunction(_bOp, _uaggOp));
	
	//set output RDD
	updateUnaryOutputDataCharacteristics(sec);
	sec.setRDDHandleForVariable(output.getName(), out);	
	sec.addLineageRDD(output.getName(), input1.getName());
}

Source File: UnaryMatrixSPInstruction.java From systemds with Apache License 2.0

6 votes

@Override 
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	
	//get input
	JavaPairRDD<MatrixIndexes,MatrixBlock> in = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() );
	
	//execute unary builtin operation
	UnaryOperator uop = (UnaryOperator) _optr;
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = in.mapValues(new RDDMatrixBuiltinUnaryOp(uop));
	
	//set output RDD
	updateUnaryOutputDataCharacteristics(sec);
	sec.setRDDHandleForVariable(output.getName(), out);	
	sec.addLineageRDD(output.getName(), input1.getName());
}

Source File: CumulativeOffsetSPInstruction.java From systemds with Apache License 2.0

5 votes

@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	DataCharacteristics mc1 = sec.getDataCharacteristics(input1.getName());
	DataCharacteristics mc2 = sec.getDataCharacteristics(input2.getName());
	long rlen = mc2.getRows();
	int blen = mc2.getBlocksize();
	
	//get and join inputs
	JavaPairRDD<MatrixIndexes,MatrixBlock> inData = sec.getBinaryMatrixBlockRDDHandleForVariable(input1.getName());
	JavaPairRDD<MatrixIndexes,Tuple2<MatrixBlock,MatrixBlock>> joined = null;
	boolean broadcast = _broadcast && !SparkUtils.isHashPartitioned(inData);
	
	if( broadcast ) {
		//broadcast offsets and broadcast join with data
		PartitionedBroadcast<MatrixBlock> inAgg = sec.getBroadcastForVariable(input2.getName());
		joined = inData.mapToPair(new RDDCumSplitLookupFunction(inAgg,_initValue, rlen, blen));
	}
	else {
		//prepare aggregates (cumsplit of offsets) and repartition join with data
		joined = inData.join(sec
			.getBinaryMatrixBlockRDDHandleForVariable(input2.getName())
			.flatMapToPair(new RDDCumSplitFunction(_initValue, rlen, blen)));
	}
	
	//execute cumulative offset (apply cumulative op w/ offsets)
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = joined
		.mapValues(new RDDCumOffsetFunction(_uop, _cumsumprod));
	
	//put output handle in symbol table
	if( _cumsumprod )
		sec.getDataCharacteristics(output.getName())
			.set(mc1.getRows(), 1, mc1.getBlocksize(), mc1.getBlocksize());
	else //general case
		updateUnaryOutputDataCharacteristics(sec);
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), input1.getName());
	sec.addLineage(output.getName(), input2.getName(), broadcast);
}

Source File: BinaryFrameFrameSPInstruction.java From systemds with Apache License 2.0

5 votes

@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	// Get input RDDs
	JavaPairRDD<Long, FrameBlock> in1 = sec.getFrameBinaryBlockRDDHandleForVariable(input1.getName());
	// get schema frame-block
	Broadcast<FrameBlock> fb = sec.getSparkContext().broadcast(sec.getFrameInput(input2.getName()));
	JavaPairRDD<Long, FrameBlock> out = in1.mapValues(new isCorrectbySchema(fb.getValue()));
	//release input frame
	sec.releaseFrameInput(input2.getName());
	//set output RDD
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), input1.getName());
}

Source File: CompressionSPInstruction.java From systemds with Apache License 2.0

5 votes

@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext) ec;

	// get input rdd handle
	JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryMatrixBlockRDDHandleForVariable(input1.getName());

	// execute compression
	JavaPairRDD<MatrixIndexes, MatrixBlock> out = in.mapValues(new CompressionFunction());

	// set outputs
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(input1.getName(), output.getName());
}

Source File: SpoofSPInstruction.java From systemds with Apache License 2.0

5 votes

private static JavaPairRDD<MatrixIndexes, MatrixBlock[]> createJoinedInputRDD(SparkExecutionContext sec, CPOperand[] inputs, boolean[] bcVect, boolean outer) {
	//get input rdd for main input
	int main = getMainInputIndex(inputs, bcVect);
	DataCharacteristics mcIn = sec.getDataCharacteristics(inputs[main].getName());
	JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryMatrixBlockRDDHandleForVariable(inputs[main].getName());
	JavaPairRDD<MatrixIndexes, MatrixBlock[]> ret = in.mapValues(new MapInputSignature());
	
	for( int i=0; i<inputs.length; i++ )
		if( i != main && inputs[i].getDataType().isMatrix() && !bcVect[i] ) {
			//create side input rdd 
			String varname = inputs[i].getName();
			JavaPairRDD<MatrixIndexes, MatrixBlock> tmp = sec
				.getBinaryMatrixBlockRDDHandleForVariable(varname);
			DataCharacteristics mcTmp = sec.getDataCharacteristics(varname);
			//replicate blocks if mismatch with main input
			if( outer && i==2 )
				tmp = tmp.flatMapToPair(new ReplicateRightFactorFunction(mcIn.getRows(), mcIn.getBlocksize()));
			else if( mcIn.getNumRowBlocks() > mcTmp.getNumRowBlocks() )
				tmp = tmp.flatMapToPair(new ReplicateBlockFunction(mcIn.getRows(), mcIn.getBlocksize(), false));
			else if( mcIn.getNumColBlocks() > mcTmp.getNumColBlocks() )
				tmp = tmp.flatMapToPair(new ReplicateBlockFunction(mcIn.getCols(), mcIn.getBlocksize(), true));
			//join main and side inputs and consolidate signature
			ret = ret.join(tmp)
				.mapValues(new MapJoinSignature());
		}
	
	return ret;
}

Source File: SparkUtils.java From systemds with Apache License 2.0

5 votes

/**
 * Creates a partitioning-preserving copy of the input matrix RDD. If a deep copy is 
 * requested, indexes and values are copied, otherwise they are simply passed through.
 * 
 * @param in matrix as {@code JavaPairRDD<MatrixIndexes,MatrixBlock>}
 * @param deep if true, perform deep copy
 * @return matrix as {@code JavaPairRDD<MatrixIndexes,MatrixBlock>}
 */
public static JavaPairRDD<MatrixIndexes,MatrixBlock> copyBinaryBlockMatrix(
		JavaPairRDD<MatrixIndexes,MatrixBlock> in, boolean deep) 
{
	if( !deep ) //pass through of indexes and blocks
		return in.mapValues(new CopyMatrixBlockFunction(false));
	else //requires key access, so use mappartitions
		return in.mapPartitionsToPair(new CopyMatrixBlockPairFunction(deep), true);
}

Source File: CumulativeOffsetSPInstruction.java From systemds with Apache License 2.0

5 votes

@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	DataCharacteristics mc1 = sec.getDataCharacteristics(input1.getName());
	DataCharacteristics mc2 = sec.getDataCharacteristics(input2.getName());
	long rlen = mc2.getRows();
	int blen = mc2.getBlocksize();
	
	//get and join inputs
	JavaPairRDD<MatrixIndexes,MatrixBlock> inData = sec.getBinaryMatrixBlockRDDHandleForVariable(input1.getName());
	JavaPairRDD<MatrixIndexes,Tuple2<MatrixBlock,MatrixBlock>> joined = null;
	boolean broadcast = _broadcast && !SparkUtils.isHashPartitioned(inData);
	
	if( broadcast ) {
		//broadcast offsets and broadcast join with data
		PartitionedBroadcast<MatrixBlock> inAgg = sec.getBroadcastForVariable(input2.getName());
		joined = inData.mapToPair(new RDDCumSplitLookupFunction(inAgg,_initValue, rlen, blen));
	}
	else {
		//prepare aggregates (cumsplit of offsets) and repartition join with data
		joined = inData.join(sec
			.getBinaryMatrixBlockRDDHandleForVariable(input2.getName())
			.flatMapToPair(new RDDCumSplitFunction(_initValue, rlen, blen)));
	}
	
	//execute cumulative offset (apply cumulative op w/ offsets)
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = joined
		.mapValues(new RDDCumOffsetFunction(_uop, _cumsumprod));
	
	//put output handle in symbol table
	if( _cumsumprod )
		sec.getDataCharacteristics(output.getName())
			.set(mc1.getRows(), 1, mc1.getBlocksize(), mc1.getBlocksize());
	else //general case
		updateUnaryOutputDataCharacteristics(sec);
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), input1.getName());
	sec.addLineage(output.getName(), input2.getName(), broadcast);
}

Source File: SparkExecutionContext.java From systemds with Apache License 2.0

4 votes

@SuppressWarnings("unchecked")
public void repartitionAndCacheMatrixObject( String var ) {
	MatrixObject mo = getMatrixObject(var);
	DataCharacteristics dcIn = mo.getDataCharacteristics();

	//double check size to avoid unnecessary spark context creation
	if( !OptimizerUtils.exceedsCachingThreshold(mo.getNumColumns(),
			OptimizerUtils.estimateSizeExactSparsity(dcIn)) )
		return;

	//get input rdd and default storage level
	JavaPairRDD<MatrixIndexes,MatrixBlock> in = (JavaPairRDD<MatrixIndexes, MatrixBlock>)
		getRDDHandleForMatrixObject(mo, FileFormat.BINARY);

	//avoid unnecessary caching of input in order to reduce memory pressure
	if( mo.getRDDHandle().allowsShortCircuitRead()
		&& isRDDMarkedForCaching(in.id()) && !isRDDCached(in.id()) ) {
		in = (JavaPairRDD<MatrixIndexes,MatrixBlock>)
			((RDDObject)mo.getRDDHandle().getLineageChilds().get(0)).getRDD();

		//investigate issue of unnecessarily large number of partitions
		int numPartitions = SparkUtils.getNumPreferredPartitions(dcIn, in);
		if( numPartitions < in.getNumPartitions() )
			in = in.coalesce( numPartitions );
	}

	//repartition rdd (force creation of shuffled rdd via merge), note: without deep copy albeit
	//executed on the original data, because there will be no merge, i.e., no key duplicates
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = RDDAggregateUtils.mergeByKey(in, false);

	//convert mcsr into memory-efficient csr if potentially sparse
	if( OptimizerUtils.checkSparseBlockCSRConversion(dcIn) ) {
		out = out.mapValues(new CreateSparseBlockFunction(SparseBlock.Type.CSR));
	}

	//persist rdd in default storage level
	out.persist( Checkpoint.DEFAULT_STORAGE_LEVEL )
		.count(); //trigger caching to prevent contention

	//create new rdd handle, in-place of current matrix object
	RDDObject inro =  mo.getRDDHandle();  //guaranteed to exist (see above)
	RDDObject outro = new RDDObject(out); //create new rdd object
	outro.setCheckpointRDD(true);         //mark as checkpointed
	outro.addLineageChild(inro);          //keep lineage to prevent cycles on cleanup
	mo.setRDDHandle(outro);
}

Source File: AggregateUnarySPInstruction.java From systemds with Apache License 2.0

4 votes

private void processTensorAggregate(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;

	//get input
	// TODO support DataTensor
	JavaPairRDD<TensorIndexes, TensorBlock> in = sec.getBinaryTensorBlockRDDHandleForVariable( input1.getName() );
	JavaPairRDD<TensorIndexes, TensorBlock> out = in;

	// TODO: filter input blocks for trace
	//execute unary aggregate operation
	AggregateUnaryOperator auop = (AggregateUnaryOperator)_optr;
	AggregateOperator aggop = _aop;

	//perform aggregation if necessary and put output into symbol table
	if( _aggtype == SparkAggType.SINGLE_BLOCK )
	{
		// TODO filter non empty blocks if sparse safe
		JavaRDD<TensorBlock> out2 = out.map(new RDDUTensorAggFunction2(auop));
		TensorBlock out3 = RDDAggregateUtils.aggStableTensor(out2, aggop);

		//put output block into symbol table (no lineage because single block)
		//this also includes implicit maintenance of data characteristics
		// TODO generalize to drop depending on location of correction
		// TODO support DataTensor
		TensorBlock out4 = new TensorBlock(out3.getValueType(), new int[]{1, 1});
		out4.set(0, 0, out3.get(0, 0));
		sec.setTensorOutput(output.getName(), out4);
	}
	else //MULTI_BLOCK or NONE
	{
		if( _aggtype == SparkAggType.NONE ) {
			//in case of no block aggregation, we always drop the correction as well as
			//use a partitioning-preserving mapvalues
			out = out.mapValues(new RDDUTensorAggValueFunction(auop));
		}
		else if( _aggtype == SparkAggType.MULTI_BLOCK ) {
			// TODO MULTI_BLOCK
			throw new DMLRuntimeException("Multi block spark aggregations are not supported for tensors yet.");
			/*
			//in case of multi-block aggregation, we always keep the correction
			out = out.mapToPair(new RDDUTensorAggFunction(auop, dc.getBlocksize(), dc.getBlocksize()));
			out = RDDAggregateUtils.aggByKeyStable(out, aggop, false);

			//drop correction after aggregation if required (aggbykey creates
			//partitioning, drop correction via partitioning-preserving mapvalues)
			if( auop.aggOp.correctionExists )
				out = out.mapValues( new AggregateDropCorrectionFunction(aggop) );
			 */
		}

		//put output RDD handle into symbol table
		updateUnaryAggOutputDataCharacteristics(sec, auop.indexFn);
		sec.setRDDHandleForVariable(output.getName(), out);
		sec.addLineageRDD(output.getName(), input1.getName());
	}
}

Source File: Basic.java From learning-spark-with-java with MIT License

4 votes

public static void main(String[] args) {
  SparkSession spark = SparkSession
      .builder()
      .appName("Pairs-Basic")
      .master("local[4]")
      .getOrCreate();

  JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());

  List<Tuple2<String, Integer>> pairs =
      Arrays.asList(
          new Tuple2<>("1",9), new Tuple2<>("1",2), new Tuple2<>("1",1),
          new Tuple2<>("2",3), new Tuple2<>("2",4), new Tuple2<>("3",1),
          new Tuple2<>("3",5), new Tuple2<>("6",2), new Tuple2<>("6",1),
          new Tuple2<>("6",4), new Tuple2<>("8",1));

  // a randomly partitioned pair RDD
  JavaPairRDD<String, Integer> pairsRDD = sc.parallelizePairs(pairs, 4);

  System.out.println("*** the original pairs");
  pairsRDD.foreach(i -> System.out.println(i));

  //
  // Pairs can be collected as a Map of, but this only works well if the
  // keys are unique. Here they aren't so an arbitrary value is chosen for each:
  //
  Map<String, Integer> pairsAsMap = pairsRDD.collectAsMap();
  System.out.println("*** the pretty useless map");
  System.out.println(pairsAsMap);

  // let's say we just want the pair with minimum value for each key
  // we can use one of the handy methods in PairRDDFunctions. To reduce we need
  // only supply a single function to combine all the values for each key -- the result
  // has to have the same type as the values
  JavaPairRDD<String, Integer> reducedRDD = pairsRDD.reduceByKey(Math::min);

  System.out.println("*** the reduced pairs");
  reducedRDD.foreach(i -> System.out.println(i));

  // the reduced pairs have unique keys so collecting to a map works a lot better
  Map<String, Integer> reducedAsMap = reducedRDD.collectAsMap();
  System.out.println("*** the reduced pairs as a map");
  System.out.println(reducedAsMap);

  // folding is a little mor general: we get to specifiy the identity value:
  // say 0 for adding and 1 for multiplying
  JavaPairRDD<String, Integer> foldedRDD =
      pairsRDD.foldByKey(1, (x, y) -> x * y);

  System.out.println("*** the folded pairs");
  foldedRDD.foreach(i -> System.out.println(i));

  // Combining is more general: you can produce values of a different type, which is very powerful.
  // You need to provide three functions: the first converts an individual value to the new type, the second
  // incorporates an additional value into the the result, and the third combines intermediate results, which is
  // used by execution to avoid excessive communication between partitions. The first function is applied once
  // per partition and the second is used for each additional value in the partition.
  // Below is a pretty classical example of its use: compute a per-key average by first computing the sum and count
  // for each key and then dividing.
  JavaPairRDD<String, Tuple2<Integer, Integer>> combinedRDD =
      pairsRDD.combineByKey(
          value -> new Tuple2<>(value, 1),
          (sumAndCount, value) -> new Tuple2<>(sumAndCount._1() + value, sumAndCount._2() + 1),
          (sumAndCount1, sumAndCount2) ->
              new Tuple2<>(sumAndCount1._1() + sumAndCount2._1(), sumAndCount1._2() + sumAndCount2._2())
      );

  JavaPairRDD<String, Double> averageRDD =
      combinedRDD.mapValues(sumAndCount -> (double) sumAndCount._1() / sumAndCount._2());

  System.out.println("*** the average pairs");
  averageRDD.foreach(i -> System.out.println(i));

  // The dividing could be done just by calling map, but in Java this requires a lot of conversion between the
  // two kinds of RDD and ends up *VERY* cumbersome.
  JavaRDD<Tuple2<String, Tuple2<Integer, Integer>>> tupleCombinedRDD =
      JavaRDD.fromRDD(combinedRDD.rdd(), combinedRDD.classTag());
  JavaRDD<Tuple2<String, Double>> tupleDividedRDD = tupleCombinedRDD.map(keyAndsumAndCount ->
      new Tuple2<>(keyAndsumAndCount._1(), (double) keyAndsumAndCount._2()._1() / keyAndsumAndCount._2()._2()));
  JavaPairRDD<String, Double> averageRDDtheHardWay = JavaPairRDD.fromJavaRDD(tupleDividedRDD);

  // remember these won't necessarily come out int he same order so they may not obviously be
  // the same as above
  System.out.println("*** the average pairs the hard way");
  averageRDDtheHardWay.foreach(i -> System.out.println(i));

  spark.stop();
}

Source File: TernarySPInstruction.java From systemds with Apache License 2.0

4 votes

@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = !input1.isMatrix() ? null :
		sec.getBinaryMatrixBlockRDDHandleForVariable(input1.getName());
	JavaPairRDD<MatrixIndexes,MatrixBlock> in2 = !input2.isMatrix() ? null :
		sec.getBinaryMatrixBlockRDDHandleForVariable(input2.getName());
	JavaPairRDD<MatrixIndexes,MatrixBlock> in3 = !input3.isMatrix() ? null :
		sec.getBinaryMatrixBlockRDDHandleForVariable(input3.getName());
	MatrixBlock m1 = input1.isMatrix() ? null :
		new MatrixBlock(ec.getScalarInput(input1).getDoubleValue());
	MatrixBlock m2 = input2.isMatrix() ? null :
		new MatrixBlock(ec.getScalarInput(input2).getDoubleValue());
	MatrixBlock m3 = input3.isMatrix() ? null :
		new MatrixBlock(ec.getScalarInput(input3).getDoubleValue());
	
	TernaryOperator op = (TernaryOperator) _optr;
	
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = null;
	if( input1.isMatrix() && !input2.isMatrix() && !input3.isMatrix() )
		out = in1.mapValues(new TernaryFunctionMSS(op, m1, m2, m3));
	else if( !input1.isMatrix() && input2.isMatrix() && !input3.isMatrix() )
		out = in2.mapValues(new TernaryFunctionSMS(op, m1, m2, m3));
	else if( !input1.isMatrix() && !input2.isMatrix() && input3.isMatrix() )
		out = in3.mapValues(new TernaryFunctionSSM(op, m1, m2, m3));
	else if( input1.isMatrix() && input2.isMatrix() && !input3.isMatrix() )
		out = in1.join(in2).mapValues(new TernaryFunctionMMS(op, m1, m2, m3));
	else if( input1.isMatrix() && !input2.isMatrix() && input3.isMatrix() )
		out = in1.join(in3).mapValues(new TernaryFunctionMSM(op, m1, m2, m3));
	else if( !input1.isMatrix() && input2.isMatrix() && input3.isMatrix() )
		out = in2.join(in3).mapValues(new TernaryFunctionSMM(op, m1, m2, m3));
	else // all matrices
		out = in1.join(in2).join(in3).mapValues(new TernaryFunctionMMM(op, m1, m2, m3));
	
	//set output RDD
	updateTernaryOutputDataCharacteristics(sec);
	sec.setRDDHandleForVariable(output.getName(), out);
	if( input1.isMatrix() )
		sec.addLineageRDD(output.getName(), input1.getName());
	if( input2.isMatrix() )
		sec.addLineageRDD(output.getName(), input2.getName());
	if( input3.isMatrix() )
		sec.addLineageRDD(output.getName(), input3.getName());
}

Source File: PageRankSpark.java From graphify with Apache License 2.0

4 votes

public static void main(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println("Usage: JavaPageRank <file> <number_of_iterations>");
        System.exit(1);
    }
    SparkConf sparkConf = new SparkConf().setAppName("Graphify");
    JavaSparkContext ctx = new JavaSparkContext(sparkConf);


    JavaRDD<String> lines = ctx.textFile(args[0], 1);


    // Loads all URLs from input file and initialize their neighbors.
    JavaPairRDD<String, Iterable<String>> links = lines.mapToPair(s -> {
        String[] parts = SPACES.split(s);
        return new Tuple2<>(parts[0], parts[1]);
    }).distinct().groupByKey().cache();


    // Loads all URLs with other URL(s) link to from input file and initialize ranks of them to one.
    JavaPairRDD<String, Double> ranks = links.mapValues(rs -> 1.0);

    // Calculates and updates URL ranks continuously using PageRank algorithm.
    for (int current = 0; current < Integer.parseInt(args[1]); current++) {
        // Calculates URL contributions to the rank of other URLs.
        JavaPairRDD<String, Double> contribs = links.join(ranks).values()
                .flatMapToPair(s -> {
                    int urlCount = Iterables.size(s._1());
                    List<Tuple2<String, Double>> results = new ArrayList<>();
                    for (String n : s._1()) {
                        results.add(new Tuple2<>(n, s._2() / urlCount));
                    }
                    return results;
                });
        // Re-calculates URL ranks based on neighbor contributions.
        ranks = contribs.reduceByKey(new Sum()).mapValues(sum -> 0.15 + sum * 0.85);
    }

    // Collects all URL ranks and dump them to console.
    List<Tuple2<String, Double>> output = ranks.collect();
    for (Tuple2<?,?> tuple : output) {
        System.out.println(tuple._1() + " has rank: " + tuple._2() + ".");
    }
    ctx.stop();
}

Source File: TernarySPInstruction.java From systemds with Apache License 2.0

4 votes

@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = !input1.isMatrix() ? null :
		sec.getBinaryMatrixBlockRDDHandleForVariable(input1.getName());
	JavaPairRDD<MatrixIndexes,MatrixBlock> in2 = !input2.isMatrix() ? null :
		sec.getBinaryMatrixBlockRDDHandleForVariable(input2.getName());
	JavaPairRDD<MatrixIndexes,MatrixBlock> in3 = !input3.isMatrix() ? null :
		sec.getBinaryMatrixBlockRDDHandleForVariable(input3.getName());
	MatrixBlock m1 = input1.isMatrix() ? null :
		new MatrixBlock(ec.getScalarInput(input1).getDoubleValue());
	MatrixBlock m2 = input2.isMatrix() ? null :
		new MatrixBlock(ec.getScalarInput(input2).getDoubleValue());
	MatrixBlock m3 = input3.isMatrix() ? null :
		new MatrixBlock(ec.getScalarInput(input3).getDoubleValue());
	
	TernaryOperator op = (TernaryOperator) _optr;
	
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = null;
	if( input1.isMatrix() && !input2.isMatrix() && !input3.isMatrix() )
		out = in1.mapValues(new TernaryFunctionMSS(op, m1, m2, m3));
	else if( !input1.isMatrix() && input2.isMatrix() && !input3.isMatrix() )
		out = in2.mapValues(new TernaryFunctionSMS(op, m1, m2, m3));
	else if( !input1.isMatrix() && !input2.isMatrix() && input3.isMatrix() )
		out = in3.mapValues(new TernaryFunctionSSM(op, m1, m2, m3));
	else if( input1.isMatrix() && input2.isMatrix() && !input3.isMatrix() )
		out = in1.join(in2).mapValues(new TernaryFunctionMMS(op, m1, m2, m3));
	else if( input1.isMatrix() && !input2.isMatrix() && input3.isMatrix() )
		out = in1.join(in3).mapValues(new TernaryFunctionMSM(op, m1, m2, m3));
	else if( !input1.isMatrix() && input2.isMatrix() && input3.isMatrix() )
		out = in2.join(in3).mapValues(new TernaryFunctionSMM(op, m1, m2, m3));
	else // all matrices
		out = in1.join(in2).join(in3).mapValues(new TernaryFunctionMMM(op, m1, m2, m3));
	
	//set output RDD
	updateTernaryOutputDataCharacteristics(sec);
	sec.setRDDHandleForVariable(output.getName(), out);
	if( input1.isMatrix() )
		sec.addLineageRDD(output.getName(), input1.getName());
	if( input2.isMatrix() )
		sec.addLineageRDD(output.getName(), input2.getName());
	if( input3.isMatrix() )
		sec.addLineageRDD(output.getName(), input3.getName());
}

Source File: SparkExecutionContext.java From systemds with Apache License 2.0

4 votes

@SuppressWarnings("unchecked")
public void repartitionAndCacheMatrixObject( String var ) {
	MatrixObject mo = getMatrixObject(var);
	DataCharacteristics dcIn = mo.getDataCharacteristics();

	//double check size to avoid unnecessary spark context creation
	if( !OptimizerUtils.exceedsCachingThreshold(mo.getNumColumns(),
			OptimizerUtils.estimateSizeExactSparsity(dcIn)) )
		return;

	//get input rdd and default storage level
	JavaPairRDD<MatrixIndexes,MatrixBlock> in = (JavaPairRDD<MatrixIndexes, MatrixBlock>)
			getRDDHandleForMatrixObject(mo, InputInfo.BinaryBlockInputInfo);

	//avoid unnecessary caching of input in order to reduce memory pressure
	if( mo.getRDDHandle().allowsShortCircuitRead()
		&& isRDDMarkedForCaching(in.id()) && !isRDDCached(in.id()) ) {
		in = (JavaPairRDD<MatrixIndexes,MatrixBlock>)
				((RDDObject)mo.getRDDHandle().getLineageChilds().get(0)).getRDD();

		//investigate issue of unnecessarily large number of partitions
		int numPartitions = SparkUtils.getNumPreferredPartitions(dcIn, in);
		if( numPartitions < in.getNumPartitions() )
			in = in.coalesce( numPartitions );
	}

	//repartition rdd (force creation of shuffled rdd via merge), note: without deep copy albeit
	//executed on the original data, because there will be no merge, i.e., no key duplicates
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = RDDAggregateUtils.mergeByKey(in, false);

	//convert mcsr into memory-efficient csr if potentially sparse
	if( OptimizerUtils.checkSparseBlockCSRConversion(dcIn) ) {
		out = out.mapValues(new CreateSparseBlockFunction(SparseBlock.Type.CSR));
	}

	//persist rdd in default storage level
	out.persist( Checkpoint.DEFAULT_STORAGE_LEVEL )
	   .count(); //trigger caching to prevent contention

	//create new rdd handle, in-place of current matrix object
	RDDObject inro =  mo.getRDDHandle();  //guaranteed to exist (see above)
	RDDObject outro = new RDDObject(out); //create new rdd object
	outro.setCheckpointRDD(true);         //mark as checkpointed
	outro.addLineageChild(inro);          //keep lineage to prevent cycles on cleanup
	mo.setRDDHandle(outro);
}

Java Code Examples for org.apache.spark.api.java.JavaPairRDD#mapValues()