org.apache.spark.api.java.JavaPairRDD#mapPartitionsToPair

Source File: RDDSortUtils.java From systemds with Apache License 2.0

5 votes

/**
 * This function collects and sorts value column in memory and then broadcasts it. 
 * 
 * @param val value as {@code JavaPairRDD<MatrixIndexes, MatrixBlock>}
 * @param data data as {@code JavaPairRDD<MatrixIndexes, MatrixBlock>}
 * @param asc if true, sort ascending
 * @param rlen number of rows
 * @param clen number of columns
 * @param blen block length
 * @param sec spark execution context
 * @param r_op reorg operator
 * @return data as {@code JavaPairRDD<MatrixIndexes, MatrixBlock>}
 */
public static JavaPairRDD<MatrixIndexes, MatrixBlock> sortDataByValMemSort( JavaPairRDD<MatrixIndexes, MatrixBlock> val, 
		JavaPairRDD<MatrixIndexes, MatrixBlock> data, boolean asc, long rlen, long clen, int blen, 
		SparkExecutionContext sec, ReorgOperator r_op) 
{
	//collect orderby column for in-memory sorting
	MatrixBlock inMatBlock = SparkExecutionContext
		.toMatrixBlock(val, (int)rlen, 1, blen, -1);

	//in-memory sort operation (w/ index return: source index in target position)
	ReorgOperator lrop = new ReorgOperator(new SortIndex(1, !asc, true));
	MatrixBlock sortedIx = inMatBlock.reorgOperations(lrop, new MatrixBlock(), -1, -1, -1);
	
	//flip sort indices from <source ix in target pos> to <target ix in source pos>
	MatrixBlock sortedIxSrc = new MatrixBlock(sortedIx.getNumRows(), 1, false); 
	for (int i=0; i < sortedIx.getNumRows(); i++) 
		sortedIxSrc.quickSetValue((int)sortedIx.quickGetValue(i,0)-1, 0, i+1);

	//broadcast index vector
	PartitionedBlock<MatrixBlock> pmb = new PartitionedBlock<>(sortedIxSrc, blen);
	Broadcast<PartitionedBlock<MatrixBlock>> _pmb = sec.getSparkContext().broadcast(pmb);

	//sort data with broadcast index vector
	JavaPairRDD<MatrixIndexes, RowMatrixBlock> ret = data
			.mapPartitionsToPair(new ShuffleMatrixBlockRowsInMemFunction(rlen, blen, _pmb));
	return RDDAggregateUtils.mergeRowsByKey(ret);
}

Source File: TransformTranslator.java From beam with Apache License 2.0

5 votes

private static <K, V, OutputT> JavaPairRDD<TupleTag<?>, WindowedValue<?>> statefulParDoTransform(
    KvCoder<K, V> kvCoder,
    Coder<? extends BoundedWindow> windowCoder,
    JavaRDD<WindowedValue<KV<K, V>>> kvInRDD,
    Partitioner partitioner,
    MultiDoFnFunction<KV<K, V>, OutputT> doFnFunction,
    boolean requiresSortedInput) {
  Coder<K> keyCoder = kvCoder.getKeyCoder();

  final WindowedValue.WindowedValueCoder<V> wvCoder =
      WindowedValue.FullWindowedValueCoder.of(kvCoder.getValueCoder(), windowCoder);

  if (!requiresSortedInput) {
    return GroupCombineFunctions.groupByKeyOnly(kvInRDD, keyCoder, wvCoder, partitioner)
        .map(
            input -> {
              final K key = input.getKey();
              Iterable<WindowedValue<V>> value = input.getValue();
              return FluentIterable.from(value)
                  .transform(
                      windowedValue ->
                          windowedValue.withValue(KV.of(key, windowedValue.getValue())))
                  .iterator();
            })
        .flatMapToPair(doFnFunction);
  }

  JavaPairRDD<ByteArray, byte[]> pairRDD =
      kvInRDD
          .map(new ReifyTimestampsAndWindowsFunction<>())
          .mapToPair(TranslationUtils.toPairFunction())
          .mapToPair(
              CoderHelpers.toByteFunctionWithTs(keyCoder, wvCoder, in -> in._2().getTimestamp()));

  JavaPairRDD<ByteArray, byte[]> sorted =
      pairRDD.repartitionAndSortWithinPartitions(keyPrefixPartitionerFrom(partitioner));

  return sorted.mapPartitionsToPair(wrapDoFnFromSortedRDD(doFnFunction, keyCoder, wvCoder));
}

Source File: RDDSortUtils.java From systemds with Apache License 2.0

5 votes

/**
 * This function collects and sorts value column in memory and then broadcasts it. 
 * 
 * @param val value as {@code JavaPairRDD<MatrixIndexes, MatrixBlock>}
 * @param data data as {@code JavaPairRDD<MatrixIndexes, MatrixBlock>}
 * @param asc if true, sort ascending
 * @param rlen number of rows
 * @param clen number of columns
 * @param blen block length
 * @param sec spark execution context
 * @param r_op reorg operator
 * @return data as {@code JavaPairRDD<MatrixIndexes, MatrixBlock>}
 */
public static JavaPairRDD<MatrixIndexes, MatrixBlock> sortDataByValMemSort( JavaPairRDD<MatrixIndexes, MatrixBlock> val, 
		JavaPairRDD<MatrixIndexes, MatrixBlock> data, boolean asc, long rlen, long clen, int blen, 
		SparkExecutionContext sec, ReorgOperator r_op) 
{
	//collect orderby column for in-memory sorting
	MatrixBlock inMatBlock = SparkExecutionContext
		.toMatrixBlock(val, (int)rlen, 1, blen, -1);

	//in-memory sort operation (w/ index return: source index in target position)
	ReorgOperator lrop = new ReorgOperator(new SortIndex(1, !asc, true));
	MatrixBlock sortedIx = inMatBlock.reorgOperations(lrop, new MatrixBlock(), -1, -1, -1);
	
	//flip sort indices from <source ix in target pos> to <target ix in source pos>
	MatrixBlock sortedIxSrc = new MatrixBlock(sortedIx.getNumRows(), 1, false); 
	for (int i=0; i < sortedIx.getNumRows(); i++) 
		sortedIxSrc.quickSetValue((int)sortedIx.quickGetValue(i,0)-1, 0, i+1);

	//broadcast index vector
	PartitionedBlock<MatrixBlock> pmb = new PartitionedBlock<>(sortedIxSrc, blen);
	Broadcast<PartitionedBlock<MatrixBlock>> _pmb = sec.getSparkContext().broadcast(pmb);

	//sort data with broadcast index vector
	JavaPairRDD<MatrixIndexes, RowMatrixBlock> ret = data
			.mapPartitionsToPair(new ShuffleMatrixBlockRowsInMemFunction(rlen, blen, _pmb));
	return RDDAggregateUtils.mergeRowsByKey(ret);
}

Source File: SparkUtils.java From systemds with Apache License 2.0

5 votes

/**
 * Creates a partitioning-preserving copy of the input matrix RDD. If a deep copy is 
 * requested, indexes and values are copied, otherwise they are simply passed through.
 * 
 * @param in matrix as {@code JavaPairRDD<MatrixIndexes,MatrixBlock>}
 * @param deep if true, perform deep copy
 * @return matrix as {@code JavaPairRDD<MatrixIndexes,MatrixBlock>}
 */
public static JavaPairRDD<MatrixIndexes,MatrixBlock> copyBinaryBlockMatrix(
		JavaPairRDD<MatrixIndexes,MatrixBlock> in, boolean deep) 
{
	if( !deep ) //pass through of indexes and blocks
		return in.mapValues(new CopyMatrixBlockFunction(false));
	else //requires key access, so use mappartitions
		return in.mapPartitionsToPair(new CopyMatrixBlockPairFunction(deep), true);
}

Source File: FrameRDDConverterUtils.java From systemds with Apache License 2.0

5 votes

public static JavaPairRDD<Long, FrameBlock> csvToBinaryBlock(JavaSparkContext sc,
	JavaPairRDD<LongWritable, Text> input, DataCharacteristics mc, ValueType[] schema,
	boolean hasHeader, String delim, boolean fill, double fillValue)
{
	//determine unknown dimensions and sparsity if required
	if( !mc.dimsKnown() ) { //nnz irrelevant here
			JavaRDD<String> tmp = input.values()
				.map(new TextToStringFunction());
		String tmpStr = tmp.first();
		boolean metaHeader = tmpStr.startsWith(TfUtils.TXMTD_MVPREFIX) 
				|| tmpStr.startsWith(TfUtils.TXMTD_NDPREFIX);
		tmpStr = (metaHeader) ? tmpStr.substring(tmpStr.indexOf(delim)+1) : tmpStr;
		long rlen = tmp.count() - (hasHeader ? 1 : 0) - (metaHeader ? 2 : 0);
		long clen = IOUtilFunctions.splitCSV(tmpStr, delim).length;
		mc.set(rlen, clen, mc.getBlocksize(), -1);
	}
	
	//prepare csv w/ row indexes (sorted by filenames)
	JavaPairRDD<Text,Long> prepinput = input.values()
			.zipWithIndex(); //zip row index
	
	//prepare default schema if needed
	if( schema == null || schema.length==1 )
		schema = UtilFunctions.nCopies((int)mc.getCols(), ValueType.STRING);

	//convert csv rdd to binary block rdd (w/ partial blocks)
	JavaPairRDD<Long, FrameBlock> out = prepinput.mapPartitionsToPair(
			new CSVToBinaryBlockFunction(mc, schema, hasHeader, delim));
	
	return out;
}

Source File: FrameRDDConverterUtils.java From systemds with Apache License 2.0

5 votes

public static JavaPairRDD<Long, FrameBlock> csvToBinaryBlock(JavaSparkContext sc,
	JavaPairRDD<LongWritable, Text> input, DataCharacteristics mc, ValueType[] schema,
	boolean hasHeader, String delim, boolean fill, double fillValue)
{
	//determine unknown dimensions and sparsity if required
	if( !mc.dimsKnown() ) { //nnz irrelevant here
			JavaRDD<String> tmp = input.values()
				.map(new TextToStringFunction());
		String tmpStr = tmp.first();
		boolean metaHeader = tmpStr.startsWith(TfUtils.TXMTD_MVPREFIX) 
				|| tmpStr.startsWith(TfUtils.TXMTD_NDPREFIX);
		tmpStr = (metaHeader) ? tmpStr.substring(tmpStr.indexOf(delim)+1) : tmpStr;
		long rlen = tmp.count() - (hasHeader ? 1 : 0) - (metaHeader ? 2 : 0);
		long clen = IOUtilFunctions.splitCSV(tmpStr, delim).length;
		mc.set(rlen, clen, mc.getBlocksize(), -1);
	}
	
	//prepare csv w/ row indexes (sorted by filenames)
	JavaPairRDD<Text,Long> prepinput = input.values()
			.zipWithIndex(); //zip row index
	
	//prepare default schema if needed
	if( schema == null || schema.length==1 )
		schema = UtilFunctions.nCopies((int)mc.getCols(), ValueType.STRING);

	//convert csv rdd to binary block rdd (w/ partial blocks)
	JavaPairRDD<Long, FrameBlock> out = prepinput.mapPartitionsToPair(
			new CSVToBinaryBlockFunction(mc, schema, hasHeader, delim));
	
	return out;
}

Source File: GraknSparkExecutor.java From grakn with GNU Affero General Public License v3.0

5 votes

public static <K, V> JavaPairRDD<K, V> executeMap(
        final JavaPairRDD<Object, VertexWritable> graphRDD, final MapReduce<K, V, ?, ?, ?> mapReduce,
        final Configuration graphComputerConfiguration) {
    JavaPairRDD<K, V> mapRDD = graphRDD.mapPartitionsToPair(partitionIterator -> {
        KryoShimServiceLoader.applyConfiguration(graphComputerConfiguration);
        return new MapIterator<>(MapReduce.<MapReduce<K, V, ?, ?, ?>>createMapReduce(HadoopGraph.open(graphComputerConfiguration), graphComputerConfiguration), partitionIterator);
    });
    if (mapReduce.getMapKeySort().isPresent()){
        mapRDD = mapRDD.sortByKey(mapReduce.getMapKeySort().get(), true, 1);}
    return mapRDD;
}

Source File: MatrixAppendMSPInstruction.java From systemds with Apache License 2.0

5 votes

@Override
public void processInstruction(ExecutionContext ec) {
	// map-only append (rhs must be vector and fit in mapper mem)
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	checkBinaryAppendInputCharacteristics(sec, _cbind, false, false);
	DataCharacteristics mc1 = sec.getDataCharacteristics(input1.getName());
	DataCharacteristics mc2 = sec.getDataCharacteristics(input2.getName());
	int blen = mc1.getBlocksize();
	
	JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() );
	PartitionedBroadcast<MatrixBlock> in2 = sec.getBroadcastForVariable( input2.getName() );
	long off = sec.getScalarInput( _offset).getLongValue();
	
	//execute map-append operations (partitioning preserving if #in-blocks = #out-blocks)
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = null;
	if( preservesPartitioning(mc1, mc2, _cbind) ) {
		out = in1.mapPartitionsToPair(
			new MapSideAppendPartitionFunction(in2, _cbind, off, blen), true);
	}
	else {
		out = in1.flatMapToPair(
			new MapSideAppendFunction(in2, _cbind, off, blen));
	}
	
	//put output RDD handle into symbol table
	updateBinaryAppendOutputDataCharacteristics(sec, _cbind);
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), input1.getName());
	sec.addLineageBroadcast(output.getName(), input2.getName());
}

Source File: BinarySPInstruction.java From systemds with Apache License 2.0

5 votes

protected void processMatrixBVectorBinaryInstruction(ExecutionContext ec, VectorType vtype)
{
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	
	//sanity check dimensions
	checkMatrixMatrixBinaryCharacteristics(sec);

	//get input RDDs
	String rddVar = input1.getName(); 
	String bcastVar = input2.getName();
	JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryMatrixBlockRDDHandleForVariable( rddVar );
	PartitionedBroadcast<MatrixBlock> in2 = sec.getBroadcastForVariable( bcastVar );
	DataCharacteristics mc1 = sec.getDataCharacteristics(rddVar);
	DataCharacteristics mc2 = sec.getDataCharacteristics(bcastVar);
	
	BinaryOperator bop = (BinaryOperator) _optr;
	boolean isOuter = (mc1.getRows()>1 && mc1.getCols()==1 && mc2.getRows()==1 && mc2.getCols()>1);
	
	//execute map binary operation
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = null;
	if( isOuter ) {
		out = in1.flatMapToPair(new OuterVectorBinaryOpFunction(bop, in2));
	}
	else { //default
		//note: we use mappartition in order to preserve partitioning information for
		//binary mv operations where the keys are guaranteed not to change, the reason
		//why we cannot use mapValues is the need for broadcast key lookups.
		//alternative: out = in1.mapToPair(new MatrixVectorBinaryOpFunction(bop, in2, vtype));
		out = in1.mapPartitionsToPair(
				new MatrixVectorBinaryOpPartitionFunction(bop, in2, vtype), true);
	}
	
	//set output RDD
	updateBinaryOutputDataCharacteristics(sec);
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), rddVar);
	sec.addLineageBroadcast(output.getName(), bcastVar);
}

Source File: BinarySPInstruction.java From systemds with Apache License 2.0

5 votes

protected void processTensorTensorBroadcastBinaryInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext) ec;

	//sanity check dimensions
	checkTensorTensorBinaryCharacteristics(sec);

	//get input RDDs
	String rddVar = input1.getName();
	String bcastVar = input2.getName();
	JavaPairRDD<TensorIndexes, TensorBlock> in1 = sec.getBinaryTensorBlockRDDHandleForVariable(rddVar);
	DataCharacteristics dc1 = sec.getDataCharacteristics(rddVar);
	DataCharacteristics dc2 = sec.getDataCharacteristics(bcastVar).setBlocksize(dc1.getBlocksize());
	PartitionedBroadcast<TensorBlock> in2 = sec.getBroadcastForTensorVariable(bcastVar);

	BinaryOperator bop = (BinaryOperator) _optr;

	boolean[] replicateDim = new boolean[dc2.getNumDims()];
	for (int i = 0; i < replicateDim.length; i++)
		replicateDim[i] = dc2.getDim(i) == 1;

	//execute map binary operation
	JavaPairRDD<TensorIndexes, TensorBlock> out;
	// TODO less dims broadcast variable
	out = in1.mapPartitionsToPair(
			new TensorTensorBinaryOpPartitionFunction(bop, in2, replicateDim), true);

	//set output RDD
	updateBinaryTensorOutputDataCharacteristics(sec);
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), rddVar);
	sec.addLineageBroadcast(output.getName(), bcastVar);
}

Source File: RDDConverterUtils.java From systemds with Apache License 2.0

5 votes

public static JavaPairRDD<MatrixIndexes, MatrixBlock> dataFrameToBinaryBlock(JavaSparkContext sc,
	Dataset<Row> df, DataCharacteristics mc, boolean containsID, boolean isVector)
{
	//determine unknown dimensions and sparsity if required
	if( !mc.dimsKnown(true) ) {
		LongAccumulator aNnz = sc.sc().longAccumulator("nnz");
		JavaRDD<Row> tmp = df.javaRDD().map(new DataFrameAnalysisFunction(aNnz, containsID, isVector));
		long rlen = tmp.count();
		long clen = !isVector ? df.columns().length - (containsID?1:0) : 
				((Vector) tmp.first().get(containsID?1:0)).size();
		long nnz = UtilFunctions.toLong(aNnz.value());
		mc.set(rlen, clen, mc.getBlocksize(), nnz);
	}
	
	//ensure valid blocksizes
	if( mc.getBlocksize()<=1 )
		mc.setBlocksize(ConfigurationManager.getBlocksize());
	
	//construct or reuse row ids
	JavaPairRDD<Row, Long> prepinput = containsID ?
			df.javaRDD().mapToPair(new DataFrameExtractIDFunction(
				df.schema().fieldIndex(DF_ID_COLUMN))) :
			df.javaRDD().zipWithIndex(); //zip row index
	
	//convert csv rdd to binary block rdd (w/ partial blocks)
	boolean sparse = requiresSparseAllocation(prepinput, mc);
	JavaPairRDD<MatrixIndexes, MatrixBlock> out = 
			prepinput.mapPartitionsToPair(
				new DataFrameToBinaryBlockFunction(mc, sparse, containsID, isVector));
	
	//aggregate partial matrix blocks (w/ preferred number of output 
	//partitions as the data is likely smaller in binary block format,
	//but also to bound the size of partitions for compressed inputs)
	int parts = SparkUtils.getNumPreferredPartitions(mc, out);
	return RDDAggregateUtils.mergeByKey(out, parts, false); 
}

Source File: SparkExecutor.java From tinkerpop with Apache License 2.0

4 votes

public static JavaPairRDD<Object, VertexWritable> applyGraphFilter(final JavaPairRDD<Object, VertexWritable> graphRDD, final GraphFilter graphFilter) {
    return graphRDD.mapPartitionsToPair(partitionIterator -> {
        final GraphFilter gFilter = graphFilter.clone();
        return IteratorUtils.filter(partitionIterator, tuple -> (tuple._2().get().applyGraphFilter(gFilter)).isPresent());
    }, true);
}

Source File: GraknSparkExecutor.java From grakn with GNU Affero General Public License v3.0

4 votes

public static JavaPairRDD<Object, VertexWritable> applyGraphFilter(final JavaPairRDD<Object, VertexWritable> graphRDD, final GraphFilter graphFilter) {
    return graphRDD.mapPartitionsToPair(partitionIterator -> {
        final GraphFilter gFilter = graphFilter.clone();
        return IteratorUtils.filter(partitionIterator, tuple -> (tuple._2().get().applyGraphFilter(gFilter)).isPresent());
    }, true);
}

Source File: DnnSPInstruction.java From systemds with Apache License 2.0

4 votes

@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	if(instOpcode.equalsIgnoreCase("conv2d") || instOpcode.equalsIgnoreCase("conv2d_bias_add")
		|| instOpcode.equalsIgnoreCase("maxpooling") || instOpcode.equalsIgnoreCase("relu_maxpooling")) {
		String rddVar = input1.getName();
		int numRowsPerBlock = 1;
		JavaPairRDD<MatrixIndexes,MatrixBlock> inputRDD = reblockAsRectangularMatrices(sec, rddVar, numRowsPerBlock);
		DataCharacteristics mcRdd = sec.getDataCharacteristics(rddVar);
		
		// ------------------------------------
		// TODO: Handle large filters > 2G
		Broadcast<MatrixBlock> filterBroadcast = null;
		Broadcast<MatrixBlock> biasBroadcast = null;
		if(instOpcode.equalsIgnoreCase("conv2d")) {
			filterBroadcast = getBroadcast(sec, _in2.getName());
		}
		else if(instOpcode.equalsIgnoreCase("conv2d_bias_add")) {
			filterBroadcast = getBroadcast(sec, _in3.getName());
			biasBroadcast = getBroadcast(sec, _in2.getName());
		}
		// ------------------------------------
		
		int pad_h = getScalarInput(ec, _padding, 0);
		int pad_w = getScalarInput(ec, _padding, 1);
		int stride_h = getScalarInput(ec, _stride, 0);
		int stride_w = getScalarInput(ec, _stride, 1);

		// int N = getScalarInput(ec, _input_shape, 0);
		int C = getScalarInput(ec, _input_shape, 1);
		int H = getScalarInput(ec, _input_shape, 2);
		int W = getScalarInput(ec, _input_shape, 3);

		int K = getScalarInput(ec, _filter_shape, 0);
		int R = getScalarInput(ec, _filter_shape, 2);
		int S = getScalarInput(ec, _filter_shape, 3);
		int P = (int) DnnUtils.getP(H, R, stride_h, pad_h);
		int Q = (int) DnnUtils.getQ(W, S, stride_w, pad_w);
		
		DnnParameters params = new DnnParameters(numRowsPerBlock, C, H, W, K, R, S, stride_h, stride_w, pad_h, pad_w, 1);
		boolean enableNativeBLAS = NativeHelper.isNativeLibraryLoaded(); 
		JavaPairRDD<MatrixIndexes,MatrixBlock> out = inputRDD.mapPartitionsToPair(new RDDConv2dMapMMFunction(filterBroadcast, params, instOpcode, biasBroadcast, mcRdd.getRows(), enableNativeBLAS), true);
		
		//put output RDD handle into symbol table
		sec.setRDDHandleForVariable(output.getName(), out);
		sec.addLineageRDD(output.getName(), rddVar);
		
		long nnz = -1; // TODO: Handle nnz
		long numCols = ((long)K)*((long)P)*Q;
		if(instOpcode.equalsIgnoreCase("maxpooling") || instOpcode.equalsIgnoreCase("relu_maxpooling")) {
			numCols = ((long)C)*((long)P)*Q;
		}
		if(numCols > Integer.MAX_VALUE) {
			throw new DMLRuntimeException("The current operator doesnot support large outputs.");
		}
		sec.setMetaData(output.getName(), 
			new MetaDataFormat(new MatrixCharacteristics(mcRdd.getRows(), numCols, numRowsPerBlock, nnz), FileFormat.BINARY));
	}
	else {
		throw new DMLRuntimeException("Not implemented: " + instOpcode);
	}
}

Source File: RDDConverterUtils.java From systemds with Apache License 2.0

4 votes

/**
 * Converts a libsvm text input file into two binary block matrices for features 
 * and labels, and saves these to the specified output files. This call also deletes 
 * existing files at the specified output locations, as well as determines and 
 * writes the meta data files of both output matrices. 
 * <p>
 * Note: We use {@code org.apache.spark.mllib.util.MLUtils.loadLibSVMFile} for parsing 
 * the libsvm input files in order to ensure consistency with Spark.
 * 
 * @param sc java spark context
 * @param pathIn path to libsvm input file
 * @param pathX path to binary block output file of features
 * @param pathY path to binary block output file of labels
 * @param mcOutX matrix characteristics of output matrix X
 */
public static void libsvmToBinaryBlock(JavaSparkContext sc, String pathIn, 
		String pathX, String pathY, DataCharacteristics mcOutX)
{
	if( !mcOutX.dimsKnown() )
		throw new DMLRuntimeException("Matrix characteristics "
			+ "required to convert sparse input representation.");
	try {
		//cleanup existing output files
		HDFSTool.deleteFileIfExistOnHDFS(pathX);
		HDFSTool.deleteFileIfExistOnHDFS(pathY);
		
		//convert libsvm to labeled points
		int numFeatures = (int) mcOutX.getCols();
		int numPartitions = SparkUtils.getNumPreferredPartitions(mcOutX, null);
		JavaRDD<org.apache.spark.mllib.regression.LabeledPoint> lpoints = 
				MLUtils.loadLibSVMFile(sc.sc(), pathIn, numFeatures, numPartitions).toJavaRDD();
		
		//append row index and best-effort caching to avoid repeated text parsing
		JavaPairRDD<org.apache.spark.mllib.regression.LabeledPoint,Long> ilpoints = 
				lpoints.zipWithIndex().persist(StorageLevel.MEMORY_AND_DISK()); 
		
		//extract labels and convert to binary block
		DataCharacteristics mc1 = new MatrixCharacteristics(mcOutX.getRows(), 1, mcOutX.getBlocksize(), -1);
		LongAccumulator aNnz1 = sc.sc().longAccumulator("nnz");
		JavaPairRDD<MatrixIndexes,MatrixBlock> out1 = ilpoints
				.mapPartitionsToPair(new LabeledPointToBinaryBlockFunction(mc1, true, aNnz1));
		int numPartitions2 = SparkUtils.getNumPreferredPartitions(mc1, null);
		out1 = RDDAggregateUtils.mergeByKey(out1, numPartitions2, false);
		out1.saveAsHadoopFile(pathY, MatrixIndexes.class, MatrixBlock.class, SequenceFileOutputFormat.class);
		mc1.setNonZeros(aNnz1.value()); //update nnz after triggered save
		HDFSTool.writeMetaDataFile(pathY+".mtd", ValueType.FP64, mc1, FileFormat.BINARY);
		
		//extract data and convert to binary block
		DataCharacteristics mc2 = new MatrixCharacteristics(mcOutX.getRows(), mcOutX.getCols(), mcOutX.getBlocksize(), -1);
		LongAccumulator aNnz2 = sc.sc().longAccumulator("nnz");
		JavaPairRDD<MatrixIndexes,MatrixBlock> out2 = ilpoints
				.mapPartitionsToPair(new LabeledPointToBinaryBlockFunction(mc2, false, aNnz2));
		out2 = RDDAggregateUtils.mergeByKey(out2, numPartitions, false);
		out2.saveAsHadoopFile(pathX, MatrixIndexes.class, MatrixBlock.class, SequenceFileOutputFormat.class);
		mc2.setNonZeros(aNnz2.value()); //update nnz after triggered save
		HDFSTool.writeMetaDataFile(pathX+".mtd", ValueType.FP64, mc2, FileFormat.BINARY);
		
		//asynchronous cleanup of cached intermediates
		ilpoints.unpersist(false);
	}
	catch(IOException ex) {
		throw new DMLRuntimeException(ex);
	}
}

Source File: SparkComputationGraph.java From deeplearning4j with Apache License 2.0

2 votes

/**
 * Feed-forward the specified data, with the given keys. i.e., get the network output/predictions for the specified data
 *
 * @param featuresData Features data to feed through the network
 * @param batchSize    Batch size to use when doing feed forward operations
 * @param <K>          Type of data for key - may be anything
 * @return             Network output given the input, by key
 */
public <K> JavaPairRDD<K, INDArray[]> feedForwardWithKey(JavaPairRDD<K, INDArray[]> featuresData, int batchSize) {
    return featuresData.mapPartitionsToPair(new GraphFeedForwardWithKeyFunction<K>(sc.broadcast(network.params()),
                    sc.broadcast(conf.toJson()), batchSize));
}

Source File: SparkDl4jMultiLayer.java From deeplearning4j with Apache License 2.0

2 votes

/**
 * Score the examples individually, using a specified batch size. Unlike {@link #calculateScore(JavaRDD, boolean)},
 * this method returns a score for each example separately<br>
 * Note: The provided JavaPairRDD has a key that is associated with each example and returned score.<br>
 * <b>Note:</b> The DataSet objects passed in must have exactly one example in them (otherwise: can't have a 1:1 association
 * between keys and data sets to score)
 *
 * @param data                       Data to score
 * @param includeRegularizationTerms If  true: include the l1/l2 regularization terms with the score (if any)
 * @param <K>                        Key type
 * @return A {@code JavaPairRDD<K,Double>} containing the scores of each example
 * @see MultiLayerNetwork#scoreExamples(DataSet, boolean)
 */
public <K> JavaPairRDD<K, Double> scoreExamples(JavaPairRDD<K, DataSet> data, boolean includeRegularizationTerms,
                int batchSize) {
    return data.mapPartitionsToPair(new ScoreExamplesWithKeyFunction<K>(sc.broadcast(network.params()),
                    sc.broadcast(conf.toJson()), includeRegularizationTerms, batchSize));
}

Source File: SparkComputationGraph.java From deeplearning4j with Apache License 2.0

2 votes

/**
 * Score the examples individually, using a specified batch size. Unlike {@link #calculateScore(JavaRDD, boolean)},
 * this method returns a score for each example separately<br>
 * Note: The provided JavaPairRDD has a key that is associated with each example and returned score.<br>
 * <b>Note:</b> The DataSet objects passed in must have exactly one example in them (otherwise: can't have a 1:1 association
 * between keys and data sets to score)
 *
 * @param data                       Data to score
 * @param includeRegularizationTerms If true: include the l1/l2 regularization terms with the score (if any)
 * @param <K>                        Key type
 * @return A {@code JavaPairRDD<K,Double>} containing the scores of each example
 * @see MultiLayerNetwork#scoreExamples(DataSet, boolean)
 */
public <K> JavaPairRDD<K, Double> scoreExamplesMultiDataSet(JavaPairRDD<K, MultiDataSet> data,
                boolean includeRegularizationTerms, int batchSize) {
    return data.mapPartitionsToPair(new ScoreExamplesWithKeyFunction<K>(sc.broadcast(network.params()),
                    sc.broadcast(conf.toJson()), includeRegularizationTerms, batchSize));
}

Java Code Examples for org.apache.spark.api.java.JavaPairRDD#mapPartitionsToPair()