org.apache.spark.api.java.JavaPairRDD#union

Source File: ExampleBatchLayerUpdate.java From oryx with Apache License 2.0

6 votes

@Override
public void runUpdate(JavaSparkContext sparkContext,
                      long timestamp,
                      JavaPairRDD<String,String> newData,
                      JavaPairRDD<String,String> pastData,
                      String modelDirString,
                      TopicProducer<String,String> modelUpdateTopic) throws IOException {
  JavaPairRDD<String,String> allData = pastData == null ? newData : newData.union(pastData);
  String modelString;
  try {
    modelString = new ObjectMapper().writeValueAsString(countDistinctOtherWords(allData));
  } catch (JsonProcessingException jpe) {
    throw new IOException(jpe);
  }
  modelUpdateTopic.send("MODEL", modelString);
}

Source File: AppendGAlignedSPInstruction.java From systemds with Apache License 2.0

6 votes

@Override
public void processInstruction(ExecutionContext ec) {
	// general case append (map-extend, aggregate)
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	checkBinaryAppendInputCharacteristics(sec, _cbind, false, true);
	DataCharacteristics mc1 = sec.getDataCharacteristics(input1.getName());
	
	JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() );
	JavaPairRDD<MatrixIndexes,MatrixBlock> in2 = sec.getBinaryMatrixBlockRDDHandleForVariable( input2.getName() );
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = null;
	
	// Simple changing of matrix indexes of RHS
	long shiftBy = _cbind ? mc1.getNumColBlocks() : mc1.getNumRowBlocks();
	out = in2.mapToPair(new ShiftColumnIndex(shiftBy, _cbind));
	out = in1.union( out );
	
	//put output RDD handle into symbol table
	updateBinaryAppendOutputDataCharacteristics(sec, _cbind);
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), input1.getName());
	sec.addLineageRDD(output.getName(), input2.getName());
}

Source File: RDDConverterUtilsExt.java From systemds with Apache License 2.0

6 votes

public static JavaPairRDD<MatrixIndexes, MatrixBlock> coordinateMatrixToBinaryBlock(JavaSparkContext sc,
	CoordinateMatrix input, DataCharacteristics mcIn, boolean outputEmptyBlocks)
{
	//convert matrix entry rdd to binary block rdd (w/ partial blocks)
	JavaPairRDD<MatrixIndexes, MatrixBlock> out = input.entries().toJavaRDD()
			.mapPartitionsToPair(new MatrixEntryToBinaryBlockFunction(mcIn));

	//inject empty blocks (if necessary)
	if( outputEmptyBlocks && mcIn.mightHaveEmptyBlocks() ) {
		out = out.union(
			SparkUtils.getEmptyBlockRDD(sc, mcIn) );
	}

	//aggregate partial matrix blocks
	out = RDDAggregateUtils.mergeByKey(out, false);

	return out;
}

Source File: RDDConverterUtils.java From systemds with Apache License 2.0

6 votes

public static JavaPairRDD<MatrixIndexes, MatrixBlock> textCellToBinaryBlock(JavaSparkContext sc,
		JavaPairRDD<LongWritable, Text> input, DataCharacteristics mcOut, boolean outputEmptyBlocks, FileFormatPropertiesMM mmProps)
{
	//convert textcell rdd to binary block rdd (w/ partial blocks)
	JavaPairRDD<MatrixIndexes, MatrixBlock> out = input.values()
			.mapPartitionsToPair(new TextToBinaryBlockFunction(mcOut, mmProps));

	//inject empty blocks (if necessary) 
	if( outputEmptyBlocks && mcOut.mightHaveEmptyBlocks() ) {
		out = out.union( 
			SparkUtils.getEmptyBlockRDD(sc, mcOut) );
	}
	
	//aggregate partial matrix blocks
	out = RDDAggregateUtils.mergeByKey(out, false); 
	
	return out;
}

Source File: RDDConverterUtils.java From systemds with Apache License 2.0

6 votes

public static JavaPairRDD<MatrixIndexes, MatrixBlock> binaryCellToBinaryBlock(JavaSparkContext sc,
	JavaPairRDD<MatrixIndexes, MatrixCell> input, DataCharacteristics mcOut, boolean outputEmptyBlocks)
{
		//convert binarycell rdd to binary block rdd (w/ partial blocks)
	JavaPairRDD<MatrixIndexes, MatrixBlock> out = input
			.mapPartitionsToPair(new BinaryCellToBinaryBlockFunction(mcOut));

	//inject empty blocks (if necessary) 
	if( outputEmptyBlocks && mcOut.mightHaveEmptyBlocks() ) {
		out = out.union( 
			SparkUtils.getEmptyBlockRDD(sc, mcOut) );
	}
	
	//aggregate partial matrix blocks
	out = RDDAggregateUtils.mergeByKey(out, false); 
	
	return out;
}

Source File: AppendGAlignedSPInstruction.java From systemds with Apache License 2.0

6 votes

@Override
public void processInstruction(ExecutionContext ec) {
	// general case append (map-extend, aggregate)
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	checkBinaryAppendInputCharacteristics(sec, _cbind, false, true);
	DataCharacteristics mc1 = sec.getDataCharacteristics(input1.getName());
	
	JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() );
	JavaPairRDD<MatrixIndexes,MatrixBlock> in2 = sec.getBinaryMatrixBlockRDDHandleForVariable( input2.getName() );
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = null;
	
	// Simple changing of matrix indexes of RHS
	long shiftBy = _cbind ? mc1.getNumColBlocks() : mc1.getNumRowBlocks();
	out = in2.mapToPair(new ShiftColumnIndex(shiftBy, _cbind));
	out = in1.union( out );
	
	//put output RDD handle into symbol table
	updateBinaryAppendOutputDataCharacteristics(sec, _cbind);
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), input1.getName());
	sec.addLineageRDD(output.getName(), input2.getName());
}

Source File: RDDConverterUtils.java From systemds with Apache License 2.0

6 votes

public static JavaPairRDD<MatrixIndexes, MatrixBlock> binaryCellToBinaryBlock(JavaSparkContext sc,
	JavaPairRDD<MatrixIndexes, MatrixCell> input, DataCharacteristics mcOut, boolean outputEmptyBlocks)
{
		//convert binarycell rdd to binary block rdd (w/ partial blocks)
	JavaPairRDD<MatrixIndexes, MatrixBlock> out = input
			.mapPartitionsToPair(new BinaryCellToBinaryBlockFunction(mcOut));

	//inject empty blocks (if necessary) 
	if( outputEmptyBlocks && mcOut.mightHaveEmptyBlocks() ) {
		out = out.union( 
			SparkUtils.getEmptyBlockRDD(sc, mcOut) );
	}
	
	//aggregate partial matrix blocks
	out = RDDAggregateUtils.mergeByKey(out, false); 
	
	return out;
}

Source File: RDDConverterUtils.java From systemds with Apache License 2.0

6 votes

public static JavaPairRDD<MatrixIndexes, MatrixBlock> textCellToBinaryBlock(JavaSparkContext sc,
		JavaPairRDD<LongWritable, Text> input, DataCharacteristics mcOut, boolean outputEmptyBlocks, FileFormatPropertiesMM mmProps)
{
	//convert textcell rdd to binary block rdd (w/ partial blocks)
	JavaPairRDD<MatrixIndexes, MatrixBlock> out = input.values()
			.mapPartitionsToPair(new TextToBinaryBlockFunction(mcOut, mmProps));

	//inject empty blocks (if necessary) 
	if( outputEmptyBlocks && mcOut.mightHaveEmptyBlocks() ) {
		out = out.union( 
			SparkUtils.getEmptyBlockRDD(sc, mcOut) );
	}
	
	//aggregate partial matrix blocks
	out = RDDAggregateUtils.mergeByKey(out, false); 
	
	return out;
}

Source File: RDDConverterUtilsExt.java From systemds with Apache License 2.0

6 votes

public static JavaPairRDD<MatrixIndexes, MatrixBlock> coordinateMatrixToBinaryBlock(JavaSparkContext sc,
	CoordinateMatrix input, DataCharacteristics mcIn, boolean outputEmptyBlocks)
{
	//convert matrix entry rdd to binary block rdd (w/ partial blocks)
	JavaPairRDD<MatrixIndexes, MatrixBlock> out = input.entries().toJavaRDD()
			.mapPartitionsToPair(new MatrixEntryToBinaryBlockFunction(mcIn));

	//inject empty blocks (if necessary)
	if( outputEmptyBlocks && mcIn.mightHaveEmptyBlocks() ) {
		out = out.union(
			SparkUtils.getEmptyBlockRDD(sc, mcIn) );
	}

	//aggregate partial matrix blocks
	out = RDDAggregateUtils.mergeByKey(out, false);

	return out;
}

Source File: FrameAppendRSPInstruction.java From systemds with Apache License 2.0

5 votes

@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	JavaPairRDD<Long,FrameBlock> in1 = sec.getFrameBinaryBlockRDDHandleForVariable( input1.getName() );
	JavaPairRDD<Long,FrameBlock> in2 = sec.getFrameBinaryBlockRDDHandleForVariable( input2.getName() );
	JavaPairRDD<Long,FrameBlock> out = null;
	long leftRows = sec.getDataCharacteristics(input1.getName()).getRows();
	
	if(_cbind) {
		JavaPairRDD<Long,FrameBlock> in1Aligned = in1.mapToPair(new ReduceSideAppendAlignFunction(leftRows));
		in1Aligned = FrameRDDAggregateUtils.mergeByKey(in1Aligned);
		JavaPairRDD<Long,FrameBlock> in2Aligned = in2.mapToPair(new ReduceSideAppendAlignFunction(leftRows));
		in2Aligned = FrameRDDAggregateUtils.mergeByKey(in2Aligned);
		
		out = in1Aligned.join(in2Aligned).mapValues(new ReduceSideColumnsFunction(_cbind));
	} else {	//rbind
		JavaPairRDD<Long,FrameBlock> right = in2.mapToPair( new ReduceSideAppendRowsFunction(leftRows));
		out = in1.union(right);
	}
	
	//put output RDD handle into symbol table
	updateBinaryAppendOutputDataCharacteristics(sec, _cbind);
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), input1.getName());
	sec.addLineageRDD(output.getName(), input2.getName());
	
	//update schema of output with merged input schemas
	sec.getFrameObject(output.getName()).setSchema(
		sec.getFrameObject(input1.getName()).mergeSchemas(
		sec.getFrameObject(input2.getName())));
}

Source File: FrameAppendRSPInstruction.java From systemds with Apache License 2.0

5 votes

@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	JavaPairRDD<Long,FrameBlock> in1 = sec.getFrameBinaryBlockRDDHandleForVariable( input1.getName() );
	JavaPairRDD<Long,FrameBlock> in2 = sec.getFrameBinaryBlockRDDHandleForVariable( input2.getName() );
	JavaPairRDD<Long,FrameBlock> out = null;
	long leftRows = sec.getDataCharacteristics(input1.getName()).getRows();
	
	if(_cbind) {
		JavaPairRDD<Long,FrameBlock> in1Aligned = in1.mapToPair(new ReduceSideAppendAlignFunction(leftRows));
		in1Aligned = FrameRDDAggregateUtils.mergeByKey(in1Aligned);
		JavaPairRDD<Long,FrameBlock> in2Aligned = in2.mapToPair(new ReduceSideAppendAlignFunction(leftRows));
		in2Aligned = FrameRDDAggregateUtils.mergeByKey(in2Aligned);
		
		out = in1Aligned.join(in2Aligned).mapValues(new ReduceSideColumnsFunction(_cbind));
	} else {	//rbind
		JavaPairRDD<Long,FrameBlock> right = in2.mapToPair( new ReduceSideAppendRowsFunction(leftRows));
		out = in1.union(right);
	}
	
	//put output RDD handle into symbol table
	updateBinaryAppendOutputDataCharacteristics(sec, _cbind);
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), input1.getName());
	sec.addLineageRDD(output.getName(), input2.getName());
	
	//update schema of output with merged input schemas
	sec.getFrameObject(output.getName()).setSchema(
		sec.getFrameObject(input1.getName()).mergeSchemas(
		sec.getFrameObject(input2.getName())));
}

Source File: SparkUHCDictionary.java From kylin with Apache License 2.0

4 votes

@Override
protected void execute(OptionsHelper optionsHelper) throws Exception {
    String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME);
    String metaUrl = optionsHelper.getOptionValue(OPTION_META_URL);
    String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID);
    String inputPath = optionsHelper.getOptionValue(OPTION_INPUT_PATH);
    String outputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH);
    String counterPath = optionsHelper.getOptionValue(OPTION_COUNTER_PATH);

    Class[] kryoClassArray = new Class[]{Class.forName("scala.reflect.ClassTag$$anon$1"),
            Class.forName("org.apache.kylin.engine.mr.steps.SelfDefineSortableKey")};

    SparkConf conf = new SparkConf().setAppName("Build uhc dictionary with spark for:" + cubeName + " segment " + segmentId);
    //serialization conf
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
    conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator");
    conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray);

    KylinSparkJobListener jobListener = new KylinSparkJobListener();
    try (JavaSparkContext sc = new JavaSparkContext(conf)) {
        sc.sc().addSparkListener(jobListener);
        HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(outputPath));

        Configuration hadoopConf = sc.hadoopConfiguration();
        hadoopConf.set("mapreduce.input.pathFilter.class", "org.apache.kylin.engine.mr.steps.filter.UHCDictPathFilter");

        final SerializableConfiguration sConf = new SerializableConfiguration(hadoopConf);
        KylinConfig config = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl);

        CubeManager cubeMgr = CubeManager.getInstance(config);
        CubeInstance cube = cubeMgr.getCube(cubeName);
        final Job job = Job.getInstance(sConf.get());

        // calculate source record bytes size
        final LongAccumulator bytesWritten = sc.sc().longAccumulator();
        String hdfsDir = sc.hadoopConfiguration().get(BatchConstants.CFG_GLOBAL_DICT_BASE_DIR);

        List<TblColRef> uhcColumns = cube.getDescriptor().getAllUHCColumns();
        int reducerCount = uhcColumns.size();
        if (reducerCount == 0) {
            return;
        }

        logger.info("RDD Output path: {}", outputPath);
        logger.info("getTotalReducerNum: {}", reducerCount);
        logger.info("counter path {}", counterPath);

        JavaPairRDD<String, String> wholeSequenceFileNames = null;
        for (TblColRef tblColRef : uhcColumns) {
            String columnPath = inputPath + "/" + tblColRef.getIdentity();
            if (!HadoopUtil.getFileSystem(columnPath).exists(new Path(columnPath))) {
                continue;
            }
            if (wholeSequenceFileNames == null) {
                wholeSequenceFileNames = sc.wholeTextFiles(columnPath);
            } else {
                wholeSequenceFileNames = wholeSequenceFileNames.union(sc.wholeTextFiles(columnPath));
            }
        }

        if (wholeSequenceFileNames == null) {
            logger.error("There're no sequence files at " + inputPath + " !");
            return;
        }

        JavaPairRDD<String, Tuple3<Writable, Writable, String>> pairRDD = wholeSequenceFileNames.map(tuple -> tuple._1)
                .mapToPair(new InputPathAndFilterAddFunction2(config, uhcColumns))
                .filter(tuple -> tuple._1 != -1)
                .reduceByKey((list1, list2) -> combineAllColumnDistinctValues(list1, list2))
                .mapToPair(new ProcessUHCColumnValues(cubeName, config, hdfsDir, uhcColumns));

        MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_DICT, SequenceFileOutputFormat.class,
                NullWritable.class, ArrayPrimitiveWritable.class);

        FileOutputFormat.setOutputPath(job, new Path(outputPath));
        job.getConfiguration().set(BatchConstants.CFG_OUTPUT_PATH, outputPath);
        //prevent to create zero-sized default output
        LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);

        MultipleOutputsRDD multipleOutputsRDD = MultipleOutputsRDD.rddToMultipleOutputsRDD(pairRDD);
        multipleOutputsRDD.saveAsNewAPIHadoopDatasetWithMultipleOutputs(job.getConfiguration());

        logger.info("Map input records={}", reducerCount);
        logger.info("HDFS Read: {} HDFS Write", bytesWritten.value());

        Map<String, String> counterMap = Maps.newHashMap();
        counterMap.put(ExecutableConstants.SOURCE_RECORDS_COUNT, String.valueOf(reducerCount));
        counterMap.put(ExecutableConstants.SOURCE_RECORDS_SIZE, String.valueOf(bytesWritten.value()));

        // save counter to hdfs
        HadoopUtil.writeToSequenceFile(sc.hadoopConfiguration(), counterPath, counterMap);
        HadoopUtil.deleteHDFSMeta(metaUrl);
    }
}

Source File: DataVecSparkUtil.java From deeplearning4j with Apache License 2.0

4 votes

/**This is a convenience method to combine data from separate files together (intended to write to a sequence file, using
 * {@link org.apache.spark.api.java.JavaPairRDD#saveAsNewAPIHadoopFile(String, Class, Class, Class) })<br>
 * A typical use case is to combine input and label data from different files, for later parsing by a RecordReader
 * or SequenceRecordReader.
 * A typical use case is as follows:<br>
 * Given two paths (directories), combine the files in these two directories into pairs.<br>
 * Then, for each pair of files, convert the file contents into a {@link BytesPairWritable}, which also contains
 * the original file paths of the files.<br>
 * The assumptions are as follows:<br>
 * - For every file in the first directory, there is an equivalent file in the second directory (i.e., same key)<br>
 * - The pairing of files can be done based on the paths of the files; paths are mapped to a key using a {@link PathToKeyConverter};
 *   keys are then matched to give pairs of files<br>
 * <br><br>
 * <b>Example usage</b>: to combine all files in directory {@code dir1} with equivalent files in {@code dir2}, by file name:
 * <pre>
 * <code>JavaSparkContext sc = ...;
 * String path1 = "/dir1";
 * String path2 = "/dir2";
 * PathToKeyConverter pathConverter = new PathToKeyConverterFilename();
 * JavaPairRDD&lt;Text,BytesPairWritable&gt; toWrite = DataVecSparkUtil.combineFilesForSequenceFile(sc, path1, path2, pathConverter, pathConverter );
 * String outputPath = "/my/output/path";
 * toWrite.saveAsNewAPIHadoopFile(outputPath, Text.class, BytesPairWritable.class, SequenceFileOutputFormat.class);
 * </code>
 * </pre>
 * Result: the file contexts aggregated (pairwise), written to a hadoop sequence file at /my/output/path
 *
 *
 * @param sc Spark context
 * @param path1 First directory (passed to JavaSparkContext.binaryFiles(path1))
 * @param path2 Second directory (passed to JavaSparkContext.binaryFiles(path1))
 * @param converter1 Converter, to convert file paths in first directory to a key (to allow files to be matched/paired by key)
 * @param converter2 As above, for second directory
 * @return
 */
public static JavaPairRDD<Text, BytesPairWritable> combineFilesForSequenceFile(JavaSparkContext sc, String path1,
                String path2, PathToKeyConverter converter1, PathToKeyConverter converter2) {
    JavaPairRDD<String, PortableDataStream> first = sc.binaryFiles(path1);
    JavaPairRDD<String, PortableDataStream> second = sc.binaryFiles(path2);

    //Now: process keys (paths) so that they can be merged
    JavaPairRDD<String, Tuple3<String, Integer, PortableDataStream>> first2 =
                    first.mapToPair(new PathToKeyFunction(0, converter1));
    JavaPairRDD<String, Tuple3<String, Integer, PortableDataStream>> second2 =
                    second.mapToPair(new PathToKeyFunction(1, converter2));
    JavaPairRDD<String, Tuple3<String, Integer, PortableDataStream>> merged = first2.union(second2);

    //Combine into pairs, and prepare for writing
    JavaPairRDD<Text, BytesPairWritable> toWrite =
                    merged.groupByKey().mapToPair(new MapToBytesPairWritableFunction());
    return toWrite;
}

Source File: CtableSPInstruction.java From systemds with Apache License 2.0

4 votes

@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;

	Ctable.OperationTypes ctableOp = Ctable.findCtableOperationByInputDataTypes(
		input1.getDataType(), input2.getDataType(), input3.getDataType());
	ctableOp = _isExpand ? Ctable.OperationTypes.CTABLE_EXPAND_SCALAR_WEIGHT : ctableOp;
	
	//get input rdd handle
	JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() );
	JavaPairRDD<MatrixIndexes,MatrixBlock> in2 = !ctableOp.hasSecondInput() ? null :
		sec.getBinaryMatrixBlockRDDHandleForVariable( input2.getName() );
	
	JavaPairRDD<MatrixIndexes,MatrixBlock> in3 = null;
	double s2 = -1, s3 = -1; //scalars
	
	DataCharacteristics mc1 = sec.getDataCharacteristics(input1.getName());
	DataCharacteristics mcOut = sec.getDataCharacteristics(output.getName());
	
	// handle known/unknown dimensions
	long dim1 = (_dim1Literal ? (long) Double.parseDouble(_outDim1) :
		(sec.getScalarInput(_outDim1, ValueType.FP64, false)).getLongValue());
	long dim2 = (_dim2Literal ? (long) Double.parseDouble(_outDim2) :
		(sec.getScalarInput(_outDim2, ValueType.FP64, false)).getLongValue());
	if( dim1 == -1 && dim2 == -1 ) {
		//note: if we need to determine the dimensions to we do so before 
		//creating cells to avoid unnecessary caching, repeated joins, etc.
		dim1 = (long) RDDAggregateUtils.max(in1);
		dim2 = ctableOp.hasSecondInput() ? (long) RDDAggregateUtils.max(in2) :
			sec.getScalarInput(input3).getLongValue();
	}
	mcOut.set(dim1, dim2, mc1.getBlocksize(), mc1.getBlocksize());
	mcOut.setNonZerosBound(mc1.getRows());
	
	//compute preferred degree of parallelism
	int numParts = Math.max(4 * (mc1.dimsKnown() ?
		SparkUtils.getNumPreferredPartitions(mc1) : in1.getNumPartitions()),
		SparkUtils.getNumPreferredPartitions(mcOut));
	
	JavaPairRDD<MatrixIndexes, MatrixBlock> out = null;
	switch(ctableOp) {
		case CTABLE_TRANSFORM: //(VECTOR)
			// F=ctable(A,B,W) 
			in3 = sec.getBinaryMatrixBlockRDDHandleForVariable( input3.getName() );
			out = in1.join(in2, numParts).join(in3, numParts)
				.mapValues(new MapJoinSignature3())
				.mapPartitionsToPair(new CTableFunction(ctableOp, s2, s3, _ignoreZeros, mcOut));
			break;
		
		case CTABLE_EXPAND_SCALAR_WEIGHT: //(VECTOR)
		case CTABLE_TRANSFORM_SCALAR_WEIGHT: //(VECTOR/MATRIX)
			// F = ctable(A,B) or F = ctable(A,B,1)
			s3 = sec.getScalarInput(input3).getDoubleValue();
			out = in1.join(in2, numParts).mapValues(new MapJoinSignature2())
				.mapPartitionsToPair(new CTableFunction(ctableOp, s2, s3, _ignoreZeros, mcOut));
			break;
			
		case CTABLE_TRANSFORM_HISTOGRAM: //(VECTOR)
			// F=ctable(A,1) or F = ctable(A,1,1)
			s2 = sec.getScalarInput(input2).getDoubleValue();
			s3 = sec.getScalarInput(input3).getDoubleValue();
			out = in1.mapValues(new MapJoinSignature1())
				.mapPartitionsToPair(new CTableFunction(ctableOp, s2, s3, _ignoreZeros, mcOut));
			break;
			
		case CTABLE_TRANSFORM_WEIGHTED_HISTOGRAM: //(VECTOR)
			// F=ctable(A,1,W)
			in3 = sec.getBinaryMatrixBlockRDDHandleForVariable( input3.getName() );
			s2 = sec.getScalarInput(input2).getDoubleValue();
			out = in1.join(in3, numParts).mapValues(new MapJoinSignature2())
				.mapPartitionsToPair(new CTableFunction(ctableOp, s2, s3, _ignoreZeros, mcOut));
			break;
		
		default:
			throw new DMLRuntimeException("Encountered an invalid ctable operation ("+ctableOp+") while executing instruction: " + this.toString());
	}
	
	//perform fused aggregation and reblock
	out = out.union(SparkUtils.getEmptyBlockRDD(sec.getSparkContext(), mcOut));
	out = RDDAggregateUtils.sumByKeyStable(out, numParts, false);
	
	//store output rdd handle
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), input1.getName());
	if( ctableOp.hasSecondInput() )
		sec.addLineageRDD(output.getName(), input2.getName());
	if( ctableOp.hasThirdInput() )
		sec.addLineageRDD(output.getName(), input3.getName());
}

Source File: PMapmmSPInstruction.java From systemds with Apache License 2.0

4 votes

@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	
	//get inputs
	JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() );
	JavaPairRDD<MatrixIndexes,MatrixBlock> in2 = sec.getBinaryMatrixBlockRDDHandleForVariable( input2.getName() );
	DataCharacteristics mc1 = sec.getDataCharacteristics(input1.getName());
	
	// This avoids errors such as java.lang.UnsupportedOperationException: Cannot change storage level of an RDD after it was already assigned a level
	// Ideally, we should ensure that we donot redundantly call persist on the same RDD.
	StorageLevel pmapmmStorageLevel = StorageLevel.MEMORY_AND_DISK();
	
	//cache right hand side because accessed many times
	in2 = in2.repartition(sec.getSparkContext().defaultParallelism())
			 .persist(pmapmmStorageLevel);
	
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = null;
	for( int i=0; i<mc1.getRows(); i+=NUM_ROWBLOCKS*mc1.getBlocksize() ) 
	{
		//create broadcast for rdd partition
		JavaPairRDD<MatrixIndexes,MatrixBlock> rdd = in1
				.filter(new IsBlockInRange(i+1, i+NUM_ROWBLOCKS*mc1.getBlocksize(), 1, mc1.getCols(), mc1))
				.mapToPair(new PMapMMRebaseBlocksFunction(i/mc1.getBlocksize()));
		
		int rlen = (int)Math.min(mc1.getRows()-i, NUM_ROWBLOCKS*mc1.getBlocksize());
		PartitionedBlock<MatrixBlock> pmb = SparkExecutionContext.toPartitionedMatrixBlock(rdd, rlen, (int)mc1.getCols(), mc1.getBlocksize(), -1L);
		Broadcast<PartitionedBlock<MatrixBlock>> bpmb = sec.getSparkContext().broadcast(pmb);
		
		//matrix multiplication
		JavaPairRDD<MatrixIndexes,MatrixBlock> rdd2 = in2
				.flatMapToPair(new PMapMMFunction(bpmb, i/mc1.getBlocksize()));
		rdd2 = RDDAggregateUtils.sumByKeyStable(rdd2, false);
		rdd2.persist(pmapmmStorageLevel)
		    .count();
		bpmb.unpersist(false);
		
		if( out == null )
			out = rdd2;
		else
			out = out.union(rdd2);
	}
	
	//cache final result
	out = out.persist(pmapmmStorageLevel);
	out.count();
	
	//put output RDD handle into symbol table
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), input1.getName());
	sec.addLineageRDD(output.getName(), input2.getName());
		
	//update output statistics if not inferred
	updateBinaryMMOutputDataCharacteristics(sec, true);
}

Source File: SparkUHCDictionary.java From kylin-on-parquet-v2 with Apache License 2.0

4 votes

@Override
protected void execute(OptionsHelper optionsHelper) throws Exception {
    String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME);
    String metaUrl = optionsHelper.getOptionValue(OPTION_META_URL);
    String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID);
    String inputPath = optionsHelper.getOptionValue(OPTION_INPUT_PATH);
    String outputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH);
    String counterPath = optionsHelper.getOptionValue(OPTION_COUNTER_PATH);

    Class[] kryoClassArray = new Class[]{Class.forName("scala.reflect.ClassTag$$anon$1"),
            Class.forName("org.apache.kylin.engine.mr.steps.SelfDefineSortableKey")};

    SparkConf conf = new SparkConf().setAppName("Build uhc dictionary with spark for:" + cubeName + " segment " + segmentId);
    //serialization conf
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
    conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator");
    conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray);

    KylinSparkJobListener jobListener = new KylinSparkJobListener();
    try (JavaSparkContext sc = new JavaSparkContext(conf)) {
        sc.sc().addSparkListener(jobListener);
        HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(outputPath));

        Configuration hadoopConf = sc.hadoopConfiguration();
        hadoopConf.set("mapreduce.input.pathFilter.class", "org.apache.kylin.engine.mr.steps.filter.UHCDictPathFilter");

        final SerializableConfiguration sConf = new SerializableConfiguration(hadoopConf);
        KylinConfig config = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl);

        CubeManager cubeMgr = CubeManager.getInstance(config);
        CubeInstance cube = cubeMgr.getCube(cubeName);
        final Job job = Job.getInstance(sConf.get());

        // calculate source record bytes size
        final LongAccumulator bytesWritten = sc.sc().longAccumulator();
        String hdfsDir = sc.hadoopConfiguration().get(BatchConstants.CFG_GLOBAL_DICT_BASE_DIR);

        List<TblColRef> uhcColumns = cube.getDescriptor().getAllUHCColumns();
        int reducerCount = uhcColumns.size();
        if (reducerCount == 0) {
            return;
        }

        logger.info("RDD Output path: {}", outputPath);
        logger.info("getTotalReducerNum: {}", reducerCount);
        logger.info("counter path {}", counterPath);

        JavaPairRDD<String, String> wholeSequenceFileNames = null;
        for (TblColRef tblColRef : uhcColumns) {
            String columnPath = inputPath + "/" + tblColRef.getIdentity();
            if (!HadoopUtil.getFileSystem(columnPath).exists(new Path(columnPath))) {
                continue;
            }
            if (wholeSequenceFileNames == null) {
                wholeSequenceFileNames = sc.wholeTextFiles(columnPath);
            } else {
                wholeSequenceFileNames = wholeSequenceFileNames.union(sc.wholeTextFiles(columnPath));
            }
        }

        if (wholeSequenceFileNames == null) {
            logger.error("There're no sequence files at " + inputPath + " !");
            return;
        }

        JavaPairRDD<String, Tuple3<Writable, Writable, String>> pairRDD = wholeSequenceFileNames.map(tuple -> tuple._1)
                .mapToPair(new InputPathAndFilterAddFunction2(config, uhcColumns))
                .filter(tuple -> tuple._1 != -1)
                .reduceByKey((list1, list2) -> combineAllColumnDistinctValues(list1, list2))
                .mapToPair(new ProcessUHCColumnValues(cubeName, config, hdfsDir, uhcColumns));

        MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_DICT, SequenceFileOutputFormat.class,
                NullWritable.class, ArrayPrimitiveWritable.class);

        FileOutputFormat.setOutputPath(job, new Path(outputPath));
        job.getConfiguration().set(BatchConstants.CFG_OUTPUT_PATH, outputPath);
        //prevent to create zero-sized default output
        LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);

        MultipleOutputsRDD multipleOutputsRDD = MultipleOutputsRDD.rddToMultipleOutputsRDD(pairRDD);
        multipleOutputsRDD.saveAsNewAPIHadoopDatasetWithMultipleOutputs(job.getConfiguration());

        logger.info("Map input records={}", reducerCount);
        logger.info("HDFS Read: {} HDFS Write", bytesWritten.value());

        Map<String, String> counterMap = Maps.newHashMap();
        counterMap.put(ExecutableConstants.SOURCE_RECORDS_COUNT, String.valueOf(reducerCount));
        counterMap.put(ExecutableConstants.SOURCE_RECORDS_SIZE, String.valueOf(bytesWritten.value()));

        // save counter to hdfs
        HadoopUtil.writeToSequenceFile(sc.hadoopConfiguration(), counterPath, counterMap);
        HadoopUtil.deleteHDFSMeta(metaUrl);
    }
}

Source File: DataVecSparkUtil.java From DataVec with Apache License 2.0

4 votes

/**This is a convenience method to combine data from separate files together (intended to write to a sequence file, using
 * {@link org.apache.spark.api.java.JavaPairRDD#saveAsNewAPIHadoopFile(String, Class, Class, Class) })<br>
 * A typical use case is to combine input and label data from different files, for later parsing by a RecordReader
 * or SequenceRecordReader.
 * A typical use case is as follows:<br>
 * Given two paths (directories), combine the files in these two directories into pairs.<br>
 * Then, for each pair of files, convert the file contents into a {@link BytesPairWritable}, which also contains
 * the original file paths of the files.<br>
 * The assumptions are as follows:<br>
 * - For every file in the first directory, there is an equivalent file in the second directory (i.e., same key)<br>
 * - The pairing of files can be done based on the paths of the files; paths are mapped to a key using a {@link PathToKeyConverter};
 *   keys are then matched to give pairs of files<br>
 * <br><br>
 * <b>Example usage</b>: to combine all files in directory {@code dir1} with equivalent files in {@code dir2}, by file name:
 * <pre>
 * <code>JavaSparkContext sc = ...;
 * String path1 = "/dir1";
 * String path2 = "/dir2";
 * PathToKeyConverter pathConverter = new PathToKeyConverterFilename();
 * JavaPairRDD&lt;Text,BytesPairWritable&gt; toWrite = DataVecSparkUtil.combineFilesForSequenceFile(sc, path1, path2, pathConverter, pathConverter );
 * String outputPath = "/my/output/path";
 * toWrite.saveAsNewAPIHadoopFile(outputPath, Text.class, BytesPairWritable.class, SequenceFileOutputFormat.class);
 * </code>
 * </pre>
 * Result: the file contexts aggregated (pairwise), written to a hadoop sequence file at /my/output/path
 *
 *
 * @param sc Spark context
 * @param path1 First directory (passed to JavaSparkContext.binaryFiles(path1))
 * @param path2 Second directory (passed to JavaSparkContext.binaryFiles(path1))
 * @param converter1 Converter, to convert file paths in first directory to a key (to allow files to be matched/paired by key)
 * @param converter2 As above, for second directory
 * @return
 */
public static JavaPairRDD<Text, BytesPairWritable> combineFilesForSequenceFile(JavaSparkContext sc, String path1,
                String path2, PathToKeyConverter converter1, PathToKeyConverter converter2) {
    JavaPairRDD<String, PortableDataStream> first = sc.binaryFiles(path1);
    JavaPairRDD<String, PortableDataStream> second = sc.binaryFiles(path2);

    //Now: process keys (paths) so that they can be merged
    JavaPairRDD<String, Tuple3<String, Integer, PortableDataStream>> first2 =
                    first.mapToPair(new PathToKeyFunction(0, converter1));
    JavaPairRDD<String, Tuple3<String, Integer, PortableDataStream>> second2 =
                    second.mapToPair(new PathToKeyFunction(1, converter2));
    JavaPairRDD<String, Tuple3<String, Integer, PortableDataStream>> merged = first2.union(second2);

    //Combine into pairs, and prepare for writing
    JavaPairRDD<Text, BytesPairWritable> toWrite =
                    merged.groupByKey().mapToPair(new MapToBytesPairWritableFunction());
    return toWrite;
}

Source File: CtableSPInstruction.java From systemds with Apache License 2.0

4 votes

@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;

	Ctable.OperationTypes ctableOp = Ctable.findCtableOperationByInputDataTypes(
		input1.getDataType(), input2.getDataType(), input3.getDataType());
	ctableOp = _isExpand ? Ctable.OperationTypes.CTABLE_EXPAND_SCALAR_WEIGHT : ctableOp;
	
	//get input rdd handle
	JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() );
	JavaPairRDD<MatrixIndexes,MatrixBlock> in2 = !ctableOp.hasSecondInput() ? null :
		sec.getBinaryMatrixBlockRDDHandleForVariable( input2.getName() );
	
	JavaPairRDD<MatrixIndexes,MatrixBlock> in3 = null;
	double s2 = -1, s3 = -1; //scalars
	
	DataCharacteristics mc1 = sec.getDataCharacteristics(input1.getName());
	DataCharacteristics mcOut = sec.getDataCharacteristics(output.getName());
	
	// handle known/unknown dimensions
	long dim1 = (_dim1Literal ? (long) Double.parseDouble(_outDim1) :
		(sec.getScalarInput(_outDim1, ValueType.FP64, false)).getLongValue());
	long dim2 = (_dim2Literal ? (long) Double.parseDouble(_outDim2) :
		(sec.getScalarInput(_outDim2, ValueType.FP64, false)).getLongValue());
	if( dim1 == -1 && dim2 == -1 ) {
		//note: if we need to determine the dimensions to we do so before 
		//creating cells to avoid unnecessary caching, repeated joins, etc.
		dim1 = (long) RDDAggregateUtils.max(in1);
		dim2 = ctableOp.hasSecondInput() ? (long) RDDAggregateUtils.max(in2) :
			sec.getScalarInput(input3).getLongValue();
	}
	mcOut.set(dim1, dim2, mc1.getBlocksize(), mc1.getBlocksize());
	mcOut.setNonZerosBound(mc1.getRows());
	
	//compute preferred degree of parallelism
	int numParts = Math.max(4 * (mc1.dimsKnown() ?
		SparkUtils.getNumPreferredPartitions(mc1) : in1.getNumPartitions()),
		SparkUtils.getNumPreferredPartitions(mcOut));
	
	JavaPairRDD<MatrixIndexes, MatrixBlock> out = null;
	switch(ctableOp) {
		case CTABLE_TRANSFORM: //(VECTOR)
			// F=ctable(A,B,W) 
			in3 = sec.getBinaryMatrixBlockRDDHandleForVariable( input3.getName() );
			out = in1.join(in2, numParts).join(in3, numParts)
				.mapValues(new MapJoinSignature3())
				.mapPartitionsToPair(new CTableFunction(ctableOp, s2, s3, _ignoreZeros, mcOut));
			break;
		
		case CTABLE_EXPAND_SCALAR_WEIGHT: //(VECTOR)
		case CTABLE_TRANSFORM_SCALAR_WEIGHT: //(VECTOR/MATRIX)
			// F = ctable(A,B) or F = ctable(A,B,1)
			s3 = sec.getScalarInput(input3).getDoubleValue();
			out = in1.join(in2, numParts).mapValues(new MapJoinSignature2())
				.mapPartitionsToPair(new CTableFunction(ctableOp, s2, s3, _ignoreZeros, mcOut));
			break;
			
		case CTABLE_TRANSFORM_HISTOGRAM: //(VECTOR)
			// F=ctable(A,1) or F = ctable(A,1,1)
			s2 = sec.getScalarInput(input2).getDoubleValue();
			s3 = sec.getScalarInput(input3).getDoubleValue();
			out = in1.mapValues(new MapJoinSignature1())
				.mapPartitionsToPair(new CTableFunction(ctableOp, s2, s3, _ignoreZeros, mcOut));
			break;
			
		case CTABLE_TRANSFORM_WEIGHTED_HISTOGRAM: //(VECTOR)
			// F=ctable(A,1,W)
			in3 = sec.getBinaryMatrixBlockRDDHandleForVariable( input3.getName() );
			s2 = sec.getScalarInput(input2).getDoubleValue();
			out = in1.join(in3, numParts).mapValues(new MapJoinSignature2())
				.mapPartitionsToPair(new CTableFunction(ctableOp, s2, s3, _ignoreZeros, mcOut));
			break;
		
		default:
			throw new DMLRuntimeException("Encountered an invalid ctable operation ("+ctableOp+") while executing instruction: " + this.toString());
	}
	
	//perform fused aggregation and reblock
	out = out.union(SparkUtils.getEmptyBlockRDD(sec.getSparkContext(), mcOut));
	out = RDDAggregateUtils.sumByKeyStable(out, numParts, false);
	
	//store output rdd handle
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), input1.getName());
	if( ctableOp.hasSecondInput() )
		sec.addLineageRDD(output.getName(), input2.getName());
	if( ctableOp.hasThirdInput() )
		sec.addLineageRDD(output.getName(), input3.getName());
}

Source File: PMapmmSPInstruction.java From systemds with Apache License 2.0

4 votes

@Override
public void processInstruction(ExecutionContext ec) {
	SparkExecutionContext sec = (SparkExecutionContext)ec;
	
	//get inputs
	JavaPairRDD<MatrixIndexes,MatrixBlock> in1 = sec.getBinaryMatrixBlockRDDHandleForVariable( input1.getName() );
	JavaPairRDD<MatrixIndexes,MatrixBlock> in2 = sec.getBinaryMatrixBlockRDDHandleForVariable( input2.getName() );
	DataCharacteristics mc1 = sec.getDataCharacteristics(input1.getName());
	
	// This avoids errors such as java.lang.UnsupportedOperationException: Cannot change storage level of an RDD after it was already assigned a level
	// Ideally, we should ensure that we donot redundantly call persist on the same RDD.
	StorageLevel pmapmmStorageLevel = StorageLevel.MEMORY_AND_DISK();
	
	//cache right hand side because accessed many times
	in2 = in2.repartition(sec.getSparkContext().defaultParallelism())
			 .persist(pmapmmStorageLevel);
	
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = null;
	for( int i=0; i<mc1.getRows(); i+=NUM_ROWBLOCKS*mc1.getBlocksize() ) 
	{
		//create broadcast for rdd partition
		JavaPairRDD<MatrixIndexes,MatrixBlock> rdd = in1
				.filter(new IsBlockInRange(i+1, i+NUM_ROWBLOCKS*mc1.getBlocksize(), 1, mc1.getCols(), mc1))
				.mapToPair(new PMapMMRebaseBlocksFunction(i/mc1.getBlocksize()));
		
		int rlen = (int)Math.min(mc1.getRows()-i, NUM_ROWBLOCKS*mc1.getBlocksize());
		PartitionedBlock<MatrixBlock> pmb = SparkExecutionContext.toPartitionedMatrixBlock(rdd, rlen, (int)mc1.getCols(), mc1.getBlocksize(), -1L);
		Broadcast<PartitionedBlock<MatrixBlock>> bpmb = sec.getSparkContext().broadcast(pmb);
		
		//matrix multiplication
		JavaPairRDD<MatrixIndexes,MatrixBlock> rdd2 = in2
				.flatMapToPair(new PMapMMFunction(bpmb, i/mc1.getBlocksize()));
		rdd2 = RDDAggregateUtils.sumByKeyStable(rdd2, false);
		rdd2.persist(pmapmmStorageLevel)
		    .count();
		bpmb.unpersist(false);
		
		if( out == null )
			out = rdd2;
		else
			out = out.union(rdd2);
	}
	
	//cache final result
	out = out.persist(pmapmmStorageLevel);
	out.count();
	
	//put output RDD handle into symbol table
	sec.setRDDHandleForVariable(output.getName(), out);
	sec.addLineageRDD(output.getName(), input1.getName());
	sec.addLineageRDD(output.getName(), input2.getName());
		
	//update output statistics if not inferred
	updateBinaryMMOutputDataCharacteristics(sec, true);
}

Source File: RP_DBSCAN.java From RP-DBSCAN with Apache License 2.0

4 votes

/**
 * Phase III : post-processing for RP-DBSCAN
 * Phase III-1 (Progressive Graph Merging) and Phase III-2 (Point Labeling)
 */
public void phaseIII()
{
	/**
	 * Phase III-1: Progressive Graph Merging
	 */
	
	// Merge subgraphs into global cell graph through following parallel procedures: Single Merger, Edge Type Detection and Edge Reduction.
	int curPartitionSize = Conf.numOfPartitions;
	while(curPartitionSize != 1)
	{
		curPartitionSize = curPartitionSize/2;
		edgeSet = edgeSet.mapPartitionsToPair(new Methods.BuildMST(conf, corePaths, curPartitionSize)).repartition(curPartitionSize);
	}

	List<Tuple2<Integer, Integer>> result = edgeSet.mapPartitionsToPair(new Methods.FinalPhase(conf, corePaths)).collect();

	// Count the number of Cluster in global cell graph.
	numOfClusters = result.get(0)._2;

	/**
	 * Phase III-2: Point Labeling
	 */
	//Assign border points into proper clusters (partially condition of Theorem 3.5).
	JavaPairRDD<Integer, ApproximatedPoint> borderPts = dataset.flatMapToPair(new Methods.EmitConnectedCoreCellsFromBorderCell(conf, Conf.numOfPartitions)).groupByKey().flatMapToPair(new Methods.AssignBorderPointToCluster(Conf.dim, Conf.epsilon, conf, Conf.pairOutputPath));
	
	//Assign core points into proper clusters (fully condition of Theorem 3.5.
	JavaPairRDD<Integer, ApproximatedPoint> corePts = dataset.mapPartitionsToPair(new Methods.AssignCorePointToCluster(conf, Conf.pairOutputPath));
	
	//Point labeling algorithm 1 : faster than algorithm 2, but not scalable.
	//If out-of-memory error is occurred during the labeling procedure, then use below algorithm 2 for labeling instead of this.
	//union the two results.
	JavaPairRDD<Integer, ApproximatedPoint> assignedResult = borderPts.union(corePts);
	
	//count the number of points in each cluster.
	numOfPtsInCluster = assignedResult.mapPartitionsToPair(new Methods.CountForEachCluster()).reduceByKey(new Methods.AggregateCount()).collect();
	
	
	/*
	// Point labeling algorithm 2 : scalable, but slower than algorithm 1.
	List<Tuple2<Integer, Long>> borderPtsList =  borderPts.mapPartitionsToPair(new Methods.CountForEachCluster()).reduceByKey(new Methods.AggregateCount()).collect();	
	List<Tuple2<Integer, Long>> corePtsList =  corePts.mapPartitionsToPair(new Methods.CountForEachCluster()).reduceByKey(new Methods.AggregateCount()).collect();
	
	HashMap<Integer, Long> numOfPtsInCluster = new HashMap<Integer, Long>();
	for(Tuple2<Integer, Long> core : corePtsList)
		numOfPtsInCluster.put(core._1, core._2);
	for(Tuple2<Integer, Long> border : borderPtsList)
		numOfPtsInCluster.put( border._1 , numOfPtsInCluster.get(border._1)+border._2);

	for(Entry<Integer, Long> entry : numOfPtsInCluster.entrySet())
		System.out.println("CLUSTER ["+(entry.getKey()+1)+"] : "+ entry.getValue());
	*/
	

}

Java Code Examples for org.apache.spark.api.java.JavaPairRDD#union()