Java Code Examples for org.apache.spark.api.java.JavaPairRDD#collect()
The following examples show how to use
org.apache.spark.api.java.JavaPairRDD#collect() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: SparkExecutionContext.java From systemds with Apache License 2.0 | 6 votes |
public static PartitionedBlock<MatrixBlock> toPartitionedMatrixBlock(JavaPairRDD<MatrixIndexes,MatrixBlock> rdd, int rlen, int clen, int blen, long nnz) { long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; PartitionedBlock<MatrixBlock> out = new PartitionedBlock<>(rlen, clen, blen); List<Tuple2<MatrixIndexes,MatrixBlock>> list = rdd.collect(); //copy blocks one-at-a-time into output matrix block for( Tuple2<MatrixIndexes,MatrixBlock> keyval : list ) { //unpack index-block pair MatrixIndexes ix = keyval._1(); MatrixBlock block = keyval._2(); out.setBlock((int)ix.getRowIndex(), (int)ix.getColumnIndex(), block); } if (DMLScript.STATISTICS) { Statistics.accSparkCollectTime(System.nanoTime() - t0); Statistics.incSparkCollectCount(1); } return out; }
Example 2
Source File: WordCount.java From tutorials with MIT License | 6 votes |
public static void main(String[] args) throws Exception { if (args.length < 1) { System.err.println("Usage: JavaWordCount <file>"); System.exit(1); } SparkConf sparkConf = new SparkConf().setAppName("JavaWordCount") .setMaster("local"); JavaSparkContext ctx = new JavaSparkContext(sparkConf); JavaRDD<String> lines = ctx.textFile(args[0], 1); JavaRDD<String> words = lines.flatMap(s -> Arrays.asList(SPACE.split(s)).iterator()); JavaPairRDD<String, Integer> wordAsTuple = words.mapToPair(word -> new Tuple2<>(word, 1)); JavaPairRDD<String, Integer> wordWithCount = wordAsTuple.reduceByKey((Integer i1, Integer i2)->i1 + i2); List<Tuple2<String, Integer>> output = wordWithCount.collect(); for (Tuple2<?, ?> tuple : output) { System.out.println(tuple._1() + ": " + tuple._2()); } ctx.stop(); }
Example 3
Source File: SparkExecutionContext.java From systemds with Apache License 2.0 | 5 votes |
public static FrameBlock toFrameBlock(JavaPairRDD<Long,FrameBlock> rdd, ValueType[] schema, int rlen, int clen) { long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; if(schema == null) schema = UtilFunctions.nCopies(clen, ValueType.STRING); //create output frame block (w/ lazy allocation) FrameBlock out = new FrameBlock(schema); out.ensureAllocatedColumns(rlen); List<Tuple2<Long,FrameBlock>> list = rdd.collect(); //copy blocks one-at-a-time into output matrix block for( Tuple2<Long,FrameBlock> keyval : list ) { //unpack index-block pair int ix = (int)(keyval._1() - 1); FrameBlock block = keyval._2(); //copy into output frame out.copy( ix, ix+block.getNumRows()-1, 0, block.getNumColumns()-1, block ); if( ix == 0 ) { out.setColumnNames(block.getColumnNames()); out.setColumnMetadata(block.getColumnMetadata()); } } if (DMLScript.STATISTICS) { Statistics.accSparkCollectTime(System.nanoTime() - t0); Statistics.incSparkCollectCount(1); } return out; }
Example 4
Source File: JavaLogQuery.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaLogQuery") .getOrCreate(); JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext()); JavaRDD<String> dataSet = (args.length == 1) ? jsc.textFile(args[0]) : jsc.parallelize(exampleApacheLogs); JavaPairRDD<Tuple3<String, String, String>, Stats> extracted = dataSet.mapToPair(new PairFunction<String, Tuple3<String, String, String>, Stats>() { @Override public Tuple2<Tuple3<String, String, String>, Stats> call(String s) { return new Tuple2<>(extractKey(s), extractStats(s)); } }); JavaPairRDD<Tuple3<String, String, String>, Stats> counts = extracted.reduceByKey(new Function2<Stats, Stats, Stats>() { @Override public Stats call(Stats stats, Stats stats2) { return stats.merge(stats2); } }); List<Tuple2<Tuple3<String, String, String>, Stats>> output = counts.collect(); for (Tuple2<?,?> t : output) { System.out.println(t._1() + "\t" + t._2()); } spark.stop(); }
Example 5
Source File: TextPipelineTest.java From deeplearning4j with Apache License 2.0 | 5 votes |
/** * This test checked generations retrieved using stopWords * * @throws Exception */ @Test @Ignore //AB 2020/04/19 https://github.com/eclipse/deeplearning4j/issues/8849 public void testZipFunction1() throws Exception { JavaSparkContext sc = getContext(); JavaRDD<String> corpusRDD = getCorpusRDD(sc); // word2vec.setRemoveStop(false); Broadcast<Map<String, Object>> broadcastTokenizerVarMap = sc.broadcast(word2vec.getTokenizerVarMap()); TextPipeline pipeline = new TextPipeline(corpusRDD, broadcastTokenizerVarMap); pipeline.buildVocabCache(); pipeline.buildVocabWordListRDD(); JavaRDD<AtomicLong> sentenceCountRDD = pipeline.getSentenceCountRDD(); JavaRDD<List<VocabWord>> vocabWordListRDD = pipeline.getVocabWordListRDD(); CountCumSum countCumSum = new CountCumSum(sentenceCountRDD); JavaRDD<Long> sentenceCountCumSumRDD = countCumSum.buildCumSum(); JavaPairRDD<List<VocabWord>, Long> vocabWordListSentenceCumSumRDD = vocabWordListRDD.zip(sentenceCountCumSumRDD); List<Tuple2<List<VocabWord>, Long>> lst = vocabWordListSentenceCumSumRDD.collect(); List<VocabWord> vocabWordsList1 = lst.get(0)._1(); Long cumSumSize1 = lst.get(0)._2(); assertEquals(3, vocabWordsList1.size()); assertEquals(vocabWordsList1.get(0).getWord(), "strange"); assertEquals(vocabWordsList1.get(1).getWord(), "strange"); assertEquals(vocabWordsList1.get(2).getWord(), "world"); assertEquals(cumSumSize1, 6L, 0); List<VocabWord> vocabWordsList2 = lst.get(1)._1(); Long cumSumSize2 = lst.get(1)._2(); assertEquals(2, vocabWordsList2.size()); assertEquals(vocabWordsList2.get(0).getWord(), "flowers"); assertEquals(vocabWordsList2.get(1).getWord(), "red"); assertEquals(cumSumSize2, 9L, 0); sc.stop(); }
Example 6
Source File: SMInputFormatIT.java From spliceengine with GNU Affero General Public License v3.0 | 5 votes |
@Test public void testSparkIntegrationWithInputFormat() throws IOException { config.set(MRConstants.SPLICE_TABLE_NAME, tableWatcherA.toString()); Job job = Job.getInstance(config, "Test Scan"); JavaPairRDD<RowLocation, ExecRow> table = sparkWatcher.jsc.newAPIHadoopRDD(job.getConfiguration(), SMInputFormat.class, RowLocation.class, ExecRow.class); List<Tuple2<RowLocation, ExecRow>> data = table.collect(); int i = 0; for (Tuple2<RowLocation, ExecRow> tuple: data) { i++; Assert.assertNotNull(tuple._1()); Assert.assertNotNull(tuple._2()); } Assert.assertEquals("Incorrect Results Returned", 2,i); }
Example 7
Source File: SparkExecutionContext.java From systemds with Apache License 2.0 | 5 votes |
public static FrameBlock toFrameBlock(JavaPairRDD<Long,FrameBlock> rdd, ValueType[] schema, int rlen, int clen) { long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; if(schema == null) schema = UtilFunctions.nCopies(clen, ValueType.STRING); //create output frame block (w/ lazy allocation) FrameBlock out = new FrameBlock(schema); out.ensureAllocatedColumns(rlen); List<Tuple2<Long,FrameBlock>> list = rdd.collect(); //copy blocks one-at-a-time into output matrix block for( Tuple2<Long,FrameBlock> keyval : list ) { //unpack index-block pair int ix = (int)(keyval._1() - 1); FrameBlock block = keyval._2(); //copy into output frame out.copy( ix, ix+block.getNumRows()-1, 0, block.getNumColumns()-1, block ); if( ix == 0 ) { out.setColumnNames(block.getColumnNames()); out.setColumnMetadata(block.getColumnMetadata()); } } if (DMLScript.STATISTICS) { Statistics.accSparkCollectTime(System.nanoTime() - t0); Statistics.incSparkCollectCount(1); } return out; }
Example 8
Source File: MockBatchUpdate.java From oryx with Apache License 2.0 | 5 votes |
private static Collection<Tuple2<String,String>> collect(JavaPairRDD<String,String> rdd) { if (rdd == null) { return Collections.emptyList(); } else { return rdd.collect(); } }
Example 9
Source File: SparkPairDataSet.java From spliceengine with GNU Affero General Public License v3.0 | 5 votes |
private <W> Multimap<K, W> generateMultimap(JavaPairRDD<K, W> rightPairDataSet){ Multimap<K, W> returnValue=ArrayListMultimap.create(); List<Tuple2<K, W>> value=rightPairDataSet.collect(); for(Tuple2<K, W> tuple : value){ returnValue.put(tuple._1,tuple._2); } return returnValue; }
Example 10
Source File: SMInputFormatIT.java From spliceengine with GNU Affero General Public License v3.0 | 5 votes |
@Test public void testCountOverMultipleRegionsInSpark() throws IOException { config.set(MRConstants.SPLICE_TABLE_NAME, tableWatcherB.toString()); Job job = Job.getInstance(config, "Test Scan"); JavaPairRDD<RowLocation, ExecRow> table = sparkWatcher.jsc.newAPIHadoopRDD(job.getConfiguration(), SMInputFormat.class, RowLocation.class, ExecRow.class); List<Tuple2<RowLocation, ExecRow>> data = table.collect(); int i = 0; for (Tuple2<RowLocation, ExecRow> tuple: data) { i++; Assert.assertNotNull(tuple._1()); Assert.assertNotNull(tuple._2()); } Assert.assertEquals("Incorrect Results Returned", 10000,i); }
Example 11
Source File: PageRankSpark.java From graphify with Apache License 2.0 | 4 votes |
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: JavaPageRank <file> <number_of_iterations>"); System.exit(1); } SparkConf sparkConf = new SparkConf().setAppName("Graphify"); JavaSparkContext ctx = new JavaSparkContext(sparkConf); JavaRDD<String> lines = ctx.textFile(args[0], 1); // Loads all URLs from input file and initialize their neighbors. JavaPairRDD<String, Iterable<String>> links = lines.mapToPair(s -> { String[] parts = SPACES.split(s); return new Tuple2<>(parts[0], parts[1]); }).distinct().groupByKey().cache(); // Loads all URLs with other URL(s) link to from input file and initialize ranks of them to one. JavaPairRDD<String, Double> ranks = links.mapValues(rs -> 1.0); // Calculates and updates URL ranks continuously using PageRank algorithm. for (int current = 0; current < Integer.parseInt(args[1]); current++) { // Calculates URL contributions to the rank of other URLs. JavaPairRDD<String, Double> contribs = links.join(ranks).values() .flatMapToPair(s -> { int urlCount = Iterables.size(s._1()); List<Tuple2<String, Double>> results = new ArrayList<>(); for (String n : s._1()) { results.add(new Tuple2<>(n, s._2() / urlCount)); } return results; }); // Re-calculates URL ranks based on neighbor contributions. ranks = contribs.reduceByKey(new Sum()).mapValues(sum -> 0.15 + sum * 0.85); } // Collects all URL ranks and dump them to console. List<Tuple2<String, Double>> output = ranks.collect(); for (Tuple2<?,?> tuple : output) { System.out.println(tuple._1() + " has rank: " + tuple._2() + "."); } ctx.stop(); }
Example 12
Source File: HashingBalancedPartitionerTest.java From deeplearning4j with Apache License 2.0 | 4 votes |
@Test public void hashPartitionerBalancesAtScale() { LinearCongruentialGenerator r = new LinearCongruentialGenerator(10000); List<String> elements = new ArrayList<String>(); for (int i = 0; i < 10000; i++) { // The red occur towards the end if (r.nextDouble() < ((double) i / 10000D)) elements.add("red"); // The blue occur towards the front if (r.nextDouble() < (1 - (double) i / 10000D)) elements.add("blue"); } Integer countRed = 0; Integer countBlue = 0; for (String elem : elements) { if (elem.equals("red")) countRed++; else countBlue++; } JavaRDD<String> rdd = sc.parallelize(elements); JavaPairRDD<Tuple2<Long, Integer>, String> indexedRDD = rdd.zipWithUniqueId() .mapToPair(new PairFunction<Tuple2<String, Long>, Tuple2<Long, Integer>, String>() { @Override public Tuple2<Tuple2<Long, Integer>, String> call(Tuple2<String, Long> stringLongTuple2) throws Exception { Integer elemClass = stringLongTuple2._1().equals("red") ? 0 : 1; return new Tuple2<Tuple2<Long, Integer>, String>( new Tuple2<Long, Integer>(stringLongTuple2._2(), elemClass), stringLongTuple2._1()); } }); Integer numPartitions = indexedRDD.getNumPartitions(); // rdd and indexedRDD have the same partition distribution List<Tuple2<Integer, Integer>> partitionTuples = rdd.mapPartitionsWithIndex(new CountRedBluePartitionsFunction(), true).collect(); List<Double> redWeights = new ArrayList<Double>(); List<Double> blueWeights = new ArrayList<Double>(); Float avgRed = (float) countRed / numPartitions; Float avgBlue = (float) countBlue / numPartitions; for (int i = 0; i < partitionTuples.size(); i++) { Tuple2<Integer, Integer> counts = partitionTuples.get(i); redWeights.add((double) counts._1() / avgRed); blueWeights.add((double) counts._2() / avgBlue); } List<List<Double>> partitionWeights = Arrays.asList(redWeights, blueWeights); HashingBalancedPartitioner hbp = new HashingBalancedPartitioner(partitionWeights); List<Tuple2<Tuple2<Long, Integer>, String>> testList = indexedRDD.collect(); int[][] colorCountsByPartition = new int[numPartitions][2]; for (final Tuple2<Tuple2<Long, Integer>, String> val : testList) { Integer partition = hbp.getPartition(val._1()); if (val._2().equals("red")) colorCountsByPartition[partition][0] += 1; else colorCountsByPartition[partition][1] += 1; } // for (int i = 0; i < numPartitions; i++) { // System.out.println(Arrays.toString(colorCountsByPartition[i])); // } // // System.out.println("Ideal red # per partition: " + avgRed); // System.out.println("Ideal blue # per partition: " + avgBlue); for (int i = 0; i < numPartitions; i++) { // avg red per partition : 2.33 assertTrue(colorCountsByPartition[i][0] >= Math.round(avgRed * .99) && colorCountsByPartition[i][0] < Math.round(avgRed * 1.01) + 1); // avg blue per partition : 3.33 assertTrue(colorCountsByPartition[i][1] >= Math.round(avgBlue * .99) && colorCountsByPartition[i][1] < Math.round(avgBlue * 1.01) + 1); } }
Example 13
Source File: HashingBalancedPartitionerTest.java From deeplearning4j with Apache License 2.0 | 4 votes |
@Test public void hashingBalancedPartitionerDoesBalance() { // partitionWeightsByClass = [[1.714, .429, .857], [0.9, 0.6, 1.5]] List<Double> reds = Arrays.asList(1.714D, 0.429D, .857D); List<Double> blues = Arrays.asList(0.9D, 0.6D, 1.5D); List<List<Double>> partitionWeights = Arrays.asList(reds, blues); HashingBalancedPartitioner hbp = new HashingBalancedPartitioner(partitionWeights); List<Tuple2<Integer, String>> l = new ArrayList<>(); for (int i = 0; i < 4; i++) { l.add(new Tuple2<Integer, String>(0, "red")); } for (int i = 0; i < 3; i++) { l.add(new Tuple2<Integer, String>(0, "blue")); } for (int i = 0; i < 1; i++) { l.add(new Tuple2<Integer, String>(1, "red")); } for (int i = 0; i < 2; i++) { l.add(new Tuple2<Integer, String>(1, "blue")); } for (int i = 0; i < 2; i++) { l.add(new Tuple2<Integer, String>(2, "red")); } for (int i = 0; i < 5; i++) { l.add(new Tuple2<Integer, String>(2, "blue")); } // This should give exactly the sought distribution JavaPairRDD<Integer, String> rdd = JavaPairRDD.fromJavaRDD(sc.parallelize(l)).partitionBy(new HashPartitioner(3)); // Let's reproduce UIDs JavaPairRDD<Tuple2<Long, Integer>, String> indexedRDD = rdd.zipWithUniqueId().mapToPair( new PairFunction<Tuple2<Tuple2<Integer, String>, Long>, Tuple2<Long, Integer>, String>() { @Override public Tuple2<Tuple2<Long, Integer>, String> call( Tuple2<Tuple2<Integer, String>, Long> payLoadNuid) { Long uid = payLoadNuid._2(); String value = payLoadNuid._1()._2(); Integer elemClass = value.equals("red") ? 0 : 1; return new Tuple2<Tuple2<Long, Integer>, String>( new Tuple2<Long, Integer>(uid, elemClass), value); } }); List<Tuple2<Tuple2<Long, Integer>, String>> testList = indexedRDD.collect(); int[][] colorCountsByPartition = new int[3][2]; for (final Tuple2<Tuple2<Long, Integer>, String> val : testList) { // System.out.println(val); Integer partition = hbp.getPartition(val._1()); // System.out.println(partition); if (val._2().equals("red")) colorCountsByPartition[partition][0] += 1; else colorCountsByPartition[partition][1] += 1; } // for (int i = 0; i < 3; i++) { // System.out.println(Arrays.toString(colorCountsByPartition[i])); // } for (int i = 0; i < 3; i++) { // avg red per partition : 2.33 assertTrue(colorCountsByPartition[i][0] >= 1 && colorCountsByPartition[i][0] < 4); // avg blue per partition : 3.33 assertTrue(colorCountsByPartition[i][1] >= 2 && colorCountsByPartition[i][1] < 5); } }
Example 14
Source File: TextPipelineTest.java From deeplearning4j with Apache License 2.0 | 4 votes |
@Test @Ignore //AB 2020/04/19 https://github.com/eclipse/deeplearning4j/issues/8849 public void testZipFunction2() throws Exception { JavaSparkContext sc = getContext(); JavaRDD<String> corpusRDD = getCorpusRDD(sc); // word2vec.setRemoveStop(false); Broadcast<Map<String, Object>> broadcastTokenizerVarMap = sc.broadcast(word2vecNoStop.getTokenizerVarMap()); TextPipeline pipeline = new TextPipeline(corpusRDD, broadcastTokenizerVarMap); pipeline.buildVocabCache(); pipeline.buildVocabWordListRDD(); JavaRDD<AtomicLong> sentenceCountRDD = pipeline.getSentenceCountRDD(); JavaRDD<List<VocabWord>> vocabWordListRDD = pipeline.getVocabWordListRDD(); CountCumSum countCumSum = new CountCumSum(sentenceCountRDD); JavaRDD<Long> sentenceCountCumSumRDD = countCumSum.buildCumSum(); JavaPairRDD<List<VocabWord>, Long> vocabWordListSentenceCumSumRDD = vocabWordListRDD.zip(sentenceCountCumSumRDD); List<Tuple2<List<VocabWord>, Long>> lst = vocabWordListSentenceCumSumRDD.collect(); List<VocabWord> vocabWordsList1 = lst.get(0)._1(); Long cumSumSize1 = lst.get(0)._2(); assertEquals(6, vocabWordsList1.size()); assertEquals(vocabWordsList1.get(0).getWord(), "this"); assertEquals(vocabWordsList1.get(1).getWord(), "is"); assertEquals(vocabWordsList1.get(2).getWord(), "a"); assertEquals(vocabWordsList1.get(3).getWord(), "strange"); assertEquals(vocabWordsList1.get(4).getWord(), "strange"); assertEquals(vocabWordsList1.get(5).getWord(), "world"); assertEquals(cumSumSize1, 6L, 0); List<VocabWord> vocabWordsList2 = lst.get(1)._1(); Long cumSumSize2 = lst.get(1)._2(); assertEquals(vocabWordsList2.size(), 3); assertEquals(vocabWordsList2.get(0).getWord(), "flowers"); assertEquals(vocabWordsList2.get(1).getWord(), "are"); assertEquals(vocabWordsList2.get(2).getWord(), "red"); assertEquals(cumSumSize2, 9L, 0); sc.stop(); }
Example 15
Source File: GeoWaveSparkKMeansIT.java From geowave with Apache License 2.0 | 4 votes |
@Test public void testKMeansRunner() throws Exception { // Load data TestUtils.testLocalIngest(inputDataStore, DimensionalityType.SPATIAL, HAIL_SHAPEFILE_FILE, 1); // Create the runner long mark = System.currentTimeMillis(); final KMeansRunner runner = new KMeansRunner(); runner.setSparkSession(SparkTestEnvironment.getInstance().defaultSession); runner.setInputDataStore(inputDataStore); runner.setTypeName("hail"); runner.setCqlFilter(CQL_FILTER); runner.setUseTime(true); // Set output params to write centroids + hulls to store. runner.setOutputDataStore(inputDataStore); runner.setCentroidTypeName("kmeans-centroids-test"); runner.setGenerateHulls(true); runner.setComputeHullData(true); runner.setHullTypeName("kmeans-hulls-test"); // Run kmeans try { runner.run(); } catch (final IOException e) { throw new RuntimeException("Failed to execute: " + e.getMessage()); } // Create the output final KMeansModel clusterModel = runner.getOutputModel(); long dur = (System.currentTimeMillis() - mark); LOGGER.warn("KMeans duration: " + dur + " ms."); // Write out the centroid features final short centroidInternalAdapterId = inputDataStore.createInternalAdapterStore().getAdapterId("kmeans-centroids-test"); final DataTypeAdapter centroidAdapter = inputDataStore.createAdapterStore().getAdapter(centroidInternalAdapterId); // Query back from the new adapter mark = System.currentTimeMillis(); queryFeatures(centroidAdapter, clusterModel.clusterCenters().length); dur = (System.currentTimeMillis() - mark); LOGGER.warn("Centroid verify: " + dur + " ms."); // Generate the hulls final JavaPairRDD<Integer, Iterable<Vector>> groupByRDD = KMeansHullGenerator.groupByIndex(runner.getInputCentroids(), clusterModel); final JavaPairRDD<Integer, Geometry> hullsRDD = KMeansHullGenerator.generateHullsRDD(groupByRDD); Assert.assertTrue( "centroids from the model should match the hull count", clusterModel.clusterCenters().length == hullsRDD.count()); System.out.println("KMeans cluster hulls:"); for (final Tuple2<Integer, Geometry> hull : hullsRDD.collect()) { System.out.println("> Hull size (verts): " + hull._2.getNumPoints()); System.out.println("> Hull centroid: " + hull._2.getCentroid().toString()); } final short hullInternalAdapterId = inputDataStore.createInternalAdapterStore().getAdapterId("kmeans-hulls-test"); // Write out the hull features w/ metadata final DataTypeAdapter hullAdapter = inputDataStore.createAdapterStore().getAdapter(hullInternalAdapterId); mark = System.currentTimeMillis(); // Query back from the new adapter queryFeatures(hullAdapter, clusterModel.clusterCenters().length); dur = (System.currentTimeMillis() - mark); LOGGER.warn("Hull verify: " + dur + " ms."); TestUtils.deleteAll(inputDataStore); }
Example 16
Source File: RP_DBSCAN.java From RP-DBSCAN with Apache License 2.0 | 4 votes |
/** * Phase I : pre-processing for RP-DBSCAN. * Phase I-1 (Pseudo Random Partitioning) and Phase I-2 (Cell_Dictionary_Building & Broadcasting) */ public void phaseI() { /** * Phase I-1. Pseudo Random Partitioning */ //Read input data set from HDFS JavaRDD<String> lines = sc.textFile(Conf.inputPath, Conf.numOfPartitions); JavaPairRDD<List<Integer>, ApproximatedCell> dataMap = null; //Data partitioning if(Conf.boost) { dataMap = lines.mapToPair(new Methods.PointToCell(Conf.dim, Conf.epsilon)) .combineByKey(new Methods.CreateLocalApproximatedPoint(Conf.dim, Conf.epsilon, Conf.rho), new Methods.LocalApproximation(Conf.dim, Conf.epsilon, Conf.rho), new Methods.GlobalApproximation(Conf.dim)) .mapToPair(new Methods.PseudoRandomPartition2(Conf.metaBlockWindow)).persist(StorageLevel.MEMORY_AND_DISK_SER()); }else dataMap = lines.mapToPair(new Methods.PointToCell(Conf.dim, Conf.epsilon)).groupByKey().mapToPair(new Methods.PseudoRandomPartition(Conf.dim, Conf.epsilon, Conf.rho, Conf.metaBlockWindow, Conf.pairOutputPath)).persist(StorageLevel.MEMORY_AND_DISK_SER()); numOfCells = dataMap.count(); /** * Phase I-2. Cell_Dictionary_Building & Broadcasting */ //Dictionary Defragmentation JavaPairRDD<List<Integer>, Long> ptsCountforEachMetaBlock = dataMap.mapToPair(new Methods.MetaBlockMergeWithApproximation()).reduceByKey(new Methods.AggregateCount()); List<Tuple2<List<Integer>, Long>> numOfPtsInCell = ptsCountforEachMetaBlock.collect(); //System.out.println("# of Blocks for virtually combining : " + numOfPtsInCell.size()); HashMap<List<Integer>,List<Integer>> partitionIndex = new HashMap<List<Integer>,List<Integer>>(); Tuple2<Long, List<Partition>> metaInfoForVirtualCombining = Methods.scalablePartition(numOfPtsInCell, Conf.dim, Conf.numOflvhCellsInMetaPartition/Conf.dim, partitionIndex); numOfSubCells = metaInfoForVirtualCombining._1; List<Partition> wholePartitions = metaInfoForVirtualCombining._2; numOfSubDictionaries = wholePartitions.size(); //Build Two-Level Cell Dictionary composed of multiple sub-dictionaries JavaPairRDD<Integer, Iterable<ApproximatedCell>> evenlySplitPartitions = dataMap.flatMapToPair(new Methods.AssignApproximatedPointToPartition(partitionIndex)).groupByKey(wholePartitions.size()); JavaPairRDD<Null, Null> metaDataSet = evenlySplitPartitions.mapToPair(new Methods.MetaGenerationWithApproximation(Conf.dim, Conf.epsilon, Conf.rho, Conf.minPts, conf, wholePartitions)); metaDataSet.collect(); //Re-partition the pseudo random partitions into Each Worker by a randomly assigned integer value for reducing the size of memory usage. dataset = dataMap.mapToPair(new Methods.Repartition(Conf.numOfPartitions)).repartition(Conf.numOfPartitions).persist(StorageLevel.MEMORY_AND_DISK_SER()); //Broadcast two-level cell dictionary to every workers. try { metaPaths = FileIO.broadCastData(sc, conf, Conf.metaFoler); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } }
Example 17
Source File: SparkExecutionContext.java From systemds with Apache License 2.0 | 4 votes |
public static TensorBlock toTensorBlock(JavaPairRDD<TensorIndexes, TensorBlock> rdd, DataCharacteristics dc) { long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; // TODO special case single block int[] idims = dc.getIntDims(); // TODO asynchronous allocation List<Tuple2<TensorIndexes, TensorBlock>> list = rdd.collect(); ValueType vt = (list.get(0)._2).getValueType(); TensorBlock out = new TensorBlock(vt, idims).allocateBlock(); //copy blocks one-at-a-time into output matrix block for( Tuple2<TensorIndexes, TensorBlock> keyval : list ) { //unpack index-block pair TensorIndexes ix = keyval._1(); TensorBlock block = keyval._2(); //compute row/column block offsets int[] lower = new int[ix.getNumDims()]; int[] upper = new int[ix.getNumDims()]; for (int i = 0; i < lower.length; i++) { lower[i] = (int) ((ix.getIndex(i) - 1) * dc.getBlocksize()); upper[i] = lower[i] + block.getDim(i) - 1; } upper[upper.length - 1]++; for (int i = upper.length - 1; i > 0; i--) { if (upper[i] == block.getDim(i)) { upper[i] = 0; upper[i - 1]++; } } // TODO sparse copy out.copy(lower, upper, block); // TODO keep track of nnz } // TODO post-processing output tensor (nnz, sparsity) if (DMLScript.STATISTICS) { Statistics.accSparkCollectTime(System.nanoTime() - t0); Statistics.incSparkCollectCount(1); } return out; }
Example 18
Source File: SparkExecutionContext.java From systemds with Apache License 2.0 | 4 votes |
/** * Utility method for creating a single matrix block out of a binary cell RDD. * Note that this collect call might trigger execution of any pending transformations. * * @param rdd JavaPairRDD for matrix block * @param rlen number of rows * @param clen number of columns * @param nnz number of non-zeros * @return matrix block */ public static MatrixBlock toMatrixBlock(JavaPairRDD<MatrixIndexes, MatrixCell> rdd, int rlen, int clen, long nnz) { long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; MatrixBlock out = null; //determine target sparse/dense representation long lnnz = (nnz >= 0) ? nnz : (long)rlen * clen; boolean sparse = MatrixBlock.evalSparseFormatInMemory(rlen, clen, lnnz); //create output matrix block (w/ lazy allocation) out = new MatrixBlock(rlen, clen, sparse); List<Tuple2<MatrixIndexes,MatrixCell>> list = rdd.collect(); //copy blocks one-at-a-time into output matrix block for( Tuple2<MatrixIndexes,MatrixCell> keyval : list ) { //unpack index-block pair MatrixIndexes ix = keyval._1(); MatrixCell cell = keyval._2(); //append cell to dense/sparse target in order to avoid shifting for sparse //note: this append requires a final sort of sparse rows out.appendValue((int)ix.getRowIndex()-1, (int)ix.getColumnIndex()-1, cell.getValue()); } //post-processing output matrix if( sparse ) out.sortSparseRows(); out.recomputeNonZeros(); out.examSparsity(); if (DMLScript.STATISTICS) { Statistics.accSparkCollectTime(System.nanoTime() - t0); Statistics.incSparkCollectCount(1); } return out; }
Example 19
Source File: SparkExecutionContext.java From systemds with Apache License 2.0 | 4 votes |
public static TensorBlock toTensorBlock(JavaPairRDD<TensorIndexes, TensorBlock> rdd, DataCharacteristics dc) { long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; // TODO special case single block int[] idims = dc.getIntDims(); // TODO asynchronous allocation List<Tuple2<TensorIndexes, TensorBlock>> list = rdd.collect(); ValueType vt = (list.get(0)._2).getValueType(); TensorBlock out = new TensorBlock(vt, idims).allocateBlock(); //copy blocks one-at-a-time into output matrix block for( Tuple2<TensorIndexes, TensorBlock> keyval : list ) { //unpack index-block pair TensorIndexes ix = keyval._1(); TensorBlock block = keyval._2(); //compute row/column block offsets int[] lower = new int[ix.getNumDims()]; int[] upper = new int[ix.getNumDims()]; for (int i = 0; i < lower.length; i++) { lower[i] = (int) ((ix.getIndex(i) - 1) * dc.getBlocksize()); upper[i] = lower[i] + block.getDim(i) - 1; } upper[upper.length - 1]++; for (int i = upper.length - 1; i > 0; i--) { if (upper[i] == block.getDim(i)) { upper[i] = 0; upper[i - 1]++; } } // TODO sparse copy out.copy(lower, upper, block); // TODO keep track of nnz } // TODO post-processing output tensor (nnz, sparsity) if (DMLScript.STATISTICS) { Statistics.accSparkCollectTime(System.nanoTime() - t0); Statistics.incSparkCollectCount(1); } return out; }
Example 20
Source File: SparkExecutionContext.java From systemds with Apache License 2.0 | 4 votes |
/** * Utility method for creating a single matrix block out of a binary cell RDD. * Note that this collect call might trigger execution of any pending transformations. * * @param rdd JavaPairRDD for matrix block * @param rlen number of rows * @param clen number of columns * @param nnz number of non-zeros * @return matrix block */ public static MatrixBlock toMatrixBlock(JavaPairRDD<MatrixIndexes, MatrixCell> rdd, int rlen, int clen, long nnz) { long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; MatrixBlock out = null; //determine target sparse/dense representation long lnnz = (nnz >= 0) ? nnz : (long)rlen * clen; boolean sparse = MatrixBlock.evalSparseFormatInMemory(rlen, clen, lnnz); //create output matrix block (w/ lazy allocation) out = new MatrixBlock(rlen, clen, sparse); List<Tuple2<MatrixIndexes,MatrixCell>> list = rdd.collect(); //copy blocks one-at-a-time into output matrix block for( Tuple2<MatrixIndexes,MatrixCell> keyval : list ) { //unpack index-block pair MatrixIndexes ix = keyval._1(); MatrixCell cell = keyval._2(); //append cell to dense/sparse target in order to avoid shifting for sparse //note: this append requires a final sort of sparse rows out.appendValue((int)ix.getRowIndex()-1, (int)ix.getColumnIndex()-1, cell.getValue()); } //post-processing output matrix if( sparse ) out.sortSparseRows(); out.recomputeNonZeros(); out.examSparsity(); if (DMLScript.STATISTICS) { Statistics.accSparkCollectTime(System.nanoTime() - t0); Statistics.incSparkCollectCount(1); } return out; }