Java Code Examples for org.apache.spark.api.java.JavaRDD#collectPartitions()
The following examples show how to use
org.apache.spark.api.java.JavaRDD#collectPartitions() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestRepartitioning.java From deeplearning4j with Apache License 2.0 | 6 votes |
@Test public void testRepartitioning() { List<String> list = new ArrayList<>(); for (int i = 0; i < 1000; i++) { list.add(String.valueOf(i)); } JavaRDD<String> rdd = sc.parallelize(list); rdd = rdd.repartition(200); JavaRDD<String> rdd2 = SparkUtils.repartitionBalanceIfRequired(rdd, Repartition.Always, 100, 10); assertFalse(rdd == rdd2); //Should be different objects due to repartitioning assertEquals(10, rdd2.partitions().size()); for (int i = 0; i < 10; i++) { List<String> partition = rdd2.collectPartitions(new int[] {i})[0]; // System.out.println("Partition " + i + " size: " + partition.size()); assertEquals(100, partition.size()); //Should be exactly 100, for the util method (but NOT spark .repartition) } }
Example 2
Source File: TestRepartitioning.java From deeplearning4j with Apache License 2.0 | 6 votes |
@Test public void testRepartitioningApprox() { List<String> list = new ArrayList<>(); for (int i = 0; i < 1000; i++) { list.add(String.valueOf(i)); } JavaRDD<String> rdd = sc.parallelize(list); rdd = rdd.repartition(200); JavaRDD<String> rdd2 = SparkUtils.repartitionApproximateBalance(rdd, Repartition.Always, 10); assertFalse(rdd == rdd2); //Should be different objects due to repartitioning assertEquals(10, rdd2.partitions().size()); for (int i = 0; i < 10; i++) { List<String> partition = rdd2.collectPartitions(new int[] {i})[0]; // System.out.println("Partition " + i + " size: " + partition.size()); assertTrue(partition.size() >= 90 && partition.size() <= 110); } }
Example 3
Source File: SparkUtilsUnitTest.java From gatk with BSD 3-Clause "New" or "Revised" License | 5 votes |
@Test public void testSortQuerynameFixesPartitionBoundaries(){ JavaSparkContext ctx = SparkContextFactory.getTestSparkContext(); final SAMFileHeader header = ArtificialReadUtils.createArtificialSamHeader(); header.setSortOrder(SAMFileHeader.SortOrder.queryname); final int numReadsWithSameName = 4; final List<GATKRead> pairedReads = createPairedReads(header, 100, numReadsWithSameName); final int numPartitions = 7; final JavaRDD<GATKRead> reads = ctx.parallelize(pairedReads, numPartitions); //assert that the grouping is not correct before sorting final List<GATKRead>[] partitions = reads.collectPartitions(IntStream.range(0, reads.getNumPartitions()).toArray()); Assert.assertTrue( Arrays.stream(partitions) //look through each partition and count the number of each read name seen .flatMap( readsInPartition -> readsInPartition.stream() .collect(Collectors.groupingBy(GATKRead::getName)) .values() .stream() .map(List::size) ) //check that at least one partition was not correctly distributed .anyMatch(size -> size != numReadsWithSameName), "The partitioning was correct before sorting so the test is meaningless."); final JavaRDD<GATKRead> sorted = SparkUtils.sortReadsAccordingToHeader(reads, header, numPartitions); //assert that the grouping is fixed after sorting final List<GATKRead>[] sortedPartitions = sorted.collectPartitions(IntStream.range(0, sorted.getNumPartitions()).toArray()); Assert.assertTrue(Arrays.stream(sortedPartitions) .flatMap( readsInPartition -> readsInPartition.stream() .collect(Collectors.groupingBy(GATKRead::getName)) .values() .stream() .map(List::size) ) .allMatch(size -> size == numReadsWithSameName), "Some reads names were split between multiple partitions"); }