Java Code Examples for org.apache.spark.api.java.JavaRDD#collectPartitions()

The following examples show how to use org.apache.spark.api.java.JavaRDD#collectPartitions() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: TestRepartitioning.java From deeplearning4j with Apache License 2.0

6 votes

@Test
    public void testRepartitioning() {
        List<String> list = new ArrayList<>();
        for (int i = 0; i < 1000; i++) {
            list.add(String.valueOf(i));
        }

        JavaRDD<String> rdd = sc.parallelize(list);
        rdd = rdd.repartition(200);

        JavaRDD<String> rdd2 = SparkUtils.repartitionBalanceIfRequired(rdd, Repartition.Always, 100, 10);
        assertFalse(rdd == rdd2); //Should be different objects due to repartitioning

        assertEquals(10, rdd2.partitions().size());
        for (int i = 0; i < 10; i++) {
            List<String> partition = rdd2.collectPartitions(new int[] {i})[0];
//            System.out.println("Partition " + i + " size: " + partition.size());
            assertEquals(100, partition.size()); //Should be exactly 100, for the util method (but NOT spark .repartition)
        }
    }

Example 2

Source File: TestRepartitioning.java From deeplearning4j with Apache License 2.0

6 votes

@Test
    public void testRepartitioningApprox() {
        List<String> list = new ArrayList<>();
        for (int i = 0; i < 1000; i++) {
            list.add(String.valueOf(i));
        }

        JavaRDD<String> rdd = sc.parallelize(list);
        rdd = rdd.repartition(200);

        JavaRDD<String> rdd2 = SparkUtils.repartitionApproximateBalance(rdd, Repartition.Always, 10);
        assertFalse(rdd == rdd2); //Should be different objects due to repartitioning

        assertEquals(10, rdd2.partitions().size());

        for (int i = 0; i < 10; i++) {
            List<String> partition = rdd2.collectPartitions(new int[] {i})[0];
//            System.out.println("Partition " + i + " size: " + partition.size());
            assertTrue(partition.size() >= 90 && partition.size() <= 110);
        }
    }

Example 3

Source File: SparkUtilsUnitTest.java From gatk with BSD 3-Clause "New" or "Revised" License

5 votes

@Test
public void testSortQuerynameFixesPartitionBoundaries(){
    JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
    final SAMFileHeader header = ArtificialReadUtils.createArtificialSamHeader();
    header.setSortOrder(SAMFileHeader.SortOrder.queryname);
    final int numReadsWithSameName = 4;
    final List<GATKRead> pairedReads = createPairedReads(header, 100, numReadsWithSameName);
    final int numPartitions = 7;
    final JavaRDD<GATKRead> reads = ctx.parallelize(pairedReads, numPartitions);

    //assert that the grouping is not correct before sorting
    final List<GATKRead>[] partitions = reads.collectPartitions(IntStream.range(0, reads.getNumPartitions()).toArray());
    Assert.assertTrue(
            Arrays.stream(partitions)
                    //look through each partition and count the number of each read name seen
                    .flatMap( readsInPartition -> readsInPartition.stream()
                        .collect(Collectors.groupingBy(GATKRead::getName))
                        .values()
                        .stream()
                        .map(List::size)
                    )
                    //check that at least one partition was not correctly distributed
            .anyMatch(size -> size != numReadsWithSameName), "The partitioning was correct before sorting so the test is meaningless.");

    final JavaRDD<GATKRead> sorted = SparkUtils.sortReadsAccordingToHeader(reads, header, numPartitions);

    //assert that the grouping is fixed after sorting
    final List<GATKRead>[] sortedPartitions = sorted.collectPartitions(IntStream.range(0, sorted.getNumPartitions()).toArray());
    Assert.assertTrue(Arrays.stream(sortedPartitions)
            .flatMap( readsInPartition -> readsInPartition.stream()
                    .collect(Collectors.groupingBy(GATKRead::getName))
                    .values()
                    .stream()
                    .map(List::size)
            )
            .allMatch(size -> size == numReadsWithSameName), "Some reads names were split between multiple partitions");
}