Java Code Examples for org.apache.spark.api.java.JavaPairRDD#repartition()

The following examples show how to use org.apache.spark.api.java.JavaPairRDD#repartition() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: RepartitionHadoopSequenceFile.java From mmtf-spark with Apache License 2.0

6 votes

/**
 * Reparations an MMTF-Hadoop Sequence file.
 * 
 * @param args
 *            args[0] path to input Hadoop Sequence file, args[1] path to
 *            output Hadoop Sequence File, args[3] number of partitions
 * @throws IOException
 */
public static void main(String[] args) throws IOException {

	SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(RepartitionHadoopSequenceFile.class.getSimpleName());
	JavaSparkContext sc = new JavaSparkContext(conf);

	long start = System.nanoTime();

	if (args.length != 3) {
		System.out.println("Usage: RepartitionHadoopSequenceFile <input-path> <ouput-path> <number-of-partitions>");
	}
	
	String inputPath = args[0];
	String outputPath = args[1];
	int numPartitions = Integer.parseInt(args[2]);

	JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(inputPath, sc);
	pdb = pdb.repartition(numPartitions);
	MmtfWriter.writeSequenceFile(outputPath, sc, pdb);

	long end = System.nanoTime();
	System.out.println("Time: " + TimeUnit.NANOSECONDS.toSeconds(end - start) + " sec.");

	sc.close();
}

Example 2

Source File: ProcessStreamingData.java From spark-streaming-direct-kafka with Apache License 2.0

6 votes

public void execute(JavaPairRDD<String, byte[]> inputMessage) {
    JavaPairRDD<String, byte[]> partitionedRDD;
    if (config.getLocalMode())
        partitionedRDD = inputMessage;
    else {
        // Helps scale beyond number of input partitions in kafka
        partitionedRDD = inputMessage.repartition(config.getRepartitionCount());

    }

    partitionedRDD.foreachPartition(prdd -> {
        // You can choose binary or string encoder
        Producer validProducer = ConnectionManager.getKafkaSingletonConnectionWithBinaryEncoder(config);
        prdd.forEachRemaining(records -> {
            byte[] msg = records._2();
            try {
                // TODO: Add your logic here to process data
                // As default we are just publishing back to another kafka topic
                logger.info("Processing event=" + new String(msg));
                publishMessagesToKafka(validProducer, msg);
            } catch (Exception e){
                logger.error("Error processing message:" + msg);
            }
        });
    });
}

Example 3

Source File: RepartitionFastq.java From ViraPipe with MIT License

4 votes

public static void main(String[] args) throws IOException {

        if (args.length < 1) {
            System.err.println("Usage: RepartitionFastq <input path> <output path> <number of partitions>");
            System.exit(1);
        }

        SparkConf conf = new SparkConf().setAppName("RepartitionFastq");
        //conf.set("spark.default.parallelism", String.valueOf(args[2]));
        JavaSparkContext sc = new JavaSparkContext(conf);

        JavaPairRDD<Text, SequencedFragment> fastqRDD = sc.newAPIHadoopFile(args[0], FastqInputFormat.class, Text.class, SequencedFragment.class, sc.hadoopConfiguration());

        JavaPairRDD<Text, SequencedFragment> repartitioned = fastqRDD.repartition(Integer.valueOf(args[2]));

        repartitioned.saveAsNewAPIHadoopFile(args[1], Text.class, SequencedFragment.class, FastqOutputFormat.class, sc.hadoopConfiguration());

        sc.stop();
    }