org.apache.spark.api.java.JavaPairRDD#coalesce

Source File: CreateRepresentativeSet.java From mmtf-spark with Apache License 2.0

5 votes

/**
 * @throws IOException 
 */
public static void main(String[] args) throws IOException {
    
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(CreateRepresentativeSet.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);

    // filter by representative protein chains at 40% sequence identify 
    // and  2.5 A resolution using the Pisces filter. Any pair of protein
    // chains in the representative set will have <= 40% sequence identity.
    int sequenceIdentity = 40;
    double resolution = 2.5;
    
    // read PDB, split entries into polymer chains, and filter by Pisces filter
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader
    		.readReducedSequenceFile(sc)
    		.flatMapToPair(new StructureToPolymerChains())
    		.filter(new Pisces(sequenceIdentity, resolution));
   
    System.out.println("# representative chains: " + pdb.count());
    
    // coalesce partitions to avoid saving many small files
    pdb = pdb.coalesce(12);
    
    // save representative set
    String path = MmtfReader.getMmtfReducedPath();
    MmtfWriter.writeSequenceFile(path +"_representatives_i40_r2.5", sc, pdb);
	    
    sc.close();
}

Source File: WriteMmtfCustom.java From mmtf-spark with Apache License 2.0

5 votes

/**
 * @param args
 * @throws FileNotFoundException 
 */
public static void main(String[] args) throws FileNotFoundException {

	String path = MmtfReader.getMmtfFullPath();
    
    long start = System.nanoTime();
    
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(WriteMmtfCustom.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
	 
    // read a 20% random sample of the PDB
    double fraction = 0.2;
    long seed = 123;
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, fraction, seed, sc);

    // retain high resolution X-ray structures
    pdb = pdb
    		.filter(new ExperimentalMethods(ExperimentalMethods.X_RAY_DIFFRACTION))
    		.filter(new Resolution(0, 2.0))
    		.filter(new Rfree(0, 0.2));
   
    // coalesce this into 8 partitions to avoid creating many small files
    pdb = pdb.coalesce(8);
    
    // save this subset in a Hadoop Sequence file
    MmtfWriter.writeSequenceFile(path +"_xray", sc, pdb);
    
    System.out.println("# structures in custom set: " + pdb.count());
  
    long end = System.nanoTime();
    
    System.out.println("Time: " + (end-start)/1E9 + "sec.");
    
    sc.close();
}

Source File: SparkExecutionContext.java From systemds with Apache License 2.0

4 votes

@SuppressWarnings("unchecked")
public void repartitionAndCacheMatrixObject( String var ) {
	MatrixObject mo = getMatrixObject(var);
	DataCharacteristics dcIn = mo.getDataCharacteristics();

	//double check size to avoid unnecessary spark context creation
	if( !OptimizerUtils.exceedsCachingThreshold(mo.getNumColumns(),
			OptimizerUtils.estimateSizeExactSparsity(dcIn)) )
		return;

	//get input rdd and default storage level
	JavaPairRDD<MatrixIndexes,MatrixBlock> in = (JavaPairRDD<MatrixIndexes, MatrixBlock>)
			getRDDHandleForMatrixObject(mo, InputInfo.BinaryBlockInputInfo);

	//avoid unnecessary caching of input in order to reduce memory pressure
	if( mo.getRDDHandle().allowsShortCircuitRead()
		&& isRDDMarkedForCaching(in.id()) && !isRDDCached(in.id()) ) {
		in = (JavaPairRDD<MatrixIndexes,MatrixBlock>)
				((RDDObject)mo.getRDDHandle().getLineageChilds().get(0)).getRDD();

		//investigate issue of unnecessarily large number of partitions
		int numPartitions = SparkUtils.getNumPreferredPartitions(dcIn, in);
		if( numPartitions < in.getNumPartitions() )
			in = in.coalesce( numPartitions );
	}

	//repartition rdd (force creation of shuffled rdd via merge), note: without deep copy albeit
	//executed on the original data, because there will be no merge, i.e., no key duplicates
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = RDDAggregateUtils.mergeByKey(in, false);

	//convert mcsr into memory-efficient csr if potentially sparse
	if( OptimizerUtils.checkSparseBlockCSRConversion(dcIn) ) {
		out = out.mapValues(new CreateSparseBlockFunction(SparseBlock.Type.CSR));
	}

	//persist rdd in default storage level
	out.persist( Checkpoint.DEFAULT_STORAGE_LEVEL )
	   .count(); //trigger caching to prevent contention

	//create new rdd handle, in-place of current matrix object
	RDDObject inro =  mo.getRDDHandle();  //guaranteed to exist (see above)
	RDDObject outro = new RDDObject(out); //create new rdd object
	outro.setCheckpointRDD(true);         //mark as checkpointed
	outro.addLineageChild(inro);          //keep lineage to prevent cycles on cleanup
	mo.setRDDHandle(outro);
}

Source File: MergeFastq.java From ViraPipe with MIT License

4 votes

public static void main(String[] args) throws IOException {

        if (args.length < 1) {
            System.err.println("Usage: MergeFastq <input path> <output path> <number of partitions>");
            System.exit(1);
        }

        SparkConf conf = new SparkConf().setAppName("MergeFastq");
        JavaSparkContext sc = new JavaSparkContext(conf);

        JavaPairRDD<Text, SequencedFragment> fastqRDD = sc.newAPIHadoopFile(args[0], FastqInputFormat.class, Text.class, SequencedFragment.class, sc.hadoopConfiguration());

        JavaPairRDD<Text, SequencedFragment> coalesced = fastqRDD.coalesce(Integer.valueOf(args[2]));

        coalesced.saveAsNewAPIHadoopFile(args[1], Text.class, SequencedFragment.class, FastqOutputFormat.class, sc.hadoopConfiguration());

        sc.stop();
    }

Source File: SparkExecutionContext.java From systemds with Apache License 2.0

4 votes

@SuppressWarnings("unchecked")
public void repartitionAndCacheMatrixObject( String var ) {
	MatrixObject mo = getMatrixObject(var);
	DataCharacteristics dcIn = mo.getDataCharacteristics();

	//double check size to avoid unnecessary spark context creation
	if( !OptimizerUtils.exceedsCachingThreshold(mo.getNumColumns(),
			OptimizerUtils.estimateSizeExactSparsity(dcIn)) )
		return;

	//get input rdd and default storage level
	JavaPairRDD<MatrixIndexes,MatrixBlock> in = (JavaPairRDD<MatrixIndexes, MatrixBlock>)
		getRDDHandleForMatrixObject(mo, FileFormat.BINARY);

	//avoid unnecessary caching of input in order to reduce memory pressure
	if( mo.getRDDHandle().allowsShortCircuitRead()
		&& isRDDMarkedForCaching(in.id()) && !isRDDCached(in.id()) ) {
		in = (JavaPairRDD<MatrixIndexes,MatrixBlock>)
			((RDDObject)mo.getRDDHandle().getLineageChilds().get(0)).getRDD();

		//investigate issue of unnecessarily large number of partitions
		int numPartitions = SparkUtils.getNumPreferredPartitions(dcIn, in);
		if( numPartitions < in.getNumPartitions() )
			in = in.coalesce( numPartitions );
	}

	//repartition rdd (force creation of shuffled rdd via merge), note: without deep copy albeit
	//executed on the original data, because there will be no merge, i.e., no key duplicates
	JavaPairRDD<MatrixIndexes,MatrixBlock> out = RDDAggregateUtils.mergeByKey(in, false);

	//convert mcsr into memory-efficient csr if potentially sparse
	if( OptimizerUtils.checkSparseBlockCSRConversion(dcIn) ) {
		out = out.mapValues(new CreateSparseBlockFunction(SparseBlock.Type.CSR));
	}

	//persist rdd in default storage level
	out.persist( Checkpoint.DEFAULT_STORAGE_LEVEL )
		.count(); //trigger caching to prevent contention

	//create new rdd handle, in-place of current matrix object
	RDDObject inro =  mo.getRDDHandle();  //guaranteed to exist (see above)
	RDDObject outro = new RDDObject(out); //create new rdd object
	outro.setCheckpointRDD(true);         //mark as checkpointed
	outro.addLineageChild(inro);          //keep lineage to prevent cycles on cleanup
	mo.setRDDHandle(outro);
}

Java Code Examples for org.apache.spark.api.java.JavaPairRDD#coalesce()