Java Code Examples for org.apache.spark.api.java.JavaPairRDD#coalesce()
The following examples show how to use
org.apache.spark.api.java.JavaPairRDD#coalesce() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: CreateRepresentativeSet.java From mmtf-spark with Apache License 2.0 | 5 votes |
/** * @throws IOException */ public static void main(String[] args) throws IOException { SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(CreateRepresentativeSet.class.getSimpleName()); JavaSparkContext sc = new JavaSparkContext(conf); // filter by representative protein chains at 40% sequence identify // and 2.5 A resolution using the Pisces filter. Any pair of protein // chains in the representative set will have <= 40% sequence identity. int sequenceIdentity = 40; double resolution = 2.5; // read PDB, split entries into polymer chains, and filter by Pisces filter JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader .readReducedSequenceFile(sc) .flatMapToPair(new StructureToPolymerChains()) .filter(new Pisces(sequenceIdentity, resolution)); System.out.println("# representative chains: " + pdb.count()); // coalesce partitions to avoid saving many small files pdb = pdb.coalesce(12); // save representative set String path = MmtfReader.getMmtfReducedPath(); MmtfWriter.writeSequenceFile(path +"_representatives_i40_r2.5", sc, pdb); sc.close(); }
Example 2
Source File: WriteMmtfCustom.java From mmtf-spark with Apache License 2.0 | 5 votes |
/** * @param args * @throws FileNotFoundException */ public static void main(String[] args) throws FileNotFoundException { String path = MmtfReader.getMmtfFullPath(); long start = System.nanoTime(); SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(WriteMmtfCustom.class.getSimpleName()); JavaSparkContext sc = new JavaSparkContext(conf); // read a 20% random sample of the PDB double fraction = 0.2; long seed = 123; JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, fraction, seed, sc); // retain high resolution X-ray structures pdb = pdb .filter(new ExperimentalMethods(ExperimentalMethods.X_RAY_DIFFRACTION)) .filter(new Resolution(0, 2.0)) .filter(new Rfree(0, 0.2)); // coalesce this into 8 partitions to avoid creating many small files pdb = pdb.coalesce(8); // save this subset in a Hadoop Sequence file MmtfWriter.writeSequenceFile(path +"_xray", sc, pdb); System.out.println("# structures in custom set: " + pdb.count()); long end = System.nanoTime(); System.out.println("Time: " + (end-start)/1E9 + "sec."); sc.close(); }
Example 3
Source File: SparkExecutionContext.java From systemds with Apache License 2.0 | 4 votes |
@SuppressWarnings("unchecked") public void repartitionAndCacheMatrixObject( String var ) { MatrixObject mo = getMatrixObject(var); DataCharacteristics dcIn = mo.getDataCharacteristics(); //double check size to avoid unnecessary spark context creation if( !OptimizerUtils.exceedsCachingThreshold(mo.getNumColumns(), OptimizerUtils.estimateSizeExactSparsity(dcIn)) ) return; //get input rdd and default storage level JavaPairRDD<MatrixIndexes,MatrixBlock> in = (JavaPairRDD<MatrixIndexes, MatrixBlock>) getRDDHandleForMatrixObject(mo, InputInfo.BinaryBlockInputInfo); //avoid unnecessary caching of input in order to reduce memory pressure if( mo.getRDDHandle().allowsShortCircuitRead() && isRDDMarkedForCaching(in.id()) && !isRDDCached(in.id()) ) { in = (JavaPairRDD<MatrixIndexes,MatrixBlock>) ((RDDObject)mo.getRDDHandle().getLineageChilds().get(0)).getRDD(); //investigate issue of unnecessarily large number of partitions int numPartitions = SparkUtils.getNumPreferredPartitions(dcIn, in); if( numPartitions < in.getNumPartitions() ) in = in.coalesce( numPartitions ); } //repartition rdd (force creation of shuffled rdd via merge), note: without deep copy albeit //executed on the original data, because there will be no merge, i.e., no key duplicates JavaPairRDD<MatrixIndexes,MatrixBlock> out = RDDAggregateUtils.mergeByKey(in, false); //convert mcsr into memory-efficient csr if potentially sparse if( OptimizerUtils.checkSparseBlockCSRConversion(dcIn) ) { out = out.mapValues(new CreateSparseBlockFunction(SparseBlock.Type.CSR)); } //persist rdd in default storage level out.persist( Checkpoint.DEFAULT_STORAGE_LEVEL ) .count(); //trigger caching to prevent contention //create new rdd handle, in-place of current matrix object RDDObject inro = mo.getRDDHandle(); //guaranteed to exist (see above) RDDObject outro = new RDDObject(out); //create new rdd object outro.setCheckpointRDD(true); //mark as checkpointed outro.addLineageChild(inro); //keep lineage to prevent cycles on cleanup mo.setRDDHandle(outro); }
Example 4
Source File: MergeFastq.java From ViraPipe with MIT License | 4 votes |
public static void main(String[] args) throws IOException { if (args.length < 1) { System.err.println("Usage: MergeFastq <input path> <output path> <number of partitions>"); System.exit(1); } SparkConf conf = new SparkConf().setAppName("MergeFastq"); JavaSparkContext sc = new JavaSparkContext(conf); JavaPairRDD<Text, SequencedFragment> fastqRDD = sc.newAPIHadoopFile(args[0], FastqInputFormat.class, Text.class, SequencedFragment.class, sc.hadoopConfiguration()); JavaPairRDD<Text, SequencedFragment> coalesced = fastqRDD.coalesce(Integer.valueOf(args[2])); coalesced.saveAsNewAPIHadoopFile(args[1], Text.class, SequencedFragment.class, FastqOutputFormat.class, sc.hadoopConfiguration()); sc.stop(); }
Example 5
Source File: SparkExecutionContext.java From systemds with Apache License 2.0 | 4 votes |
@SuppressWarnings("unchecked") public void repartitionAndCacheMatrixObject( String var ) { MatrixObject mo = getMatrixObject(var); DataCharacteristics dcIn = mo.getDataCharacteristics(); //double check size to avoid unnecessary spark context creation if( !OptimizerUtils.exceedsCachingThreshold(mo.getNumColumns(), OptimizerUtils.estimateSizeExactSparsity(dcIn)) ) return; //get input rdd and default storage level JavaPairRDD<MatrixIndexes,MatrixBlock> in = (JavaPairRDD<MatrixIndexes, MatrixBlock>) getRDDHandleForMatrixObject(mo, FileFormat.BINARY); //avoid unnecessary caching of input in order to reduce memory pressure if( mo.getRDDHandle().allowsShortCircuitRead() && isRDDMarkedForCaching(in.id()) && !isRDDCached(in.id()) ) { in = (JavaPairRDD<MatrixIndexes,MatrixBlock>) ((RDDObject)mo.getRDDHandle().getLineageChilds().get(0)).getRDD(); //investigate issue of unnecessarily large number of partitions int numPartitions = SparkUtils.getNumPreferredPartitions(dcIn, in); if( numPartitions < in.getNumPartitions() ) in = in.coalesce( numPartitions ); } //repartition rdd (force creation of shuffled rdd via merge), note: without deep copy albeit //executed on the original data, because there will be no merge, i.e., no key duplicates JavaPairRDD<MatrixIndexes,MatrixBlock> out = RDDAggregateUtils.mergeByKey(in, false); //convert mcsr into memory-efficient csr if potentially sparse if( OptimizerUtils.checkSparseBlockCSRConversion(dcIn) ) { out = out.mapValues(new CreateSparseBlockFunction(SparseBlock.Type.CSR)); } //persist rdd in default storage level out.persist( Checkpoint.DEFAULT_STORAGE_LEVEL ) .count(); //trigger caching to prevent contention //create new rdd handle, in-place of current matrix object RDDObject inro = mo.getRDDHandle(); //guaranteed to exist (see above) RDDObject outro = new RDDObject(out); //create new rdd object outro.setCheckpointRDD(true); //mark as checkpointed outro.addLineageChild(inro); //keep lineage to prevent cycles on cleanup mo.setRDDHandle(outro); }