Java Code Examples for org.apache.spark.api.java.JavaPairRDD#sortByKey()
The following examples show how to use
org.apache.spark.api.java.JavaPairRDD#sortByKey() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: RDDConverterUtils.java From systemds with Apache License 2.0 | 7 votes |
public static JavaRDD<String> binaryBlockToCsv(JavaPairRDD<MatrixIndexes,MatrixBlock> in, DataCharacteristics mcIn, FileFormatPropertiesCSV props, boolean strict) { JavaPairRDD<MatrixIndexes,MatrixBlock> input = in; //fast path without, general case with shuffle if( mcIn.getCols()>mcIn.getBlocksize() ) { //create row partitioned matrix input = input .flatMapToPair(new SliceBinaryBlockToRowsFunction(mcIn.getBlocksize())) .groupByKey() .mapToPair(new ConcatenateBlocksFunction(mcIn.getCols(), mcIn.getBlocksize())); } //sort if required (on blocks/rows) if( strict ) { input = input.sortByKey(true); } //convert binary block to csv (from blocks/rows) JavaRDD<String> out = input .flatMap(new BinaryBlockToCSVFunction(props)); return out; }
Example 2
Source File: RDDConverterUtils.java From systemds with Apache License 2.0 | 6 votes |
public static JavaRDD<String> binaryBlockToCsv(JavaPairRDD<MatrixIndexes,MatrixBlock> in, DataCharacteristics mcIn, FileFormatPropertiesCSV props, boolean strict) { JavaPairRDD<MatrixIndexes,MatrixBlock> input = in; //fast path without, general case with shuffle if( mcIn.getCols()>mcIn.getBlocksize() ) { //create row partitioned matrix input = input .flatMapToPair(new SliceBinaryBlockToRowsFunction(mcIn.getBlocksize())) .groupByKey() .mapToPair(new ConcatenateBlocksFunction(mcIn.getCols(), mcIn.getBlocksize())); } //sort if required (on blocks/rows) if( strict ) { input = input.sortByKey(true); } //convert binary block to csv (from blocks/rows) JavaRDD<String> out = input .flatMap(new BinaryBlockToCSVFunction(props)); return out; }
Example 3
Source File: VariantsSparkSink.java From gatk with BSD 3-Clause "New" or "Revised" License | 6 votes |
private static JavaRDD<VariantContext> sortVariants(final JavaRDD<VariantContext> variants, final VCFHeader header, final int numReducers) { // Turn into key-value pairs so we can sort (by key). Values are null so there is no overhead in the amount // of data going through the shuffle. final JavaPairRDD<VariantContext, Void> rddVariantPairs = variants.mapToPair(variant -> new Tuple2<>(variant, (Void) null)); // do a total sort so that all the records in partition i are less than those in partition i+1 final Comparator<VariantContext> comparator = header.getVCFRecordComparator(); final JavaPairRDD<VariantContext, Void> variantVoidPairs; if (comparator == null){ variantVoidPairs = rddVariantPairs; //no sort } else if (numReducers > 0) { variantVoidPairs = rddVariantPairs.sortByKey(comparator, true, numReducers); } else { variantVoidPairs = rddVariantPairs.sortByKey(comparator); } return variantVoidPairs.map(Tuple2::_1); }
Example 4
Source File: SparkUtils.java From gatk with BSD 3-Clause "New" or "Revised" License | 6 votes |
/** * Do a global sort of an RDD using the given comparator. * This method uses the RDD elements themselves as the keys in the spark key/value sort. This may be inefficient * if the comparator only uses looks at a small fraction of the element to perform the comparison. */ public static <T> JavaRDD<T> sortUsingElementsAsKeys(JavaRDD<T> elements, Comparator<T> comparator, int numReducers) { Utils.nonNull(comparator); Utils.nonNull(elements); // Turn into key-value pairs so we can sort (by key). Values are null so there is no overhead in the amount // of data going through the shuffle. final JavaPairRDD<T, Void> rddReadPairs = elements.mapToPair(read -> new Tuple2<>(read, (Void) null)); final JavaPairRDD<T, Void> readVoidPairs; if (numReducers > 0) { readVoidPairs = rddReadPairs.sortByKey(comparator, true, numReducers); } else { readVoidPairs = rddReadPairs.sortByKey(comparator); } return readVoidPairs.keys(); }
Example 5
Source File: RankConverter.java From spork with Apache License 2.0 | 6 votes |
@Override public RDD<Tuple> convert(List<RDD<Tuple>> predecessors, PORank poRank) throws IOException { SparkUtil.assertPredecessorSize(predecessors, poRank, 1); RDD<Tuple> rdd = predecessors.get(0); JavaPairRDD<Integer, Long> javaPairRdd = rdd.toJavaRDD() .mapToPair(new ToPairRdd()); JavaPairRDD<Integer, Iterable<Long>> groupedByIndex = javaPairRdd .groupByKey(); JavaPairRDD<Integer, Long> countsByIndex = groupedByIndex .mapToPair(new IndexCounters()); JavaPairRDD<Integer, Long> sortedCountsByIndex = countsByIndex .sortByKey(true); Map<Integer, Long> counts = sortedCountsByIndex.collectAsMap(); JavaRDD<Tuple> finalRdd = rdd.toJavaRDD() .map(new RankFunction(new HashMap<Integer, Long>(counts))); return finalRdd.rdd(); }
Example 6
Source File: SortConverter.java From spork with Apache License 2.0 | 6 votes |
@Override public RDD<Tuple> convert(List<RDD<Tuple>> predecessors, POSort sortOperator) throws IOException { SparkUtil.assertPredecessorSize(predecessors, sortOperator, 1); RDD<Tuple> rdd = predecessors.get(0); RDD<Tuple2<Tuple, Object>> rddPair = rdd.map(new ToKeyValueFunction(), SparkUtil.<Tuple, Object> getTuple2Manifest()); JavaPairRDD<Tuple, Object> r = new JavaPairRDD<Tuple, Object>(rddPair, SparkUtil.getManifest(Tuple.class), SparkUtil.getManifest(Object.class)); JavaPairRDD<Tuple, Object> sorted = r.sortByKey( sortOperator.new SortComparator(), true); JavaRDD<Tuple> mapped = sorted.mapPartitions(TO_VALUE_FUNCTION); return mapped.rdd(); }
Example 7
Source File: JoinParirRDD.java From sparkResearch with Apache License 2.0 | 5 votes |
public static void run(JavaSparkContext sparkContext){ JavaRDD<String> rdd = sparkContext.parallelize(Arrays.asList("test", "java", "python")); JavaRDD<String> otherRDD = sparkContext.parallelize(Arrays.asList("golang", "php", "hadoop")); PairFunction<String, String, String> pairFunction = new PairFunction<String, String, String>() { @Override public Tuple2<String, String> call(String s) { return new Tuple2<>(s.split(" ")[0], s); } }; JavaPairRDD<String, String> pairRDD = rdd.mapToPair(pairFunction); JavaPairRDD<String, String> pairRDDOther = otherRDD.mapToPair(pairFunction); pairRDD.sortByKey(false); }
Example 8
Source File: FrameRDDConverterUtils.java From systemds with Apache License 2.0 | 5 votes |
public static JavaRDD<String> binaryBlockToCsv(JavaPairRDD<Long,FrameBlock> in, DataCharacteristics mcIn, FileFormatPropertiesCSV props, boolean strict) { JavaPairRDD<Long,FrameBlock> input = in; //sort if required (on blocks/rows) if( strict && !isSorted(input) ) { input = input.sortByKey(true); } //convert binary block to csv (from blocks/rows) return input.flatMap( new BinaryBlockToCSVFunction(props)); }
Example 9
Source File: GraknSparkExecutor.java From grakn with GNU Affero General Public License v3.0 | 5 votes |
public static <K, V> JavaPairRDD<K, V> executeMap( final JavaPairRDD<Object, VertexWritable> graphRDD, final MapReduce<K, V, ?, ?, ?> mapReduce, final Configuration graphComputerConfiguration) { JavaPairRDD<K, V> mapRDD = graphRDD.mapPartitionsToPair(partitionIterator -> { KryoShimServiceLoader.applyConfiguration(graphComputerConfiguration); return new MapIterator<>(MapReduce.<MapReduce<K, V, ?, ?, ?>>createMapReduce(HadoopGraph.open(graphComputerConfiguration), graphComputerConfiguration), partitionIterator); }); if (mapReduce.getMapKeySort().isPresent()){ mapRDD = mapRDD.sortByKey(mapReduce.getMapKeySort().get(), true, 1);} return mapRDD; }
Example 10
Source File: GraknSparkExecutor.java From grakn with GNU Affero General Public License v3.0 | 5 votes |
public static <K, V, OK, OV> JavaPairRDD<OK, OV> executeReduce( final JavaPairRDD<K, V> mapOrCombineRDD, final MapReduce<K, V, OK, OV, ?> mapReduce, final Configuration graphComputerConfiguration) { JavaPairRDD<OK, OV> reduceRDD = mapOrCombineRDD.groupByKey().mapPartitionsToPair(partitionIterator -> { KryoShimServiceLoader.applyConfiguration(graphComputerConfiguration); return new ReduceIterator<>(MapReduce.<MapReduce<K, V, OK, OV, ?>>createMapReduce(HadoopGraph.open(graphComputerConfiguration), graphComputerConfiguration), partitionIterator); }); if (mapReduce.getReduceKeySort().isPresent()){ reduceRDD = reduceRDD.sortByKey(mapReduce.getReduceKeySort().get(), true, 1);} return reduceRDD; }
Example 11
Source File: SparkExecutor.java From tinkerpop with Apache License 2.0 | 5 votes |
public static <K, V> JavaPairRDD<K, V> executeMap( final JavaPairRDD<Object, VertexWritable> graphRDD, final MapReduce<K, V, ?, ?, ?> mapReduce, final Configuration graphComputerConfiguration) { JavaPairRDD<K, V> mapRDD = graphRDD.mapPartitionsToPair(partitionIterator -> { KryoShimServiceLoader.applyConfiguration(graphComputerConfiguration); return new MapIterator<>(MapReduce.<MapReduce<K, V, ?, ?, ?>>createMapReduce(HadoopGraph.open(graphComputerConfiguration), graphComputerConfiguration), partitionIterator); }); if (mapReduce.getMapKeySort().isPresent()) mapRDD = mapRDD.sortByKey(mapReduce.getMapKeySort().get(), true, 1); return mapRDD; }
Example 12
Source File: SparkExecutor.java From tinkerpop with Apache License 2.0 | 5 votes |
public static <K, V, OK, OV> JavaPairRDD<OK, OV> executeReduce( final JavaPairRDD<K, V> mapOrCombineRDD, final MapReduce<K, V, OK, OV, ?> mapReduce, final Configuration graphComputerConfiguration) { JavaPairRDD<OK, OV> reduceRDD = mapOrCombineRDD.groupByKey().mapPartitionsToPair(partitionIterator -> { KryoShimServiceLoader.applyConfiguration(graphComputerConfiguration); return new ReduceIterator<>(MapReduce.<MapReduce<K, V, OK, OV, ?>>createMapReduce(HadoopGraph.open(graphComputerConfiguration), graphComputerConfiguration), partitionIterator); }); if (mapReduce.getReduceKeySort().isPresent()) reduceRDD = reduceRDD.sortByKey(mapReduce.getReduceKeySort().get(), true, 1); return reduceRDD; }
Example 13
Source File: FrameRDDConverterUtils.java From systemds with Apache License 2.0 | 5 votes |
public static JavaRDD<String> binaryBlockToCsv(JavaPairRDD<Long,FrameBlock> in, DataCharacteristics mcIn, FileFormatPropertiesCSV props, boolean strict) { JavaPairRDD<Long,FrameBlock> input = in; //sort if required (on blocks/rows) if( strict && !isSorted(input) ) { input = input.sortByKey(true); } //convert binary block to csv (from blocks/rows) return input.flatMap( new BinaryBlockToCSVFunction(props)); }
Example 14
Source File: QuantilePickSPInstruction.java From systemds with Apache License 2.0 | 4 votes |
/** * Get a summary of weighted quantiles in in the following form: * sum of weights, (keys of quantiles), (portions of quantiles), (values of quantiles) * * @param w rdd containing values and optionally weights, sorted by value * @param mc matrix characteristics * @param quantiles one or more quantiles between 0 and 1. * @return a summary of weighted quantiles */ private static double[] getWeightedQuantileSummary(JavaPairRDD<MatrixIndexes,MatrixBlock> w, DataCharacteristics mc, double[] quantiles) { double[] ret = new double[3*quantiles.length + 1]; if( mc.getCols()==2 ) //weighted { //sort blocks (values sorted but blocks and partitions are not) w = w.sortByKey(); //compute cumsum weights per partition //with assumption that partition aggregates fit into memory List<Tuple2<Integer,Double>> partWeights = w .mapPartitionsWithIndex(new SumWeightsFunction(), false).collect(); //compute sum of weights ret[0] = partWeights.stream().mapToDouble(p -> p._2()).sum(); //compute total cumsum and determine partitions double[] qdKeys = new double[quantiles.length]; long[] qiKeys = new long[quantiles.length]; int[] partitionIDs = new int[quantiles.length]; double[] offsets = new double[quantiles.length]; for( int i=0; i<quantiles.length; i++ ) { qdKeys[i] = quantiles[i]*ret[0]; qiKeys[i] = (long)Math.ceil(qdKeys[i]); } double cumSum = 0; for( Tuple2<Integer,Double> psum : partWeights ) { double tmp = cumSum + psum._2(); for(int i=0; i<quantiles.length; i++) if( tmp >= qiKeys[i] && partitionIDs[i] == 0 ) { partitionIDs[i] = psum._1(); offsets[i] = cumSum; } cumSum = tmp; } //get keys and values for quantile cutoffs List<Tuple2<Integer,double[]>> qVals = w .mapPartitionsWithIndex(new ExtractWeightedQuantileFunction( mc, qdKeys, qiKeys, partitionIDs, offsets), false).collect(); for( Tuple2<Integer,double[]> qVal : qVals ) { ret[qVal._1()+1] = qVal._2()[0]; ret[qVal._1()+quantiles.length+1] = qVal._2()[1]; ret[qVal._1()+2*quantiles.length+1] = qVal._2()[2]; } } else { ret[0] = mc.getRows(); for( int i=0; i<quantiles.length; i++ ){ ret[i+1] = quantiles[i] * mc.getRows(); ret[i+quantiles.length+1] = Math.ceil(ret[i+1])-ret[i+1]; ret[i+2*quantiles.length+1] = lookupKey(w, (long)Math.ceil(ret[i+1]), mc.getBlocksize()); } } return ret; }
Example 15
Source File: QuantilePickSPInstruction.java From systemds with Apache License 2.0 | 4 votes |
/** * Get a summary of weighted quantiles in in the following form: * sum of weights, (keys of quantiles), (portions of quantiles), (values of quantiles) * * @param w rdd containing values and optionally weights, sorted by value * @param mc matrix characteristics * @param quantiles one or more quantiles between 0 and 1. * @return a summary of weighted quantiles */ private static double[] getWeightedQuantileSummary(JavaPairRDD<MatrixIndexes,MatrixBlock> w, DataCharacteristics mc, double[] quantiles) { double[] ret = new double[3*quantiles.length + 1]; if( mc.getCols()==2 ) //weighted { //sort blocks (values sorted but blocks and partitions are not) w = w.sortByKey(); //compute cumsum weights per partition //with assumption that partition aggregates fit into memory List<Tuple2<Integer,Double>> partWeights = w .mapPartitionsWithIndex(new SumWeightsFunction(), false).collect(); //compute sum of weights ret[0] = partWeights.stream().mapToDouble(p -> p._2()).sum(); //compute total cumsum and determine partitions double[] qdKeys = new double[quantiles.length]; long[] qiKeys = new long[quantiles.length]; int[] partitionIDs = new int[quantiles.length]; double[] offsets = new double[quantiles.length]; for( int i=0; i<quantiles.length; i++ ) { qdKeys[i] = quantiles[i]*ret[0]; qiKeys[i] = (long)Math.ceil(qdKeys[i]); } double cumSum = 0; for( Tuple2<Integer,Double> psum : partWeights ) { double tmp = cumSum + psum._2(); for(int i=0; i<quantiles.length; i++) if( tmp >= qiKeys[i] && partitionIDs[i] == 0 ) { partitionIDs[i] = psum._1(); offsets[i] = cumSum; } cumSum = tmp; } //get keys and values for quantile cutoffs List<Tuple2<Integer,double[]>> qVals = w .mapPartitionsWithIndex(new ExtractWeightedQuantileFunction( mc, qdKeys, qiKeys, partitionIDs, offsets), false).collect(); for( Tuple2<Integer,double[]> qVal : qVals ) { ret[qVal._1()+1] = qVal._2()[0]; ret[qVal._1()+quantiles.length+1] = qVal._2()[1]; ret[qVal._1()+2*quantiles.length+1] = qVal._2()[2]; } } else { ret[0] = mc.getRows(); for( int i=0; i<quantiles.length; i++ ){ ret[i+1] = quantiles[i] * mc.getRows(); ret[i+quantiles.length+1] = Math.ceil(ret[i+1])-ret[i+1]; ret[i+2*quantiles.length+1] = lookupKey(w, (long)Math.ceil(ret[i+1]), mc.getBlocksize()); } } return ret; }