Java Code Examples for org.apache.spark.api.java.JavaSparkContext#parallelizePairs()
The following examples show how to use
org.apache.spark.api.java.JavaSparkContext#parallelizePairs() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: BroadCastParam.java From sparkResearch with Apache License 2.0 | 6 votes |
/** * 广播变量测试 * @param args */ public static void main(String[] args) { SparkSession sparkSession = SparkSession.builder() .master("local[4]").appName("AttackFind").getOrCreate(); //初始化sparkContext JavaSparkContext javaSparkContext = JavaSparkContext.fromSparkContext(sparkSession.sparkContext()); //在这里假定一份广播变量 //因为我们之前说过,广播变量只可读 final List<String> broadcastList = Arrays.asList("190099HJLL","98392QUEYY","561788LLKK"); //设置广播变量,把broadcast广播出去 final Broadcast<List<String>> broadcast = javaSparkContext.broadcast(broadcastList); //定义数据 JavaPairRDD<String,String> pairRDD = javaSparkContext.parallelizePairs(Arrays.asList(new Tuple2<>("000", "000"))); JavaPairRDD<String,String> resultPairRDD = pairRDD.filter((Function<Tuple2<String, String>, Boolean>) v1 -> broadcast.value().contains(v1._2)); resultPairRDD.foreach((VoidFunction<Tuple2<String, String>>) System.out::println); }
Example 2
Source File: SparkExecutionContext.java From systemds with Apache License 2.0 | 5 votes |
public static JavaPairRDD<MatrixIndexes,MatrixBlock> toMatrixJavaPairRDD(JavaSparkContext sc, MatrixBlock src, int blen, int numParts, boolean inclEmpty) { long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; List<Tuple2<MatrixIndexes,MatrixBlock>> list = null; if( src.getNumRows() <= blen && src.getNumColumns() <= blen ) { list = Arrays.asList(new Tuple2<>(new MatrixIndexes(1,1), src)); } else { MatrixCharacteristics mc = new MatrixCharacteristics( src.getNumRows(), src.getNumColumns(), blen, src.getNonZeros()); list = LongStream.range(0, mc.getNumBlocks()).parallel() .mapToObj(i -> createIndexedMatrixBlock(src, mc, i)) .filter(kv -> inclEmpty || !kv._2.isEmptyBlock(false)) .collect(Collectors.toList()); } JavaPairRDD<MatrixIndexes,MatrixBlock> result = (numParts > 1) ? sc.parallelizePairs(list, numParts) : sc.parallelizePairs(list); if (DMLScript.STATISTICS) { Statistics.accSparkParallelizeTime(System.nanoTime() - t0); Statistics.incSparkParallelizeCount(1); } return result; }
Example 3
Source File: SparkExecutionContext.java From systemds with Apache License 2.0 | 5 votes |
public static JavaPairRDD<TensorIndexes, TensorBlock> toTensorJavaPairRDD(JavaSparkContext sc, TensorBlock src, int blen, int numParts, boolean inclEmpty) { long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; List<Tuple2<TensorIndexes, TensorBlock>> list; int numDims = src.getNumDims(); boolean singleBlock = true; for (int i = 0; i < numDims; i++) { if (blen > src.getDim(i)) { singleBlock = false; break; } } if (singleBlock) { long[] ix = new long[numDims]; Arrays.fill(ix, 1); list = Arrays.asList(new Tuple2<>(new TensorIndexes(ix), src)); } else { // TODO rows and columns for matrix characteristics long[] dims = src.getLongDims(); TensorCharacteristics mc = new TensorCharacteristics(dims, src.getNonZeros()); list = LongStream.range(0, mc.getNumBlocks()).parallel() .mapToObj(i -> createIndexedTensorBlock(src, mc, i)) .filter(kv -> inclEmpty || !kv._2.isEmpty(false)) .collect(Collectors.toList()); } JavaPairRDD<TensorIndexes, TensorBlock> result = (numParts > 1) ? sc.parallelizePairs(list, numParts) : sc.parallelizePairs(list); if (DMLScript.STATISTICS) { Statistics.accSparkParallelizeTime(System.nanoTime() - t0); Statistics.incSparkParallelizeCount(1); } return result; }
Example 4
Source File: SparkExecutionContext.java From systemds with Apache License 2.0 | 5 votes |
public static JavaPairRDD<Long,FrameBlock> toFrameJavaPairRDD(JavaSparkContext sc, FrameBlock src) { long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; LinkedList<Tuple2<Long,FrameBlock>> list = new LinkedList<>(); //create and write subblocks of matrix int blksize = ConfigurationManager.getBlocksize(); for(int blockRow = 0; blockRow < (int)Math.ceil(src.getNumRows()/(double)blksize); blockRow++) { int maxRow = (blockRow*blksize + blksize < src.getNumRows()) ? blksize : src.getNumRows() - blockRow*blksize; int roffset = blockRow*blksize; FrameBlock block = new FrameBlock(src.getSchema()); //copy sub frame to block, incl meta data on first src.slice( roffset, roffset+maxRow-1, 0, src.getNumColumns()-1, block ); if( roffset == 0 ) block.setColumnMetadata(src.getColumnMetadata()); //append block to sequence file list.addLast(new Tuple2<>((long)roffset+1, block)); } JavaPairRDD<Long,FrameBlock> result = sc.parallelizePairs(list); if (DMLScript.STATISTICS) { Statistics.accSparkParallelizeTime(System.nanoTime() - t0); Statistics.incSparkParallelizeCount(1); } return result; }
Example 5
Source File: Join.java From SparkDemo with MIT License | 5 votes |
static void join(JavaSparkContext sc) { List<Tuple2<Integer, String>> products = new ArrayList<>(); products.add(new Tuple2<>(1, "苹果")); products.add(new Tuple2<>(2, "梨")); products.add(new Tuple2<>(3, "香蕉")); products.add(new Tuple2<>(4, "石榴")); List<Tuple2<Integer, Integer>> counts = new ArrayList<>(); counts.add(new Tuple2<>(1, 7)); counts.add(new Tuple2<>(2, 3)); counts.add(new Tuple2<>(3, 8)); counts.add(new Tuple2<>(4, 3)); counts.add(new Tuple2<>(5, 9)); JavaPairRDD<Integer, String> productsRDD = sc.parallelizePairs(products); JavaPairRDD<Integer, Integer> countsRDD = sc.parallelizePairs(counts); /** * ================================================================================= * | 对<K, V>和<K, W>进行join操作,返回(K, (V, W))外连接函数为leftOuterJoin、rightOuterJoin和fullOuterJoin | * | For <K, V>, and <K, W> performs join operations and returns (K, (V, W)) the outer join functions are leftOuterJoin, | * | rightOuterJoin, and fullOuterJoin | * ================================================================================= */ productsRDD.join(countsRDD) .foreach(new VoidFunction<Tuple2<Integer,Tuple2<String,Integer>>>() { @Override public void call(Tuple2<Integer, Tuple2<String, Integer>> t) throws Exception { System.out.println(t._1 + "\t" + t._2()); } }); }
Example 6
Source File: CoverageModelWPreconditionerSpark.java From gatk-protected with BSD 3-Clause "New" or "Revised" License | 5 votes |
public CoverageModelWPreconditionerSpark(@Nonnull final INDArray Q_ll, @Nonnull final INDArray Z_ll, @Nonnull final FourierLinearOperatorNDArray F_tt, final int numTargets, @Nonnull JavaSparkContext ctx, final int numPartitions) { this.numTargets = ParamUtils.isPositive(numTargets, "Number of target must be positive"); this.numLatents = Q_ll.shape()[0]; this.fftSize = F_tt.getFFTSize(); if (Q_ll.shape()[1] != numLatents) throw new IllegalArgumentException("Malformed Q_ll."); if (Z_ll.shape()[0] != numLatents || Z_ll.shape()[1] != numLatents) throw new IllegalArgumentException("Malformed Z_ll."); if (F_tt.getRowDimension() != numTargets || F_tt.getColumnDimension() != numTargets) throw new IllegalArgumentException("Malformed F_tt."); this.F_tt = F_tt; orderedFourierFactors = Nd4j.create(F_tt.getOrderedFourierFactors(), new int[]{fftSize, 1}); /* sparky stuff */ this.ctx = ctx; fourierSpaceBlocks = CoverageModelSparkUtils.createLinearlySpacedIndexBlocks(fftSize, numPartitions, 1); final INDArray linOp = Nd4j.create(fftSize, numLatents, numLatents); IntStream.range(0, fftSize).parallel().forEach(k -> linOp.get(NDArrayIndex.point(k)).assign( Z_ll.mul(orderedFourierFactors.getDouble(k)).addi(Q_ll))); // linOpPairRDD = CoverageModelSparkUtils.rddFromINDArray(linOp, fourierSpaceBlocks, ctx, true); /* for a broadcast hash join, repartitioning is unncessary */ linOpPairRDD = ctx.parallelizePairs( CoverageModelSparkUtils.partitionINDArrayToList(fourierSpaceBlocks, linOp), fourierSpaceBlocks.size()); linOpPairRDD.cache(); }
Example 7
Source File: SparkExecutionContext.java From systemds with Apache License 2.0 | 5 votes |
public static JavaPairRDD<MatrixIndexes,MatrixBlock> toMatrixJavaPairRDD(JavaSparkContext sc, MatrixBlock src, int blen, int numParts, boolean inclEmpty) { long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; List<Tuple2<MatrixIndexes,MatrixBlock>> list = null; if( src.getNumRows() <= blen && src.getNumColumns() <= blen ) { list = Arrays.asList(new Tuple2<>(new MatrixIndexes(1,1), src)); } else { MatrixCharacteristics mc = new MatrixCharacteristics( src.getNumRows(), src.getNumColumns(), blen, src.getNonZeros()); list = LongStream.range(0, mc.getNumBlocks()).parallel() .mapToObj(i -> createIndexedMatrixBlock(src, mc, i)) .filter(kv -> inclEmpty || !kv._2.isEmptyBlock(false)) .collect(Collectors.toList()); } JavaPairRDD<MatrixIndexes,MatrixBlock> result = (numParts > 1) ? sc.parallelizePairs(list, numParts) : sc.parallelizePairs(list); if (DMLScript.STATISTICS) { Statistics.accSparkParallelizeTime(System.nanoTime() - t0); Statistics.incSparkParallelizeCount(1); } return result; }
Example 8
Source File: SparkExecutionContext.java From systemds with Apache License 2.0 | 5 votes |
public static JavaPairRDD<TensorIndexes, TensorBlock> toTensorJavaPairRDD(JavaSparkContext sc, TensorBlock src, int blen, int numParts, boolean inclEmpty) { long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; List<Tuple2<TensorIndexes, TensorBlock>> list; int numDims = src.getNumDims(); boolean singleBlock = true; for (int i = 0; i < numDims; i++) { if (blen > src.getDim(i)) { singleBlock = false; break; } } if (singleBlock) { long[] ix = new long[numDims]; Arrays.fill(ix, 1); list = Arrays.asList(new Tuple2<>(new TensorIndexes(ix), src)); } else { // TODO rows and columns for matrix characteristics long[] dims = src.getLongDims(); TensorCharacteristics mc = new TensorCharacteristics(dims, src.getNonZeros()); list = LongStream.range(0, mc.getNumBlocks()).parallel() .mapToObj(i -> createIndexedTensorBlock(src, mc, i)) .filter(kv -> inclEmpty || !kv._2.isEmpty(false)) .collect(Collectors.toList()); } JavaPairRDD<TensorIndexes, TensorBlock> result = (numParts > 1) ? sc.parallelizePairs(list, numParts) : sc.parallelizePairs(list); if (DMLScript.STATISTICS) { Statistics.accSparkParallelizeTime(System.nanoTime() - t0); Statistics.incSparkParallelizeCount(1); } return result; }
Example 9
Source File: SparkExecutionContext.java From systemds with Apache License 2.0 | 5 votes |
public static JavaPairRDD<Long,FrameBlock> toFrameJavaPairRDD(JavaSparkContext sc, FrameBlock src) { long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; LinkedList<Tuple2<Long,FrameBlock>> list = new LinkedList<>(); //create and write subblocks of matrix int blksize = ConfigurationManager.getBlocksize(); for(int blockRow = 0; blockRow < (int)Math.ceil(src.getNumRows()/(double)blksize); blockRow++) { int maxRow = (blockRow*blksize + blksize < src.getNumRows()) ? blksize : src.getNumRows() - blockRow*blksize; int roffset = blockRow*blksize; FrameBlock block = new FrameBlock(src.getSchema()); //copy sub frame to block, incl meta data on first src.slice( roffset, roffset+maxRow-1, 0, src.getNumColumns()-1, block ); if( roffset == 0 ) block.setColumnMetadata(src.getColumnMetadata()); //append block to sequence file list.addLast(new Tuple2<>((long)roffset+1, block)); } JavaPairRDD<Long,FrameBlock> result = sc.parallelizePairs(list); if (DMLScript.STATISTICS) { Statistics.accSparkParallelizeTime(System.nanoTime() - t0); Statistics.incSparkParallelizeCount(1); } return result; }
Example 10
Source File: MapSideJoinBroadcast.java From Apache-Spark-2x-for-Java-Developers with MIT License | 4 votes |
public static void main(String[] args) { SparkSession sparkSession = SparkSession.builder().master("local").appName("My App") .config("spark.sql.warehouse.dir", "file:////C:/Users/sgulati/spark-warehouse").getOrCreate(); JavaSparkContext jsc = new JavaSparkContext(sparkSession.sparkContext()); JavaPairRDD<String, String> userIdToCityId = jsc.parallelizePairs( Arrays.asList(new Tuple2<String, String>("1", "101"), new Tuple2<String, String>("2", "102"), new Tuple2<String, String>("3", "107"), new Tuple2<String, String>("4", "103"), new Tuple2<String, String>("11", "101"), new Tuple2<String, String>("12", "102"), new Tuple2<String, String>("13", "107"), new Tuple2<String, String>("14", "103"))); JavaPairRDD<String, String> cityIdToCityName = jsc.parallelizePairs( Arrays.asList(new Tuple2<String, String>("101", "India"), new Tuple2<String, String>("102", "UK"), new Tuple2<String, String>("103", "Germany"), new Tuple2<String, String>("107", "USA"))); Broadcast<Map<String, String>> citiesBroadcasted = jsc.broadcast(cityIdToCityName.collectAsMap()); JavaRDD<Tuple3<String, String, String>> joined = userIdToCityId.map( v1 -> new Tuple3<String, String, String>(v1._1(), v1._2(), citiesBroadcasted.value().get(v1._2()))); System.out.println(joined.collect()); }
Example 11
Source File: Basic.java From learning-spark-with-java with MIT License | 4 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("Pairs-Basic") .master("local[4]") .getOrCreate(); JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); List<Tuple2<String, Integer>> pairs = Arrays.asList( new Tuple2<>("1",9), new Tuple2<>("1",2), new Tuple2<>("1",1), new Tuple2<>("2",3), new Tuple2<>("2",4), new Tuple2<>("3",1), new Tuple2<>("3",5), new Tuple2<>("6",2), new Tuple2<>("6",1), new Tuple2<>("6",4), new Tuple2<>("8",1)); // a randomly partitioned pair RDD JavaPairRDD<String, Integer> pairsRDD = sc.parallelizePairs(pairs, 4); System.out.println("*** the original pairs"); pairsRDD.foreach(i -> System.out.println(i)); // // Pairs can be collected as a Map of, but this only works well if the // keys are unique. Here they aren't so an arbitrary value is chosen for each: // Map<String, Integer> pairsAsMap = pairsRDD.collectAsMap(); System.out.println("*** the pretty useless map"); System.out.println(pairsAsMap); // let's say we just want the pair with minimum value for each key // we can use one of the handy methods in PairRDDFunctions. To reduce we need // only supply a single function to combine all the values for each key -- the result // has to have the same type as the values JavaPairRDD<String, Integer> reducedRDD = pairsRDD.reduceByKey(Math::min); System.out.println("*** the reduced pairs"); reducedRDD.foreach(i -> System.out.println(i)); // the reduced pairs have unique keys so collecting to a map works a lot better Map<String, Integer> reducedAsMap = reducedRDD.collectAsMap(); System.out.println("*** the reduced pairs as a map"); System.out.println(reducedAsMap); // folding is a little mor general: we get to specifiy the identity value: // say 0 for adding and 1 for multiplying JavaPairRDD<String, Integer> foldedRDD = pairsRDD.foldByKey(1, (x, y) -> x * y); System.out.println("*** the folded pairs"); foldedRDD.foreach(i -> System.out.println(i)); // Combining is more general: you can produce values of a different type, which is very powerful. // You need to provide three functions: the first converts an individual value to the new type, the second // incorporates an additional value into the the result, and the third combines intermediate results, which is // used by execution to avoid excessive communication between partitions. The first function is applied once // per partition and the second is used for each additional value in the partition. // Below is a pretty classical example of its use: compute a per-key average by first computing the sum and count // for each key and then dividing. JavaPairRDD<String, Tuple2<Integer, Integer>> combinedRDD = pairsRDD.combineByKey( value -> new Tuple2<>(value, 1), (sumAndCount, value) -> new Tuple2<>(sumAndCount._1() + value, sumAndCount._2() + 1), (sumAndCount1, sumAndCount2) -> new Tuple2<>(sumAndCount1._1() + sumAndCount2._1(), sumAndCount1._2() + sumAndCount2._2()) ); JavaPairRDD<String, Double> averageRDD = combinedRDD.mapValues(sumAndCount -> (double) sumAndCount._1() / sumAndCount._2()); System.out.println("*** the average pairs"); averageRDD.foreach(i -> System.out.println(i)); // The dividing could be done just by calling map, but in Java this requires a lot of conversion between the // two kinds of RDD and ends up *VERY* cumbersome. JavaRDD<Tuple2<String, Tuple2<Integer, Integer>>> tupleCombinedRDD = JavaRDD.fromRDD(combinedRDD.rdd(), combinedRDD.classTag()); JavaRDD<Tuple2<String, Double>> tupleDividedRDD = tupleCombinedRDD.map(keyAndsumAndCount -> new Tuple2<>(keyAndsumAndCount._1(), (double) keyAndsumAndCount._2()._1() / keyAndsumAndCount._2()._2())); JavaPairRDD<String, Double> averageRDDtheHardWay = JavaPairRDD.fromJavaRDD(tupleDividedRDD); // remember these won't necessarily come out int he same order so they may not obviously be // the same as above System.out.println("*** the average pairs the hard way"); averageRDDtheHardWay.foreach(i -> System.out.println(i)); spark.stop(); }
Example 12
Source File: CustomPartitionerExample.java From Apache-Spark-2x-for-Java-Developers with MIT License | 3 votes |
public static void main(String[] args) { System.setProperty("hadoop.home.dir", "C:\\softwares\\Winutils"); SparkConf conf = new SparkConf().setMaster("local").setAppName("Partitioning"); JavaSparkContext jsc = new JavaSparkContext(conf); JavaPairRDD<String, String> pairRdd = jsc.parallelizePairs( Arrays.asList(new Tuple2<String, String>("India", "Asia"),new Tuple2<String, String>("Germany", "Europe"), new Tuple2<String, String>("Japan", "Asia"),new Tuple2<String, String>("France", "Europe")) ,3); JavaPairRDD<String, String> customPartitioned = pairRdd.partitionBy(new CustomPartitioner()); System.out.println(customPartitioned.getNumPartitions()); JavaRDD<String> mapPartitionsWithIndex = customPartitioned.mapPartitionsWithIndex((index, tupleIterator) -> { List<String> list=new ArrayList<>(); while(tupleIterator.hasNext()){ list.add("Partition number:"+index+",key:"+tupleIterator.next()._1()); } return list.iterator(); }, true); System.out.println(mapPartitionsWithIndex.collect()); }
Example 13
Source File: Partitioning.java From Apache-Spark-2x-for-Java-Developers with MIT License | 2 votes |
public static void main(String[] args) { System.setProperty("hadoop.home.dir", "C:\\softwares\\Winutils"); SparkConf conf = new SparkConf().setMaster("local").setAppName("Partitioning"); JavaSparkContext jsc = new JavaSparkContext(conf); JavaPairRDD<Integer, String> pairRdd = jsc.parallelizePairs( Arrays.asList(new Tuple2<Integer, String>(1, "A"),new Tuple2<Integer, String>(2, "B"), new Tuple2<Integer, String>(3, "C"),new Tuple2<Integer, String>(4, "D"), new Tuple2<Integer, String>(5, "E"),new Tuple2<Integer, String>(6, "F"), new Tuple2<Integer, String>(7, "G"),new Tuple2<Integer, String>(8, "H")),3); RDD<Tuple2<Integer, String>> rdd = JavaPairRDD.toRDD(pairRdd); System.out.println(pairRdd.getNumPartitions()); // JavaPairRDD<Integer, String> hashPartitioned = pairRdd.partitionBy(new HashPartitioner(2)); // // System.out.println(hashPartitioned.getNumPartitions()); RangePartitioner rangePartitioner = new RangePartitioner(4, rdd, true, scala.math.Ordering.Int$.MODULE$ , scala.reflect.ClassTag$.MODULE$.apply(Integer.class)); JavaPairRDD<Integer, String> rangePartitioned = pairRdd.partitionBy(rangePartitioner); JavaRDD<String> mapPartitionsWithIndex = rangePartitioned.mapPartitionsWithIndex((index, tupleIterator) -> { List<String> list=new ArrayList<>(); while(tupleIterator.hasNext()){ list.add("Partition number:"+index+",key:"+tupleIterator.next()._1()); } return list.iterator(); }, true); System.out.println(mapPartitionsWithIndex.collect()); }