org.apache.spark.api.java.JavaSparkContext#parallelizePairs

Source File: BroadCastParam.java From sparkResearch with Apache License 2.0

6 votes

/**
 * 广播变量测试
 * @param args
 */
public static void main(String[] args) {
    SparkSession sparkSession = SparkSession.builder()
            .master("local[4]").appName("AttackFind").getOrCreate();
    //初始化sparkContext
    JavaSparkContext javaSparkContext = JavaSparkContext.fromSparkContext(sparkSession.sparkContext());
    //在这里假定一份广播变量
    //因为我们之前说过,广播变量只可读
    final List<String> broadcastList = Arrays.asList("190099HJLL","98392QUEYY","561788LLKK");
    //设置广播变量,把broadcast广播出去
    final Broadcast<List<String>> broadcast = javaSparkContext.broadcast(broadcastList);
    //定义数据
    JavaPairRDD<String,String> pairRDD = javaSparkContext.parallelizePairs(Arrays.asList(new Tuple2<>("000", "000")));
    JavaPairRDD<String,String> resultPairRDD = pairRDD.filter((Function<Tuple2<String, String>, Boolean>) v1 -> broadcast.value().contains(v1._2));
    resultPairRDD.foreach((VoidFunction<Tuple2<String, String>>) System.out::println);
}

Source File: SparkExecutionContext.java From systemds with Apache License 2.0

5 votes

public static JavaPairRDD<MatrixIndexes,MatrixBlock> toMatrixJavaPairRDD(JavaSparkContext sc, MatrixBlock src,
		int blen, int numParts, boolean inclEmpty) {
	long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
	List<Tuple2<MatrixIndexes,MatrixBlock>> list = null;

	if( src.getNumRows() <= blen && src.getNumColumns() <= blen ) {
		list = Arrays.asList(new Tuple2<>(new MatrixIndexes(1,1), src));
	}
	else {
		MatrixCharacteristics mc = new MatrixCharacteristics(
			src.getNumRows(), src.getNumColumns(), blen, src.getNonZeros());
		list = LongStream.range(0, mc.getNumBlocks()).parallel()
			.mapToObj(i -> createIndexedMatrixBlock(src, mc, i))
			.filter(kv -> inclEmpty || !kv._2.isEmptyBlock(false))
			.collect(Collectors.toList());
	}

	JavaPairRDD<MatrixIndexes,MatrixBlock> result = (numParts > 1) ?
		sc.parallelizePairs(list, numParts) : sc.parallelizePairs(list);
	
	if (DMLScript.STATISTICS) {
		Statistics.accSparkParallelizeTime(System.nanoTime() - t0);
		Statistics.incSparkParallelizeCount(1);
	}

	return result;
}

Source File: SparkExecutionContext.java From systemds with Apache License 2.0

5 votes

public static JavaPairRDD<TensorIndexes, TensorBlock> toTensorJavaPairRDD(JavaSparkContext sc, TensorBlock src,
		int blen, int numParts, boolean inclEmpty) {
	long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
	List<Tuple2<TensorIndexes, TensorBlock>> list;

	int numDims = src.getNumDims();
	boolean singleBlock = true;
	for (int i = 0; i < numDims; i++) {
		if (blen > src.getDim(i)) {
			singleBlock = false;
			break;
		}
	}
	if (singleBlock) {
		long[] ix = new long[numDims];
		Arrays.fill(ix, 1);
		list = Arrays.asList(new Tuple2<>(new TensorIndexes(ix), src));
	} else {
		// TODO rows and columns for matrix characteristics
		long[] dims = src.getLongDims();
		TensorCharacteristics mc = new TensorCharacteristics(dims, src.getNonZeros());
		list = LongStream.range(0, mc.getNumBlocks()).parallel()
				.mapToObj(i -> createIndexedTensorBlock(src, mc, i))
				.filter(kv -> inclEmpty || !kv._2.isEmpty(false))
				.collect(Collectors.toList());
	}

	JavaPairRDD<TensorIndexes, TensorBlock> result = (numParts > 1) ?
			sc.parallelizePairs(list, numParts) : sc.parallelizePairs(list);

	if (DMLScript.STATISTICS) {
		Statistics.accSparkParallelizeTime(System.nanoTime() - t0);
		Statistics.incSparkParallelizeCount(1);
	}

	return result;
}

Source File: SparkExecutionContext.java From systemds with Apache License 2.0

5 votes

public static JavaPairRDD<Long,FrameBlock> toFrameJavaPairRDD(JavaSparkContext sc, FrameBlock src) {
	long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
	LinkedList<Tuple2<Long,FrameBlock>> list = new LinkedList<>();

	//create and write subblocks of matrix
	int blksize = ConfigurationManager.getBlocksize();
	for(int blockRow = 0; blockRow < (int)Math.ceil(src.getNumRows()/(double)blksize); blockRow++)
	{
		int maxRow = (blockRow*blksize + blksize < src.getNumRows()) ? blksize : src.getNumRows() - blockRow*blksize;
		int roffset = blockRow*blksize;

		FrameBlock block = new FrameBlock(src.getSchema());

		//copy sub frame to block, incl meta data on first
		src.slice( roffset, roffset+maxRow-1, 0, src.getNumColumns()-1, block );
		if( roffset == 0 )
			block.setColumnMetadata(src.getColumnMetadata());

		//append block to sequence file
		list.addLast(new Tuple2<>((long)roffset+1, block));
	}

	JavaPairRDD<Long,FrameBlock> result = sc.parallelizePairs(list);
	if (DMLScript.STATISTICS) {
		Statistics.accSparkParallelizeTime(System.nanoTime() - t0);
		Statistics.incSparkParallelizeCount(1);
	}

	return result;
}

Source File: Join.java From SparkDemo with MIT License

5 votes

static void join(JavaSparkContext sc) {
    List<Tuple2<Integer, String>> products = new ArrayList<>();
    products.add(new Tuple2<>(1, "苹果"));
    products.add(new Tuple2<>(2, "梨"));
    products.add(new Tuple2<>(3, "香蕉"));
    products.add(new Tuple2<>(4, "石榴"));

    List<Tuple2<Integer, Integer>> counts = new ArrayList<>();
    counts.add(new Tuple2<>(1, 7));
    counts.add(new Tuple2<>(2, 3));
    counts.add(new Tuple2<>(3, 8));
    counts.add(new Tuple2<>(4, 3));
    counts.add(new Tuple2<>(5, 9));

    JavaPairRDD<Integer, String> productsRDD = sc.parallelizePairs(products);
    JavaPairRDD<Integer, Integer> countsRDD = sc.parallelizePairs(counts);

   /**
  	*  =================================================================================
    *   |            对<K, V>和<K, W>进行join操作，返回(K, (V, W))外连接函数为leftOuterJoin、rightOuterJoin和fullOuterJoin                    |
    *   |            For <K, V>, and <K, W> performs join operations and returns (K, (V, W)) the outer join functions are leftOuterJoin,  | 
    *   |            rightOuterJoin, and fullOuterJoin                                                                                                                                                     | 
  	*   =================================================================================
  	*/
    productsRDD.join(countsRDD)
            .foreach(new VoidFunction<Tuple2<Integer,Tuple2<String,Integer>>>() {
				@Override
				public void call(Tuple2<Integer, Tuple2<String, Integer>> t) throws Exception {
					   System.out.println(t._1 + "\t" + t._2());
				}
			});
}

Source File: CoverageModelWPreconditionerSpark.java From gatk-protected with BSD 3-Clause "New" or "Revised" License

5 votes

public CoverageModelWPreconditionerSpark(@Nonnull final INDArray Q_ll,
                                             @Nonnull final INDArray Z_ll,
                                             @Nonnull final FourierLinearOperatorNDArray F_tt,
                                             final int numTargets,
                                             @Nonnull JavaSparkContext ctx,
                                             final int numPartitions) {
        this.numTargets = ParamUtils.isPositive(numTargets, "Number of target must be positive");
        this.numLatents = Q_ll.shape()[0];
        this.fftSize = F_tt.getFFTSize();
        if (Q_ll.shape()[1] != numLatents)
            throw new IllegalArgumentException("Malformed Q_ll.");
        if (Z_ll.shape()[0] != numLatents || Z_ll.shape()[1] != numLatents)
            throw new IllegalArgumentException("Malformed Z_ll.");
        if (F_tt.getRowDimension() != numTargets || F_tt.getColumnDimension() != numTargets)
            throw new IllegalArgumentException("Malformed F_tt.");
        this.F_tt = F_tt;
        orderedFourierFactors = Nd4j.create(F_tt.getOrderedFourierFactors(), new int[]{fftSize, 1});

        /* sparky stuff */
        this.ctx = ctx;

        fourierSpaceBlocks = CoverageModelSparkUtils.createLinearlySpacedIndexBlocks(fftSize, numPartitions, 1);

        final INDArray linOp = Nd4j.create(fftSize, numLatents, numLatents);
        IntStream.range(0, fftSize).parallel().forEach(k -> linOp.get(NDArrayIndex.point(k)).assign(
                Z_ll.mul(orderedFourierFactors.getDouble(k)).addi(Q_ll)));
//        linOpPairRDD = CoverageModelSparkUtils.rddFromINDArray(linOp, fourierSpaceBlocks, ctx, true);
        /* for a broadcast hash join, repartitioning is unncessary */
        linOpPairRDD = ctx.parallelizePairs(
                CoverageModelSparkUtils.partitionINDArrayToList(fourierSpaceBlocks, linOp), fourierSpaceBlocks.size());
        linOpPairRDD.cache();
    }

Source File: SparkExecutionContext.java From systemds with Apache License 2.0

5 votes

public static JavaPairRDD<MatrixIndexes,MatrixBlock> toMatrixJavaPairRDD(JavaSparkContext sc, MatrixBlock src,
		int blen, int numParts, boolean inclEmpty) {
	long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
	List<Tuple2<MatrixIndexes,MatrixBlock>> list = null;

	if( src.getNumRows() <= blen && src.getNumColumns() <= blen ) {
		list = Arrays.asList(new Tuple2<>(new MatrixIndexes(1,1), src));
	}
	else {
		MatrixCharacteristics mc = new MatrixCharacteristics(
			src.getNumRows(), src.getNumColumns(), blen, src.getNonZeros());
		list = LongStream.range(0, mc.getNumBlocks()).parallel()
			.mapToObj(i -> createIndexedMatrixBlock(src, mc, i))
			.filter(kv -> inclEmpty || !kv._2.isEmptyBlock(false))
			.collect(Collectors.toList());
	}

	JavaPairRDD<MatrixIndexes,MatrixBlock> result = (numParts > 1) ?
		sc.parallelizePairs(list, numParts) : sc.parallelizePairs(list);
	
	if (DMLScript.STATISTICS) {
		Statistics.accSparkParallelizeTime(System.nanoTime() - t0);
		Statistics.incSparkParallelizeCount(1);
	}

	return result;
}

Source File: SparkExecutionContext.java From systemds with Apache License 2.0

5 votes

public static JavaPairRDD<TensorIndexes, TensorBlock> toTensorJavaPairRDD(JavaSparkContext sc, TensorBlock src,
		int blen, int numParts, boolean inclEmpty) {
	long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
	List<Tuple2<TensorIndexes, TensorBlock>> list;

	int numDims = src.getNumDims();
	boolean singleBlock = true;
	for (int i = 0; i < numDims; i++) {
		if (blen > src.getDim(i)) {
			singleBlock = false;
			break;
		}
	}
	if (singleBlock) {
		long[] ix = new long[numDims];
		Arrays.fill(ix, 1);
		list = Arrays.asList(new Tuple2<>(new TensorIndexes(ix), src));
	} else {
		// TODO rows and columns for matrix characteristics
		long[] dims = src.getLongDims();
		TensorCharacteristics mc = new TensorCharacteristics(dims, src.getNonZeros());
		list = LongStream.range(0, mc.getNumBlocks()).parallel()
				.mapToObj(i -> createIndexedTensorBlock(src, mc, i))
				.filter(kv -> inclEmpty || !kv._2.isEmpty(false))
				.collect(Collectors.toList());
	}

	JavaPairRDD<TensorIndexes, TensorBlock> result = (numParts > 1) ?
			sc.parallelizePairs(list, numParts) : sc.parallelizePairs(list);

	if (DMLScript.STATISTICS) {
		Statistics.accSparkParallelizeTime(System.nanoTime() - t0);
		Statistics.incSparkParallelizeCount(1);
	}

	return result;
}

Source File: SparkExecutionContext.java From systemds with Apache License 2.0

5 votes

public static JavaPairRDD<Long,FrameBlock> toFrameJavaPairRDD(JavaSparkContext sc, FrameBlock src) {
	long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
	LinkedList<Tuple2<Long,FrameBlock>> list = new LinkedList<>();

	//create and write subblocks of matrix
	int blksize = ConfigurationManager.getBlocksize();
	for(int blockRow = 0; blockRow < (int)Math.ceil(src.getNumRows()/(double)blksize); blockRow++)
	{
		int maxRow = (blockRow*blksize + blksize < src.getNumRows()) ? blksize : src.getNumRows() - blockRow*blksize;
		int roffset = blockRow*blksize;

		FrameBlock block = new FrameBlock(src.getSchema());

		//copy sub frame to block, incl meta data on first
		src.slice( roffset, roffset+maxRow-1, 0, src.getNumColumns()-1, block );
		if( roffset == 0 )
			block.setColumnMetadata(src.getColumnMetadata());

		//append block to sequence file
		list.addLast(new Tuple2<>((long)roffset+1, block));
	}

	JavaPairRDD<Long,FrameBlock> result = sc.parallelizePairs(list);
	if (DMLScript.STATISTICS) {
		Statistics.accSparkParallelizeTime(System.nanoTime() - t0);
		Statistics.incSparkParallelizeCount(1);
	}

	return result;
}

Source File: MapSideJoinBroadcast.java From Apache-Spark-2x-for-Java-Developers with MIT License

4 votes

public static void main(String[] args) {

		SparkSession sparkSession = SparkSession.builder().master("local").appName("My App")
				.config("spark.sql.warehouse.dir", "file:////C:/Users/sgulati/spark-warehouse").getOrCreate();

		JavaSparkContext jsc = new JavaSparkContext(sparkSession.sparkContext());

		JavaPairRDD<String, String> userIdToCityId = jsc.parallelizePairs(
				Arrays.asList(new Tuple2<String, String>("1", "101"), new Tuple2<String, String>("2", "102"),
						new Tuple2<String, String>("3", "107"), new Tuple2<String, String>("4", "103"),
						new Tuple2<String, String>("11", "101"), new Tuple2<String, String>("12", "102"),
						new Tuple2<String, String>("13", "107"), new Tuple2<String, String>("14", "103")));

		JavaPairRDD<String, String> cityIdToCityName = jsc.parallelizePairs(
				Arrays.asList(new Tuple2<String, String>("101", "India"), new Tuple2<String, String>("102", "UK"),
						new Tuple2<String, String>("103", "Germany"), new Tuple2<String, String>("107", "USA")));

		Broadcast<Map<String, String>> citiesBroadcasted = jsc.broadcast(cityIdToCityName.collectAsMap());

		JavaRDD<Tuple3<String, String, String>> joined = userIdToCityId.map(
				v1 -> new Tuple3<String, String, String>(v1._1(), v1._2(), citiesBroadcasted.value().get(v1._2())));

		System.out.println(joined.collect());

	}

Source File: Basic.java From learning-spark-with-java with MIT License

4 votes

public static void main(String[] args) {
  SparkSession spark = SparkSession
      .builder()
      .appName("Pairs-Basic")
      .master("local[4]")
      .getOrCreate();

  JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());

  List<Tuple2<String, Integer>> pairs =
      Arrays.asList(
          new Tuple2<>("1",9), new Tuple2<>("1",2), new Tuple2<>("1",1),
          new Tuple2<>("2",3), new Tuple2<>("2",4), new Tuple2<>("3",1),
          new Tuple2<>("3",5), new Tuple2<>("6",2), new Tuple2<>("6",1),
          new Tuple2<>("6",4), new Tuple2<>("8",1));

  // a randomly partitioned pair RDD
  JavaPairRDD<String, Integer> pairsRDD = sc.parallelizePairs(pairs, 4);

  System.out.println("*** the original pairs");
  pairsRDD.foreach(i -> System.out.println(i));

  //
  // Pairs can be collected as a Map of, but this only works well if the
  // keys are unique. Here they aren't so an arbitrary value is chosen for each:
  //
  Map<String, Integer> pairsAsMap = pairsRDD.collectAsMap();
  System.out.println("*** the pretty useless map");
  System.out.println(pairsAsMap);

  // let's say we just want the pair with minimum value for each key
  // we can use one of the handy methods in PairRDDFunctions. To reduce we need
  // only supply a single function to combine all the values for each key -- the result
  // has to have the same type as the values
  JavaPairRDD<String, Integer> reducedRDD = pairsRDD.reduceByKey(Math::min);

  System.out.println("*** the reduced pairs");
  reducedRDD.foreach(i -> System.out.println(i));

  // the reduced pairs have unique keys so collecting to a map works a lot better
  Map<String, Integer> reducedAsMap = reducedRDD.collectAsMap();
  System.out.println("*** the reduced pairs as a map");
  System.out.println(reducedAsMap);

  // folding is a little mor general: we get to specifiy the identity value:
  // say 0 for adding and 1 for multiplying
  JavaPairRDD<String, Integer> foldedRDD =
      pairsRDD.foldByKey(1, (x, y) -> x * y);

  System.out.println("*** the folded pairs");
  foldedRDD.foreach(i -> System.out.println(i));

  // Combining is more general: you can produce values of a different type, which is very powerful.
  // You need to provide three functions: the first converts an individual value to the new type, the second
  // incorporates an additional value into the the result, and the third combines intermediate results, which is
  // used by execution to avoid excessive communication between partitions. The first function is applied once
  // per partition and the second is used for each additional value in the partition.
  // Below is a pretty classical example of its use: compute a per-key average by first computing the sum and count
  // for each key and then dividing.
  JavaPairRDD<String, Tuple2<Integer, Integer>> combinedRDD =
      pairsRDD.combineByKey(
          value -> new Tuple2<>(value, 1),
          (sumAndCount, value) -> new Tuple2<>(sumAndCount._1() + value, sumAndCount._2() + 1),
          (sumAndCount1, sumAndCount2) ->
              new Tuple2<>(sumAndCount1._1() + sumAndCount2._1(), sumAndCount1._2() + sumAndCount2._2())
      );

  JavaPairRDD<String, Double> averageRDD =
      combinedRDD.mapValues(sumAndCount -> (double) sumAndCount._1() / sumAndCount._2());

  System.out.println("*** the average pairs");
  averageRDD.foreach(i -> System.out.println(i));

  // The dividing could be done just by calling map, but in Java this requires a lot of conversion between the
  // two kinds of RDD and ends up *VERY* cumbersome.
  JavaRDD<Tuple2<String, Tuple2<Integer, Integer>>> tupleCombinedRDD =
      JavaRDD.fromRDD(combinedRDD.rdd(), combinedRDD.classTag());
  JavaRDD<Tuple2<String, Double>> tupleDividedRDD = tupleCombinedRDD.map(keyAndsumAndCount ->
      new Tuple2<>(keyAndsumAndCount._1(), (double) keyAndsumAndCount._2()._1() / keyAndsumAndCount._2()._2()));
  JavaPairRDD<String, Double> averageRDDtheHardWay = JavaPairRDD.fromJavaRDD(tupleDividedRDD);

  // remember these won't necessarily come out int he same order so they may not obviously be
  // the same as above
  System.out.println("*** the average pairs the hard way");
  averageRDDtheHardWay.foreach(i -> System.out.println(i));

  spark.stop();
}

Source File: CustomPartitionerExample.java From Apache-Spark-2x-for-Java-Developers with MIT License

3 votes

public static void main(String[] args) {
	System.setProperty("hadoop.home.dir", "C:\\softwares\\Winutils");
	SparkConf conf = new SparkConf().setMaster("local").setAppName("Partitioning");
	JavaSparkContext jsc = new JavaSparkContext(conf);
	
	 JavaPairRDD<String, String> pairRdd = jsc.parallelizePairs(
				Arrays.asList(new Tuple2<String, String>("India", "Asia"),new Tuple2<String, String>("Germany", "Europe"),
						new Tuple2<String, String>("Japan", "Asia"),new Tuple2<String, String>("France", "Europe"))
						,3);
	 
	 
	 JavaPairRDD<String, String> customPartitioned = pairRdd.partitionBy(new CustomPartitioner());
	 
	 System.out.println(customPartitioned.getNumPartitions());
	 
	 
	 JavaRDD<String> mapPartitionsWithIndex = customPartitioned.mapPartitionsWithIndex((index, tupleIterator) -> {
			
			List<String> list=new ArrayList<>();
			
			while(tupleIterator.hasNext()){
				list.add("Partition number:"+index+",key:"+tupleIterator.next()._1());
			}
			
			return list.iterator();
		}, true);
		
		 System.out.println(mapPartitionsWithIndex.collect());
}

Source File: Partitioning.java From Apache-Spark-2x-for-Java-Developers with MIT License

2 votes

public static void main(String[] args) {
		System.setProperty("hadoop.home.dir", "C:\\softwares\\Winutils");
		SparkConf conf = new SparkConf().setMaster("local").setAppName("Partitioning");
		JavaSparkContext jsc = new JavaSparkContext(conf);

		JavaPairRDD<Integer, String> pairRdd = jsc.parallelizePairs(
				Arrays.asList(new Tuple2<Integer, String>(1, "A"),new Tuple2<Integer, String>(2, "B"),
						new Tuple2<Integer, String>(3, "C"),new Tuple2<Integer, String>(4, "D"),
						new Tuple2<Integer, String>(5, "E"),new Tuple2<Integer, String>(6, "F"),
						new Tuple2<Integer, String>(7, "G"),new Tuple2<Integer, String>(8, "H")),3);
		
		
		
		
		RDD<Tuple2<Integer, String>> rdd = JavaPairRDD.toRDD(pairRdd);
		
		System.out.println(pairRdd.getNumPartitions());
//		JavaPairRDD<Integer, String> hashPartitioned = pairRdd.partitionBy(new HashPartitioner(2));
//		
//		System.out.println(hashPartitioned.getNumPartitions());
		
		
		
		RangePartitioner rangePartitioner = new RangePartitioner(4, rdd, true, scala.math.Ordering.Int$.MODULE$ , scala.reflect.ClassTag$.MODULE$.apply(Integer.class));
				
		JavaPairRDD<Integer, String> rangePartitioned = pairRdd.partitionBy(rangePartitioner);
		
		
		 JavaRDD<String> mapPartitionsWithIndex = rangePartitioned.mapPartitionsWithIndex((index, tupleIterator) -> {
				
			List<String> list=new ArrayList<>();
			
			while(tupleIterator.hasNext()){
				list.add("Partition number:"+index+",key:"+tupleIterator.next()._1());
			}
			
			return list.iterator();
		}, true);
		
		 System.out.println(mapPartitionsWithIndex.collect());
		 
		 
		 
		
		 
		 
		 
		 
	}

Java Code Examples for org.apache.spark.api.java.JavaSparkContext#parallelizePairs()