org.apache.spark.api.java.JavaRDD#reduce

Source File: Reduce.java From SparkDemo with MIT License

6 votes

private static void reduce(JavaSparkContext sc) {
	
	List<Integer> numberList=Arrays.asList(1,2,3,4,5,6,7,8,9,10);
	JavaRDD<Integer> javaRDD = sc.parallelize(numberList);
	
	/**
	 *   =====================================================
	 *   |                                                                 累加求和                                                               | 
	 *   =====================================================
	 */
	Integer num = javaRDD.reduce(new Function2<Integer, Integer, Integer>() {
		/**
		 * @param num1上一次计算结果 return的值
		 * @param num2 当前值
		 */
		@Override
		public Integer call(Integer num1, Integer num2) throws Exception {
			// System.out.println(num1+"======"+num2);
			return num1 + num2;
		}
	});
	
	System.out.println(num);
	
	sc.close();
}

Source File: Tokenizer.java From vn.vitk with GNU General Public License v3.0

6 votes

/**
 * Counts the number of non-space characters in this data set. This utility method 
 * is used to check the tokenization result.
 * @param lines
 * @return number of characters
 */
int numCharacters(JavaRDD<String> lines) {
	JavaRDD<Integer> lengths = lines.map(new Function<String, Integer>() {
		private static final long serialVersionUID = -2189399343462982586L;
		@Override
		public Integer call(String line) throws Exception {
			line = line.replaceAll("[\\s_]+", "");
			return line.length();
		}
	});
	return lengths.reduce(new Function2<Integer, Integer, Integer>() {
		private static final long serialVersionUID = -8438072946884289401L;

		@Override
		public Integer call(Integer e0, Integer e1) throws Exception {
			return e0 + e1;
		}
	});
}

Source File: PiApproximation.java From tutorials with MIT License

6 votes

public static void main(String[] args) {
    SparkConf conf = new SparkConf().setAppName("BaeldungPIApproximation").setMaster("local[2]");
    JavaSparkContext context = new JavaSparkContext(conf);
    int slices = args.length >= 1 ? Integer.valueOf(args[0]) : 2;
    int n = (100000L * slices) > Integer.MAX_VALUE ? Integer.MAX_VALUE : 100000 * slices;

    List<Integer> xs = IntStream.rangeClosed(0, n)
      .mapToObj(element -> Integer.valueOf(element))
      .collect(Collectors.toList());

    JavaRDD<Integer> dataSet = context.parallelize(xs, slices);

    JavaRDD<Integer> pointsInsideTheCircle = dataSet.map(integer -> {
       double x = Math.random() * 2 - 1;
       double y = Math.random() * 2 - 1;
       return (x*x + y*y ) < 1 ? 1: 0;
    });

    int count = pointsInsideTheCircle.reduce((integer, integer2) -> integer + integer2);

    System.out.println("The pi was estimated as:" + count / n);

    context.stop();
}

Source File: SparkDataValidation.java From deeplearning4j with Apache License 2.0

5 votes

protected static ValidationResult validateDataSets(JavaSparkContext sc, String path, boolean recursive, boolean deleteInvalid,
                                            int[] featuresShape, int[] labelsShape) {
    JavaRDD<String> paths;
    try {
        paths = SparkUtils.listPaths(sc, path, recursive);
    } catch (IOException e) {
        throw new RuntimeException("Error listing paths in directory", e);
    }

    JavaRDD<ValidationResult> results = paths.map(new ValidateDataSetFn(deleteInvalid, featuresShape, labelsShape));

    return results.reduce(new ValidationResultReduceFn());
}

Source File: SparkDataValidation.java From deeplearning4j with Apache License 2.0

5 votes

protected static ValidationResult validateMultiDataSets(JavaSparkContext sc, String path, boolean recursive, boolean deleteInvalid,
                                                 int numFeatureArrays, int numLabelArrays,
                                                 List<int[]> featuresShape, List<int[]> labelsShape) {
    JavaRDD<String> paths;
    try {
        paths = SparkUtils.listPaths(sc, path, recursive);
    } catch (IOException e) {
        throw new RuntimeException("Error listing paths in directory", e);
    }

    JavaRDD<ValidationResult> results = paths.map(new ValidateMultiDataSetFn(deleteInvalid, numFeatureArrays, numLabelArrays,
            featuresShape, labelsShape));

    return results.reduce(new ValidationResultReduceFn());
}

Source File: SparkDl4jMultiLayer.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * Calculate the score for all examples in the provided {@code JavaRDD<DataSet>}, either by summing
 * or averaging over the entire data set. To calculate a score for each example individually, use {@link #scoreExamples(JavaPairRDD, boolean)}
 * or one of the similar methods
 *
 * @param data          Data to score
 * @param average       Whether to sum the scores, or average them
 * @param minibatchSize The number of examples to use in each minibatch when scoring. If more examples are in a partition than
 *                      this, multiple scoring operations will be done (to avoid using too much memory by doing the whole partition
 *                      in one go)
 */
public double calculateScore(JavaRDD<DataSet> data, boolean average, int minibatchSize) {
    JavaRDD<Tuple2<Integer, Double>> rdd = data.mapPartitions(
                    new ScoreFlatMapFunction(conf.toJson(), sc.broadcast(network.params(false)), minibatchSize));

    //Reduce to a single tuple, with example count + sum of scores
    Tuple2<Integer, Double> countAndSumScores = rdd.reduce(new IntDoubleReduceFunction());
    if (average) {
        return countAndSumScores._2() / countAndSumScores._1();
    } else {
        return countAndSumScores._2();
    }
}

Source File: SparkComputationGraph.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * Calculate the score for all examples in the provided {@code JavaRDD<DataSet>}, either by summing
 * or averaging over the entire data set. To calculate a score for each example individually, use {@link #scoreExamples(JavaPairRDD, boolean)}
 * or one of the similar methods
 *
 * @param data          Data to score
 * @param average       Whether to sum the scores, or average them
 * @param minibatchSize The number of examples to use in each minibatch when scoring. If more examples are in a partition than
 *                      this, multiple scoring operations will be done (to avoid using too much memory by doing the whole partition
 *                      in one go)
 */
public double calculateScore(JavaRDD<DataSet> data, boolean average, int minibatchSize) {
    JavaRDD<Tuple2<Long, Double>> rdd = data.mapPartitions(new ScoreFlatMapFunctionCGDataSet(conf.toJson(),
                    sc.broadcast(network.params()), minibatchSize));

    //Reduce to a single tuple, with example count + sum of scores
    Tuple2<Long, Double> countAndSumScores = rdd.reduce(new LongDoubleReduceFunction());
    if (average) {
        return countAndSumScores._2() / countAndSumScores._1();
    } else {
        return countAndSumScores._2();
    }
}

Source File: SparkComputationGraph.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * Calculate the score for all examples in the provided {@code JavaRDD<MultiDataSet>}, either by summing
 * or averaging over the entire data set.
 *      *
 * @param data          Data to score
 * @param average       Whether to sum the scores, or average them
 * @param minibatchSize The number of examples to use in each minibatch when scoring. If more examples are in a partition than
 *                      this, multiple scoring operations will be done (to avoid using too much memory by doing the whole partition
 *                      in one go)
 */
public double calculateScoreMultiDataSet(JavaRDD<MultiDataSet> data, boolean average, int minibatchSize) {
    JavaRDD<Tuple2<Long, Double>> rdd = data.mapPartitions(new ScoreFlatMapFunctionCGMultiDataSet(conf.toJson(),
                    sc.broadcast(network.params()), minibatchSize));
    //Reduce to a single tuple, with example count + sum of scores
    Tuple2<Long, Double> countAndSumScores = rdd.reduce(new LongDoubleReduceFunction());
    if (average) {
        return countAndSumScores._2() / countAndSumScores._1();
    } else {
        return countAndSumScores._2();
    }
}

Source File: Tokenizer.java From vn.vitk with GNU General Public License v3.0

4 votes

/**
 * Tokenizes a RDD of text lines and return a RDD of result.
 * @param input
 * @return a RDD of tokenized text lines.
 */
public JavaRDD<String> tokenize(JavaRDD<String> input) {
	if (verbose) {
		// print some basic statistic about the input, including 
		// max line length, min line length, average line length in syllables
		JavaRDD<Integer> wordCount = input.map(new Function<String, Integer>() {
			private static final long serialVersionUID = 7214093453452927565L;
			@Override
			public Integer call(String line) throws Exception {
				return line.split("\\s+").length;
			}
			
		});
		Comparator<Integer> comp = new IntegerComparator();
		System.out.println("Max line length (in syllables) = " + wordCount.max(comp));
		System.out.println("Min line length (in syllables) = " + wordCount.min(comp));
		float totalCount = wordCount.reduce(new Function2<Integer, Integer, Integer>() {
			private static final long serialVersionUID = 1L;
			@Override
			public Integer call(Integer v1, Integer v2) throws Exception {
				return v1 + v2;
			}
		});
		System.out.println("Avg line length (in syllables) = " + (totalCount) / input.count());
	}
	
	JavaRDD<String> output = null;
	if (classifier == null) {
		// use phrase graph approach (shortest paths and bigram model)
		// to segment phrases
		output = input.map(new SegmentationFunction());
	} else {
		// use logistic regression approach to segment phrases
		JavaRDD<String> s = input.map(new SegmentationFunction());
		// make sure that the preceding lazy computation has been evaluated
		// so that whitespace contexts have been properly accumulated
		System.out.println("Number of text lines = " + s.count());
		System.out.println("Number of contexts = " + contexts.value().size());
		// use whitespace classification approach (logistic regresion model)
		JavaRDD<WhitespaceContext> jrdd = jsc.parallelize(contexts.value());
		DataFrame df0 = (new SQLContext(jsc)).createDataFrame(jrdd, WhitespaceContext.class);
		DataFrame df1 = model.transform(df0);
		prediction = jsc.broadcast(df1.select("prediction").collect());
		if (df1.count() > 0) {
			output = s.map(new WhitespaceClassificationFunction());
		}
		else { 
			System.err.println("Empty data frame!");
		}
	}
	if (verbose) {
		// print number of non-space characters of the input and output dataset
		System.out.println("#(non-space characters of input) = " + numCharacters(input));
		if (output != null) {
			System.out.println("#(non-space characters of output) = " + numCharacters(output));
		}
	}
	return output;
}

Source File: DPMeansClusterer.java From ensemble-clustering with MIT License

3 votes

private Map<String, Instance> initKMeans(SparkDataSet ds) {		
	JavaRDD<Map<String, Instance>> singletons = ds.getRDD().map( new InstanceToClusterFunction(clusterFactory) );
		
	Map<String, Instance> kmeans = singletons.reduce( new AggregateClusterFunction(distFunc, Double.MAX_VALUE) );
	
	return kmeans;
}

Source File: ThresholdClusterer.java From ensemble-clustering with MIT License

2 votes

@Override
	public SparkClusterResult doCluster(DataSet ds) {
		// SparkDataSet needs to be passed in
		SparkDataSet rdd = (SparkDataSet)ds;
		
		// cache dataset in memory
//		rdd.getRDD().cache();
		
		distFunc = new DistanceFunction(this.typeDefs);
		ClusterFactory clusterFactory = new ClusterFactory(this.typeDefs, this.onlineUpdate);
		
		log.info("Starting threshold clusterer with threshold {}", threshold);
		
		// TODO look at using a reduce function 
		// Idea is the first step is a map<Instance, List<Instance>> that converts each instance to a single "cluster"
		// second step is a reduce where input is a List<Instances> and produces a List<Instances>
		// this step would merge clusters within threshold
		
		JavaPairRDD<String, Instance> instances = rdd.getRDD();
		instances.cache();
		
		// convert each instance into a singleton cluster
		JavaRDD<Map<String, Instance>> singletons = rdd.getRDD().map( new InstanceToClusterFunction(clusterFactory) );
		//singletons.cache();
		
		log.info("Generated initial singleton clusters");
		
		// merge clusters together
		Map<String, Instance> clusters = singletons.reduce( new AggregateClusterFunction(distFunc, threshold) );
		
		log.info("Merging clusters completed with {} clusters", clusters.size());
		
		// find the best cluster for each instance
		JavaPairRDD<String, Instance> bestCluster = instances.mapToPair( new BestClusterFunction(distFunc, clusters) );
		
		log.info("Output results");
		
		if (clusters != null && centroidsPath != null) rdd.getContext().parallelize(new ArrayList<Instance>(clusters.values())).saveAsTextFile(centroidsPath);
	
		if (bestCluster != null && clustersPath != null) bestCluster.saveAsTextFile(clustersPath);
		
		log.info("Threshold clusterer completed");
		
		// return the cluster membership rdd
		return new SparkClusterResult(bestCluster);
	}

Java Code Examples for org.apache.spark.api.java.JavaRDD#reduce()