Java Code Examples for org.apache.spark.api.java.JavaRDD#reduce()
The following examples show how to use
org.apache.spark.api.java.JavaRDD#reduce() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: Reduce.java From SparkDemo with MIT License | 6 votes |
private static void reduce(JavaSparkContext sc) { List<Integer> numberList=Arrays.asList(1,2,3,4,5,6,7,8,9,10); JavaRDD<Integer> javaRDD = sc.parallelize(numberList); /** * ===================================================== * | 累加求和 | * ===================================================== */ Integer num = javaRDD.reduce(new Function2<Integer, Integer, Integer>() { /** * @param num1上一次计算结果 return的值 * @param num2 当前值 */ @Override public Integer call(Integer num1, Integer num2) throws Exception { // System.out.println(num1+"======"+num2); return num1 + num2; } }); System.out.println(num); sc.close(); }
Example 2
Source File: Tokenizer.java From vn.vitk with GNU General Public License v3.0 | 6 votes |
/** * Counts the number of non-space characters in this data set. This utility method * is used to check the tokenization result. * @param lines * @return number of characters */ int numCharacters(JavaRDD<String> lines) { JavaRDD<Integer> lengths = lines.map(new Function<String, Integer>() { private static final long serialVersionUID = -2189399343462982586L; @Override public Integer call(String line) throws Exception { line = line.replaceAll("[\\s_]+", ""); return line.length(); } }); return lengths.reduce(new Function2<Integer, Integer, Integer>() { private static final long serialVersionUID = -8438072946884289401L; @Override public Integer call(Integer e0, Integer e1) throws Exception { return e0 + e1; } }); }
Example 3
Source File: PiApproximation.java From tutorials with MIT License | 6 votes |
public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("BaeldungPIApproximation").setMaster("local[2]"); JavaSparkContext context = new JavaSparkContext(conf); int slices = args.length >= 1 ? Integer.valueOf(args[0]) : 2; int n = (100000L * slices) > Integer.MAX_VALUE ? Integer.MAX_VALUE : 100000 * slices; List<Integer> xs = IntStream.rangeClosed(0, n) .mapToObj(element -> Integer.valueOf(element)) .collect(Collectors.toList()); JavaRDD<Integer> dataSet = context.parallelize(xs, slices); JavaRDD<Integer> pointsInsideTheCircle = dataSet.map(integer -> { double x = Math.random() * 2 - 1; double y = Math.random() * 2 - 1; return (x*x + y*y ) < 1 ? 1: 0; }); int count = pointsInsideTheCircle.reduce((integer, integer2) -> integer + integer2); System.out.println("The pi was estimated as:" + count / n); context.stop(); }
Example 4
Source File: SparkDataValidation.java From deeplearning4j with Apache License 2.0 | 5 votes |
protected static ValidationResult validateDataSets(JavaSparkContext sc, String path, boolean recursive, boolean deleteInvalid, int[] featuresShape, int[] labelsShape) { JavaRDD<String> paths; try { paths = SparkUtils.listPaths(sc, path, recursive); } catch (IOException e) { throw new RuntimeException("Error listing paths in directory", e); } JavaRDD<ValidationResult> results = paths.map(new ValidateDataSetFn(deleteInvalid, featuresShape, labelsShape)); return results.reduce(new ValidationResultReduceFn()); }
Example 5
Source File: SparkDataValidation.java From deeplearning4j with Apache License 2.0 | 5 votes |
protected static ValidationResult validateMultiDataSets(JavaSparkContext sc, String path, boolean recursive, boolean deleteInvalid, int numFeatureArrays, int numLabelArrays, List<int[]> featuresShape, List<int[]> labelsShape) { JavaRDD<String> paths; try { paths = SparkUtils.listPaths(sc, path, recursive); } catch (IOException e) { throw new RuntimeException("Error listing paths in directory", e); } JavaRDD<ValidationResult> results = paths.map(new ValidateMultiDataSetFn(deleteInvalid, numFeatureArrays, numLabelArrays, featuresShape, labelsShape)); return results.reduce(new ValidationResultReduceFn()); }
Example 6
Source File: SparkDl4jMultiLayer.java From deeplearning4j with Apache License 2.0 | 5 votes |
/** * Calculate the score for all examples in the provided {@code JavaRDD<DataSet>}, either by summing * or averaging over the entire data set. To calculate a score for each example individually, use {@link #scoreExamples(JavaPairRDD, boolean)} * or one of the similar methods * * @param data Data to score * @param average Whether to sum the scores, or average them * @param minibatchSize The number of examples to use in each minibatch when scoring. If more examples are in a partition than * this, multiple scoring operations will be done (to avoid using too much memory by doing the whole partition * in one go) */ public double calculateScore(JavaRDD<DataSet> data, boolean average, int minibatchSize) { JavaRDD<Tuple2<Integer, Double>> rdd = data.mapPartitions( new ScoreFlatMapFunction(conf.toJson(), sc.broadcast(network.params(false)), minibatchSize)); //Reduce to a single tuple, with example count + sum of scores Tuple2<Integer, Double> countAndSumScores = rdd.reduce(new IntDoubleReduceFunction()); if (average) { return countAndSumScores._2() / countAndSumScores._1(); } else { return countAndSumScores._2(); } }
Example 7
Source File: SparkComputationGraph.java From deeplearning4j with Apache License 2.0 | 5 votes |
/** * Calculate the score for all examples in the provided {@code JavaRDD<DataSet>}, either by summing * or averaging over the entire data set. To calculate a score for each example individually, use {@link #scoreExamples(JavaPairRDD, boolean)} * or one of the similar methods * * @param data Data to score * @param average Whether to sum the scores, or average them * @param minibatchSize The number of examples to use in each minibatch when scoring. If more examples are in a partition than * this, multiple scoring operations will be done (to avoid using too much memory by doing the whole partition * in one go) */ public double calculateScore(JavaRDD<DataSet> data, boolean average, int minibatchSize) { JavaRDD<Tuple2<Long, Double>> rdd = data.mapPartitions(new ScoreFlatMapFunctionCGDataSet(conf.toJson(), sc.broadcast(network.params()), minibatchSize)); //Reduce to a single tuple, with example count + sum of scores Tuple2<Long, Double> countAndSumScores = rdd.reduce(new LongDoubleReduceFunction()); if (average) { return countAndSumScores._2() / countAndSumScores._1(); } else { return countAndSumScores._2(); } }
Example 8
Source File: SparkComputationGraph.java From deeplearning4j with Apache License 2.0 | 5 votes |
/** * Calculate the score for all examples in the provided {@code JavaRDD<MultiDataSet>}, either by summing * or averaging over the entire data set. * * * @param data Data to score * @param average Whether to sum the scores, or average them * @param minibatchSize The number of examples to use in each minibatch when scoring. If more examples are in a partition than * this, multiple scoring operations will be done (to avoid using too much memory by doing the whole partition * in one go) */ public double calculateScoreMultiDataSet(JavaRDD<MultiDataSet> data, boolean average, int minibatchSize) { JavaRDD<Tuple2<Long, Double>> rdd = data.mapPartitions(new ScoreFlatMapFunctionCGMultiDataSet(conf.toJson(), sc.broadcast(network.params()), minibatchSize)); //Reduce to a single tuple, with example count + sum of scores Tuple2<Long, Double> countAndSumScores = rdd.reduce(new LongDoubleReduceFunction()); if (average) { return countAndSumScores._2() / countAndSumScores._1(); } else { return countAndSumScores._2(); } }
Example 9
Source File: Tokenizer.java From vn.vitk with GNU General Public License v3.0 | 4 votes |
/** * Tokenizes a RDD of text lines and return a RDD of result. * @param input * @return a RDD of tokenized text lines. */ public JavaRDD<String> tokenize(JavaRDD<String> input) { if (verbose) { // print some basic statistic about the input, including // max line length, min line length, average line length in syllables JavaRDD<Integer> wordCount = input.map(new Function<String, Integer>() { private static final long serialVersionUID = 7214093453452927565L; @Override public Integer call(String line) throws Exception { return line.split("\\s+").length; } }); Comparator<Integer> comp = new IntegerComparator(); System.out.println("Max line length (in syllables) = " + wordCount.max(comp)); System.out.println("Min line length (in syllables) = " + wordCount.min(comp)); float totalCount = wordCount.reduce(new Function2<Integer, Integer, Integer>() { private static final long serialVersionUID = 1L; @Override public Integer call(Integer v1, Integer v2) throws Exception { return v1 + v2; } }); System.out.println("Avg line length (in syllables) = " + (totalCount) / input.count()); } JavaRDD<String> output = null; if (classifier == null) { // use phrase graph approach (shortest paths and bigram model) // to segment phrases output = input.map(new SegmentationFunction()); } else { // use logistic regression approach to segment phrases JavaRDD<String> s = input.map(new SegmentationFunction()); // make sure that the preceding lazy computation has been evaluated // so that whitespace contexts have been properly accumulated System.out.println("Number of text lines = " + s.count()); System.out.println("Number of contexts = " + contexts.value().size()); // use whitespace classification approach (logistic regresion model) JavaRDD<WhitespaceContext> jrdd = jsc.parallelize(contexts.value()); DataFrame df0 = (new SQLContext(jsc)).createDataFrame(jrdd, WhitespaceContext.class); DataFrame df1 = model.transform(df0); prediction = jsc.broadcast(df1.select("prediction").collect()); if (df1.count() > 0) { output = s.map(new WhitespaceClassificationFunction()); } else { System.err.println("Empty data frame!"); } } if (verbose) { // print number of non-space characters of the input and output dataset System.out.println("#(non-space characters of input) = " + numCharacters(input)); if (output != null) { System.out.println("#(non-space characters of output) = " + numCharacters(output)); } } return output; }
Example 10
Source File: DPMeansClusterer.java From ensemble-clustering with MIT License | 3 votes |
private Map<String, Instance> initKMeans(SparkDataSet ds) { JavaRDD<Map<String, Instance>> singletons = ds.getRDD().map( new InstanceToClusterFunction(clusterFactory) ); Map<String, Instance> kmeans = singletons.reduce( new AggregateClusterFunction(distFunc, Double.MAX_VALUE) ); return kmeans; }
Example 11
Source File: ThresholdClusterer.java From ensemble-clustering with MIT License | 2 votes |
@Override public SparkClusterResult doCluster(DataSet ds) { // SparkDataSet needs to be passed in SparkDataSet rdd = (SparkDataSet)ds; // cache dataset in memory // rdd.getRDD().cache(); distFunc = new DistanceFunction(this.typeDefs); ClusterFactory clusterFactory = new ClusterFactory(this.typeDefs, this.onlineUpdate); log.info("Starting threshold clusterer with threshold {}", threshold); // TODO look at using a reduce function // Idea is the first step is a map<Instance, List<Instance>> that converts each instance to a single "cluster" // second step is a reduce where input is a List<Instances> and produces a List<Instances> // this step would merge clusters within threshold JavaPairRDD<String, Instance> instances = rdd.getRDD(); instances.cache(); // convert each instance into a singleton cluster JavaRDD<Map<String, Instance>> singletons = rdd.getRDD().map( new InstanceToClusterFunction(clusterFactory) ); //singletons.cache(); log.info("Generated initial singleton clusters"); // merge clusters together Map<String, Instance> clusters = singletons.reduce( new AggregateClusterFunction(distFunc, threshold) ); log.info("Merging clusters completed with {} clusters", clusters.size()); // find the best cluster for each instance JavaPairRDD<String, Instance> bestCluster = instances.mapToPair( new BestClusterFunction(distFunc, clusters) ); log.info("Output results"); if (clusters != null && centroidsPath != null) rdd.getContext().parallelize(new ArrayList<Instance>(clusters.values())).saveAsTextFile(centroidsPath); if (bestCluster != null && clustersPath != null) bestCluster.saveAsTextFile(clustersPath); log.info("Threshold clusterer completed"); // return the cluster membership rdd return new SparkClusterResult(bestCluster); }