org.apache.spark.api.java.JavaDoubleRDD Java Examples
The following examples show how to use
org.apache.spark.api.java.JavaDoubleRDD.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: PolyPeptideChainStatistics.java From mmtf-spark with Apache License 2.0 | 6 votes |
public static void main(String[] args) throws FileNotFoundException { SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(PolyPeptideChainStatistics.class.getSimpleName()); JavaSparkContext sc = new JavaSparkContext(conf); JavaDoubleRDD chainLengths = MmtfReader .readReducedSequenceFile(sc) // read PDB from MMTF-Hadoop sequence file .flatMapToPair(new StructureToPolymerChains(false, true)) // split (flatmap) into unique polymer chains .filter(new PolymerComposition(PolymerComposition.AMINO_ACIDS_20)) // only consider chains that contain the 20 standard aminoacids .mapToDouble(t -> t._2.getNumGroups()); // get the number of groups (residues) in each chain using a lambda expression System.out.println("Protein chains length statistics for proteins in the PDB with the 20 standard amino acids:"); System.out.println(chainLengths.stats()); sc.close(); }
Example #2
Source File: JavaHypothesisTestingKolmogorovSmirnovTestExample.java From SparkDemo with MIT License | 6 votes |
public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("JavaHypothesisTestingKolmogorovSmirnovTestExample"); JavaSparkContext jsc = new JavaSparkContext(conf); // $example on$ JavaDoubleRDD data = jsc.parallelizeDoubles(Arrays.asList(0.1, 0.15, 0.2, 0.3, 0.25)); KolmogorovSmirnovTestResult testResult = Statistics.kolmogorovSmirnovTest(data, "norm", 0.0, 1.0); // summary of the test including the p-value, test statistic, and null hypothesis // if our p-value indicates significance, we can reject the null hypothesis System.out.println(testResult); // $example off$ jsc.stop(); }
Example #3
Source File: Main.java From SparkApps with Apache License 2.0 | 6 votes |
public static void main(String[] args) { //Sample test data - All numbers from 1 to 99999 List<Double> testData = IntStream.range(1, 100000).mapToDouble(d -> d).collect(ArrayList::new, ArrayList::add, ArrayList::addAll); JavaDoubleRDD rdd = sc.parallelizeDoubles(testData); LOGGER.info("Mean: " + rdd.mean()); //For efficiency, use StatCounter if more than one stats are required. StatCounter statCounter = rdd.stats(); LOGGER.info("Using StatCounter"); LOGGER.info("Count: " + statCounter.count()); LOGGER.info("Min: " + statCounter.min()); LOGGER.info("Max: " + statCounter.max()); LOGGER.info("Sum: " + statCounter.sum()); LOGGER.info("Mean: " + statCounter.mean()); LOGGER.info("Variance: " + statCounter.variance()); LOGGER.info("Stdev: " + statCounter.stdev()); }
Example #4
Source File: XGBoostEvidenceFilterUnitTest.java From gatk with BSD 3-Clause "New" or "Revised" License | 6 votes |
@Test(groups = "sv") protected void testLocalXGBoostClassifierSpark() { final Predictor localPredictor = XGBoostEvidenceFilter.loadPredictor(localClassifierModelFile); // get spark ctx final JavaSparkContext ctx = SparkContextFactory.getTestSparkContext(); // parallelize classifierAccuracyData to RDD JavaRDD<FVec> testFeaturesRdd = ctx.parallelize(Arrays.asList(classifierAccuracyData.features)); // predict in parallel JavaDoubleRDD predictedProbabilityRdd = testFeaturesRdd.mapToDouble(f -> localPredictor.predictSingle(f, false, 0)); // pull back to local array final double[] predictedProbabilitySpark = predictedProbabilityRdd.collect() .stream().mapToDouble(Double::doubleValue).toArray(); // check probabilities from spark are identical to serial assertArrayEquals(predictedProbabilitySpark, predictedProbabilitySerial, 0.0, "Probabilities predicted in spark context differ from serial" ); }
Example #5
Source File: Main.java From SparkApps with Apache License 2.0 | 6 votes |
public static void main(String[] args) { //Sample test data - All numbers from 1 to 99999 List<Double> testData = IntStream.range(1, 100000).mapToDouble(d -> d).collect(ArrayList::new, ArrayList::add, ArrayList::addAll); JavaDoubleRDD rdd = sc.parallelizeDoubles(testData); LOGGER.info("Mean: " + rdd.mean()); //For efficiency, use StatCounter if more than one stats are required. StatCounter statCounter = rdd.stats(); LOGGER.info("Using StatCounter"); LOGGER.info("Count: " + statCounter.count()); LOGGER.info("Min: " + statCounter.min()); LOGGER.info("Max: " + statCounter.max()); LOGGER.info("Sum: " + statCounter.sum()); LOGGER.info("Mean: " + statCounter.mean()); LOGGER.info("Variance: " + statCounter.variance()); LOGGER.info("Stdev: " + statCounter.stdev()); }
Example #6
Source File: StructureToBioJavaTest.java From mmtf-spark with Apache License 2.0 | 5 votes |
@Test public void test() throws IOException { List<String> pdbIds = Arrays.asList("1STP","4HHB","1JLP","5X6H","5L2G","2MK1"); JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.downloadFullMmtfFiles(pdbIds, sc).cache(); // 1STP: 1 L-protein chain: // 4HHB: 4 polymer chains // 1JLP: 1 L-protein chains with non-polymer capping group (NH2) // 5X6H: 1 L-protein and 1 DNA chain // 5L2G: 2 DNA chains // 2MK1: 0 polymer chains // -------------------- // tot : 10 polymer chains JavaDoubleRDD chainCounts = pdb .mapValues(new StructureToBioJava()) .values() .mapToDouble(v -> v.getPolyChains().size()); assertEquals(10, Math.round(chainCounts.sum())); // extract polymer chains and count chains again chainCounts = pdb .flatMapToPair(new StructureToPolymerChains()) .mapValues(new StructureToBioJava()) .values() .mapToDouble(v -> v.getChains().size()); assertEquals(10, Math.round(chainCounts.sum())); }
Example #7
Source File: CollabFilterCassandra7.java From Spark-Cassandra-Collabfiltering with Apache License 2.0 | 5 votes |
public double validate(JavaRDD<Rating> predictionJavaRdd, CassandraJavaRDD<CassandraRow> validationsCassRdd) { JavaPairRDD<Tuple2<Integer, Integer>, Double> predictionsJavaPairs = JavaPairRDD.fromJavaRDD(predictionJavaRdd.map(new org.apache.spark.api.java.function.Function<Rating, Tuple2<Tuple2<Integer, Integer>, Double>>() { @Override public Tuple2<Tuple2<Integer, Integer>, Double> call(Rating pred) throws Exception { return new Tuple2<Tuple2<Integer, Integer>, Double>(new Tuple2<Integer, Integer>(pred.user(), pred.product()), pred.rating()); } // })); JavaRDD<Rating> validationRatings = validationsCassRdd.map(new org.apache.spark.api.java.function.Function<CassandraRow, Rating>() { @Override public Rating call(CassandraRow validation) throws Exception { return new Rating(validation.getInt(RatingDO.USER_COL), validation.getInt(RatingDO.PRODUCT_COL), validation.getInt(RatingDO.RATING_COL)); } }); JavaRDD<Tuple2<Double, Double>> validationAndPredictions = JavaPairRDD.fromJavaRDD(validationRatings.map(new org.apache.spark.api.java.function.Function<Rating, Tuple2<Tuple2<Integer, Integer>, Double>>() { @Override public Tuple2<Tuple2<Integer, Integer>, Double> call(Rating validationRating) throws Exception { return new Tuple2<Tuple2<Integer, Integer>, Double>(new Tuple2<Integer, Integer>(validationRating.user(), validationRating.product()), validationRating.rating()); } })).join(predictionsJavaPairs).values(); double meanSquaredError = JavaDoubleRDD.fromRDD(validationAndPredictions.map(new org.apache.spark.api.java.function.Function<Tuple2<Double, Double>, Object>() { @Override public Object call(Tuple2<Double, Double> pair) throws Exception { Double err = pair._1() - pair._2(); return (Object) (err * err);// No covariance! Need to cast } }).rdd()).mean(); double rmse = Math.sqrt(meanSquaredError); return rmse; }
Example #8
Source File: CollabFilterCassandra8.java From Spark-Cassandra-Collabfiltering with Apache License 2.0 | 5 votes |
public double validate(JavaRDD<Rating> predictionJavaRdd, CassandraJavaRDD<CassandraRow> validationsCassRdd) { JavaPairRDD<Tuple2<Integer, Integer>, Double> predictionsJavaPairs = JavaPairRDD.fromJavaRDD(predictionJavaRdd.map(pred -> new Tuple2<Tuple2<Integer, Integer>, Double>(new Tuple2<Integer, Integer>(pred.user(), pred.product()), pred.rating()))); JavaRDD<Rating> validationRatings = validationsCassRdd.map(validation -> new Rating(validation.getInt(RatingDO.USER_COL), validation.getInt(RatingDO.PRODUCT_COL), validation.getInt(RatingDO.RATING_COL))); JavaRDD<Tuple2<Double, Double>> validationAndPredictions = JavaPairRDD.fromJavaRDD(validationRatings.map(validationRating -> new Tuple2<Tuple2<Integer, Integer>, Double>(new Tuple2<Integer, Integer>(validationRating.user(), validationRating.product()), validationRating.rating()))).join(predictionsJavaPairs).values(); double meanSquaredError = JavaDoubleRDD.fromRDD(validationAndPredictions.map(pair -> { Double err = pair._1() - pair._2(); return (Object) (err * err);// No covariance! Need to cast to Object }).rdd()).mean(); double rmse = Math.sqrt(meanSquaredError); return rmse; }
Example #9
Source File: ChronixRDD.java From chronix.spark with Apache License 2.0 | 5 votes |
/** * Action: Calculates the slope of a linear regression of every time series. * * Where: value = slope * timestamp * .. or: y = slope * x * * @return the slopes (simple linear regression) of each an every time series in the RDD */ public JavaDoubleRDD getSlopes() { return this.mapToDouble((DoubleFunction<MetricTimeSeries>) mts -> { SimpleRegression regression = new SimpleRegression(); mts.points().forEach(p -> regression.addData(p.getTimestamp(), p.getValue())); return regression.getSlope(); } ); }
Example #10
Source File: JavaCorrelationsExample.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("JavaCorrelationsExample"); JavaSparkContext jsc = new JavaSparkContext(conf); // $example on$ JavaDoubleRDD seriesX = jsc.parallelizeDoubles( Arrays.asList(1.0, 2.0, 3.0, 3.0, 5.0)); // a series // must have the same number of partitions and cardinality as seriesX JavaDoubleRDD seriesY = jsc.parallelizeDoubles( Arrays.asList(11.0, 22.0, 33.0, 33.0, 555.0)); // compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. // If a method is not specified, Pearson's method will be used by default. Double correlation = Statistics.corr(seriesX.srdd(), seriesY.srdd(), "pearson"); System.out.println("Correlation is: " + correlation); // note that each Vector is a row and not a column JavaRDD<Vector> data = jsc.parallelize( Arrays.asList( Vectors.dense(1.0, 10.0, 100.0), Vectors.dense(2.0, 20.0, 200.0), Vectors.dense(5.0, 33.0, 366.0) ) ); // calculate the correlation matrix using Pearson's method. // Use "spearman" for Spearman's method. // If a method is not specified, Pearson's method will be used by default. Matrix correlMatrix = Statistics.corr(data.rdd(), "pearson"); System.out.println(correlMatrix.toString()); // $example off$ jsc.stop(); }
Example #11
Source File: ChronixRDD.java From chronix.spark with Apache License 2.0 | 4 votes |
/** * Action: Counts the number of observations. * * @return the number of overall observations in all time series */ public long countObservations() { JavaDoubleRDD sizesRdd = this.mapToDouble( (DoubleFunction<MetricTimeSeries>) value -> (double) value.size()); return sizesRdd.sum().longValue(); }
Example #12
Source File: SimpleSparkJob.java From infinispan-simple-tutorials with Apache License 2.0 | 4 votes |
public static void main(String[] args) throws UnknownHostException { // Obtain the Infinispan address String infinispanAddress = args[0]; // Adjust log levels Logger.getLogger("org").setLevel(Level.WARN); // Create the remote cache manager Configuration build = new ConfigurationBuilder().addServer().host(infinispanAddress).build(); RemoteCacheManager remoteCacheManager = new RemoteCacheManager(build); // Obtain the remote cache RemoteCache<Integer, Temperature> cache = remoteCacheManager.getCache(); // Add some data cache.put(1, new Temperature(21, "London")); cache.put(2, new Temperature(34, "Rome")); cache.put(3, new Temperature(33, "Barcelona")); cache.put(4, new Temperature(8, "Oslo")); // Create java spark context SparkConf conf = new SparkConf().setAppName("infinispan-spark-simple-job"); JavaSparkContext jsc = new JavaSparkContext(conf); // Create InfinispanRDD ConnectorConfiguration config = new ConnectorConfiguration().setServerList(infinispanAddress); JavaPairRDD<Integer, Temperature> infinispanRDD = InfinispanJavaRDD.createInfinispanRDD(jsc, config); // Convert RDD to RDD of doubles JavaDoubleRDD javaDoubleRDD = infinispanRDD.values().mapToDouble(Temperature::getValue); // Calculate average temperature Double meanTemp = javaDoubleRDD.mean(); System.out.printf("\nAVERAGE TEMPERATURE: %f C\n", meanTemp); // Calculate standard deviation Double stdDev = javaDoubleRDD.sampleStdev(); System.out.printf("STD DEVIATION: %f C\n ", stdDev); // Calculate histogram of temperatures System.out.println("TEMPERATURE HISTOGRAM:"); double[] buckets = {0d, 10d, 20d, 30d, 40d}; long[] histogram = javaDoubleRDD.histogram(buckets); for (int i = 0; i < buckets.length - 1; i++) { System.out.printf("Between %f C and %f C: %d cities\n", buckets[i], buckets[i + 1], histogram[i]); } }
Example #13
Source File: JavaLinearRegressionWithSGDExample.java From SparkDemo with MIT License | 4 votes |
public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("JavaLinearRegressionWithSGDExample"); JavaSparkContext sc = new JavaSparkContext(conf); // $example on$ // Load and parse the data String path = "data/mllib/ridge-data/lpsa.data"; JavaRDD<String> data = sc.textFile(path); JavaRDD<LabeledPoint> parsedData = data.map( new Function<String, LabeledPoint>() { public LabeledPoint call(String line) { String[] parts = line.split(","); String[] features = parts[1].split(" "); double[] v = new double[features.length]; for (int i = 0; i < features.length - 1; i++) { v[i] = Double.parseDouble(features[i]); } return new LabeledPoint(Double.parseDouble(parts[0]), Vectors.dense(v)); } } ); parsedData.cache(); // Building the model int numIterations = 100; double stepSize = 0.00000001; final LinearRegressionModel model = LinearRegressionWithSGD.train(JavaRDD.toRDD(parsedData), numIterations, stepSize); // Evaluate model on training examples and compute training error JavaRDD<Tuple2<Double, Double>> valuesAndPreds = parsedData.map( new Function<LabeledPoint, Tuple2<Double, Double>>() { public Tuple2<Double, Double> call(LabeledPoint point) { double prediction = model.predict(point.features()); return new Tuple2<>(prediction, point.label()); } } ); double MSE = new JavaDoubleRDD(valuesAndPreds.map( new Function<Tuple2<Double, Double>, Object>() { public Object call(Tuple2<Double, Double> pair) { return Math.pow(pair._1() - pair._2(), 2.0); } } ).rdd()).mean(); System.out.println("training Mean Squared Error = " + MSE); // Save and load model model.save(sc.sc(), "target/tmp/javaLinearRegressionWithSGDModel"); LinearRegressionModel sameModel = LinearRegressionModel.load(sc.sc(), "target/tmp/javaLinearRegressionWithSGDModel"); // $example off$ sc.stop(); }
Example #14
Source File: SparkDl4jMultiLayer.java From deeplearning4j with Apache License 2.0 | 4 votes |
/** * {@code RDD<DataSet>} overload of {@link #scoreExamples(JavaPairRDD, boolean)} */ public JavaDoubleRDD scoreExamples(RDD<DataSet> data, boolean includeRegularizationTerms) { return scoreExamples(data.toJavaRDD(), includeRegularizationTerms); }
Example #15
Source File: SparkComputationGraph.java From deeplearning4j with Apache License 2.0 | 4 votes |
/** * DataSet version of {@link #scoreExamples(JavaRDD, boolean)} */ public JavaDoubleRDD scoreExamples(JavaRDD<DataSet> data, boolean includeRegularizationTerms) { return scoreExamplesMultiDataSet(data.map(new DataSetToMultiDataSetFn()), includeRegularizationTerms); }
Example #16
Source File: SparkComputationGraph.java From deeplearning4j with Apache License 2.0 | 4 votes |
/** * DataSet version of {@link #scoreExamples(JavaPairRDD, boolean, int)} */ public JavaDoubleRDD scoreExamples(JavaRDD<DataSet> data, boolean includeRegularizationTerms, int batchSize) { return scoreExamplesMultiDataSet(data.map(new DataSetToMultiDataSetFn()), includeRegularizationTerms, batchSize); }
Example #17
Source File: ChronixRDD.java From chronix.spark with Apache License 2.0 | 2 votes |
/** * Transformation: Get all values as JavaDoubleRDD. * * @return a RDD with all observation values */ public JavaDoubleRDD getValuesAsRdd() { return this.flatMapToDouble(mts -> Arrays.asList(ArrayUtils.toObject(mts.getValuesAsArray())).iterator()); }
Example #18
Source File: SparkDl4jMultiLayer.java From deeplearning4j with Apache License 2.0 | 2 votes |
/** * Score the examples individually, using the default batch size {@link #DEFAULT_EVAL_SCORE_BATCH_SIZE}. Unlike {@link #calculateScore(JavaRDD, boolean)}, * this method returns a score for each example separately. If scoring is needed for specific examples use either * {@link #scoreExamples(JavaPairRDD, boolean)} or {@link #scoreExamples(JavaPairRDD, boolean, int)} which can have * a key for each example. * * @param data Data to score * @param includeRegularizationTerms If true: include the l1/l2 regularization terms with the score (if any) * @return A JavaDoubleRDD containing the scores of each example * @see MultiLayerNetwork#scoreExamples(DataSet, boolean) */ public JavaDoubleRDD scoreExamples(JavaRDD<DataSet> data, boolean includeRegularizationTerms) { return scoreExamples(data, includeRegularizationTerms, DEFAULT_EVAL_SCORE_BATCH_SIZE); }
Example #19
Source File: SparkDl4jMultiLayer.java From deeplearning4j with Apache License 2.0 | 2 votes |
/** * {@code RDD<DataSet>} * overload of {@link #scoreExamples(JavaRDD, boolean, int)} */ public JavaDoubleRDD scoreExamples(RDD<DataSet> data, boolean includeRegularizationTerms, int batchSize) { return scoreExamples(data.toJavaRDD(), includeRegularizationTerms, batchSize); }
Example #20
Source File: SparkDl4jMultiLayer.java From deeplearning4j with Apache License 2.0 | 2 votes |
/** * Score the examples individually, using a specified batch size. Unlike {@link #calculateScore(JavaRDD, boolean)}, * this method returns a score for each example separately. If scoring is needed for specific examples use either * {@link #scoreExamples(JavaPairRDD, boolean)} or {@link #scoreExamples(JavaPairRDD, boolean, int)} which can have * a key for each example. * * @param data Data to score * @param includeRegularizationTerms If true: include the l1/l2 regularization terms with the score (if any) * @param batchSize Batch size to use when doing scoring * @return A JavaDoubleRDD containing the scores of each example * @see MultiLayerNetwork#scoreExamples(DataSet, boolean) */ public JavaDoubleRDD scoreExamples(JavaRDD<DataSet> data, boolean includeRegularizationTerms, int batchSize) { return data.mapPartitionsToDouble(new ScoreExamplesFunction(sc.broadcast(network.params()), sc.broadcast(conf.toJson()), includeRegularizationTerms, batchSize)); }
Example #21
Source File: SparkComputationGraph.java From deeplearning4j with Apache License 2.0 | 2 votes |
/** * Score the examples individually, using the default batch size {@link #DEFAULT_EVAL_SCORE_BATCH_SIZE}. Unlike {@link #calculateScore(JavaRDD, boolean)}, * this method returns a score for each example separately. If scoring is needed for specific examples use either * {@link #scoreExamples(JavaPairRDD, boolean)} or {@link #scoreExamples(JavaPairRDD, boolean, int)} which can have * a key for each example. * * @param data Data to score * @param includeRegularizationTerms If true: include the l1/l2 regularization terms with the score (if any) * @return A JavaDoubleRDD containing the scores of each example * @see ComputationGraph#scoreExamples(MultiDataSet, boolean) */ public JavaDoubleRDD scoreExamplesMultiDataSet(JavaRDD<MultiDataSet> data, boolean includeRegularizationTerms) { return scoreExamplesMultiDataSet(data, includeRegularizationTerms, DEFAULT_EVAL_SCORE_BATCH_SIZE); }
Example #22
Source File: SparkComputationGraph.java From deeplearning4j with Apache License 2.0 | 2 votes |
/** * Score the examples individually, using a specified batch size. Unlike {@link #calculateScore(JavaRDD, boolean)}, * this method returns a score for each example separately. If scoring is needed for specific examples use either * {@link #scoreExamples(JavaPairRDD, boolean)} or {@link #scoreExamples(JavaPairRDD, boolean, int)} which can have * a key for each example. * * @param data Data to score * @param includeRegularizationTerms If true: include the l1/l2 regularization terms with the score (if any) * @param batchSize Batch size to use when doing scoring * @return A JavaDoubleRDD containing the scores of each example * @see ComputationGraph#scoreExamples(MultiDataSet, boolean) */ public JavaDoubleRDD scoreExamplesMultiDataSet(JavaRDD<MultiDataSet> data, boolean includeRegularizationTerms, int batchSize) { return data.mapPartitionsToDouble(new ScoreExamplesFunction(sc.broadcast(network.params()), sc.broadcast(conf.toJson()), includeRegularizationTerms, batchSize)); }