org.apache.spark.api.java.JavaDoubleRDD Java Exaples

Source File: PolyPeptideChainStatistics.java From mmtf-spark with Apache License 2.0

6 votes

public static void main(String[] args) throws FileNotFoundException {

		SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(PolyPeptideChainStatistics.class.getSimpleName());
		JavaSparkContext sc = new JavaSparkContext(conf);

		JavaDoubleRDD chainLengths = MmtfReader
				.readReducedSequenceFile(sc) // read PDB from MMTF-Hadoop sequence file															
				.flatMapToPair(new StructureToPolymerChains(false, true)) // split (flatmap) into unique polymer chains
				.filter(new PolymerComposition(PolymerComposition.AMINO_ACIDS_20)) // only consider chains that contain the 20 standard aminoacids
				.mapToDouble(t -> t._2.getNumGroups()); // get the number of groups (residues) in each chain using a lambda expression

		System.out.println("Protein chains length statistics for proteins in the PDB with the 20 standard amino acids:");
		System.out.println(chainLengths.stats());

		sc.close();
	}

Source File: JavaHypothesisTestingKolmogorovSmirnovTestExample.java From SparkDemo with MIT License

6 votes

public static void main(String[] args) {

    SparkConf conf =
      new SparkConf().setAppName("JavaHypothesisTestingKolmogorovSmirnovTestExample");
    JavaSparkContext jsc = new JavaSparkContext(conf);

    // $example on$
    JavaDoubleRDD data = jsc.parallelizeDoubles(Arrays.asList(0.1, 0.15, 0.2, 0.3, 0.25));
    KolmogorovSmirnovTestResult testResult =
      Statistics.kolmogorovSmirnovTest(data, "norm", 0.0, 1.0);
    // summary of the test including the p-value, test statistic, and null hypothesis
    // if our p-value indicates significance, we can reject the null hypothesis
    System.out.println(testResult);
    // $example off$

    jsc.stop();
  }

Source File: Main.java From SparkApps with Apache License 2.0

6 votes

public static void main(String[] args) {
    //Sample test data - All numbers from 1 to 99999
    List<Double> testData = IntStream.range(1, 100000).mapToDouble(d -> d).collect(ArrayList::new, ArrayList::add,
                                                                                 ArrayList::addAll);

    JavaDoubleRDD rdd = sc.parallelizeDoubles(testData);

    LOGGER.info("Mean: " + rdd.mean());

    //For efficiency, use StatCounter if more than one stats are required.
    StatCounter statCounter = rdd.stats();

    LOGGER.info("Using StatCounter");
    LOGGER.info("Count:    " + statCounter.count());
    LOGGER.info("Min:      " + statCounter.min());
    LOGGER.info("Max:      " + statCounter.max());
    LOGGER.info("Sum:      " + statCounter.sum());
    LOGGER.info("Mean:     " + statCounter.mean());
    LOGGER.info("Variance: " + statCounter.variance());
    LOGGER.info("Stdev:    " + statCounter.stdev());
}

Source File: XGBoostEvidenceFilterUnitTest.java From gatk with BSD 3-Clause "New" or "Revised" License

6 votes

@Test(groups = "sv")
protected void testLocalXGBoostClassifierSpark() {
    final Predictor localPredictor = XGBoostEvidenceFilter.loadPredictor(localClassifierModelFile);
    // get spark ctx
    final JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
    // parallelize classifierAccuracyData to RDD
    JavaRDD<FVec> testFeaturesRdd = ctx.parallelize(Arrays.asList(classifierAccuracyData.features));
    // predict in parallel
    JavaDoubleRDD predictedProbabilityRdd
            = testFeaturesRdd.mapToDouble(f -> localPredictor.predictSingle(f, false, 0));
    // pull back to local array
    final double[] predictedProbabilitySpark = predictedProbabilityRdd.collect()
            .stream().mapToDouble(Double::doubleValue).toArray();
    // check probabilities from spark are identical to serial
    assertArrayEquals(predictedProbabilitySpark, predictedProbabilitySerial, 0.0, "Probabilities predicted in spark context differ from serial"
    );
}

Source File: Main.java From SparkApps with Apache License 2.0

6 votes

public static void main(String[] args) {
    //Sample test data - All numbers from 1 to 99999
    List<Double> testData = IntStream.range(1, 100000).mapToDouble(d -> d).collect(ArrayList::new, ArrayList::add,
                                                                                 ArrayList::addAll);

    JavaDoubleRDD rdd = sc.parallelizeDoubles(testData);

    LOGGER.info("Mean: " + rdd.mean());

    //For efficiency, use StatCounter if more than one stats are required.
    StatCounter statCounter = rdd.stats();

    LOGGER.info("Using StatCounter");
    LOGGER.info("Count:    " + statCounter.count());
    LOGGER.info("Min:      " + statCounter.min());
    LOGGER.info("Max:      " + statCounter.max());
    LOGGER.info("Sum:      " + statCounter.sum());
    LOGGER.info("Mean:     " + statCounter.mean());
    LOGGER.info("Variance: " + statCounter.variance());
    LOGGER.info("Stdev:    " + statCounter.stdev());
}

Source File: StructureToBioJavaTest.java From mmtf-spark with Apache License 2.0

5 votes

@Test
public void test() throws IOException {
	List<String> pdbIds = Arrays.asList("1STP","4HHB","1JLP","5X6H","5L2G","2MK1");
	JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.downloadFullMmtfFiles(pdbIds, sc).cache();

	// 1STP: 1 L-protein chain:
	// 4HHB: 4 polymer chains
	// 1JLP: 1 L-protein chains with non-polymer capping group (NH2)
	// 5X6H: 1 L-protein and 1 DNA chain
	// 5L2G: 2 DNA chains
	// 2MK1: 0 polymer chains
	// --------------------
	// tot : 10 polymer chains

	JavaDoubleRDD chainCounts = pdb
			.mapValues(new StructureToBioJava())
			.values()
			.mapToDouble(v -> v.getPolyChains().size());

	assertEquals(10, Math.round(chainCounts.sum()));

	// extract polymer chains and count chains again
	chainCounts = pdb
			.flatMapToPair(new StructureToPolymerChains())
			.mapValues(new StructureToBioJava())
			.values()
			.mapToDouble(v -> v.getChains().size());
			
	assertEquals(10, Math.round(chainCounts.sum()));
}

Source File: CollabFilterCassandra7.java From Spark-Cassandra-Collabfiltering with Apache License 2.0

5 votes

public double validate(JavaRDD<Rating> predictionJavaRdd, CassandraJavaRDD<CassandraRow> validationsCassRdd) {
	JavaPairRDD<Tuple2<Integer, Integer>, Double> predictionsJavaPairs = JavaPairRDD.fromJavaRDD(predictionJavaRdd.map(new org.apache.spark.api.java.function.Function<Rating, Tuple2<Tuple2<Integer, Integer>, Double>>() {
		@Override
		public Tuple2<Tuple2<Integer, Integer>, Double> call(Rating pred) throws Exception {
			return new Tuple2<Tuple2<Integer, Integer>, Double>(new Tuple2<Integer, Integer>(pred.user(), pred.product()), pred.rating());
		}
		//
	}));
	JavaRDD<Rating> validationRatings = validationsCassRdd.map(new org.apache.spark.api.java.function.Function<CassandraRow, Rating>() {
		@Override
		public Rating call(CassandraRow validation) throws Exception {
			return new Rating(validation.getInt(RatingDO.USER_COL), validation.getInt(RatingDO.PRODUCT_COL), validation.getInt(RatingDO.RATING_COL));
		}
	
	});
	JavaRDD<Tuple2<Double, Double>> validationAndPredictions = JavaPairRDD.fromJavaRDD(validationRatings.map(new org.apache.spark.api.java.function.Function<Rating, Tuple2<Tuple2<Integer, Integer>, Double>>() {
	
		@Override
		public Tuple2<Tuple2<Integer, Integer>, Double> call(Rating validationRating) throws Exception {
			return new Tuple2<Tuple2<Integer, Integer>, Double>(new Tuple2<Integer, Integer>(validationRating.user(), validationRating.product()), validationRating.rating());
		}
	
	})).join(predictionsJavaPairs).values();
	
	double meanSquaredError = JavaDoubleRDD.fromRDD(validationAndPredictions.map(new org.apache.spark.api.java.function.Function<Tuple2<Double, Double>, Object>() {
		@Override
		public Object call(Tuple2<Double, Double> pair) throws Exception {
			Double err = pair._1() - pair._2();
			return (Object) (err * err);// No covariance! Need to cast
		}
	}).rdd()).mean();
	double rmse = Math.sqrt(meanSquaredError);
	return rmse;
	 
}

Source File: CollabFilterCassandra8.java From Spark-Cassandra-Collabfiltering with Apache License 2.0

5 votes

public double validate(JavaRDD<Rating> predictionJavaRdd, CassandraJavaRDD<CassandraRow> validationsCassRdd) {
	JavaPairRDD<Tuple2<Integer, Integer>, Double> predictionsJavaPairs = JavaPairRDD.fromJavaRDD(predictionJavaRdd.map(pred -> new Tuple2<Tuple2<Integer, Integer>, Double>(new Tuple2<Integer, Integer>(pred.user(), pred.product()), pred.rating())));
	JavaRDD<Rating> validationRatings = validationsCassRdd.map(validation -> new Rating(validation.getInt(RatingDO.USER_COL), validation.getInt(RatingDO.PRODUCT_COL), validation.getInt(RatingDO.RATING_COL)));
	JavaRDD<Tuple2<Double, Double>> validationAndPredictions = JavaPairRDD.fromJavaRDD(validationRatings.map(validationRating -> new Tuple2<Tuple2<Integer, Integer>, Double>(new Tuple2<Integer, Integer>(validationRating.user(), validationRating.product()), validationRating.rating()))).join(predictionsJavaPairs).values();

	double meanSquaredError = JavaDoubleRDD.fromRDD(validationAndPredictions.map(pair -> {
		Double err = pair._1() - pair._2();
		return (Object) (err * err);// No covariance! Need to cast to Object
		}).rdd()).mean();
	double rmse = Math.sqrt(meanSquaredError);
	return rmse;

}

Source File: ChronixRDD.java From chronix.spark with Apache License 2.0

5 votes

/**
 * Action: Calculates the slope of a linear regression of every time series.
 *
 * Where: value = slope * timestamp
 * .. or:     y = slope * x
 *
 * @return the slopes (simple linear regression) of each an every time series in the RDD
 */
public JavaDoubleRDD getSlopes() {
    return this.mapToDouble((DoubleFunction<MetricTimeSeries>) mts -> {
                SimpleRegression regression = new SimpleRegression();
        mts.points().forEach(p -> regression.addData(p.getTimestamp(), p.getValue()));
                return regression.getSlope();
            }
    );
}

Source File: JavaCorrelationsExample.java From SparkDemo with MIT License

5 votes

public static void main(String[] args) {

    SparkConf conf = new SparkConf().setAppName("JavaCorrelationsExample");
    JavaSparkContext jsc = new JavaSparkContext(conf);

    // $example on$
    JavaDoubleRDD seriesX = jsc.parallelizeDoubles(
      Arrays.asList(1.0, 2.0, 3.0, 3.0, 5.0));  // a series

    // must have the same number of partitions and cardinality as seriesX
    JavaDoubleRDD seriesY = jsc.parallelizeDoubles(
      Arrays.asList(11.0, 22.0, 33.0, 33.0, 555.0));

    // compute the correlation using Pearson's method. Enter "spearman" for Spearman's method.
    // If a method is not specified, Pearson's method will be used by default.
    Double correlation = Statistics.corr(seriesX.srdd(), seriesY.srdd(), "pearson");
    System.out.println("Correlation is: " + correlation);

    // note that each Vector is a row and not a column
    JavaRDD<Vector> data = jsc.parallelize(
      Arrays.asList(
        Vectors.dense(1.0, 10.0, 100.0),
        Vectors.dense(2.0, 20.0, 200.0),
        Vectors.dense(5.0, 33.0, 366.0)
      )
    );

    // calculate the correlation matrix using Pearson's method.
    // Use "spearman" for Spearman's method.
    // If a method is not specified, Pearson's method will be used by default.
    Matrix correlMatrix = Statistics.corr(data.rdd(), "pearson");
    System.out.println(correlMatrix.toString());
    // $example off$

    jsc.stop();
  }

Source File: ChronixRDD.java From chronix.spark with Apache License 2.0

4 votes

/**
 * Action: Counts the number of observations.
 *
 * @return the number of overall observations in all time series
 */
public long countObservations() {
    JavaDoubleRDD sizesRdd = this.mapToDouble(
            (DoubleFunction<MetricTimeSeries>) value -> (double) value.size());
    return sizesRdd.sum().longValue();
}

Source File: SimpleSparkJob.java From infinispan-simple-tutorials with Apache License 2.0

4 votes

public static void main(String[] args) throws UnknownHostException {
   // Obtain the Infinispan address
   String infinispanAddress = args[0];

   // Adjust log levels
   Logger.getLogger("org").setLevel(Level.WARN);

   // Create the remote cache manager
   Configuration build = new ConfigurationBuilder().addServer().host(infinispanAddress).build();
   RemoteCacheManager remoteCacheManager = new RemoteCacheManager(build);

   // Obtain the remote cache
   RemoteCache<Integer, Temperature> cache = remoteCacheManager.getCache();

   // Add some data
   cache.put(1, new Temperature(21, "London"));
   cache.put(2, new Temperature(34, "Rome"));
   cache.put(3, new Temperature(33, "Barcelona"));
   cache.put(4, new Temperature(8, "Oslo"));

   // Create java spark context
   SparkConf conf = new SparkConf().setAppName("infinispan-spark-simple-job");
   JavaSparkContext jsc = new JavaSparkContext(conf);

   // Create InfinispanRDD
   ConnectorConfiguration config = new ConnectorConfiguration().setServerList(infinispanAddress);

   JavaPairRDD<Integer, Temperature> infinispanRDD = InfinispanJavaRDD.createInfinispanRDD(jsc, config);

   // Convert RDD to RDD of doubles
   JavaDoubleRDD javaDoubleRDD = infinispanRDD.values().mapToDouble(Temperature::getValue);

   // Calculate average temperature
   Double meanTemp = javaDoubleRDD.mean();
   System.out.printf("\nAVERAGE TEMPERATURE: %f C\n", meanTemp);

   // Calculate standard deviation
   Double stdDev = javaDoubleRDD.sampleStdev();
   System.out.printf("STD DEVIATION: %f C\n ", stdDev);

   // Calculate histogram of temperatures
   System.out.println("TEMPERATURE HISTOGRAM:");
   double[] buckets = {0d, 10d, 20d, 30d, 40d};
   long[] histogram = javaDoubleRDD.histogram(buckets);

   for (int i = 0; i < buckets.length - 1; i++) {
      System.out.printf("Between %f C and %f C: %d cities\n", buckets[i], buckets[i + 1], histogram[i]);
   }
}

Source File: JavaLinearRegressionWithSGDExample.java From SparkDemo with MIT License

4 votes

public static void main(String[] args) {
  SparkConf conf = new SparkConf().setAppName("JavaLinearRegressionWithSGDExample");
  JavaSparkContext sc = new JavaSparkContext(conf);

  // $example on$
  // Load and parse the data
  String path = "data/mllib/ridge-data/lpsa.data";
  JavaRDD<String> data = sc.textFile(path);
  JavaRDD<LabeledPoint> parsedData = data.map(
    new Function<String, LabeledPoint>() {
      public LabeledPoint call(String line) {
        String[] parts = line.split(",");
        String[] features = parts[1].split(" ");
        double[] v = new double[features.length];
        for (int i = 0; i < features.length - 1; i++) {
          v[i] = Double.parseDouble(features[i]);
        }
        return new LabeledPoint(Double.parseDouble(parts[0]), Vectors.dense(v));
      }
    }
  );
  parsedData.cache();

  // Building the model
  int numIterations = 100;
  double stepSize = 0.00000001;
  final LinearRegressionModel model =
    LinearRegressionWithSGD.train(JavaRDD.toRDD(parsedData), numIterations, stepSize);

  // Evaluate model on training examples and compute training error
  JavaRDD<Tuple2<Double, Double>> valuesAndPreds = parsedData.map(
    new Function<LabeledPoint, Tuple2<Double, Double>>() {
      public Tuple2<Double, Double> call(LabeledPoint point) {
        double prediction = model.predict(point.features());
        return new Tuple2<>(prediction, point.label());
      }
    }
  );
  double MSE = new JavaDoubleRDD(valuesAndPreds.map(
    new Function<Tuple2<Double, Double>, Object>() {
      public Object call(Tuple2<Double, Double> pair) {
        return Math.pow(pair._1() - pair._2(), 2.0);
      }
    }
  ).rdd()).mean();
  System.out.println("training Mean Squared Error = " + MSE);

  // Save and load model
  model.save(sc.sc(), "target/tmp/javaLinearRegressionWithSGDModel");
  LinearRegressionModel sameModel = LinearRegressionModel.load(sc.sc(),
    "target/tmp/javaLinearRegressionWithSGDModel");
  // $example off$

  sc.stop();
}

Source File: SparkDl4jMultiLayer.java From deeplearning4j with Apache License 2.0

4 votes

/**
 * {@code RDD<DataSet>} overload of {@link #scoreExamples(JavaPairRDD, boolean)}
 */
public JavaDoubleRDD scoreExamples(RDD<DataSet> data, boolean includeRegularizationTerms) {
    return scoreExamples(data.toJavaRDD(), includeRegularizationTerms);
}

Source File: SparkComputationGraph.java From deeplearning4j with Apache License 2.0

4 votes

/**
 * DataSet version of {@link #scoreExamples(JavaRDD, boolean)}
 */
public JavaDoubleRDD scoreExamples(JavaRDD<DataSet> data, boolean includeRegularizationTerms) {
    return scoreExamplesMultiDataSet(data.map(new DataSetToMultiDataSetFn()), includeRegularizationTerms);
}

Source File: SparkComputationGraph.java From deeplearning4j with Apache License 2.0

4 votes

/**
 * DataSet version of {@link #scoreExamples(JavaPairRDD, boolean, int)}
 */
public JavaDoubleRDD scoreExamples(JavaRDD<DataSet> data, boolean includeRegularizationTerms, int batchSize) {
    return scoreExamplesMultiDataSet(data.map(new DataSetToMultiDataSetFn()), includeRegularizationTerms,
                    batchSize);
}

Source File: ChronixRDD.java From chronix.spark with Apache License 2.0

2 votes

/**
 * Transformation: Get all values as JavaDoubleRDD.
 *
 * @return a RDD with all observation values
 */
public JavaDoubleRDD getValuesAsRdd() {
    return this.flatMapToDouble(mts -> Arrays.asList(ArrayUtils.toObject(mts.getValuesAsArray())).iterator());
}

Source File: SparkDl4jMultiLayer.java From deeplearning4j with Apache License 2.0

2 votes

/**
 * Score the examples individually, using the default batch size {@link #DEFAULT_EVAL_SCORE_BATCH_SIZE}. Unlike {@link #calculateScore(JavaRDD, boolean)},
 * this method returns a score for each example separately. If scoring is needed for specific examples use either
 * {@link #scoreExamples(JavaPairRDD, boolean)} or {@link #scoreExamples(JavaPairRDD, boolean, int)} which can have
 * a key for each example.
 *
 * @param data                       Data to score
 * @param includeRegularizationTerms If  true: include the l1/l2 regularization terms with the score (if any)
 * @return A JavaDoubleRDD containing the scores of each example
 * @see MultiLayerNetwork#scoreExamples(DataSet, boolean)
 */
public JavaDoubleRDD scoreExamples(JavaRDD<DataSet> data, boolean includeRegularizationTerms) {
    return scoreExamples(data, includeRegularizationTerms, DEFAULT_EVAL_SCORE_BATCH_SIZE);
}

Source File: SparkDl4jMultiLayer.java From deeplearning4j with Apache License 2.0

2 votes

/**
 * {@code RDD<DataSet>}
 * overload of {@link #scoreExamples(JavaRDD, boolean, int)}
 */
public JavaDoubleRDD scoreExamples(RDD<DataSet> data, boolean includeRegularizationTerms, int batchSize) {
    return scoreExamples(data.toJavaRDD(), includeRegularizationTerms, batchSize);
}

Source File: SparkDl4jMultiLayer.java From deeplearning4j with Apache License 2.0

2 votes

/**
 * Score the examples individually, using a specified batch size. Unlike {@link #calculateScore(JavaRDD, boolean)},
 * this method returns a score for each example separately. If scoring is needed for specific examples use either
 * {@link #scoreExamples(JavaPairRDD, boolean)} or {@link #scoreExamples(JavaPairRDD, boolean, int)} which can have
 * a key for each example.
 *
 * @param data                       Data to score
 * @param includeRegularizationTerms If  true: include the l1/l2 regularization terms with the score (if any)
 * @param batchSize                  Batch size to use when doing scoring
 * @return A JavaDoubleRDD containing the scores of each example
 * @see MultiLayerNetwork#scoreExamples(DataSet, boolean)
 */
public JavaDoubleRDD scoreExamples(JavaRDD<DataSet> data, boolean includeRegularizationTerms, int batchSize) {
    return data.mapPartitionsToDouble(new ScoreExamplesFunction(sc.broadcast(network.params()),
                    sc.broadcast(conf.toJson()), includeRegularizationTerms, batchSize));
}

Source File: SparkComputationGraph.java From deeplearning4j with Apache License 2.0

2 votes

/**
 * Score the examples individually, using the default batch size {@link #DEFAULT_EVAL_SCORE_BATCH_SIZE}. Unlike {@link #calculateScore(JavaRDD, boolean)},
 * this method returns a score for each example separately. If scoring is needed for specific examples use either
 * {@link #scoreExamples(JavaPairRDD, boolean)} or {@link #scoreExamples(JavaPairRDD, boolean, int)} which can have
 * a key for each example.
 *
 * @param data                       Data to score
 * @param includeRegularizationTerms If true: include the l1/l2 regularization terms with the score (if any)
 * @return A JavaDoubleRDD containing the scores of each example
 * @see ComputationGraph#scoreExamples(MultiDataSet, boolean)
 */
public JavaDoubleRDD scoreExamplesMultiDataSet(JavaRDD<MultiDataSet> data, boolean includeRegularizationTerms) {
    return scoreExamplesMultiDataSet(data, includeRegularizationTerms, DEFAULT_EVAL_SCORE_BATCH_SIZE);
}

Source File: SparkComputationGraph.java From deeplearning4j with Apache License 2.0

2 votes

/**
 * Score the examples individually, using a specified batch size. Unlike {@link #calculateScore(JavaRDD, boolean)},
 * this method returns a score for each example separately. If scoring is needed for specific examples use either
 * {@link #scoreExamples(JavaPairRDD, boolean)} or {@link #scoreExamples(JavaPairRDD, boolean, int)} which can have
 * a key for each example.
 *
 * @param data                       Data to score
 * @param includeRegularizationTerms If true: include the l1/l2 regularization terms with the score (if any)
 * @param batchSize                  Batch size to use when doing scoring
 * @return A JavaDoubleRDD containing the scores of each example
 * @see ComputationGraph#scoreExamples(MultiDataSet, boolean)
 */
public JavaDoubleRDD scoreExamplesMultiDataSet(JavaRDD<MultiDataSet> data, boolean includeRegularizationTerms,
                int batchSize) {
    return data.mapPartitionsToDouble(new ScoreExamplesFunction(sc.broadcast(network.params()),
                    sc.broadcast(conf.toJson()), includeRegularizationTerms, batchSize));
}

org.apache.spark.api.java.JavaDoubleRDD Java Examples