org.apache.spark.mllib.util.MLUtils Java Exaples

Source File: LogisticRegressionExporterTest.java From spark-transformers with Apache License 2.0

6 votes

@Test
public void shouldExportAndImportCorrectly() {
    String datapath = "src/test/resources/binary_classification_test.libsvm";
    JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc.sc(), datapath).toJavaRDD();

    //Train model in spark
    LogisticRegressionModel lrmodel = new LogisticRegressionWithSGD().run(data.rdd());

    //Export this model
    byte[] exportedModel = ModelExporter.export(lrmodel, null);

    //Import it back
    LogisticRegressionModelInfo importedModel = (LogisticRegressionModelInfo) ModelImporter.importModelInfo(exportedModel);

    //check if they are exactly equal with respect to their fields
    //it maybe edge cases eg. order of elements in the list is changed
    assertEquals(lrmodel.intercept(), importedModel.getIntercept(), EPSILON);
    assertEquals(lrmodel.numClasses(), importedModel.getNumClasses(), EPSILON);
    assertEquals(lrmodel.numFeatures(), importedModel.getNumFeatures(), EPSILON);
    assertEquals((double) lrmodel.getThreshold().get(), importedModel.getThreshold(), EPSILON);
    for (int i = 0; i < importedModel.getNumFeatures(); i++)
        assertEquals(lrmodel.weights().toArray()[i], importedModel.getWeights()[i], EPSILON);

}

Source File: LogisticRegressionExporterTest.java From spark-transformers with Apache License 2.0

6 votes

@Test
public void shouldExportAndImportCorrectly() {
    String datapath = "src/test/resources/binary_classification_test.libsvm";
    JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD();

    //Train model in spark
    LogisticRegressionModel lrmodel = new LogisticRegressionWithSGD().run(data.rdd());

    //Export this model
    byte[] exportedModel = ModelExporter.export(lrmodel);

    //Import it back
    LogisticRegressionModelInfo importedModel = (LogisticRegressionModelInfo) ModelImporter.importModelInfo(exportedModel);

    //check if they are exactly equal with respect to their fields
    //it maybe edge cases eg. order of elements in the list is changed
    assertEquals(lrmodel.intercept(), importedModel.getIntercept(), 0.01);
    assertEquals(lrmodel.numClasses(), importedModel.getNumClasses(), 0.01);
    assertEquals(lrmodel.numFeatures(), importedModel.getNumFeatures(), 0.01);
    assertEquals((double) lrmodel.getThreshold().get(), importedModel.getThreshold(), 0.01);
    for (int i = 0; i < importedModel.getNumFeatures(); i++)
        assertEquals(lrmodel.weights().toArray()[i], importedModel.getWeights()[i], 0.01);

}

Source File: TestSparkMultiLayerParameterAveraging.java From deeplearning4j with Apache License 2.0

5 votes

@Test
public void testFromSvmLight() throws Exception {
    JavaRDD<LabeledPoint> data = MLUtils
                    .loadLibSVMFile(sc.sc(),
                                    new ClassPathResource("svmLight/iris_svmLight_0.txt").getTempFileFromArchive()
                                                    .getAbsolutePath())
                    .toJavaRDD().map(new TestFn());

    MultiLayerConfiguration conf =
                    new NeuralNetConfiguration.Builder().seed(123)
                                    .updater(new Adam(1e-6))
                            .weightInit(WeightInit.XAVIER)
                            .list()
                            .layer(new BatchNormalization.Builder().nIn(4).nOut(4).build())
                            .layer(new DenseLayer.Builder().nIn(4).nOut(32).activation(Activation.RELU).build())
                            .layer(new org.deeplearning4j.nn.conf.layers.OutputLayer.Builder(LossFunctions.LossFunction.MCXENT).nIn(32).nOut(3)
                                                                    .activation(Activation.SOFTMAX).build())
                                    .build();



    MultiLayerNetwork network = new MultiLayerNetwork(conf);
    network.init();
    System.out.println("Initializing network");
    SparkDl4jMultiLayer master = new SparkDl4jMultiLayer(sc, getBasicConf(),
                    new ParameterAveragingTrainingMaster(true, numExecutors(), 1, 5, 1, 0));

    master.fitLabeledPoint(data);
}

Source File: TestSparkMultiLayerParameterAveraging.java From deeplearning4j with Apache License 2.0

5 votes

@Test
public void testFromSvmLightBackprop() throws Exception {
    JavaRDD<LabeledPoint> data = MLUtils
                    .loadLibSVMFile(sc.sc(),
                                    new ClassPathResource("svmLight/iris_svmLight_0.txt").getTempFileFromArchive()
                                                    .getAbsolutePath())
                    .toJavaRDD().map(new TestFn());

    DataSet d = new IrisDataSetIterator(150, 150).next();
    MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder().seed(123)
                    .optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT).list()
                    .layer(0, new DenseLayer.Builder().nIn(4).nOut(100).weightInit(WeightInit.XAVIER)
                                    .activation(Activation.RELU).build())
                    .layer(1, new org.deeplearning4j.nn.conf.layers.OutputLayer.Builder(
                                    LossFunctions.LossFunction.MCXENT).nIn(100).nOut(3)
                                                    .activation(Activation.SOFTMAX).weightInit(WeightInit.XAVIER)
                                                    .build())
                    .build();



    MultiLayerNetwork network = new MultiLayerNetwork(conf);
    network.init();
    System.out.println("Initializing network");

    SparkDl4jMultiLayer master = new SparkDl4jMultiLayer(sc, conf,
                    new ParameterAveragingTrainingMaster(true, numExecutors(), 1, 5, 1, 0));

    MultiLayerNetwork network2 = master.fitLabeledPoint(data);
}

Source File: JavaNaiveBayesExample.java From SparkDemo with MIT License

5 votes

public static void main(String[] args) {
  SparkConf sparkConf = new SparkConf().setAppName("JavaNaiveBayesExample");
  JavaSparkContext jsc = new JavaSparkContext(sparkConf);
  // $example on$
  String path = "data/mllib/sample_libsvm_data.txt";
  JavaRDD<LabeledPoint> inputData = MLUtils.loadLibSVMFile(jsc.sc(), path).toJavaRDD();
  JavaRDD<LabeledPoint>[] tmp = inputData.randomSplit(new double[]{0.6, 0.4});
  JavaRDD<LabeledPoint> training = tmp[0]; // training set
  JavaRDD<LabeledPoint> test = tmp[1]; // test set
  final NaiveBayesModel model = NaiveBayes.train(training.rdd(), 1.0);
  JavaPairRDD<Double, Double> predictionAndLabel =
    test.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
      @Override
      public Tuple2<Double, Double> call(LabeledPoint p) {
        return new Tuple2<>(model.predict(p.features()), p.label());
      }
    });
  double accuracy = predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() {
    @Override
    public Boolean call(Tuple2<Double, Double> pl) {
      return pl._1().equals(pl._2());
    }
  }).count() / (double) test.count();

  // Save and load model
  model.save(jsc.sc(), "target/tmp/myNaiveBayesModel");
  NaiveBayesModel sameModel = NaiveBayesModel.load(jsc.sc(), "target/tmp/myNaiveBayesModel");
  // $example off$

  jsc.stop();
}

Source File: LogisticRegression1BridgeTest.java From spark-transformers with Apache License 2.0

5 votes

@Test
public void testLogisticRegression() {
    //prepare data
    String datapath = "src/test/resources/binary_classification_test.libsvm";

    DataFrame trainingData = sqlContext.read().format("libsvm").load(datapath);

    //Train model in spark
    LogisticRegressionModel lrmodel = new LogisticRegression().fit(trainingData);

    //Export this model
    byte[] exportedModel = ModelExporter.export(lrmodel, trainingData);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    //validate predictions
    List<LabeledPoint> testPoints = MLUtils.loadLibSVMFile(sc.sc(), datapath).toJavaRDD().collect();
    for (LabeledPoint i : testPoints) {
        Vector v = i.features();
        double actual = lrmodel.predict(v);

        Map<String, Object> data = new HashMap<String, Object>();
        data.put("features", v.toArray());
        transformer.transform(data);
        double predicted = (double) data.get("prediction");

        assertEquals(actual, predicted, EPSILON);
    }
}

Source File: LogisticRegressionBridgeTest.java From spark-transformers with Apache License 2.0

5 votes

@Test
public void testLogisticRegression() {
    //prepare data
    String datapath = "src/test/resources/binary_classification_test.libsvm";
    JavaRDD<LabeledPoint> trainingData = MLUtils.loadLibSVMFile(sc.sc(), datapath).toJavaRDD();

    //Train model in spark
    LogisticRegressionModel lrmodel = new LogisticRegressionWithSGD().run(trainingData.rdd());

    //Export this model
    byte[] exportedModel = ModelExporter.export(lrmodel, null);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    //validate predictions
    List<LabeledPoint> testPoints = trainingData.collect();
    for (LabeledPoint i : testPoints) {
        Vector v = i.features();
        double actual = lrmodel.predict(v);

        Map<String, Object> data = new HashMap<String, Object>();
        data.put("features", v.toArray());
        transformer.transform(data);
        double predicted = (double) data.get("prediction");

        assertEquals(actual, predicted, EPSILON);
    }
}

Source File: JavaLogisticRegressionWithLBFGSExample.java From SparkDemo with MIT License

5 votes

public static void main(String[] args) {
  SparkConf conf = new SparkConf().setAppName("JavaLogisticRegressionWithLBFGSExample");
  SparkContext sc = new SparkContext(conf);
  // $example on$
  String path = "data/mllib/sample_libsvm_data.txt";
  JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD();

  // Split initial RDD into two... [60% training data, 40% testing data].
  JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[] {0.6, 0.4}, 11L);
  JavaRDD<LabeledPoint> training = splits[0].cache();
  JavaRDD<LabeledPoint> test = splits[1];

  // Run training algorithm to build the model.
  final LogisticRegressionModel model = new LogisticRegressionWithLBFGS()
    .setNumClasses(10)
    .run(training.rdd());

  // Compute raw scores on the test set.
  JavaRDD<Tuple2<Object, Object>> predictionAndLabels = test.map(
    new Function<LabeledPoint, Tuple2<Object, Object>>() {
      public Tuple2<Object, Object> call(LabeledPoint p) {
        Double prediction = model.predict(p.features());
        return new Tuple2<Object, Object>(prediction, p.label());
      }
    }
  );

  // Get evaluation metrics.
  MulticlassMetrics metrics = new MulticlassMetrics(predictionAndLabels.rdd());
  double accuracy = metrics.accuracy();
  System.out.println("Accuracy = " + accuracy);

  // Save and load model
  model.save(sc, "target/tmp/javaLogisticRegressionWithLBFGSModel");
  LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc,
    "target/tmp/javaLogisticRegressionWithLBFGSModel");
  // $example off$

  sc.stop();
}

Source File: LogisticRegression1BridgeTest.java From spark-transformers with Apache License 2.0

5 votes

@Test
public void testLogisticRegression() {
    //prepare data
    String datapath = "src/test/resources/binary_classification_test.libsvm";

    Dataset<Row> trainingData = spark.read().format("libsvm").load(datapath);

    //Train model in spark
    LogisticRegressionModel lrmodel = new LogisticRegression().fit(trainingData);

    //Export this model
    byte[] exportedModel = ModelExporter.export(lrmodel);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    //validate predictions
    List<LabeledPoint> testPoints = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD().collect();
    for (LabeledPoint i : testPoints) {
        Vector v = i.features().asML();
        double actual = lrmodel.predict(v);

        Map<String, Object> data = new HashMap<String, Object>();
        data.put("features", v.toArray());
        transformer.transform(data);
        double predicted = (double) data.get("prediction");

        assertEquals(actual, predicted, 0.01);
    }
}

Source File: LogisticRegressionBridgeTest.java From spark-transformers with Apache License 2.0

5 votes

@Test
public void testLogisticRegression() {
    //prepare data
    String datapath = "src/test/resources/binary_classification_test.libsvm";
    JavaRDD<LabeledPoint> trainingData = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD();

    //Train model in spark
    LogisticRegressionModel lrmodel = new LogisticRegressionWithSGD().run(trainingData.rdd());

    //Export this model
    byte[] exportedModel = ModelExporter.export(lrmodel);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    //validate predictions
    List<LabeledPoint> testPoints = trainingData.collect();
    for (LabeledPoint i : testPoints) {
        Vector v = i.features();
        double actual = lrmodel.predict(v);

        Map<String, Object> data = new HashMap<String, Object>();
        data.put("features", v.toArray());
        transformer.transform(data);
        double predicted = (double) data.get("prediction");

        assertEquals(actual, predicted, 0.01);
    }
}

Source File: JavaDecisionTreeRegressionExample.java From SparkDemo with MIT License

4 votes

public static void main(String[] args) {

    // $example on$
    SparkConf sparkConf = new SparkConf().setAppName("JavaDecisionTreeRegressionExample");
    JavaSparkContext jsc = new JavaSparkContext(sparkConf);

    // Load and parse the data file.
    String datapath = "data/mllib/sample_libsvm_data.txt";
    JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD();
    // Split the data into training and test sets (30% held out for testing)
    JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.7, 0.3});
    JavaRDD<LabeledPoint> trainingData = splits[0];
    JavaRDD<LabeledPoint> testData = splits[1];

    // Set parameters.
    // Empty categoricalFeaturesInfo indicates all features are continuous.
    Map<Integer, Integer> categoricalFeaturesInfo = new HashMap<>();
    String impurity = "variance";
    Integer maxDepth = 5;
    Integer maxBins = 32;

    // Train a DecisionTree model.
    final DecisionTreeModel model = DecisionTree.trainRegressor(trainingData,
      categoricalFeaturesInfo, impurity, maxDepth, maxBins);

    // Evaluate model on test instances and compute test error
    JavaPairRDD<Double, Double> predictionAndLabel =
      testData.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
      @Override
      public Tuple2<Double, Double> call(LabeledPoint p) {
        return new Tuple2<>(model.predict(p.features()), p.label());
      }
    });
    Double testMSE =
      predictionAndLabel.map(new Function<Tuple2<Double, Double>, Double>() {
        @Override
        public Double call(Tuple2<Double, Double> pl) {
          Double diff = pl._1() - pl._2();
          return diff * diff;
        }
      }).reduce(new Function2<Double, Double, Double>() {
        @Override
        public Double call(Double a, Double b) {
          return a + b;
        }
      }) / data.count();
    System.out.println("Test Mean Squared Error: " + testMSE);
    System.out.println("Learned regression tree model:\n" + model.toDebugString());

    // Save and load model
    model.save(jsc.sc(), "target/tmp/myDecisionTreeRegressionModel");
    DecisionTreeModel sameModel = DecisionTreeModel
      .load(jsc.sc(), "target/tmp/myDecisionTreeRegressionModel");
    // $example off$
  }

Source File: RDDConverterUtils.java From systemds with Apache License 2.0

4 votes

/**
 * Converts a libsvm text input file into two binary block matrices for features 
 * and labels, and saves these to the specified output files. This call also deletes 
 * existing files at the specified output locations, as well as determines and 
 * writes the meta data files of both output matrices. 
 * <p>
 * Note: We use {@code org.apache.spark.mllib.util.MLUtils.loadLibSVMFile} for parsing 
 * the libsvm input files in order to ensure consistency with Spark.
 * 
 * @param sc java spark context
 * @param pathIn path to libsvm input file
 * @param pathX path to binary block output file of features
 * @param pathY path to binary block output file of labels
 * @param mcOutX matrix characteristics of output matrix X
 */
public static void libsvmToBinaryBlock(JavaSparkContext sc, String pathIn, 
		String pathX, String pathY, DataCharacteristics mcOutX)
{
	if( !mcOutX.dimsKnown() )
		throw new DMLRuntimeException("Matrix characteristics "
			+ "required to convert sparse input representation.");
	try {
		//cleanup existing output files
		HDFSTool.deleteFileIfExistOnHDFS(pathX);
		HDFSTool.deleteFileIfExistOnHDFS(pathY);
		
		//convert libsvm to labeled points
		int numFeatures = (int) mcOutX.getCols();
		int numPartitions = SparkUtils.getNumPreferredPartitions(mcOutX, null);
		JavaRDD<org.apache.spark.mllib.regression.LabeledPoint> lpoints = 
				MLUtils.loadLibSVMFile(sc.sc(), pathIn, numFeatures, numPartitions).toJavaRDD();
		
		//append row index and best-effort caching to avoid repeated text parsing
		JavaPairRDD<org.apache.spark.mllib.regression.LabeledPoint,Long> ilpoints = 
				lpoints.zipWithIndex().persist(StorageLevel.MEMORY_AND_DISK()); 
		
		//extract labels and convert to binary block
		DataCharacteristics mc1 = new MatrixCharacteristics(mcOutX.getRows(), 1, mcOutX.getBlocksize(), -1);
		LongAccumulator aNnz1 = sc.sc().longAccumulator("nnz");
		JavaPairRDD<MatrixIndexes,MatrixBlock> out1 = ilpoints
				.mapPartitionsToPair(new LabeledPointToBinaryBlockFunction(mc1, true, aNnz1));
		int numPartitions2 = SparkUtils.getNumPreferredPartitions(mc1, null);
		out1 = RDDAggregateUtils.mergeByKey(out1, numPartitions2, false);
		out1.saveAsHadoopFile(pathY, MatrixIndexes.class, MatrixBlock.class, SequenceFileOutputFormat.class);
		mc1.setNonZeros(aNnz1.value()); //update nnz after triggered save
		HDFSTool.writeMetaDataFile(pathY+".mtd", ValueType.FP64, mc1, FileFormat.BINARY);
		
		//extract data and convert to binary block
		DataCharacteristics mc2 = new MatrixCharacteristics(mcOutX.getRows(), mcOutX.getCols(), mcOutX.getBlocksize(), -1);
		LongAccumulator aNnz2 = sc.sc().longAccumulator("nnz");
		JavaPairRDD<MatrixIndexes,MatrixBlock> out2 = ilpoints
				.mapPartitionsToPair(new LabeledPointToBinaryBlockFunction(mc2, false, aNnz2));
		out2 = RDDAggregateUtils.mergeByKey(out2, numPartitions, false);
		out2.saveAsHadoopFile(pathX, MatrixIndexes.class, MatrixBlock.class, SequenceFileOutputFormat.class);
		mc2.setNonZeros(aNnz2.value()); //update nnz after triggered save
		HDFSTool.writeMetaDataFile(pathX+".mtd", ValueType.FP64, mc2, FileFormat.BINARY);
		
		//asynchronous cleanup of cached intermediates
		ilpoints.unpersist(false);
	}
	catch(IOException ex) {
		throw new DMLRuntimeException(ex);
	}
}

Source File: JavaBinaryClassificationMetricsExample.java From SparkDemo with MIT License

4 votes

public static void main(String[] args) {
  SparkConf conf = new SparkConf().setAppName("Java Binary Classification Metrics Example");
  SparkContext sc = new SparkContext(conf);
  // $example on$
  String path = "data/mllib/sample_binary_classification_data.txt";
  JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD();

  // Split initial RDD into two... [60% training data, 40% testing data].
  JavaRDD<LabeledPoint>[] splits =
    data.randomSplit(new double[]{0.6, 0.4}, 11L);
  JavaRDD<LabeledPoint> training = splits[0].cache();
  JavaRDD<LabeledPoint> test = splits[1];

  // Run training algorithm to build the model.
  final LogisticRegressionModel model = new LogisticRegressionWithLBFGS()
    .setNumClasses(2)
    .run(training.rdd());

  // Clear the prediction threshold so the model will return probabilities
  model.clearThreshold();

  // Compute raw scores on the test set.
  JavaRDD<Tuple2<Object, Object>> predictionAndLabels = test.map(
    new Function<LabeledPoint, Tuple2<Object, Object>>() {
      @Override
      public Tuple2<Object, Object> call(LabeledPoint p) {
        Double prediction = model.predict(p.features());
        return new Tuple2<Object, Object>(prediction, p.label());
      }
    }
  );

  // Get evaluation metrics.
  BinaryClassificationMetrics metrics =
    new BinaryClassificationMetrics(predictionAndLabels.rdd());

  // Precision by threshold
  JavaRDD<Tuple2<Object, Object>> precision = metrics.precisionByThreshold().toJavaRDD();
  System.out.println("Precision by threshold: " + precision.collect());

  // Recall by threshold
  JavaRDD<Tuple2<Object, Object>> recall = metrics.recallByThreshold().toJavaRDD();
  System.out.println("Recall by threshold: " + recall.collect());

  // F Score by threshold
  JavaRDD<Tuple2<Object, Object>> f1Score = metrics.fMeasureByThreshold().toJavaRDD();
  System.out.println("F1 Score by threshold: " + f1Score.collect());

  JavaRDD<Tuple2<Object, Object>> f2Score = metrics.fMeasureByThreshold(2.0).toJavaRDD();
  System.out.println("F2 Score by threshold: " + f2Score.collect());

  // Precision-recall curve
  JavaRDD<Tuple2<Object, Object>> prc = metrics.pr().toJavaRDD();
  System.out.println("Precision-recall curve: " + prc.collect());

  // Thresholds
  JavaRDD<Double> thresholds = precision.map(
    new Function<Tuple2<Object, Object>, Double>() {
      @Override
      public Double call(Tuple2<Object, Object> t) {
        return new Double(t._1().toString());
      }
    }
  );

  // ROC Curve
  JavaRDD<Tuple2<Object, Object>> roc = metrics.roc().toJavaRDD();
  System.out.println("ROC curve: " + roc.collect());

  // AUPRC
  System.out.println("Area under precision-recall curve = " + metrics.areaUnderPR());

  // AUROC
  System.out.println("Area under ROC = " + metrics.areaUnderROC());

  // Save and load model
  model.save(sc, "target/tmp/LogisticRegressionModel");
  LogisticRegressionModel.load(sc, "target/tmp/LogisticRegressionModel");
  // $example off$
}

Source File: RDDConverterUtils.java From systemds with Apache License 2.0

4 votes

/**
 * Converts a libsvm text input file into two binary block matrices for features 
 * and labels, and saves these to the specified output files. This call also deletes 
 * existing files at the specified output locations, as well as determines and 
 * writes the meta data files of both output matrices. 
 * <p>
 * Note: We use {@code org.apache.spark.mllib.util.MLUtils.loadLibSVMFile} for parsing 
 * the libsvm input files in order to ensure consistency with Spark.
 * 
 * @param sc java spark context
 * @param pathIn path to libsvm input file
 * @param pathX path to binary block output file of features
 * @param pathY path to binary block output file of labels
 * @param mcOutX matrix characteristics of output matrix X
 */
public static void libsvmToBinaryBlock(JavaSparkContext sc, String pathIn, 
		String pathX, String pathY, DataCharacteristics mcOutX)
{
	if( !mcOutX.dimsKnown() )
		throw new DMLRuntimeException("Matrix characteristics "
			+ "required to convert sparse input representation.");
	try {
		//cleanup existing output files
		HDFSTool.deleteFileIfExistOnHDFS(pathX);
		HDFSTool.deleteFileIfExistOnHDFS(pathY);
		
		//convert libsvm to labeled points
		int numFeatures = (int) mcOutX.getCols();
		int numPartitions = SparkUtils.getNumPreferredPartitions(mcOutX, null);
		JavaRDD<org.apache.spark.mllib.regression.LabeledPoint> lpoints = 
				MLUtils.loadLibSVMFile(sc.sc(), pathIn, numFeatures, numPartitions).toJavaRDD();
		
		//append row index and best-effort caching to avoid repeated text parsing
		JavaPairRDD<org.apache.spark.mllib.regression.LabeledPoint,Long> ilpoints = 
				lpoints.zipWithIndex().persist(StorageLevel.MEMORY_AND_DISK()); 
		
		//extract labels and convert to binary block
		DataCharacteristics mc1 = new MatrixCharacteristics(mcOutX.getRows(), 1, mcOutX.getBlocksize(), -1);
		LongAccumulator aNnz1 = sc.sc().longAccumulator("nnz");
		JavaPairRDD<MatrixIndexes,MatrixBlock> out1 = ilpoints
				.mapPartitionsToPair(new LabeledPointToBinaryBlockFunction(mc1, true, aNnz1));
		int numPartitions2 = SparkUtils.getNumPreferredPartitions(mc1, null);
		out1 = RDDAggregateUtils.mergeByKey(out1, numPartitions2, false);
		out1.saveAsHadoopFile(pathY, MatrixIndexes.class, MatrixBlock.class, SequenceFileOutputFormat.class);
		mc1.setNonZeros(aNnz1.value()); //update nnz after triggered save
		HDFSTool.writeMetaDataFile(pathY+".mtd", ValueType.FP64, mc1, OutputInfo.BinaryBlockOutputInfo);
		
		//extract data and convert to binary block
		DataCharacteristics mc2 = new MatrixCharacteristics(mcOutX.getRows(), mcOutX.getCols(), mcOutX.getBlocksize(), -1);
		LongAccumulator aNnz2 = sc.sc().longAccumulator("nnz");
		JavaPairRDD<MatrixIndexes,MatrixBlock> out2 = ilpoints
				.mapPartitionsToPair(new LabeledPointToBinaryBlockFunction(mc2, false, aNnz2));
		out2 = RDDAggregateUtils.mergeByKey(out2, numPartitions, false);
		out2.saveAsHadoopFile(pathX, MatrixIndexes.class, MatrixBlock.class, SequenceFileOutputFormat.class);
		mc2.setNonZeros(aNnz2.value()); //update nnz after triggered save
		HDFSTool.writeMetaDataFile(pathX+".mtd", ValueType.FP64, mc2, OutputInfo.BinaryBlockOutputInfo);
		
		//asynchronous cleanup of cached intermediates
		ilpoints.unpersist(false);
	}
	catch(IOException ex) {
		throw new DMLRuntimeException(ex);
	}
}

Source File: JavaDecisionTreeClassificationExample.java From SparkDemo with MIT License

4 votes

public static void main(String[] args) {

    // $example on$
    SparkConf sparkConf = new SparkConf().setAppName("JavaDecisionTreeClassificationExample");
    JavaSparkContext jsc = new JavaSparkContext(sparkConf);

    // Load and parse the data file.
    String datapath = "data/mllib/sample_libsvm_data.txt";
    JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD();
    // Split the data into training and test sets (30% held out for testing)
    JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.7, 0.3});
    JavaRDD<LabeledPoint> trainingData = splits[0];
    JavaRDD<LabeledPoint> testData = splits[1];

    // Set parameters.
    //  Empty categoricalFeaturesInfo indicates all features are continuous.
    Integer numClasses = 2;
    Map<Integer, Integer> categoricalFeaturesInfo = new HashMap<>();
    String impurity = "gini";
    Integer maxDepth = 5;
    Integer maxBins = 32;

    // Train a DecisionTree model for classification.
    final DecisionTreeModel model = DecisionTree.trainClassifier(trainingData, numClasses,
      categoricalFeaturesInfo, impurity, maxDepth, maxBins);

    // Evaluate model on test instances and compute test error
    JavaPairRDD<Double, Double> predictionAndLabel =
      testData.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
        @Override
        public Tuple2<Double, Double> call(LabeledPoint p) {
          return new Tuple2<>(model.predict(p.features()), p.label());
        }
      });
    Double testErr =
      1.0 * predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() {
        @Override
        public Boolean call(Tuple2<Double, Double> pl) {
          return !pl._1().equals(pl._2());
        }
      }).count() / testData.count();

    System.out.println("Test Error: " + testErr);
    System.out.println("Learned classification tree model:\n" + model.toDebugString());

    // Save and load model
    model.save(jsc.sc(), "target/tmp/myDecisionTreeClassificationModel");
    DecisionTreeModel sameModel = DecisionTreeModel
      .load(jsc.sc(), "target/tmp/myDecisionTreeClassificationModel");
    // $example off$
  }

Source File: JavaGradientBoostingRegressionExample.java From SparkDemo with MIT License

4 votes

public static void main(String[] args) {
  // $example on$
  SparkConf sparkConf = new SparkConf()
    .setAppName("JavaGradientBoostedTreesRegressionExample");
  JavaSparkContext jsc = new JavaSparkContext(sparkConf);
  // Load and parse the data file.
  String datapath = "data/mllib/sample_libsvm_data.txt";
  JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD();
  // Split the data into training and test sets (30% held out for testing)
  JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.7, 0.3});
  JavaRDD<LabeledPoint> trainingData = splits[0];
  JavaRDD<LabeledPoint> testData = splits[1];

  // Train a GradientBoostedTrees model.
  // The defaultParams for Regression use SquaredError by default.
  BoostingStrategy boostingStrategy = BoostingStrategy.defaultParams("Regression");
  boostingStrategy.setNumIterations(3); // Note: Use more iterations in practice.
  boostingStrategy.getTreeStrategy().setMaxDepth(5);
  // Empty categoricalFeaturesInfo indicates all features are continuous.
  Map<Integer, Integer> categoricalFeaturesInfo = new HashMap<>();
  boostingStrategy.treeStrategy().setCategoricalFeaturesInfo(categoricalFeaturesInfo);

  final GradientBoostedTreesModel model =
    GradientBoostedTrees.train(trainingData, boostingStrategy);

  // Evaluate model on test instances and compute test error
  JavaPairRDD<Double, Double> predictionAndLabel =
    testData.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
      @Override
      public Tuple2<Double, Double> call(LabeledPoint p) {
        return new Tuple2<>(model.predict(p.features()), p.label());
      }
    });
  Double testMSE =
    predictionAndLabel.map(new Function<Tuple2<Double, Double>, Double>() {
      @Override
      public Double call(Tuple2<Double, Double> pl) {
        Double diff = pl._1() - pl._2();
        return diff * diff;
      }
    }).reduce(new Function2<Double, Double, Double>() {
      @Override
      public Double call(Double a, Double b) {
        return a + b;
      }
    }) / data.count();
  System.out.println("Test Mean Squared Error: " + testMSE);
  System.out.println("Learned regression GBT model:\n" + model.toDebugString());

  // Save and load model
  model.save(jsc.sc(), "target/tmp/myGradientBoostingRegressionModel");
  GradientBoostedTreesModel sameModel = GradientBoostedTreesModel.load(jsc.sc(),
    "target/tmp/myGradientBoostingRegressionModel");
  // $example off$

  jsc.stop();
}

Source File: JavaSVMWithSGDExample.java From SparkDemo with MIT License

4 votes

public static void main(String[] args) {
  SparkConf conf = new SparkConf().setAppName("JavaSVMWithSGDExample");
  SparkContext sc = new SparkContext(conf);
  // $example on$
  String path = "data/mllib/sample_libsvm_data.txt";
  JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD();

  // Split initial RDD into two... [60% training data, 40% testing data].
  JavaRDD<LabeledPoint> training = data.sample(false, 0.6, 11L);
  training.cache();
  JavaRDD<LabeledPoint> test = data.subtract(training);

  // Run training algorithm to build the model.
  int numIterations = 100;
  final SVMModel model = SVMWithSGD.train(training.rdd(), numIterations);

  // Clear the default threshold.
  model.clearThreshold();

  // Compute raw scores on the test set.
  JavaRDD<Tuple2<Object, Object>> scoreAndLabels = test.map(
    new Function<LabeledPoint, Tuple2<Object, Object>>() {
      public Tuple2<Object, Object> call(LabeledPoint p) {
        Double score = model.predict(p.features());
        return new Tuple2<Object, Object>(score, p.label());
      }
    }
  );

  // Get evaluation metrics.
  BinaryClassificationMetrics metrics =
    new BinaryClassificationMetrics(JavaRDD.toRDD(scoreAndLabels));
  double auROC = metrics.areaUnderROC();

  System.out.println("Area under ROC = " + auROC);

  // Save and load model
  model.save(sc, "target/tmp/javaSVMWithSGDModel");
  SVMModel sameModel = SVMModel.load(sc, "target/tmp/javaSVMWithSGDModel");
  // $example off$

  sc.stop();
}

Source File: JavaGradientBoostingClassificationExample.java From SparkDemo with MIT License

4 votes

public static void main(String[] args) {
  // $example on$
  SparkConf sparkConf = new SparkConf()
    .setAppName("JavaGradientBoostedTreesClassificationExample");
  JavaSparkContext jsc = new JavaSparkContext(sparkConf);

  // Load and parse the data file.
  String datapath = "data/mllib/sample_libsvm_data.txt";
  JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD();
  // Split the data into training and test sets (30% held out for testing)
  JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.7, 0.3});
  JavaRDD<LabeledPoint> trainingData = splits[0];
  JavaRDD<LabeledPoint> testData = splits[1];

  // Train a GradientBoostedTrees model.
  // The defaultParams for Classification use LogLoss by default.
  BoostingStrategy boostingStrategy = BoostingStrategy.defaultParams("Classification");
  boostingStrategy.setNumIterations(3); // Note: Use more iterations in practice.
  boostingStrategy.getTreeStrategy().setNumClasses(2);
  boostingStrategy.getTreeStrategy().setMaxDepth(5);
  // Empty categoricalFeaturesInfo indicates all features are continuous.
  Map<Integer, Integer> categoricalFeaturesInfo = new HashMap<>();
  boostingStrategy.treeStrategy().setCategoricalFeaturesInfo(categoricalFeaturesInfo);

  final GradientBoostedTreesModel model =
    GradientBoostedTrees.train(trainingData, boostingStrategy);

  // Evaluate model on test instances and compute test error
  JavaPairRDD<Double, Double> predictionAndLabel =
    testData.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
      @Override
      public Tuple2<Double, Double> call(LabeledPoint p) {
        return new Tuple2<>(model.predict(p.features()), p.label());
      }
    });
  Double testErr =
    1.0 * predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() {
      @Override
      public Boolean call(Tuple2<Double, Double> pl) {
        return !pl._1().equals(pl._2());
      }
    }).count() / testData.count();
  System.out.println("Test Error: " + testErr);
  System.out.println("Learned classification GBT model:\n" + model.toDebugString());

  // Save and load model
  model.save(jsc.sc(), "target/tmp/myGradientBoostingClassificationModel");
  GradientBoostedTreesModel sameModel = GradientBoostedTreesModel.load(jsc.sc(),
    "target/tmp/myGradientBoostingClassificationModel");
  // $example off$

  jsc.stop();
}

Source File: JavaMulticlassClassificationMetricsExample.java From SparkDemo with MIT License

4 votes

public static void main(String[] args) {
  SparkConf conf = new SparkConf().setAppName("Multi class Classification Metrics Example");
  SparkContext sc = new SparkContext(conf);
  // $example on$
  String path = "data/mllib/sample_multiclass_classification_data.txt";
  JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD();

  // Split initial RDD into two... [60% training data, 40% testing data].
  JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.6, 0.4}, 11L);
  JavaRDD<LabeledPoint> training = splits[0].cache();
  JavaRDD<LabeledPoint> test = splits[1];

  // Run training algorithm to build the model.
  final LogisticRegressionModel model = new LogisticRegressionWithLBFGS()
    .setNumClasses(3)
    .run(training.rdd());

  // Compute raw scores on the test set.
  JavaRDD<Tuple2<Object, Object>> predictionAndLabels = test.map(
    new Function<LabeledPoint, Tuple2<Object, Object>>() {
      public Tuple2<Object, Object> call(LabeledPoint p) {
        Double prediction = model.predict(p.features());
        return new Tuple2<Object, Object>(prediction, p.label());
      }
    }
  );

  // Get evaluation metrics.
  MulticlassMetrics metrics = new MulticlassMetrics(predictionAndLabels.rdd());

  // Confusion matrix
  Matrix confusion = metrics.confusionMatrix();
  System.out.println("Confusion matrix: \n" + confusion);

  // Overall statistics
  System.out.println("Accuracy = " + metrics.accuracy());

  // Stats by labels
  for (int i = 0; i < metrics.labels().length; i++) {
    System.out.format("Class %f precision = %f\n", metrics.labels()[i],metrics.precision(
      metrics.labels()[i]));
    System.out.format("Class %f recall = %f\n", metrics.labels()[i], metrics.recall(
      metrics.labels()[i]));
    System.out.format("Class %f F1 score = %f\n", metrics.labels()[i], metrics.fMeasure(
      metrics.labels()[i]));
  }

  //Weighted stats
  System.out.format("Weighted precision = %f\n", metrics.weightedPrecision());
  System.out.format("Weighted recall = %f\n", metrics.weightedRecall());
  System.out.format("Weighted F1 score = %f\n", metrics.weightedFMeasure());
  System.out.format("Weighted false positive rate = %f\n", metrics.weightedFalsePositiveRate());

  // Save and load model
  model.save(sc, "target/tmp/LogisticRegressionModel");
  LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc,
    "target/tmp/LogisticRegressionModel");
  // $example off$
}

Source File: JavaRandomForestClassificationExample.java From SparkDemo with MIT License

4 votes

public static void main(String[] args) {
  // $example on$
  SparkConf sparkConf = new SparkConf().setAppName("JavaRandomForestClassificationExample");
  JavaSparkContext jsc = new JavaSparkContext(sparkConf);
  // Load and parse the data file.
  String datapath = "data/mllib/sample_libsvm_data.txt";
  JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD();
  // Split the data into training and test sets (30% held out for testing)
  JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.7, 0.3});
  JavaRDD<LabeledPoint> trainingData = splits[0];
  JavaRDD<LabeledPoint> testData = splits[1];

  // Train a RandomForest model.
  // Empty categoricalFeaturesInfo indicates all features are continuous.
  Integer numClasses = 2;
  HashMap<Integer, Integer> categoricalFeaturesInfo = new HashMap<>();
  Integer numTrees = 3; // Use more in practice.
  String featureSubsetStrategy = "auto"; // Let the algorithm choose.
  String impurity = "gini";
  Integer maxDepth = 5;
  Integer maxBins = 32;
  Integer seed = 12345;

  final RandomForestModel model = RandomForest.trainClassifier(trainingData, numClasses,
    categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins,
    seed);

  // Evaluate model on test instances and compute test error
  JavaPairRDD<Double, Double> predictionAndLabel =
    testData.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
      @Override
      public Tuple2<Double, Double> call(LabeledPoint p) {
        return new Tuple2<>(model.predict(p.features()), p.label());
      }
    });
  Double testErr =
    1.0 * predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() {
      @Override
      public Boolean call(Tuple2<Double, Double> pl) {
        return !pl._1().equals(pl._2());
      }
    }).count() / testData.count();
  System.out.println("Test Error: " + testErr);
  System.out.println("Learned classification forest model:\n" + model.toDebugString());

  // Save and load model
  model.save(jsc.sc(), "target/tmp/myRandomForestClassificationModel");
  RandomForestModel sameModel = RandomForestModel.load(jsc.sc(),
    "target/tmp/myRandomForestClassificationModel");
  // $example off$

  jsc.stop();
}

Source File: JavaRandomForestRegressionExample.java From SparkDemo with MIT License

4 votes

public static void main(String[] args) {
  // $example on$
  SparkConf sparkConf = new SparkConf().setAppName("JavaRandomForestRegressionExample");
  JavaSparkContext jsc = new JavaSparkContext(sparkConf);
  // Load and parse the data file.
  String datapath = "data/mllib/sample_libsvm_data.txt";
  JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD();
  // Split the data into training and test sets (30% held out for testing)
  JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.7, 0.3});
  JavaRDD<LabeledPoint> trainingData = splits[0];
  JavaRDD<LabeledPoint> testData = splits[1];

  // Set parameters.
  // Empty categoricalFeaturesInfo indicates all features are continuous.
  Map<Integer, Integer> categoricalFeaturesInfo = new HashMap<>();
  Integer numTrees = 3; // Use more in practice.
  String featureSubsetStrategy = "auto"; // Let the algorithm choose.
  String impurity = "variance";
  Integer maxDepth = 4;
  Integer maxBins = 32;
  Integer seed = 12345;
  // Train a RandomForest model.
  final RandomForestModel model = RandomForest.trainRegressor(trainingData,
    categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins, seed);

  // Evaluate model on test instances and compute test error
  JavaPairRDD<Double, Double> predictionAndLabel =
    testData.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
      @Override
      public Tuple2<Double, Double> call(LabeledPoint p) {
        return new Tuple2<>(model.predict(p.features()), p.label());
      }
    });
  Double testMSE =
    predictionAndLabel.map(new Function<Tuple2<Double, Double>, Double>() {
      @Override
      public Double call(Tuple2<Double, Double> pl) {
        Double diff = pl._1() - pl._2();
        return diff * diff;
      }
    }).reduce(new Function2<Double, Double, Double>() {
      @Override
      public Double call(Double a, Double b) {
        return a + b;
      }
    }) / testData.count();
  System.out.println("Test Mean Squared Error: " + testMSE);
  System.out.println("Learned regression forest model:\n" + model.toDebugString());

  // Save and load model
  model.save(jsc.sc(), "target/tmp/myRandomForestRegressionModel");
  RandomForestModel sameModel = RandomForestModel.load(jsc.sc(),
    "target/tmp/myRandomForestRegressionModel");
  // $example off$

  jsc.stop();
}

Source File: RandomForestMlib.java From Java-Data-Science-Cookbook with MIT License

4 votes

public static void main(String args[]){

		SparkConf configuration = new SparkConf().setMaster("local[4]").setAppName("Any");
		JavaSparkContext sc = new JavaSparkContext(configuration);

		// Load and parse the data file.
		String input = "data/rf-data.txt";
		JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc.sc(), input).toJavaRDD();
		// Split the data into training and test sets (30% held out for testing)
		JavaRDD<LabeledPoint>[] dataSplits = data.randomSplit(new double[]{0.7, 0.3});
		JavaRDD<LabeledPoint> trainingData = dataSplits[0];
		JavaRDD<LabeledPoint> testData = dataSplits[1];

		// Train a RandomForest model.
		Integer numClasses = 2;
		HashMap<Integer, Integer> categoricalFeaturesInfo = new HashMap<Integer, Integer>();//  Empty categoricalFeaturesInfo indicates all features are continuous.
		Integer numTrees = 3; // Use more in practice.
		String featureSubsetStrategy = "auto"; // Let the algorithm choose.
		String impurity = "gini";
		Integer maxDepth = 5;
		Integer maxBins = 32;
		Integer seed = 12345;

		final RandomForestModel rfModel = RandomForest.trainClassifier(trainingData, numClasses,
				categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins,
				seed);

		// Evaluate model on test instances and compute test error
		JavaPairRDD<Double, Double> label =
				testData.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
					public Tuple2<Double, Double> call(LabeledPoint p) {
						return new Tuple2<Double, Double>(rfModel.predict(p.features()), p.label());
					}
				});

		Double testError =
				1.0 * label.filter(new Function<Tuple2<Double, Double>, Boolean>() {
					public Boolean call(Tuple2<Double, Double> pl) {
						return !pl._1().equals(pl._2());
					}
				}).count() / testData.count();

		System.out.println("Test Error: " + testError);
		System.out.println("Learned classification forest model:\n" + rfModel.toDebugString());
	}

org.apache.spark.mllib.util.MLUtils Java Examples