org.apache.spark.mllib.util.MLUtils Java Examples
The following examples show how to use
org.apache.spark.mllib.util.MLUtils.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: LogisticRegressionExporterTest.java From spark-transformers with Apache License 2.0 | 6 votes |
@Test public void shouldExportAndImportCorrectly() { String datapath = "src/test/resources/binary_classification_test.libsvm"; JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc.sc(), datapath).toJavaRDD(); //Train model in spark LogisticRegressionModel lrmodel = new LogisticRegressionWithSGD().run(data.rdd()); //Export this model byte[] exportedModel = ModelExporter.export(lrmodel, null); //Import it back LogisticRegressionModelInfo importedModel = (LogisticRegressionModelInfo) ModelImporter.importModelInfo(exportedModel); //check if they are exactly equal with respect to their fields //it maybe edge cases eg. order of elements in the list is changed assertEquals(lrmodel.intercept(), importedModel.getIntercept(), EPSILON); assertEquals(lrmodel.numClasses(), importedModel.getNumClasses(), EPSILON); assertEquals(lrmodel.numFeatures(), importedModel.getNumFeatures(), EPSILON); assertEquals((double) lrmodel.getThreshold().get(), importedModel.getThreshold(), EPSILON); for (int i = 0; i < importedModel.getNumFeatures(); i++) assertEquals(lrmodel.weights().toArray()[i], importedModel.getWeights()[i], EPSILON); }
Example #2
Source File: LogisticRegressionExporterTest.java From spark-transformers with Apache License 2.0 | 6 votes |
@Test public void shouldExportAndImportCorrectly() { String datapath = "src/test/resources/binary_classification_test.libsvm"; JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD(); //Train model in spark LogisticRegressionModel lrmodel = new LogisticRegressionWithSGD().run(data.rdd()); //Export this model byte[] exportedModel = ModelExporter.export(lrmodel); //Import it back LogisticRegressionModelInfo importedModel = (LogisticRegressionModelInfo) ModelImporter.importModelInfo(exportedModel); //check if they are exactly equal with respect to their fields //it maybe edge cases eg. order of elements in the list is changed assertEquals(lrmodel.intercept(), importedModel.getIntercept(), 0.01); assertEquals(lrmodel.numClasses(), importedModel.getNumClasses(), 0.01); assertEquals(lrmodel.numFeatures(), importedModel.getNumFeatures(), 0.01); assertEquals((double) lrmodel.getThreshold().get(), importedModel.getThreshold(), 0.01); for (int i = 0; i < importedModel.getNumFeatures(); i++) assertEquals(lrmodel.weights().toArray()[i], importedModel.getWeights()[i], 0.01); }
Example #3
Source File: TestSparkMultiLayerParameterAveraging.java From deeplearning4j with Apache License 2.0 | 5 votes |
@Test public void testFromSvmLight() throws Exception { JavaRDD<LabeledPoint> data = MLUtils .loadLibSVMFile(sc.sc(), new ClassPathResource("svmLight/iris_svmLight_0.txt").getTempFileFromArchive() .getAbsolutePath()) .toJavaRDD().map(new TestFn()); MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder().seed(123) .updater(new Adam(1e-6)) .weightInit(WeightInit.XAVIER) .list() .layer(new BatchNormalization.Builder().nIn(4).nOut(4).build()) .layer(new DenseLayer.Builder().nIn(4).nOut(32).activation(Activation.RELU).build()) .layer(new org.deeplearning4j.nn.conf.layers.OutputLayer.Builder(LossFunctions.LossFunction.MCXENT).nIn(32).nOut(3) .activation(Activation.SOFTMAX).build()) .build(); MultiLayerNetwork network = new MultiLayerNetwork(conf); network.init(); System.out.println("Initializing network"); SparkDl4jMultiLayer master = new SparkDl4jMultiLayer(sc, getBasicConf(), new ParameterAveragingTrainingMaster(true, numExecutors(), 1, 5, 1, 0)); master.fitLabeledPoint(data); }
Example #4
Source File: TestSparkMultiLayerParameterAveraging.java From deeplearning4j with Apache License 2.0 | 5 votes |
@Test public void testFromSvmLightBackprop() throws Exception { JavaRDD<LabeledPoint> data = MLUtils .loadLibSVMFile(sc.sc(), new ClassPathResource("svmLight/iris_svmLight_0.txt").getTempFileFromArchive() .getAbsolutePath()) .toJavaRDD().map(new TestFn()); DataSet d = new IrisDataSetIterator(150, 150).next(); MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder().seed(123) .optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT).list() .layer(0, new DenseLayer.Builder().nIn(4).nOut(100).weightInit(WeightInit.XAVIER) .activation(Activation.RELU).build()) .layer(1, new org.deeplearning4j.nn.conf.layers.OutputLayer.Builder( LossFunctions.LossFunction.MCXENT).nIn(100).nOut(3) .activation(Activation.SOFTMAX).weightInit(WeightInit.XAVIER) .build()) .build(); MultiLayerNetwork network = new MultiLayerNetwork(conf); network.init(); System.out.println("Initializing network"); SparkDl4jMultiLayer master = new SparkDl4jMultiLayer(sc, conf, new ParameterAveragingTrainingMaster(true, numExecutors(), 1, 5, 1, 0)); MultiLayerNetwork network2 = master.fitLabeledPoint(data); }
Example #5
Source File: JavaNaiveBayesExample.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkConf sparkConf = new SparkConf().setAppName("JavaNaiveBayesExample"); JavaSparkContext jsc = new JavaSparkContext(sparkConf); // $example on$ String path = "data/mllib/sample_libsvm_data.txt"; JavaRDD<LabeledPoint> inputData = MLUtils.loadLibSVMFile(jsc.sc(), path).toJavaRDD(); JavaRDD<LabeledPoint>[] tmp = inputData.randomSplit(new double[]{0.6, 0.4}); JavaRDD<LabeledPoint> training = tmp[0]; // training set JavaRDD<LabeledPoint> test = tmp[1]; // test set final NaiveBayesModel model = NaiveBayes.train(training.rdd(), 1.0); JavaPairRDD<Double, Double> predictionAndLabel = test.mapToPair(new PairFunction<LabeledPoint, Double, Double>() { @Override public Tuple2<Double, Double> call(LabeledPoint p) { return new Tuple2<>(model.predict(p.features()), p.label()); } }); double accuracy = predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() { @Override public Boolean call(Tuple2<Double, Double> pl) { return pl._1().equals(pl._2()); } }).count() / (double) test.count(); // Save and load model model.save(jsc.sc(), "target/tmp/myNaiveBayesModel"); NaiveBayesModel sameModel = NaiveBayesModel.load(jsc.sc(), "target/tmp/myNaiveBayesModel"); // $example off$ jsc.stop(); }
Example #6
Source File: LogisticRegression1BridgeTest.java From spark-transformers with Apache License 2.0 | 5 votes |
@Test public void testLogisticRegression() { //prepare data String datapath = "src/test/resources/binary_classification_test.libsvm"; DataFrame trainingData = sqlContext.read().format("libsvm").load(datapath); //Train model in spark LogisticRegressionModel lrmodel = new LogisticRegression().fit(trainingData); //Export this model byte[] exportedModel = ModelExporter.export(lrmodel, trainingData); //Import and get Transformer Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel); //validate predictions List<LabeledPoint> testPoints = MLUtils.loadLibSVMFile(sc.sc(), datapath).toJavaRDD().collect(); for (LabeledPoint i : testPoints) { Vector v = i.features(); double actual = lrmodel.predict(v); Map<String, Object> data = new HashMap<String, Object>(); data.put("features", v.toArray()); transformer.transform(data); double predicted = (double) data.get("prediction"); assertEquals(actual, predicted, EPSILON); } }
Example #7
Source File: LogisticRegressionBridgeTest.java From spark-transformers with Apache License 2.0 | 5 votes |
@Test public void testLogisticRegression() { //prepare data String datapath = "src/test/resources/binary_classification_test.libsvm"; JavaRDD<LabeledPoint> trainingData = MLUtils.loadLibSVMFile(sc.sc(), datapath).toJavaRDD(); //Train model in spark LogisticRegressionModel lrmodel = new LogisticRegressionWithSGD().run(trainingData.rdd()); //Export this model byte[] exportedModel = ModelExporter.export(lrmodel, null); //Import and get Transformer Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel); //validate predictions List<LabeledPoint> testPoints = trainingData.collect(); for (LabeledPoint i : testPoints) { Vector v = i.features(); double actual = lrmodel.predict(v); Map<String, Object> data = new HashMap<String, Object>(); data.put("features", v.toArray()); transformer.transform(data); double predicted = (double) data.get("prediction"); assertEquals(actual, predicted, EPSILON); } }
Example #8
Source File: JavaLogisticRegressionWithLBFGSExample.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("JavaLogisticRegressionWithLBFGSExample"); SparkContext sc = new SparkContext(conf); // $example on$ String path = "data/mllib/sample_libsvm_data.txt"; JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD(); // Split initial RDD into two... [60% training data, 40% testing data]. JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[] {0.6, 0.4}, 11L); JavaRDD<LabeledPoint> training = splits[0].cache(); JavaRDD<LabeledPoint> test = splits[1]; // Run training algorithm to build the model. final LogisticRegressionModel model = new LogisticRegressionWithLBFGS() .setNumClasses(10) .run(training.rdd()); // Compute raw scores on the test set. JavaRDD<Tuple2<Object, Object>> predictionAndLabels = test.map( new Function<LabeledPoint, Tuple2<Object, Object>>() { public Tuple2<Object, Object> call(LabeledPoint p) { Double prediction = model.predict(p.features()); return new Tuple2<Object, Object>(prediction, p.label()); } } ); // Get evaluation metrics. MulticlassMetrics metrics = new MulticlassMetrics(predictionAndLabels.rdd()); double accuracy = metrics.accuracy(); System.out.println("Accuracy = " + accuracy); // Save and load model model.save(sc, "target/tmp/javaLogisticRegressionWithLBFGSModel"); LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc, "target/tmp/javaLogisticRegressionWithLBFGSModel"); // $example off$ sc.stop(); }
Example #9
Source File: LogisticRegression1BridgeTest.java From spark-transformers with Apache License 2.0 | 5 votes |
@Test public void testLogisticRegression() { //prepare data String datapath = "src/test/resources/binary_classification_test.libsvm"; Dataset<Row> trainingData = spark.read().format("libsvm").load(datapath); //Train model in spark LogisticRegressionModel lrmodel = new LogisticRegression().fit(trainingData); //Export this model byte[] exportedModel = ModelExporter.export(lrmodel); //Import and get Transformer Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel); //validate predictions List<LabeledPoint> testPoints = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD().collect(); for (LabeledPoint i : testPoints) { Vector v = i.features().asML(); double actual = lrmodel.predict(v); Map<String, Object> data = new HashMap<String, Object>(); data.put("features", v.toArray()); transformer.transform(data); double predicted = (double) data.get("prediction"); assertEquals(actual, predicted, 0.01); } }
Example #10
Source File: LogisticRegressionBridgeTest.java From spark-transformers with Apache License 2.0 | 5 votes |
@Test public void testLogisticRegression() { //prepare data String datapath = "src/test/resources/binary_classification_test.libsvm"; JavaRDD<LabeledPoint> trainingData = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD(); //Train model in spark LogisticRegressionModel lrmodel = new LogisticRegressionWithSGD().run(trainingData.rdd()); //Export this model byte[] exportedModel = ModelExporter.export(lrmodel); //Import and get Transformer Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel); //validate predictions List<LabeledPoint> testPoints = trainingData.collect(); for (LabeledPoint i : testPoints) { Vector v = i.features(); double actual = lrmodel.predict(v); Map<String, Object> data = new HashMap<String, Object>(); data.put("features", v.toArray()); transformer.transform(data); double predicted = (double) data.get("prediction"); assertEquals(actual, predicted, 0.01); } }
Example #11
Source File: JavaDecisionTreeRegressionExample.java From SparkDemo with MIT License | 4 votes |
public static void main(String[] args) { // $example on$ SparkConf sparkConf = new SparkConf().setAppName("JavaDecisionTreeRegressionExample"); JavaSparkContext jsc = new JavaSparkContext(sparkConf); // Load and parse the data file. String datapath = "data/mllib/sample_libsvm_data.txt"; JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD(); // Split the data into training and test sets (30% held out for testing) JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.7, 0.3}); JavaRDD<LabeledPoint> trainingData = splits[0]; JavaRDD<LabeledPoint> testData = splits[1]; // Set parameters. // Empty categoricalFeaturesInfo indicates all features are continuous. Map<Integer, Integer> categoricalFeaturesInfo = new HashMap<>(); String impurity = "variance"; Integer maxDepth = 5; Integer maxBins = 32; // Train a DecisionTree model. final DecisionTreeModel model = DecisionTree.trainRegressor(trainingData, categoricalFeaturesInfo, impurity, maxDepth, maxBins); // Evaluate model on test instances and compute test error JavaPairRDD<Double, Double> predictionAndLabel = testData.mapToPair(new PairFunction<LabeledPoint, Double, Double>() { @Override public Tuple2<Double, Double> call(LabeledPoint p) { return new Tuple2<>(model.predict(p.features()), p.label()); } }); Double testMSE = predictionAndLabel.map(new Function<Tuple2<Double, Double>, Double>() { @Override public Double call(Tuple2<Double, Double> pl) { Double diff = pl._1() - pl._2(); return diff * diff; } }).reduce(new Function2<Double, Double, Double>() { @Override public Double call(Double a, Double b) { return a + b; } }) / data.count(); System.out.println("Test Mean Squared Error: " + testMSE); System.out.println("Learned regression tree model:\n" + model.toDebugString()); // Save and load model model.save(jsc.sc(), "target/tmp/myDecisionTreeRegressionModel"); DecisionTreeModel sameModel = DecisionTreeModel .load(jsc.sc(), "target/tmp/myDecisionTreeRegressionModel"); // $example off$ }
Example #12
Source File: RDDConverterUtils.java From systemds with Apache License 2.0 | 4 votes |
/** * Converts a libsvm text input file into two binary block matrices for features * and labels, and saves these to the specified output files. This call also deletes * existing files at the specified output locations, as well as determines and * writes the meta data files of both output matrices. * <p> * Note: We use {@code org.apache.spark.mllib.util.MLUtils.loadLibSVMFile} for parsing * the libsvm input files in order to ensure consistency with Spark. * * @param sc java spark context * @param pathIn path to libsvm input file * @param pathX path to binary block output file of features * @param pathY path to binary block output file of labels * @param mcOutX matrix characteristics of output matrix X */ public static void libsvmToBinaryBlock(JavaSparkContext sc, String pathIn, String pathX, String pathY, DataCharacteristics mcOutX) { if( !mcOutX.dimsKnown() ) throw new DMLRuntimeException("Matrix characteristics " + "required to convert sparse input representation."); try { //cleanup existing output files HDFSTool.deleteFileIfExistOnHDFS(pathX); HDFSTool.deleteFileIfExistOnHDFS(pathY); //convert libsvm to labeled points int numFeatures = (int) mcOutX.getCols(); int numPartitions = SparkUtils.getNumPreferredPartitions(mcOutX, null); JavaRDD<org.apache.spark.mllib.regression.LabeledPoint> lpoints = MLUtils.loadLibSVMFile(sc.sc(), pathIn, numFeatures, numPartitions).toJavaRDD(); //append row index and best-effort caching to avoid repeated text parsing JavaPairRDD<org.apache.spark.mllib.regression.LabeledPoint,Long> ilpoints = lpoints.zipWithIndex().persist(StorageLevel.MEMORY_AND_DISK()); //extract labels and convert to binary block DataCharacteristics mc1 = new MatrixCharacteristics(mcOutX.getRows(), 1, mcOutX.getBlocksize(), -1); LongAccumulator aNnz1 = sc.sc().longAccumulator("nnz"); JavaPairRDD<MatrixIndexes,MatrixBlock> out1 = ilpoints .mapPartitionsToPair(new LabeledPointToBinaryBlockFunction(mc1, true, aNnz1)); int numPartitions2 = SparkUtils.getNumPreferredPartitions(mc1, null); out1 = RDDAggregateUtils.mergeByKey(out1, numPartitions2, false); out1.saveAsHadoopFile(pathY, MatrixIndexes.class, MatrixBlock.class, SequenceFileOutputFormat.class); mc1.setNonZeros(aNnz1.value()); //update nnz after triggered save HDFSTool.writeMetaDataFile(pathY+".mtd", ValueType.FP64, mc1, FileFormat.BINARY); //extract data and convert to binary block DataCharacteristics mc2 = new MatrixCharacteristics(mcOutX.getRows(), mcOutX.getCols(), mcOutX.getBlocksize(), -1); LongAccumulator aNnz2 = sc.sc().longAccumulator("nnz"); JavaPairRDD<MatrixIndexes,MatrixBlock> out2 = ilpoints .mapPartitionsToPair(new LabeledPointToBinaryBlockFunction(mc2, false, aNnz2)); out2 = RDDAggregateUtils.mergeByKey(out2, numPartitions, false); out2.saveAsHadoopFile(pathX, MatrixIndexes.class, MatrixBlock.class, SequenceFileOutputFormat.class); mc2.setNonZeros(aNnz2.value()); //update nnz after triggered save HDFSTool.writeMetaDataFile(pathX+".mtd", ValueType.FP64, mc2, FileFormat.BINARY); //asynchronous cleanup of cached intermediates ilpoints.unpersist(false); } catch(IOException ex) { throw new DMLRuntimeException(ex); } }
Example #13
Source File: JavaBinaryClassificationMetricsExample.java From SparkDemo with MIT License | 4 votes |
public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("Java Binary Classification Metrics Example"); SparkContext sc = new SparkContext(conf); // $example on$ String path = "data/mllib/sample_binary_classification_data.txt"; JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD(); // Split initial RDD into two... [60% training data, 40% testing data]. JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.6, 0.4}, 11L); JavaRDD<LabeledPoint> training = splits[0].cache(); JavaRDD<LabeledPoint> test = splits[1]; // Run training algorithm to build the model. final LogisticRegressionModel model = new LogisticRegressionWithLBFGS() .setNumClasses(2) .run(training.rdd()); // Clear the prediction threshold so the model will return probabilities model.clearThreshold(); // Compute raw scores on the test set. JavaRDD<Tuple2<Object, Object>> predictionAndLabels = test.map( new Function<LabeledPoint, Tuple2<Object, Object>>() { @Override public Tuple2<Object, Object> call(LabeledPoint p) { Double prediction = model.predict(p.features()); return new Tuple2<Object, Object>(prediction, p.label()); } } ); // Get evaluation metrics. BinaryClassificationMetrics metrics = new BinaryClassificationMetrics(predictionAndLabels.rdd()); // Precision by threshold JavaRDD<Tuple2<Object, Object>> precision = metrics.precisionByThreshold().toJavaRDD(); System.out.println("Precision by threshold: " + precision.collect()); // Recall by threshold JavaRDD<Tuple2<Object, Object>> recall = metrics.recallByThreshold().toJavaRDD(); System.out.println("Recall by threshold: " + recall.collect()); // F Score by threshold JavaRDD<Tuple2<Object, Object>> f1Score = metrics.fMeasureByThreshold().toJavaRDD(); System.out.println("F1 Score by threshold: " + f1Score.collect()); JavaRDD<Tuple2<Object, Object>> f2Score = metrics.fMeasureByThreshold(2.0).toJavaRDD(); System.out.println("F2 Score by threshold: " + f2Score.collect()); // Precision-recall curve JavaRDD<Tuple2<Object, Object>> prc = metrics.pr().toJavaRDD(); System.out.println("Precision-recall curve: " + prc.collect()); // Thresholds JavaRDD<Double> thresholds = precision.map( new Function<Tuple2<Object, Object>, Double>() { @Override public Double call(Tuple2<Object, Object> t) { return new Double(t._1().toString()); } } ); // ROC Curve JavaRDD<Tuple2<Object, Object>> roc = metrics.roc().toJavaRDD(); System.out.println("ROC curve: " + roc.collect()); // AUPRC System.out.println("Area under precision-recall curve = " + metrics.areaUnderPR()); // AUROC System.out.println("Area under ROC = " + metrics.areaUnderROC()); // Save and load model model.save(sc, "target/tmp/LogisticRegressionModel"); LogisticRegressionModel.load(sc, "target/tmp/LogisticRegressionModel"); // $example off$ }
Example #14
Source File: RDDConverterUtils.java From systemds with Apache License 2.0 | 4 votes |
/** * Converts a libsvm text input file into two binary block matrices for features * and labels, and saves these to the specified output files. This call also deletes * existing files at the specified output locations, as well as determines and * writes the meta data files of both output matrices. * <p> * Note: We use {@code org.apache.spark.mllib.util.MLUtils.loadLibSVMFile} for parsing * the libsvm input files in order to ensure consistency with Spark. * * @param sc java spark context * @param pathIn path to libsvm input file * @param pathX path to binary block output file of features * @param pathY path to binary block output file of labels * @param mcOutX matrix characteristics of output matrix X */ public static void libsvmToBinaryBlock(JavaSparkContext sc, String pathIn, String pathX, String pathY, DataCharacteristics mcOutX) { if( !mcOutX.dimsKnown() ) throw new DMLRuntimeException("Matrix characteristics " + "required to convert sparse input representation."); try { //cleanup existing output files HDFSTool.deleteFileIfExistOnHDFS(pathX); HDFSTool.deleteFileIfExistOnHDFS(pathY); //convert libsvm to labeled points int numFeatures = (int) mcOutX.getCols(); int numPartitions = SparkUtils.getNumPreferredPartitions(mcOutX, null); JavaRDD<org.apache.spark.mllib.regression.LabeledPoint> lpoints = MLUtils.loadLibSVMFile(sc.sc(), pathIn, numFeatures, numPartitions).toJavaRDD(); //append row index and best-effort caching to avoid repeated text parsing JavaPairRDD<org.apache.spark.mllib.regression.LabeledPoint,Long> ilpoints = lpoints.zipWithIndex().persist(StorageLevel.MEMORY_AND_DISK()); //extract labels and convert to binary block DataCharacteristics mc1 = new MatrixCharacteristics(mcOutX.getRows(), 1, mcOutX.getBlocksize(), -1); LongAccumulator aNnz1 = sc.sc().longAccumulator("nnz"); JavaPairRDD<MatrixIndexes,MatrixBlock> out1 = ilpoints .mapPartitionsToPair(new LabeledPointToBinaryBlockFunction(mc1, true, aNnz1)); int numPartitions2 = SparkUtils.getNumPreferredPartitions(mc1, null); out1 = RDDAggregateUtils.mergeByKey(out1, numPartitions2, false); out1.saveAsHadoopFile(pathY, MatrixIndexes.class, MatrixBlock.class, SequenceFileOutputFormat.class); mc1.setNonZeros(aNnz1.value()); //update nnz after triggered save HDFSTool.writeMetaDataFile(pathY+".mtd", ValueType.FP64, mc1, OutputInfo.BinaryBlockOutputInfo); //extract data and convert to binary block DataCharacteristics mc2 = new MatrixCharacteristics(mcOutX.getRows(), mcOutX.getCols(), mcOutX.getBlocksize(), -1); LongAccumulator aNnz2 = sc.sc().longAccumulator("nnz"); JavaPairRDD<MatrixIndexes,MatrixBlock> out2 = ilpoints .mapPartitionsToPair(new LabeledPointToBinaryBlockFunction(mc2, false, aNnz2)); out2 = RDDAggregateUtils.mergeByKey(out2, numPartitions, false); out2.saveAsHadoopFile(pathX, MatrixIndexes.class, MatrixBlock.class, SequenceFileOutputFormat.class); mc2.setNonZeros(aNnz2.value()); //update nnz after triggered save HDFSTool.writeMetaDataFile(pathX+".mtd", ValueType.FP64, mc2, OutputInfo.BinaryBlockOutputInfo); //asynchronous cleanup of cached intermediates ilpoints.unpersist(false); } catch(IOException ex) { throw new DMLRuntimeException(ex); } }
Example #15
Source File: JavaDecisionTreeClassificationExample.java From SparkDemo with MIT License | 4 votes |
public static void main(String[] args) { // $example on$ SparkConf sparkConf = new SparkConf().setAppName("JavaDecisionTreeClassificationExample"); JavaSparkContext jsc = new JavaSparkContext(sparkConf); // Load and parse the data file. String datapath = "data/mllib/sample_libsvm_data.txt"; JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD(); // Split the data into training and test sets (30% held out for testing) JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.7, 0.3}); JavaRDD<LabeledPoint> trainingData = splits[0]; JavaRDD<LabeledPoint> testData = splits[1]; // Set parameters. // Empty categoricalFeaturesInfo indicates all features are continuous. Integer numClasses = 2; Map<Integer, Integer> categoricalFeaturesInfo = new HashMap<>(); String impurity = "gini"; Integer maxDepth = 5; Integer maxBins = 32; // Train a DecisionTree model for classification. final DecisionTreeModel model = DecisionTree.trainClassifier(trainingData, numClasses, categoricalFeaturesInfo, impurity, maxDepth, maxBins); // Evaluate model on test instances and compute test error JavaPairRDD<Double, Double> predictionAndLabel = testData.mapToPair(new PairFunction<LabeledPoint, Double, Double>() { @Override public Tuple2<Double, Double> call(LabeledPoint p) { return new Tuple2<>(model.predict(p.features()), p.label()); } }); Double testErr = 1.0 * predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() { @Override public Boolean call(Tuple2<Double, Double> pl) { return !pl._1().equals(pl._2()); } }).count() / testData.count(); System.out.println("Test Error: " + testErr); System.out.println("Learned classification tree model:\n" + model.toDebugString()); // Save and load model model.save(jsc.sc(), "target/tmp/myDecisionTreeClassificationModel"); DecisionTreeModel sameModel = DecisionTreeModel .load(jsc.sc(), "target/tmp/myDecisionTreeClassificationModel"); // $example off$ }
Example #16
Source File: JavaGradientBoostingRegressionExample.java From SparkDemo with MIT License | 4 votes |
public static void main(String[] args) { // $example on$ SparkConf sparkConf = new SparkConf() .setAppName("JavaGradientBoostedTreesRegressionExample"); JavaSparkContext jsc = new JavaSparkContext(sparkConf); // Load and parse the data file. String datapath = "data/mllib/sample_libsvm_data.txt"; JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD(); // Split the data into training and test sets (30% held out for testing) JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.7, 0.3}); JavaRDD<LabeledPoint> trainingData = splits[0]; JavaRDD<LabeledPoint> testData = splits[1]; // Train a GradientBoostedTrees model. // The defaultParams for Regression use SquaredError by default. BoostingStrategy boostingStrategy = BoostingStrategy.defaultParams("Regression"); boostingStrategy.setNumIterations(3); // Note: Use more iterations in practice. boostingStrategy.getTreeStrategy().setMaxDepth(5); // Empty categoricalFeaturesInfo indicates all features are continuous. Map<Integer, Integer> categoricalFeaturesInfo = new HashMap<>(); boostingStrategy.treeStrategy().setCategoricalFeaturesInfo(categoricalFeaturesInfo); final GradientBoostedTreesModel model = GradientBoostedTrees.train(trainingData, boostingStrategy); // Evaluate model on test instances and compute test error JavaPairRDD<Double, Double> predictionAndLabel = testData.mapToPair(new PairFunction<LabeledPoint, Double, Double>() { @Override public Tuple2<Double, Double> call(LabeledPoint p) { return new Tuple2<>(model.predict(p.features()), p.label()); } }); Double testMSE = predictionAndLabel.map(new Function<Tuple2<Double, Double>, Double>() { @Override public Double call(Tuple2<Double, Double> pl) { Double diff = pl._1() - pl._2(); return diff * diff; } }).reduce(new Function2<Double, Double, Double>() { @Override public Double call(Double a, Double b) { return a + b; } }) / data.count(); System.out.println("Test Mean Squared Error: " + testMSE); System.out.println("Learned regression GBT model:\n" + model.toDebugString()); // Save and load model model.save(jsc.sc(), "target/tmp/myGradientBoostingRegressionModel"); GradientBoostedTreesModel sameModel = GradientBoostedTreesModel.load(jsc.sc(), "target/tmp/myGradientBoostingRegressionModel"); // $example off$ jsc.stop(); }
Example #17
Source File: JavaSVMWithSGDExample.java From SparkDemo with MIT License | 4 votes |
public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("JavaSVMWithSGDExample"); SparkContext sc = new SparkContext(conf); // $example on$ String path = "data/mllib/sample_libsvm_data.txt"; JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD(); // Split initial RDD into two... [60% training data, 40% testing data]. JavaRDD<LabeledPoint> training = data.sample(false, 0.6, 11L); training.cache(); JavaRDD<LabeledPoint> test = data.subtract(training); // Run training algorithm to build the model. int numIterations = 100; final SVMModel model = SVMWithSGD.train(training.rdd(), numIterations); // Clear the default threshold. model.clearThreshold(); // Compute raw scores on the test set. JavaRDD<Tuple2<Object, Object>> scoreAndLabels = test.map( new Function<LabeledPoint, Tuple2<Object, Object>>() { public Tuple2<Object, Object> call(LabeledPoint p) { Double score = model.predict(p.features()); return new Tuple2<Object, Object>(score, p.label()); } } ); // Get evaluation metrics. BinaryClassificationMetrics metrics = new BinaryClassificationMetrics(JavaRDD.toRDD(scoreAndLabels)); double auROC = metrics.areaUnderROC(); System.out.println("Area under ROC = " + auROC); // Save and load model model.save(sc, "target/tmp/javaSVMWithSGDModel"); SVMModel sameModel = SVMModel.load(sc, "target/tmp/javaSVMWithSGDModel"); // $example off$ sc.stop(); }
Example #18
Source File: JavaGradientBoostingClassificationExample.java From SparkDemo with MIT License | 4 votes |
public static void main(String[] args) { // $example on$ SparkConf sparkConf = new SparkConf() .setAppName("JavaGradientBoostedTreesClassificationExample"); JavaSparkContext jsc = new JavaSparkContext(sparkConf); // Load and parse the data file. String datapath = "data/mllib/sample_libsvm_data.txt"; JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD(); // Split the data into training and test sets (30% held out for testing) JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.7, 0.3}); JavaRDD<LabeledPoint> trainingData = splits[0]; JavaRDD<LabeledPoint> testData = splits[1]; // Train a GradientBoostedTrees model. // The defaultParams for Classification use LogLoss by default. BoostingStrategy boostingStrategy = BoostingStrategy.defaultParams("Classification"); boostingStrategy.setNumIterations(3); // Note: Use more iterations in practice. boostingStrategy.getTreeStrategy().setNumClasses(2); boostingStrategy.getTreeStrategy().setMaxDepth(5); // Empty categoricalFeaturesInfo indicates all features are continuous. Map<Integer, Integer> categoricalFeaturesInfo = new HashMap<>(); boostingStrategy.treeStrategy().setCategoricalFeaturesInfo(categoricalFeaturesInfo); final GradientBoostedTreesModel model = GradientBoostedTrees.train(trainingData, boostingStrategy); // Evaluate model on test instances and compute test error JavaPairRDD<Double, Double> predictionAndLabel = testData.mapToPair(new PairFunction<LabeledPoint, Double, Double>() { @Override public Tuple2<Double, Double> call(LabeledPoint p) { return new Tuple2<>(model.predict(p.features()), p.label()); } }); Double testErr = 1.0 * predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() { @Override public Boolean call(Tuple2<Double, Double> pl) { return !pl._1().equals(pl._2()); } }).count() / testData.count(); System.out.println("Test Error: " + testErr); System.out.println("Learned classification GBT model:\n" + model.toDebugString()); // Save and load model model.save(jsc.sc(), "target/tmp/myGradientBoostingClassificationModel"); GradientBoostedTreesModel sameModel = GradientBoostedTreesModel.load(jsc.sc(), "target/tmp/myGradientBoostingClassificationModel"); // $example off$ jsc.stop(); }
Example #19
Source File: JavaMulticlassClassificationMetricsExample.java From SparkDemo with MIT License | 4 votes |
public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("Multi class Classification Metrics Example"); SparkContext sc = new SparkContext(conf); // $example on$ String path = "data/mllib/sample_multiclass_classification_data.txt"; JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD(); // Split initial RDD into two... [60% training data, 40% testing data]. JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.6, 0.4}, 11L); JavaRDD<LabeledPoint> training = splits[0].cache(); JavaRDD<LabeledPoint> test = splits[1]; // Run training algorithm to build the model. final LogisticRegressionModel model = new LogisticRegressionWithLBFGS() .setNumClasses(3) .run(training.rdd()); // Compute raw scores on the test set. JavaRDD<Tuple2<Object, Object>> predictionAndLabels = test.map( new Function<LabeledPoint, Tuple2<Object, Object>>() { public Tuple2<Object, Object> call(LabeledPoint p) { Double prediction = model.predict(p.features()); return new Tuple2<Object, Object>(prediction, p.label()); } } ); // Get evaluation metrics. MulticlassMetrics metrics = new MulticlassMetrics(predictionAndLabels.rdd()); // Confusion matrix Matrix confusion = metrics.confusionMatrix(); System.out.println("Confusion matrix: \n" + confusion); // Overall statistics System.out.println("Accuracy = " + metrics.accuracy()); // Stats by labels for (int i = 0; i < metrics.labels().length; i++) { System.out.format("Class %f precision = %f\n", metrics.labels()[i],metrics.precision( metrics.labels()[i])); System.out.format("Class %f recall = %f\n", metrics.labels()[i], metrics.recall( metrics.labels()[i])); System.out.format("Class %f F1 score = %f\n", metrics.labels()[i], metrics.fMeasure( metrics.labels()[i])); } //Weighted stats System.out.format("Weighted precision = %f\n", metrics.weightedPrecision()); System.out.format("Weighted recall = %f\n", metrics.weightedRecall()); System.out.format("Weighted F1 score = %f\n", metrics.weightedFMeasure()); System.out.format("Weighted false positive rate = %f\n", metrics.weightedFalsePositiveRate()); // Save and load model model.save(sc, "target/tmp/LogisticRegressionModel"); LogisticRegressionModel sameModel = LogisticRegressionModel.load(sc, "target/tmp/LogisticRegressionModel"); // $example off$ }
Example #20
Source File: JavaRandomForestClassificationExample.java From SparkDemo with MIT License | 4 votes |
public static void main(String[] args) { // $example on$ SparkConf sparkConf = new SparkConf().setAppName("JavaRandomForestClassificationExample"); JavaSparkContext jsc = new JavaSparkContext(sparkConf); // Load and parse the data file. String datapath = "data/mllib/sample_libsvm_data.txt"; JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD(); // Split the data into training and test sets (30% held out for testing) JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.7, 0.3}); JavaRDD<LabeledPoint> trainingData = splits[0]; JavaRDD<LabeledPoint> testData = splits[1]; // Train a RandomForest model. // Empty categoricalFeaturesInfo indicates all features are continuous. Integer numClasses = 2; HashMap<Integer, Integer> categoricalFeaturesInfo = new HashMap<>(); Integer numTrees = 3; // Use more in practice. String featureSubsetStrategy = "auto"; // Let the algorithm choose. String impurity = "gini"; Integer maxDepth = 5; Integer maxBins = 32; Integer seed = 12345; final RandomForestModel model = RandomForest.trainClassifier(trainingData, numClasses, categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins, seed); // Evaluate model on test instances and compute test error JavaPairRDD<Double, Double> predictionAndLabel = testData.mapToPair(new PairFunction<LabeledPoint, Double, Double>() { @Override public Tuple2<Double, Double> call(LabeledPoint p) { return new Tuple2<>(model.predict(p.features()), p.label()); } }); Double testErr = 1.0 * predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() { @Override public Boolean call(Tuple2<Double, Double> pl) { return !pl._1().equals(pl._2()); } }).count() / testData.count(); System.out.println("Test Error: " + testErr); System.out.println("Learned classification forest model:\n" + model.toDebugString()); // Save and load model model.save(jsc.sc(), "target/tmp/myRandomForestClassificationModel"); RandomForestModel sameModel = RandomForestModel.load(jsc.sc(), "target/tmp/myRandomForestClassificationModel"); // $example off$ jsc.stop(); }
Example #21
Source File: JavaRandomForestRegressionExample.java From SparkDemo with MIT License | 4 votes |
public static void main(String[] args) { // $example on$ SparkConf sparkConf = new SparkConf().setAppName("JavaRandomForestRegressionExample"); JavaSparkContext jsc = new JavaSparkContext(sparkConf); // Load and parse the data file. String datapath = "data/mllib/sample_libsvm_data.txt"; JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD(); // Split the data into training and test sets (30% held out for testing) JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.7, 0.3}); JavaRDD<LabeledPoint> trainingData = splits[0]; JavaRDD<LabeledPoint> testData = splits[1]; // Set parameters. // Empty categoricalFeaturesInfo indicates all features are continuous. Map<Integer, Integer> categoricalFeaturesInfo = new HashMap<>(); Integer numTrees = 3; // Use more in practice. String featureSubsetStrategy = "auto"; // Let the algorithm choose. String impurity = "variance"; Integer maxDepth = 4; Integer maxBins = 32; Integer seed = 12345; // Train a RandomForest model. final RandomForestModel model = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins, seed); // Evaluate model on test instances and compute test error JavaPairRDD<Double, Double> predictionAndLabel = testData.mapToPair(new PairFunction<LabeledPoint, Double, Double>() { @Override public Tuple2<Double, Double> call(LabeledPoint p) { return new Tuple2<>(model.predict(p.features()), p.label()); } }); Double testMSE = predictionAndLabel.map(new Function<Tuple2<Double, Double>, Double>() { @Override public Double call(Tuple2<Double, Double> pl) { Double diff = pl._1() - pl._2(); return diff * diff; } }).reduce(new Function2<Double, Double, Double>() { @Override public Double call(Double a, Double b) { return a + b; } }) / testData.count(); System.out.println("Test Mean Squared Error: " + testMSE); System.out.println("Learned regression forest model:\n" + model.toDebugString()); // Save and load model model.save(jsc.sc(), "target/tmp/myRandomForestRegressionModel"); RandomForestModel sameModel = RandomForestModel.load(jsc.sc(), "target/tmp/myRandomForestRegressionModel"); // $example off$ jsc.stop(); }
Example #22
Source File: RandomForestMlib.java From Java-Data-Science-Cookbook with MIT License | 4 votes |
public static void main(String args[]){ SparkConf configuration = new SparkConf().setMaster("local[4]").setAppName("Any"); JavaSparkContext sc = new JavaSparkContext(configuration); // Load and parse the data file. String input = "data/rf-data.txt"; JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc.sc(), input).toJavaRDD(); // Split the data into training and test sets (30% held out for testing) JavaRDD<LabeledPoint>[] dataSplits = data.randomSplit(new double[]{0.7, 0.3}); JavaRDD<LabeledPoint> trainingData = dataSplits[0]; JavaRDD<LabeledPoint> testData = dataSplits[1]; // Train a RandomForest model. Integer numClasses = 2; HashMap<Integer, Integer> categoricalFeaturesInfo = new HashMap<Integer, Integer>();// Empty categoricalFeaturesInfo indicates all features are continuous. Integer numTrees = 3; // Use more in practice. String featureSubsetStrategy = "auto"; // Let the algorithm choose. String impurity = "gini"; Integer maxDepth = 5; Integer maxBins = 32; Integer seed = 12345; final RandomForestModel rfModel = RandomForest.trainClassifier(trainingData, numClasses, categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins, seed); // Evaluate model on test instances and compute test error JavaPairRDD<Double, Double> label = testData.mapToPair(new PairFunction<LabeledPoint, Double, Double>() { public Tuple2<Double, Double> call(LabeledPoint p) { return new Tuple2<Double, Double>(rfModel.predict(p.features()), p.label()); } }); Double testError = 1.0 * label.filter(new Function<Tuple2<Double, Double>, Boolean>() { public Boolean call(Tuple2<Double, Double> pl) { return !pl._1().equals(pl._2()); } }).count() / testData.count(); System.out.println("Test Error: " + testError); System.out.println("Learned classification forest model:\n" + rfModel.toDebugString()); }