org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator Java Examples
The following examples show how to use
org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: SparkClassificationModel.java From ambiverse-nlu with Apache License 2.0 | 6 votes |
private Evaluator getEvaluator(TrainingSettings trainingSettings, Predictor predictor) { Evaluator evaluator = null; if(predictor instanceof RandomForestClassifier || predictor instanceof GBTClassifier || predictor instanceof DecisionTreeClassifier) { evaluator = new MulticlassClassificationEvaluator() .setLabelCol("indexedLabel") .setPredictionCol("prediction") .setMetricName(trainingSettings.getMetricName()); } if(predictor instanceof LogisticRegression) { evaluator = new BinaryClassificationEvaluator().setMetricName(trainingSettings.getMetricName()); } return evaluator; }
Example #2
Source File: JavaRandomForestClassifierExample.java From SparkDemo with MIT License | 6 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaRandomForestClassifierExample") .getOrCreate(); // $example on$ // Load and parse the data file, converting it to a DataFrame. Dataset<Row> data = spark.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt"); // Index labels, adding metadata to the label column. // Fit on whole dataset to include all labels in index. StringIndexerModel labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") .fit(data); // Automatically identify categorical features, and index them. // Set maxCategories so features with > 4 distinct values are treated as continuous. VectorIndexerModel featureIndexer = new VectorIndexer() .setInputCol("features") .setOutputCol("indexedFeatures") .setMaxCategories(4) .fit(data); // Split the data into training and test sets (30% held out for testing) Dataset<Row>[] splits = data.randomSplit(new double[] {0.7, 0.3}); Dataset<Row> trainingData = splits[0]; Dataset<Row> testData = splits[1]; // Train a RandomForest model. RandomForestClassifier rf = new RandomForestClassifier() .setLabelCol("indexedLabel") .setFeaturesCol("indexedFeatures"); // Convert indexed labels back to original labels. IndexToString labelConverter = new IndexToString() .setInputCol("prediction") .setOutputCol("predictedLabel") .setLabels(labelIndexer.labels()); // Chain indexers and forest in a Pipeline Pipeline pipeline = new Pipeline() .setStages(new PipelineStage[] {labelIndexer, featureIndexer, rf, labelConverter}); // Train model. This also runs the indexers. PipelineModel model = pipeline.fit(trainingData); // Make predictions. Dataset<Row> predictions = model.transform(testData); // Select example rows to display. predictions.select("predictedLabel", "label", "features").show(5); // Select (prediction, true label) and compute test error MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator() .setLabelCol("indexedLabel") .setPredictionCol("prediction") .setMetricName("accuracy"); double accuracy = evaluator.evaluate(predictions); System.out.println("Test Error = " + (1.0 - accuracy)); RandomForestClassificationModel rfModel = (RandomForestClassificationModel)(model.stages()[2]); System.out.println("Learned classification forest model:\n" + rfModel.toDebugString()); // $example off$ spark.stop(); }
Example #3
Source File: JavaNaiveBayesExample.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaNaiveBayesExample") .getOrCreate(); // $example on$ // Load training data Dataset<Row> dataFrame = spark.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt"); // Split the data into train and test Dataset<Row>[] splits = dataFrame.randomSplit(new double[]{0.6, 0.4}, 1234L); Dataset<Row> train = splits[0]; Dataset<Row> test = splits[1]; // create the trainer and set its parameters NaiveBayes nb = new NaiveBayes(); // train the model NaiveBayesModel model = nb.fit(train); // Select example rows to display. Dataset<Row> predictions = model.transform(test); predictions.show(); // compute accuracy on the test set MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator() .setLabelCol("label") .setPredictionCol("prediction") .setMetricName("accuracy"); double accuracy = evaluator.evaluate(predictions); System.out.println("Test set accuracy = " + accuracy); // $example off$ spark.stop(); }
Example #4
Source File: JavaMultilayerPerceptronClassifierExample.java From SparkDemo with MIT License | 4 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaMultilayerPerceptronClassifierExample") .getOrCreate(); // $example on$ // Load training data String path = "data/mllib/sample_multiclass_classification_data.txt"; Dataset<Row> dataFrame = spark.read().format("libsvm").load(path); // Split the data into train and test Dataset<Row>[] splits = dataFrame.randomSplit(new double[]{0.6, 0.4}, 1234L); Dataset<Row> train = splits[0]; Dataset<Row> test = splits[1]; // specify layers for the neural network: // input layer of size 4 (features), two intermediate of size 5 and 4 // and output of size 3 (classes) int[] layers = new int[] {4, 5, 4, 3}; // create the trainer and set its parameters MultilayerPerceptronClassifier trainer = new MultilayerPerceptronClassifier() .setLayers(layers) .setBlockSize(128) .setSeed(1234L) .setMaxIter(100); // train the model MultilayerPerceptronClassificationModel model = trainer.fit(train); // compute accuracy on the test set Dataset<Row> result = model.transform(test); Dataset<Row> predictionAndLabels = result.select("prediction", "label"); MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator() .setMetricName("accuracy"); System.out.println("Test set accuracy = " + evaluator.evaluate(predictionAndLabels)); // $example off$ spark.stop(); }
Example #5
Source File: JavaGradientBoostedTreeClassifierExample.java From SparkDemo with MIT License | 4 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaGradientBoostedTreeClassifierExample") .getOrCreate(); // $example on$ // Load and parse the data file, converting it to a DataFrame. Dataset<Row> data = spark .read() .format("libsvm") .load("data/mllib/sample_libsvm_data.txt"); // Index labels, adding metadata to the label column. // Fit on whole dataset to include all labels in index. StringIndexerModel labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") .fit(data); // Automatically identify categorical features, and index them. // Set maxCategories so features with > 4 distinct values are treated as continuous. VectorIndexerModel featureIndexer = new VectorIndexer() .setInputCol("features") .setOutputCol("indexedFeatures") .setMaxCategories(4) .fit(data); // Split the data into training and test sets (30% held out for testing) Dataset<Row>[] splits = data.randomSplit(new double[] {0.7, 0.3}); Dataset<Row> trainingData = splits[0]; Dataset<Row> testData = splits[1]; // Train a GBT model. GBTClassifier gbt = new GBTClassifier() .setLabelCol("indexedLabel") .setFeaturesCol("indexedFeatures") .setMaxIter(10); // Convert indexed labels back to original labels. IndexToString labelConverter = new IndexToString() .setInputCol("prediction") .setOutputCol("predictedLabel") .setLabels(labelIndexer.labels()); // Chain indexers and GBT in a Pipeline. Pipeline pipeline = new Pipeline() .setStages(new PipelineStage[] {labelIndexer, featureIndexer, gbt, labelConverter}); // Train model. This also runs the indexers. PipelineModel model = pipeline.fit(trainingData); // Make predictions. Dataset<Row> predictions = model.transform(testData); // Select example rows to display. predictions.select("predictedLabel", "label", "features").show(5); // Select (prediction, true label) and compute test error. MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator() .setLabelCol("indexedLabel") .setPredictionCol("prediction") .setMetricName("accuracy"); double accuracy = evaluator.evaluate(predictions); System.out.println("Test Error = " + (1.0 - accuracy)); GBTClassificationModel gbtModel = (GBTClassificationModel)(model.stages()[2]); System.out.println("Learned classification GBT model:\n" + gbtModel.toDebugString()); // $example off$ spark.stop(); }
Example #6
Source File: JavaDecisionTreeClassificationExample.java From SparkDemo with MIT License | 4 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaDecisionTreeClassificationExample") .getOrCreate(); // $example on$ // Load the data stored in LIBSVM format as a DataFrame. Dataset<Row> data = spark .read() .format("libsvm") .load("data/mllib/sample_libsvm_data.txt"); // Index labels, adding metadata to the label column. // Fit on whole dataset to include all labels in index. StringIndexerModel labelIndexer = new StringIndexer() .setInputCol("label") .setOutputCol("indexedLabel") .fit(data); // Automatically identify categorical features, and index them. VectorIndexerModel featureIndexer = new VectorIndexer() .setInputCol("features") .setOutputCol("indexedFeatures") .setMaxCategories(4) // features with > 4 distinct values are treated as continuous. .fit(data); // Split the data into training and test sets (30% held out for testing). Dataset<Row>[] splits = data.randomSplit(new double[]{0.7, 0.3}); Dataset<Row> trainingData = splits[0]; Dataset<Row> testData = splits[1]; // Train a DecisionTree model. DecisionTreeClassifier dt = new DecisionTreeClassifier() .setLabelCol("indexedLabel") .setFeaturesCol("indexedFeatures"); // Convert indexed labels back to original labels. IndexToString labelConverter = new IndexToString() .setInputCol("prediction") .setOutputCol("predictedLabel") .setLabels(labelIndexer.labels()); // Chain indexers and tree in a Pipeline. Pipeline pipeline = new Pipeline() .setStages(new PipelineStage[]{labelIndexer, featureIndexer, dt, labelConverter}); // Train model. This also runs the indexers. PipelineModel model = pipeline.fit(trainingData); // Make predictions. Dataset<Row> predictions = model.transform(testData); // Select example rows to display. predictions.select("predictedLabel", "label", "features").show(5); // Select (prediction, true label) and compute test error. MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator() .setLabelCol("indexedLabel") .setPredictionCol("prediction") .setMetricName("accuracy"); double accuracy = evaluator.evaluate(predictions); System.out.println("Test Error = " + (1.0 - accuracy)); DecisionTreeClassificationModel treeModel = (DecisionTreeClassificationModel) (model.stages()[2]); System.out.println("Learned classification tree model:\n" + treeModel.toDebugString()); // $example off$ spark.stop(); }
Example #7
Source File: JavaOneVsRestExample.java From SparkDemo with MIT License | 4 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaOneVsRestExample") .getOrCreate(); // $example on$ // load data file. Dataset<Row> inputData = spark.read().format("libsvm") .load("data/mllib/sample_multiclass_classification_data.txt"); // generate the train/test split. Dataset<Row>[] tmp = inputData.randomSplit(new double[]{0.8, 0.2}); Dataset<Row> train = tmp[0]; Dataset<Row> test = tmp[1]; // configure the base classifier. LogisticRegression classifier = new LogisticRegression() .setMaxIter(10) .setTol(1E-6) .setFitIntercept(true); // instantiate the One Vs Rest Classifier. OneVsRest ovr = new OneVsRest().setClassifier(classifier); // train the multiclass model. OneVsRestModel ovrModel = ovr.fit(train); // score the model on test data. Dataset<Row> predictions = ovrModel.transform(test) .select("prediction", "label"); // obtain evaluator. MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator() .setMetricName("accuracy"); // compute the classification error on test data. double accuracy = evaluator.evaluate(predictions); System.out.println("Test Error = " + (1 - accuracy)); // $example off$ spark.stop(); }
Example #8
Source File: WhitespaceClassifier.java From vn.vitk with GNU General Public License v3.0 | 4 votes |
/** * Trains a whitespace classifier model and save the resulting pipeline model * to an external file. * @param sentences a list of tokenized sentences. * @param pipelineModelFileName * @param numFeatures */ public void train(List<String> sentences, String pipelineModelFileName, int numFeatures) { List<WhitespaceContext> contexts = new ArrayList<WhitespaceContext>(sentences.size()); int id = 0; for (String sentence : sentences) { sentence = sentence.trim(); for (int j = 0; j < sentence.length(); j++) { char c = sentence.charAt(j); if (c == ' ' || c == '_') { WhitespaceContext context = new WhitespaceContext(); context.setId(id++); context.setContext(extractContext(sentence, j)); context.setLabel(c == ' ' ? 0d : 1d); contexts.add(context); } } } JavaRDD<WhitespaceContext> jrdd = jsc.parallelize(contexts); DataFrame df = sqlContext.createDataFrame(jrdd, WhitespaceContext.class); df.show(false); System.out.println("N = " + df.count()); df.groupBy("label").count().show(); org.apache.spark.ml.feature.Tokenizer tokenizer = new Tokenizer() .setInputCol("context").setOutputCol("words"); HashingTF hashingTF = new HashingTF().setNumFeatures(numFeatures) .setInputCol(tokenizer.getOutputCol()).setOutputCol("features"); LogisticRegression lr = new LogisticRegression().setMaxIter(100) .setRegParam(0.01); Pipeline pipeline = new Pipeline().setStages(new PipelineStage[] { tokenizer, hashingTF, lr }); model = pipeline.fit(df); try { model.write().overwrite().save(pipelineModelFileName); } catch (IOException e) { e.printStackTrace(); } DataFrame predictions = model.transform(df); predictions.show(); MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator().setMetricName("precision"); double accuracy = evaluator.evaluate(predictions); System.out.println("training accuracy = " + accuracy); LogisticRegressionModel lrModel = (LogisticRegressionModel) model.stages()[2]; LogisticRegressionTrainingSummary trainingSummary = lrModel.summary(); double[] objectiveHistory = trainingSummary.objectiveHistory(); System.out.println("#(iterations) = " + objectiveHistory.length); for (double lossPerIteration : objectiveHistory) { System.out.println(lossPerIteration); } }
Example #9
Source File: TransitionClassifier.java From vn.vitk with GNU General Public License v3.0 | 4 votes |
/** * Trains a transition classifier on the data frame. * @param jsc * @param graphs * @param featureFrame * @param classifierFileName * @param numHiddenUnits * @return a transition classifier. */ public Transformer trainMLP(JavaSparkContext jsc, List<DependencyGraph> graphs, FeatureFrame featureFrame, String classifierFileName, int numHiddenUnits) { // create a SQLContext this.sqlContext = new SQLContext(jsc); // extract a data frame from these graphs DataFrame dataset = toDataFrame(jsc, graphs, featureFrame); // create a processing pipeline and fit it to the data frame Pipeline pipeline = createPipeline(); PipelineModel pipelineModel = pipeline.fit(dataset); DataFrame trainingData = pipelineModel.transform(dataset); // cache the training data for better performance trainingData.cache(); if (verbose) { trainingData.show(false); } // compute the number of different labels, which is the maximum element // in the 'label' column. trainingData.registerTempTable("dfTable"); Row row = sqlContext.sql("SELECT MAX(label) as maxValue from dfTable").first(); int numLabels = (int)row.getDouble(0); numLabels++; int vocabSize = ((CountVectorizerModel)(pipelineModel.stages()[1])).getVocabSize(); // default is a two-layer MLP int[] layers = {vocabSize, numLabels}; // if user specify a hidden layer, use a 3-layer MLP: if (numHiddenUnits > 0) { layers = new int[3]; layers[0] = vocabSize; layers[1] = numHiddenUnits; layers[2] = numLabels; } MultilayerPerceptronClassifier classifier = new MultilayerPerceptronClassifier() .setLayers(layers) .setBlockSize(128) .setSeed(1234L) .setTol((Double)params.getOrDefault(params.getTolerance())) .setMaxIter((Integer)params.getOrDefault(params.getMaxIter())); MultilayerPerceptronClassificationModel model = classifier.fit(trainingData); // compute precision on the training data // DataFrame result = model.transform(trainingData); DataFrame predictionAndLabel = result.select("prediction", "label"); MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator().setMetricName("precision"); if (verbose) { System.out.println("N = " + trainingData.count()); System.out.println("D = " + vocabSize); System.out.println("K = " + numLabels); System.out.println("H = " + numHiddenUnits); System.out.println("training precision = " + evaluator.evaluate(predictionAndLabel)); } // save the trained MLP to a file // String classifierPath = new Path(classifierFileName, "data").toString(); jsc.parallelize(Arrays.asList(model), 1).saveAsObjectFile(classifierPath); // save the pipeline model to sub-directory "pipelineModel" // try { String pipelinePath = new Path(classifierFileName, "pipelineModel").toString(); pipelineModel.write().overwrite().save(pipelinePath); } catch (IOException e) { e.printStackTrace(); } return model; }