org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator Java Exaples

Source File: SparkClassificationModel.java From ambiverse-nlu with Apache License 2.0

6 votes

private Evaluator getEvaluator(TrainingSettings trainingSettings, Predictor predictor) {

        Evaluator evaluator = null;
        if(predictor instanceof RandomForestClassifier
               || predictor instanceof GBTClassifier
               || predictor instanceof DecisionTreeClassifier) {

            evaluator = new MulticlassClassificationEvaluator()
                    .setLabelCol("indexedLabel")
                    .setPredictionCol("prediction")
                    .setMetricName(trainingSettings.getMetricName());
        }

        if(predictor instanceof LogisticRegression) {
            evaluator = new BinaryClassificationEvaluator().setMetricName(trainingSettings.getMetricName());
        }

        return evaluator;
    }

Source File: JavaRandomForestClassifierExample.java From SparkDemo with MIT License

6 votes

public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaRandomForestClassifierExample")
    .getOrCreate();

  // $example on$
  // Load and parse the data file, converting it to a DataFrame.
  Dataset<Row> data = spark.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");

  // Index labels, adding metadata to the label column.
  // Fit on whole dataset to include all labels in index.
  StringIndexerModel labelIndexer = new StringIndexer()
    .setInputCol("label")
    .setOutputCol("indexedLabel")
    .fit(data);
  // Automatically identify categorical features, and index them.
  // Set maxCategories so features with > 4 distinct values are treated as continuous.
  VectorIndexerModel featureIndexer = new VectorIndexer()
    .setInputCol("features")
    .setOutputCol("indexedFeatures")
    .setMaxCategories(4)
    .fit(data);

  // Split the data into training and test sets (30% held out for testing)
  Dataset<Row>[] splits = data.randomSplit(new double[] {0.7, 0.3});
  Dataset<Row> trainingData = splits[0];
  Dataset<Row> testData = splits[1];

  // Train a RandomForest model.
  RandomForestClassifier rf = new RandomForestClassifier()
    .setLabelCol("indexedLabel")
    .setFeaturesCol("indexedFeatures");

  // Convert indexed labels back to original labels.
  IndexToString labelConverter = new IndexToString()
    .setInputCol("prediction")
    .setOutputCol("predictedLabel")
    .setLabels(labelIndexer.labels());

  // Chain indexers and forest in a Pipeline
  Pipeline pipeline = new Pipeline()
    .setStages(new PipelineStage[] {labelIndexer, featureIndexer, rf, labelConverter});

  // Train model. This also runs the indexers.
  PipelineModel model = pipeline.fit(trainingData);

  // Make predictions.
  Dataset<Row> predictions = model.transform(testData);

  // Select example rows to display.
  predictions.select("predictedLabel", "label", "features").show(5);

  // Select (prediction, true label) and compute test error
  MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator()
    .setLabelCol("indexedLabel")
    .setPredictionCol("prediction")
    .setMetricName("accuracy");
  double accuracy = evaluator.evaluate(predictions);
  System.out.println("Test Error = " + (1.0 - accuracy));

  RandomForestClassificationModel rfModel = (RandomForestClassificationModel)(model.stages()[2]);
  System.out.println("Learned classification forest model:\n" + rfModel.toDebugString());
  // $example off$

  spark.stop();
}

Source File: JavaNaiveBayesExample.java From SparkDemo with MIT License

5 votes

public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaNaiveBayesExample")
    .getOrCreate();

  // $example on$
  // Load training data
  Dataset<Row> dataFrame =
    spark.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");
  // Split the data into train and test
  Dataset<Row>[] splits = dataFrame.randomSplit(new double[]{0.6, 0.4}, 1234L);
  Dataset<Row> train = splits[0];
  Dataset<Row> test = splits[1];

  // create the trainer and set its parameters
  NaiveBayes nb = new NaiveBayes();

  // train the model
  NaiveBayesModel model = nb.fit(train);

  // Select example rows to display.
  Dataset<Row> predictions = model.transform(test);
  predictions.show();

  // compute accuracy on the test set
  MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator()
    .setLabelCol("label")
    .setPredictionCol("prediction")
    .setMetricName("accuracy");
  double accuracy = evaluator.evaluate(predictions);
  System.out.println("Test set accuracy = " + accuracy);
  // $example off$

  spark.stop();
}

Source File: JavaMultilayerPerceptronClassifierExample.java From SparkDemo with MIT License

4 votes

public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaMultilayerPerceptronClassifierExample")
    .getOrCreate();

  // $example on$
  // Load training data
  String path = "data/mllib/sample_multiclass_classification_data.txt";
  Dataset<Row> dataFrame = spark.read().format("libsvm").load(path);

  // Split the data into train and test
  Dataset<Row>[] splits = dataFrame.randomSplit(new double[]{0.6, 0.4}, 1234L);
  Dataset<Row> train = splits[0];
  Dataset<Row> test = splits[1];

  // specify layers for the neural network:
  // input layer of size 4 (features), two intermediate of size 5 and 4
  // and output of size 3 (classes)
  int[] layers = new int[] {4, 5, 4, 3};

  // create the trainer and set its parameters
  MultilayerPerceptronClassifier trainer = new MultilayerPerceptronClassifier()
    .setLayers(layers)
    .setBlockSize(128)
    .setSeed(1234L)
    .setMaxIter(100);

  // train the model
  MultilayerPerceptronClassificationModel model = trainer.fit(train);

  // compute accuracy on the test set
  Dataset<Row> result = model.transform(test);
  Dataset<Row> predictionAndLabels = result.select("prediction", "label");
  MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator()
    .setMetricName("accuracy");

  System.out.println("Test set accuracy = " + evaluator.evaluate(predictionAndLabels));
  // $example off$

  spark.stop();
}

Source File: JavaGradientBoostedTreeClassifierExample.java From SparkDemo with MIT License

4 votes

public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaGradientBoostedTreeClassifierExample")
    .getOrCreate();

  // $example on$
  // Load and parse the data file, converting it to a DataFrame.
  Dataset<Row> data = spark
    .read()
    .format("libsvm")
    .load("data/mllib/sample_libsvm_data.txt");

  // Index labels, adding metadata to the label column.
  // Fit on whole dataset to include all labels in index.
  StringIndexerModel labelIndexer = new StringIndexer()
    .setInputCol("label")
    .setOutputCol("indexedLabel")
    .fit(data);
  // Automatically identify categorical features, and index them.
  // Set maxCategories so features with > 4 distinct values are treated as continuous.
  VectorIndexerModel featureIndexer = new VectorIndexer()
    .setInputCol("features")
    .setOutputCol("indexedFeatures")
    .setMaxCategories(4)
    .fit(data);

  // Split the data into training and test sets (30% held out for testing)
  Dataset<Row>[] splits = data.randomSplit(new double[] {0.7, 0.3});
  Dataset<Row> trainingData = splits[0];
  Dataset<Row> testData = splits[1];

  // Train a GBT model.
  GBTClassifier gbt = new GBTClassifier()
    .setLabelCol("indexedLabel")
    .setFeaturesCol("indexedFeatures")
    .setMaxIter(10);

  // Convert indexed labels back to original labels.
  IndexToString labelConverter = new IndexToString()
    .setInputCol("prediction")
    .setOutputCol("predictedLabel")
    .setLabels(labelIndexer.labels());

  // Chain indexers and GBT in a Pipeline.
  Pipeline pipeline = new Pipeline()
    .setStages(new PipelineStage[] {labelIndexer, featureIndexer, gbt, labelConverter});

  // Train model. This also runs the indexers.
  PipelineModel model = pipeline.fit(trainingData);

  // Make predictions.
  Dataset<Row> predictions = model.transform(testData);

  // Select example rows to display.
  predictions.select("predictedLabel", "label", "features").show(5);

  // Select (prediction, true label) and compute test error.
  MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator()
    .setLabelCol("indexedLabel")
    .setPredictionCol("prediction")
    .setMetricName("accuracy");
  double accuracy = evaluator.evaluate(predictions);
  System.out.println("Test Error = " + (1.0 - accuracy));

  GBTClassificationModel gbtModel = (GBTClassificationModel)(model.stages()[2]);
  System.out.println("Learned classification GBT model:\n" + gbtModel.toDebugString());
  // $example off$

  spark.stop();
}

Source File: JavaDecisionTreeClassificationExample.java From SparkDemo with MIT License

4 votes

public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaDecisionTreeClassificationExample")
    .getOrCreate();

  // $example on$
  // Load the data stored in LIBSVM format as a DataFrame.
  Dataset<Row> data = spark
    .read()
    .format("libsvm")
    .load("data/mllib/sample_libsvm_data.txt");

  // Index labels, adding metadata to the label column.
  // Fit on whole dataset to include all labels in index.
  StringIndexerModel labelIndexer = new StringIndexer()
    .setInputCol("label")
    .setOutputCol("indexedLabel")
    .fit(data);

  // Automatically identify categorical features, and index them.
  VectorIndexerModel featureIndexer = new VectorIndexer()
    .setInputCol("features")
    .setOutputCol("indexedFeatures")
    .setMaxCategories(4) // features with > 4 distinct values are treated as continuous.
    .fit(data);

  // Split the data into training and test sets (30% held out for testing).
  Dataset<Row>[] splits = data.randomSplit(new double[]{0.7, 0.3});
  Dataset<Row> trainingData = splits[0];
  Dataset<Row> testData = splits[1];

  // Train a DecisionTree model.
  DecisionTreeClassifier dt = new DecisionTreeClassifier()
    .setLabelCol("indexedLabel")
    .setFeaturesCol("indexedFeatures");

  // Convert indexed labels back to original labels.
  IndexToString labelConverter = new IndexToString()
    .setInputCol("prediction")
    .setOutputCol("predictedLabel")
    .setLabels(labelIndexer.labels());

  // Chain indexers and tree in a Pipeline.
  Pipeline pipeline = new Pipeline()
    .setStages(new PipelineStage[]{labelIndexer, featureIndexer, dt, labelConverter});

  // Train model. This also runs the indexers.
  PipelineModel model = pipeline.fit(trainingData);

  // Make predictions.
  Dataset<Row> predictions = model.transform(testData);

  // Select example rows to display.
  predictions.select("predictedLabel", "label", "features").show(5);

  // Select (prediction, true label) and compute test error.
  MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator()
    .setLabelCol("indexedLabel")
    .setPredictionCol("prediction")
    .setMetricName("accuracy");
  double accuracy = evaluator.evaluate(predictions);
  System.out.println("Test Error = " + (1.0 - accuracy));

  DecisionTreeClassificationModel treeModel =
    (DecisionTreeClassificationModel) (model.stages()[2]);
  System.out.println("Learned classification tree model:\n" + treeModel.toDebugString());
  // $example off$

  spark.stop();
}

Source File: JavaOneVsRestExample.java From SparkDemo with MIT License

4 votes

public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaOneVsRestExample")
    .getOrCreate();

  // $example on$
  // load data file.
  Dataset<Row> inputData = spark.read().format("libsvm")
    .load("data/mllib/sample_multiclass_classification_data.txt");

  // generate the train/test split.
  Dataset<Row>[] tmp = inputData.randomSplit(new double[]{0.8, 0.2});
  Dataset<Row> train = tmp[0];
  Dataset<Row> test = tmp[1];

  // configure the base classifier.
  LogisticRegression classifier = new LogisticRegression()
    .setMaxIter(10)
    .setTol(1E-6)
    .setFitIntercept(true);

  // instantiate the One Vs Rest Classifier.
  OneVsRest ovr = new OneVsRest().setClassifier(classifier);

  // train the multiclass model.
  OneVsRestModel ovrModel = ovr.fit(train);

  // score the model on test data.
  Dataset<Row> predictions = ovrModel.transform(test)
    .select("prediction", "label");

  // obtain evaluator.
  MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator()
          .setMetricName("accuracy");

  // compute the classification error on test data.
  double accuracy = evaluator.evaluate(predictions);
  System.out.println("Test Error = " + (1 - accuracy));
  // $example off$

  spark.stop();
}

Source File: WhitespaceClassifier.java From vn.vitk with GNU General Public License v3.0

4 votes

/**
 * Trains a whitespace classifier model and save the resulting pipeline model
 * to an external file. 
 * @param sentences a list of tokenized sentences.
 * @param pipelineModelFileName
 * @param numFeatures
 */
public void train(List<String> sentences, String pipelineModelFileName, int numFeatures) {
	List<WhitespaceContext> contexts = new ArrayList<WhitespaceContext>(sentences.size());
	int id = 0;
	for (String sentence : sentences) {
		sentence = sentence.trim();
		for (int j = 0; j < sentence.length(); j++) {
			char c = sentence.charAt(j);
			if (c == ' ' || c == '_') {
				WhitespaceContext context = new WhitespaceContext();
				context.setId(id++);
				context.setContext(extractContext(sentence, j));
				context.setLabel(c == ' ' ? 0d : 1d);
				contexts.add(context);
			}
		}
	}
	JavaRDD<WhitespaceContext> jrdd = jsc.parallelize(contexts);
	DataFrame df = sqlContext.createDataFrame(jrdd, WhitespaceContext.class);
	df.show(false);
	System.out.println("N = " + df.count());
	df.groupBy("label").count().show();
	
	org.apache.spark.ml.feature.Tokenizer tokenizer = new Tokenizer()
			.setInputCol("context").setOutputCol("words");
	HashingTF hashingTF = new HashingTF().setNumFeatures(numFeatures)
			.setInputCol(tokenizer.getOutputCol()).setOutputCol("features");
	LogisticRegression lr = new LogisticRegression().setMaxIter(100)
			.setRegParam(0.01);
	Pipeline pipeline = new Pipeline().setStages(new PipelineStage[] {
			tokenizer, hashingTF, lr });
	model = pipeline.fit(df);
	
	try {
		model.write().overwrite().save(pipelineModelFileName);
	} catch (IOException e) {
		e.printStackTrace();
	}
	
	DataFrame predictions = model.transform(df);
	predictions.show();
	MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator().setMetricName("precision");
	double accuracy = evaluator.evaluate(predictions);
	System.out.println("training accuracy = " + accuracy);
	
	LogisticRegressionModel lrModel = (LogisticRegressionModel) model.stages()[2];
	LogisticRegressionTrainingSummary trainingSummary = lrModel.summary();
	double[] objectiveHistory = trainingSummary.objectiveHistory();
	System.out.println("#(iterations) = " + objectiveHistory.length);
	for (double lossPerIteration : objectiveHistory) {
	  System.out.println(lossPerIteration);
	}
	
}

Source File: TransitionClassifier.java From vn.vitk with GNU General Public License v3.0

4 votes

/**
 * Trains a transition classifier on the data frame.
 * @param jsc
 * @param graphs
 * @param featureFrame
 * @param classifierFileName
 * @param numHiddenUnits
 * @return a transition classifier.
 */
public Transformer trainMLP(JavaSparkContext jsc,
		List<DependencyGraph> graphs, FeatureFrame featureFrame,
		String classifierFileName, int numHiddenUnits) {
	// create a SQLContext
	this.sqlContext = new SQLContext(jsc);
	// extract a data frame from these graphs
	DataFrame dataset = toDataFrame(jsc, graphs, featureFrame);
	
	// create a processing pipeline and fit it to the data frame
	Pipeline pipeline = createPipeline();
	PipelineModel pipelineModel = pipeline.fit(dataset);
	DataFrame trainingData = pipelineModel.transform(dataset);
	
	// cache the training data for better performance
	trainingData.cache();
	
	if (verbose) {
		trainingData.show(false);
	}
	
	// compute the number of different labels, which is the maximum element 
	// in the 'label' column.
	trainingData.registerTempTable("dfTable");
	Row row = sqlContext.sql("SELECT MAX(label) as maxValue from dfTable").first();
	int numLabels = (int)row.getDouble(0);
	numLabels++;
	
	int vocabSize = ((CountVectorizerModel)(pipelineModel.stages()[1])).getVocabSize();
	
	// default is a two-layer MLP
	int[] layers = {vocabSize, numLabels};
	// if user specify a hidden layer, use a 3-layer MLP:
	if (numHiddenUnits > 0) {
		layers = new int[3];
		layers[0] = vocabSize;
		layers[1] = numHiddenUnits;
		layers[2] = numLabels;
	}
	MultilayerPerceptronClassifier classifier = new MultilayerPerceptronClassifier()
		.setLayers(layers)
		.setBlockSize(128)
		.setSeed(1234L)
		.setTol((Double)params.getOrDefault(params.getTolerance()))
		.setMaxIter((Integer)params.getOrDefault(params.getMaxIter()));
	MultilayerPerceptronClassificationModel model = classifier.fit(trainingData);
	
	// compute precision on the training data
	//
	DataFrame result = model.transform(trainingData);
	DataFrame predictionAndLabel = result.select("prediction", "label");
	MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator().setMetricName("precision");
	if (verbose) {
		System.out.println("N = " + trainingData.count());
		System.out.println("D = " + vocabSize);
		System.out.println("K = " + numLabels);
		System.out.println("H = " + numHiddenUnits);
		System.out.println("training precision = " + evaluator.evaluate(predictionAndLabel));
	}
	
	// save the trained MLP to a file
	//
	String classifierPath = new Path(classifierFileName, "data").toString();
	jsc.parallelize(Arrays.asList(model), 1).saveAsObjectFile(classifierPath);
	// save the pipeline model to sub-directory "pipelineModel"
	// 
	try {
		String pipelinePath = new Path(classifierFileName, "pipelineModel").toString(); 
		pipelineModel.write().overwrite().save(pipelinePath);
	} catch (IOException e) {
		e.printStackTrace();
	}
	return model;
}

org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator Java Examples