org.apache.spark.ml.classification.RandomForestClassificationModel Java Exaples

Source File: JavaRandomForestClassifierExample.java From SparkDemo with MIT License

6 votes

public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaRandomForestClassifierExample")
    .getOrCreate();

  // $example on$
  // Load and parse the data file, converting it to a DataFrame.
  Dataset<Row> data = spark.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");

  // Index labels, adding metadata to the label column.
  // Fit on whole dataset to include all labels in index.
  StringIndexerModel labelIndexer = new StringIndexer()
    .setInputCol("label")
    .setOutputCol("indexedLabel")
    .fit(data);
  // Automatically identify categorical features, and index them.
  // Set maxCategories so features with > 4 distinct values are treated as continuous.
  VectorIndexerModel featureIndexer = new VectorIndexer()
    .setInputCol("features")
    .setOutputCol("indexedFeatures")
    .setMaxCategories(4)
    .fit(data);

  // Split the data into training and test sets (30% held out for testing)
  Dataset<Row>[] splits = data.randomSplit(new double[] {0.7, 0.3});
  Dataset<Row> trainingData = splits[0];
  Dataset<Row> testData = splits[1];

  // Train a RandomForest model.
  RandomForestClassifier rf = new RandomForestClassifier()
    .setLabelCol("indexedLabel")
    .setFeaturesCol("indexedFeatures");

  // Convert indexed labels back to original labels.
  IndexToString labelConverter = new IndexToString()
    .setInputCol("prediction")
    .setOutputCol("predictedLabel")
    .setLabels(labelIndexer.labels());

  // Chain indexers and forest in a Pipeline
  Pipeline pipeline = new Pipeline()
    .setStages(new PipelineStage[] {labelIndexer, featureIndexer, rf, labelConverter});

  // Train model. This also runs the indexers.
  PipelineModel model = pipeline.fit(trainingData);

  // Make predictions.
  Dataset<Row> predictions = model.transform(testData);

  // Select example rows to display.
  predictions.select("predictedLabel", "label", "features").show(5);

  // Select (prediction, true label) and compute test error
  MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator()
    .setLabelCol("indexedLabel")
    .setPredictionCol("prediction")
    .setMetricName("accuracy");
  double accuracy = evaluator.evaluate(predictions);
  System.out.println("Test Error = " + (1.0 - accuracy));

  RandomForestClassificationModel rfModel = (RandomForestClassificationModel)(model.stages()[2]);
  System.out.println("Learned classification forest model:\n" + rfModel.toDebugString());
  // $example off$

  spark.stop();
}

Source File: EntitySalienceSpark.java From ambiverse-nlu with Apache License 2.0

5 votes

@Override
public void process(JCas jCas) throws AnalysisEngineProcessException {
    long startTime = System.currentTimeMillis();

    FeatureExtractor fe = new NYTEntitySalienceFeatureExtractor();
    List<EntityInstance> entityInstances;
    try {
        entityInstances = fe.getEntityInstances(jCas, TrainingSettings.FeatureExtractor.ENTITY_SALIENCE);

        final int featureVectorSize = FeatureSetFactory.createFeatureSet(TrainingSettings.FeatureExtractor.ENTITY_SALIENCE).getFeatureVectorSize();

        //TODO: For each model create separate implementation.
        RandomForestClassificationModel rfm = (RandomForestClassificationModel)trainingModel.stages()[2];
        for(EntityInstance ei : entityInstances) {
            Vector vei = FeatureValueInstanceUtils.convertToSparkMLVector(ei, featureVectorSize);

            double label = rfm.predict(vei);
            Vector probabilities = rfm.predictProbability(vei);
            double salience = probabilities.toArray()[1];

            SalientEntity salientEntity = new SalientEntity(jCas, 0, 0);
            salientEntity.setLabel(label);
            salientEntity.setID(ei.getEntityId());
            salientEntity.setSalience(salience);
            salientEntity.addToIndexes();
        }
        long endTime = System.currentTimeMillis() - startTime;
        logger.debug("Annotating salient entities finished in {}ms.", endTime);


    } catch (Exception e) {
        throw new AnalysisEngineProcessException(e);
    }

}

Source File: RandomForestClassificationModelInfoAdapter.java From spark-transformers with Apache License 2.0

5 votes

@Override
RandomForestModelInfo getModelInfo(final RandomForestClassificationModel sparkRfModel, final DataFrame df) {
    final RandomForestModelInfo modelInfo = new RandomForestModelInfo();

    modelInfo.setNumClasses(sparkRfModel.numClasses());
    modelInfo.setNumFeatures(sparkRfModel.numFeatures());
    modelInfo.setRegression(false); //false for classification

    final List<Double> treeWeights = new ArrayList<Double>();
    for (double w : sparkRfModel.treeWeights()) {
        treeWeights.add(w);
    }
    modelInfo.setTreeWeights(treeWeights);

    final List<DecisionTreeModelInfo> decisionTrees = new ArrayList<>();
    for (DecisionTreeModel decisionTreeModel : sparkRfModel.trees()) {
        decisionTrees.add(DECISION_TREE_ADAPTER.getModelInfo((DecisionTreeClassificationModel) decisionTreeModel, df));
    }
    modelInfo.setTrees(decisionTrees);

    final Set<String> inputKeys = new LinkedHashSet<String>();
    inputKeys.add(sparkRfModel.getFeaturesCol());
    inputKeys.add(sparkRfModel.getLabelCol());
    modelInfo.setInputKeys(inputKeys);

    final Set<String> outputKeys = new LinkedHashSet<String>();
    outputKeys.add(sparkRfModel.getPredictionCol());
    outputKeys.add(sparkRfModel.getProbabilityCol());
    outputKeys.add(sparkRfModel.getRawPredictionCol());
    modelInfo.setProbabilityKey(sparkRfModel.getProbabilityCol());
    modelInfo.setRawPredictionKey(sparkRfModel.getRawPredictionCol());
    modelInfo.setOutputKeys(outputKeys);

    return modelInfo;
}

Source File: RandomForestClassificationModelConverter.java From jpmml-sparkml with GNU Affero General Public License v3.0

4 votes

public RandomForestClassificationModelConverter(RandomForestClassificationModel model){
	super(model);
}

Source File: RandomForestClassificationModelInfoAdapter.java From spark-transformers with Apache License 2.0

4 votes

@Override
public Class<RandomForestClassificationModel> getSource() {
    return RandomForestClassificationModel.class;
}

Source File: RandomForestClassificationModelInfoAdapterBridgeTest.java From spark-transformers with Apache License 2.0

4 votes

@Test
public void testRandomForestClassification() {
    // Load the data stored in LIBSVM format as a DataFrame.
    DataFrame data = sqlContext.read().format("libsvm").load("src/test/resources/classification_test.libsvm");

    StringIndexerModel stringIndexerModel = new StringIndexer()
            .setInputCol("label")
            .setOutputCol("labelIndex")
            .fit(data);

    data = stringIndexerModel.transform(data);

    // Split the data into training and test sets (30% held out for testing)
    DataFrame[] splits = data.randomSplit(new double[]{0.7, 0.3});
    DataFrame trainingData = splits[0];
    DataFrame testData = splits[1];

    // Train a RandomForest model.
    RandomForestClassificationModel classificationModel = new RandomForestClassifier()
            .setLabelCol("labelIndex")
            .setFeaturesCol("features")
            .setPredictionCol("prediction")
            .setRawPredictionCol("rawPrediction")
            .setProbabilityCol("probability")
            .fit(trainingData);


    byte[] exportedModel = ModelExporter.export(classificationModel, null);

    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    Row[] sparkOutput = classificationModel.transform(testData).select("features", "prediction", "rawPrediction", "probability").collect();

    //compare predictions
    for (Row row : sparkOutput) {
        Vector v = (Vector) row.get(0);
        double actual = row.getDouble(1);
        double [] actualProbability = ((Vector) row.get(3)).toArray();
        double[] actualRaw = ((Vector) row.get(2)).toArray();

        Map<String, Object> inputData = new HashMap<String, Object>();
        inputData.put(transformer.getInputKeys().iterator().next(), v.toArray());
        transformer.transform(inputData);
        double predicted = (double) inputData.get("prediction");
        double[] probability = (double[]) inputData.get("probability");
        double[] rawPrediction = (double[]) inputData.get("rawPrediction");

        assertEquals(actual, predicted, EPSILON);
        assertArrayEquals(actualProbability, probability, EPSILON);
        assertArrayEquals(actualRaw, rawPrediction, EPSILON);


    }

}

org.apache.spark.ml.classification.RandomForestClassificationModel Java Examples