Java Code Examples for org.apache.spark.ml.PipelineModel#transform()

The following examples show how to use org.apache.spark.ml.PipelineModel#transform() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: JavaRandomForestClassifierExample.java From SparkDemo with MIT License

6 votes

public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaRandomForestClassifierExample")
    .getOrCreate();

  // $example on$
  // Load and parse the data file, converting it to a DataFrame.
  Dataset<Row> data = spark.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");

  // Index labels, adding metadata to the label column.
  // Fit on whole dataset to include all labels in index.
  StringIndexerModel labelIndexer = new StringIndexer()
    .setInputCol("label")
    .setOutputCol("indexedLabel")
    .fit(data);
  // Automatically identify categorical features, and index them.
  // Set maxCategories so features with > 4 distinct values are treated as continuous.
  VectorIndexerModel featureIndexer = new VectorIndexer()
    .setInputCol("features")
    .setOutputCol("indexedFeatures")
    .setMaxCategories(4)
    .fit(data);

  // Split the data into training and test sets (30% held out for testing)
  Dataset<Row>[] splits = data.randomSplit(new double[] {0.7, 0.3});
  Dataset<Row> trainingData = splits[0];
  Dataset<Row> testData = splits[1];

  // Train a RandomForest model.
  RandomForestClassifier rf = new RandomForestClassifier()
    .setLabelCol("indexedLabel")
    .setFeaturesCol("indexedFeatures");

  // Convert indexed labels back to original labels.
  IndexToString labelConverter = new IndexToString()
    .setInputCol("prediction")
    .setOutputCol("predictedLabel")
    .setLabels(labelIndexer.labels());

  // Chain indexers and forest in a Pipeline
  Pipeline pipeline = new Pipeline()
    .setStages(new PipelineStage[] {labelIndexer, featureIndexer, rf, labelConverter});

  // Train model. This also runs the indexers.
  PipelineModel model = pipeline.fit(trainingData);

  // Make predictions.
  Dataset<Row> predictions = model.transform(testData);

  // Select example rows to display.
  predictions.select("predictedLabel", "label", "features").show(5);

  // Select (prediction, true label) and compute test error
  MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator()
    .setLabelCol("indexedLabel")
    .setPredictionCol("prediction")
    .setMetricName("accuracy");
  double accuracy = evaluator.evaluate(predictions);
  System.out.println("Test Error = " + (1.0 - accuracy));

  RandomForestClassificationModel rfModel = (RandomForestClassificationModel)(model.stages()[2]);
  System.out.println("Learned classification forest model:\n" + rfModel.toDebugString());
  // $example off$

  spark.stop();
}

Example 2

Source File: SparkMLHouses.java From -Data-Stream-Development-with-Apache-Spark-Kafka-and-Spring-Boot with MIT License

5 votes

public static void main(String[] args) throws InterruptedException, StreamingQueryException {

                System.setProperty("hadoop.home.dir", HADOOP_HOME_DIR_VALUE);

                // * the schema can be written on disk, and read from disk
                // * the schema is not mandatory to be complete, it can contain only the needed fields    
                StructType HOUSES_SCHEMA = 
                       new StructType()
                           .add("House", LongType, true)
                           .add("Taxes", LongType, true)
                           .add("Bedrooms", LongType, true)
                           .add("Baths", FloatType, true)
                           .add("Quadrant", LongType, true)
                           .add("NW", StringType, true)
                           .add("Price($)", LongType, false)
                           .add("Size(sqft)", LongType, false)
                           .add("lot", LongType, true);

                final SparkConf conf = new SparkConf()
                    .setMaster(RUN_LOCAL_WITH_AVAILABLE_CORES)
                    .setAppName(APPLICATION_NAME)
                    .set("spark.sql.caseSensitive", CASE_SENSITIVE);

                SparkSession sparkSession = SparkSession.builder()
                    .config(conf)
                    .getOrCreate();

                Dataset<Row> housesDF = sparkSession.read()
                     .schema(HOUSES_SCHEMA)
                     .json(HOUSES_FILE_PATH);
             
                // Gathering Data				
                Dataset<Row> gatheredDF = housesDF.select(col("Taxes"), 
                    col("Bedrooms"), col("Baths"),
                    col("Size(sqft)"), col("Price($)"));
                
                // Data Preparation  
                Dataset<Row> labelDF = gatheredDF.withColumnRenamed("Price($)", "label");
                
                Imputer imputer = new Imputer()
                    // .setMissingValue(1.0d)
                    .setInputCols(new String[] { "Baths" })
                    .setOutputCols(new String[] { "~Baths~" });

                VectorAssembler assembler = new VectorAssembler()
                    .setInputCols(new String[] { "Taxes", "Bedrooms", "~Baths~", "Size(sqft)" })
                    .setOutputCol("features");
                
                // Choosing a Model               
                LinearRegression linearRegression = new LinearRegression();
                linearRegression.setMaxIter(1000);

                Pipeline pipeline = new Pipeline()
                                .setStages(new PipelineStage[] {
                                    imputer, assembler, linearRegression 
                                });

                // Training The Data
                Dataset<Row>[] splitDF = labelDF.randomSplit(new double[] { 0.8, 0.2 });

                Dataset<Row> trainDF = splitDF[0];
                Dataset<Row> evaluationDF = splitDF[1];

                PipelineModel pipelineModel = pipeline.fit(trainDF);
                
                // Evaluation 
                Dataset<Row> predictionsDF = pipelineModel.transform(evaluationDF);

                predictionsDF.show(false);

                Dataset<Row> forEvaluationDF = predictionsDF.select(col("label"), 
                    col("prediction"));

                RegressionEvaluator evaluteR2 = new RegressionEvaluator().setMetricName("r2");
                RegressionEvaluator evaluteRMSE = new RegressionEvaluator().setMetricName("rmse");

                double r2 = evaluteR2.evaluate(forEvaluationDF);
                double rmse = evaluteRMSE.evaluate(forEvaluationDF);

                logger.info("---------------------------");
                logger.info("R2 =" + r2);
                logger.info("RMSE =" + rmse);
                logger.info("---------------------------");
        }

Example 3

Source File: PMMLBuilder.java From jpmml-sparkml with GNU Affero General Public License v3.0

5 votes

public PMMLBuilder verify(Dataset<Row> dataset, double precision, double zeroThreshold){
	PipelineModel pipelineModel = getPipelineModel();

	Dataset<Row> transformedDataset = pipelineModel.transform(dataset);

	Verification verification = new Verification(dataset, transformedDataset)
		.setPrecision(precision)
		.setZeroThreshold(zeroThreshold);

	return setVerification(verification);
}

Example 4

Source File: SparkMLScoringOnline.java From -Data-Stream-Development-with-Apache-Spark-Kafka-and-Spring-Boot with MIT License

4 votes

public static void main(String[] args) throws InterruptedException, StreamingQueryException {
 
      System.setProperty("hadoop.home.dir", HADOOP_HOME_DIR_VALUE);

      // * the schema can be written on disk, and read from disk
      // * the schema is not mandatory to be complete, it can contain only the needed fields    
      StructType RSVP_SCHEMA = new StructType()                                
              .add("event",
                      new StructType()
                              .add("event_id", StringType, true)
                              .add("event_name", StringType, true)
                              .add("event_url", StringType, true)
                              .add("time", LongType, true))
              .add("group",
                      new StructType()
                              .add("group_city", StringType, true)
                              .add("group_country", StringType, true)
                              .add("group_id", LongType, true)
                              .add("group_lat", DoubleType, true)
                              .add("group_lon", DoubleType, true)
                              .add("group_name", StringType, true)
                              .add("group_state", StringType, true)
                              .add("group_topics", DataTypes.createArrayType(
                                      new StructType()
                                              .add("topicName", StringType, true)
                                              .add("urlkey", StringType, true)), true)
                              .add("group_urlname", StringType, true))
              .add("guests", LongType, true)
              .add("member",
                      new StructType()
                              .add("member_id", LongType, true)
                              .add("member_name", StringType, true)                                
                              .add("photo", StringType, true))
              .add("mtime", LongType, true)
              .add("response", StringType, true)
              .add("rsvp_id", LongType, true)
              .add("venue",
                      new StructType()
                              .add("lat", DoubleType, true)
                              .add("lon", DoubleType, true)
                              .add("venue_id", LongType, true)
                              .add("venue_name", StringType, true))
              .add("visibility", StringType, true);

      final SparkConf conf = new SparkConf()
              .setMaster(RUN_LOCAL_WITH_AVAILABLE_CORES)
              .setAppName(APPLICATION_NAME)
              .set("spark.sql.caseSensitive", CASE_SENSITIVE);

      SparkSession spark = SparkSession
              .builder()
              .config(conf)
              .getOrCreate();

      PipelineModel pipelineModel = PipelineModel.load(MODEL_FOLDER_PATH);
     
      Dataset<Row> meetupStream = spark.readStream()
              .format(KAFKA_FORMAT)
              .option("kafka.bootstrap.servers", KAFKA_BROKERS)
              .option("subscribe", KAFKA_TOPIC)
              .load();

      Dataset<Row> gatheredDF = meetupStream.select(
    (from_json(col("value").cast("string"), RSVP_SCHEMA))
	        .alias("rsvp"))
	.alias("meetup")
          .select("meetup.*");
		
      Dataset<Row> filteredDF = gatheredDF.filter(e -> !e.anyNull());  

      Dataset<Row> preparedDF = filteredDF.select(
        col("rsvp.group.group_city"),
        col("rsvp.group.group_lat"), col("rsvp.group.group_lon"), 
		col("rsvp.response")
);
		                
      preparedDF.printSchema();
   
      Dataset<Row> predictionDF = pipelineModel.transform(preparedDF);
      
      StreamingQuery query = predictionDF.writeStream()                
              .format(JSON_FORMAT)
              .option("path", RESULT_FOLDER_PATH)
              .option("checkpointLocation", CHECKPOINT_LOCATION)
              .trigger(Trigger.ProcessingTime(QUERY_INTERVAL_SECONDS))
              .option("truncate", false)
              .start();

      query.awaitTermination();
  }

Example 5

Source File: BikeRentalPrediction.java From Apache-Spark-2x-for-Java-Developers with MIT License

4 votes

public static void main(String[] args) {
	System.setProperty("hadoop.home.dir", "E:\\sumitK\\Hadoop");
	SparkSession sparkSession = SparkSession
			.builder()
			.master("local")
			.config("spark.sql.warehouse.dir",
					"file:///E:/sumitK/Hadoop/warehouse")
			.appName("BikeRentalPrediction").getOrCreate();
	Logger rootLogger = LogManager.getRootLogger();
	rootLogger.setLevel(Level.WARN);
	//We use the sqlContext.read method to read the data and set a few options:
	//  'format': specifies the Spark CSV data source
	//  'header': set to true to indicate that the first line of the CSV data file is a header
    // The file is called 'hour.csv'.	
	Dataset<Row> ds=sparkSession.read()
			  .format("org.apache.spark.sql.execution.datasources.csv.CSVFileFormat")
			  .option("header", "true")
			  .load("E:\\sumitK\\Hadoop\\Bike-Sharing-Dataset\\hour.csv");
	
	ds.cache();
	
	ds.select("season").show();;
	
	ds.show();
	
	System.out.println("Our dataset has rows :: "+ ds.count());
	
	Dataset<Row> df = ds.drop("instant").drop("dteday").drop("casual").drop("registered");
	df.printSchema();
	//col("...") is preferable to df.col("...")
	Dataset<Row> dformatted = df.select(col("season").cast(DataTypes.IntegerType),
			                            col("yr").cast(DataTypes.IntegerType),
										col("mnth").cast(DataTypes.IntegerType),
										col("hr").cast(DataTypes.IntegerType),
										col("holiday").cast(DataTypes.IntegerType),
										col("weekday").cast(DataTypes.IntegerType),
										col("workingday").cast(DataTypes.IntegerType),
										col("weathersit").cast(DataTypes.IntegerType),
										col("temp").cast(DataTypes.IntegerType),
										col("atemp").cast(DataTypes.IntegerType),
										col("hum").cast(DataTypes.IntegerType),
										col("windspeed").cast(DataTypes.IntegerType),
										col("cnt").cast(DataTypes.IntegerType));
	
	
dformatted.printSchema();	
Dataset<Row>[] data=	dformatted.randomSplit(new double[]{0.7,0.3});
System.out.println("We have training examples count :: "+ data[0].count()+" and test examples count ::"+data[1].count());

///
//removing 'cnt' cloumn and then forming str array
String[] featuresCols = dformatted.drop("cnt").columns();

for(String str:featuresCols){
	System.out.println(str+" :: ");
}

//This concatenates all feature columns into a single feature vector in a new column "rawFeatures".
VectorAssembler vectorAssembler = new VectorAssembler().setInputCols(featuresCols).setOutputCol("rawFeatures");
//This identifies categorical features and indexes them.
VectorIndexer vectorIndexer= new VectorIndexer().setInputCol("rawFeatures").setOutputCol("features").setMaxCategories(4);

//Takes the "features" column and learns to predict "cnt"
GBTRegressor gbt = new GBTRegressor().setLabelCol("cnt");
		
// Define a grid of hyperparameters to test:
  //  - maxDepth: max depth of each decision tree in the GBT ensemble
//  - maxIter: iterations, i.e., number of trees in each GBT ensemble
// In this example notebook, we keep these values small.  In practice, to get the highest accuracy, you would likely want to try deeper trees (10 or higher) and more trees in the ensemble (>100).
ParamMap[]	paramGrid = new ParamGridBuilder().addGrid(gbt.maxDepth(),new int[]{2, 5}).addGrid(gbt.maxIter(),new int[] {10, 100}).build();
// We define an evaluation metric.  This tells CrossValidator how well we are doing by comparing the true labels with predictions.
RegressionEvaluator evaluator = new RegressionEvaluator().setMetricName("rmse").setLabelCol(gbt.getLabelCol()).setPredictionCol(gbt.getPredictionCol());

//	# Declare the CrossValidator, which runs model tuning for us.
	CrossValidator cv = new CrossValidator().setEstimator(gbt).setEvaluator(evaluator).setEstimatorParamMaps(paramGrid);
		
	Pipeline pipeline = new Pipeline().setStages(new PipelineStage[]{vectorAssembler,vectorIndexer,cv});
			
	PipelineModel pipelineModel=pipeline.fit(data[0]);
	
	Dataset<Row> predictions = pipelineModel.transform(data[1]);
	
	predictions.show();
	//predictions.select("cnt", "prediction", *featuresCols);
}

Example 6

Source File: SparkRegressor.java From mmtf-spark with Apache License 2.0

4 votes

/**
 * Dataset must at least contain the following two columns:
 * label: the class labels
 * features: feature vector
 * @param data
 * @return map with metrics
 */
public Map<String,String> fit(Dataset<Row> data) {

	// Split the data into training and test sets (30% held out for testing)
	Dataset<Row>[] splits = data.randomSplit(new double[] {1.0-testFraction, testFraction}, seed);
	Dataset<Row> trainingData = splits[0];
	Dataset<Row> testData = splits[1];

	// Train a RandomForest model.
	predictor
	  .setLabelCol(label)
	  .setFeaturesCol("features");

	// Chain indexer and forest in a Pipeline
	Pipeline pipeline = new Pipeline()
	  .setStages(new PipelineStage[] {predictor});

	// Train model. This also runs the indexer.
	PipelineModel model = pipeline.fit(trainingData);

	// Make predictions.
	Dataset<Row> predictions = model.transform(testData);

	// Display some sample predictions
	System.out.println("Sample predictions: " + predictor.getClass().getSimpleName());
	String primaryKey = predictions.columns()[0];
	predictions.select(primaryKey, label, "prediction").sample(false, 0.1, seed).show(50);
	
	Map<String,String> metrics = new LinkedHashMap<>();
        
    metrics.put("Method", predictor.getClass().getSimpleName());
    
    // Select (prediction, true label) and compute test error
    RegressionEvaluator evaluator = new RegressionEvaluator()
 		  .setLabelCol(label)
 		  .setPredictionCol("prediction")
 		  .setMetricName("rmse");
    
    metrics.put("rmse", Double.toString(evaluator.evaluate(predictions)));

	return metrics;
}

Example 7

Source File: JavaPipelineExample.java From SparkDemo with MIT License

4 votes

public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaPipelineExample")
    .getOrCreate();

  // $example on$
  // Prepare training documents, which are labeled.
  Dataset<Row> training = spark.createDataFrame(Arrays.asList(
    new JavaLabeledDocument(0L, "a b c d e spark", 1.0),
    new JavaLabeledDocument(1L, "b d", 0.0),
    new JavaLabeledDocument(2L, "spark f g h", 1.0),
    new JavaLabeledDocument(3L, "hadoop mapreduce", 0.0)
  ), JavaLabeledDocument.class);

  // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
  Tokenizer tokenizer = new Tokenizer()
    .setInputCol("text")
    .setOutputCol("words");
  HashingTF hashingTF = new HashingTF()
    .setNumFeatures(1000)
    .setInputCol(tokenizer.getOutputCol())
    .setOutputCol("features");
  LogisticRegression lr = new LogisticRegression()
    .setMaxIter(10)
    .setRegParam(0.001);
  Pipeline pipeline = new Pipeline()
    .setStages(new PipelineStage[] {tokenizer, hashingTF, lr});

  // Fit the pipeline to training documents.
  PipelineModel model = pipeline.fit(training);

  // Prepare test documents, which are unlabeled.
  Dataset<Row> test = spark.createDataFrame(Arrays.asList(
    new JavaDocument(4L, "spark i j k"),
    new JavaDocument(5L, "l m n"),
    new JavaDocument(6L, "spark hadoop spark"),
    new JavaDocument(7L, "apache hadoop")
  ), JavaDocument.class);

  // Make predictions on test documents.
  Dataset<Row> predictions = model.transform(test);
  for (Row r : predictions.select("id", "text", "probability", "prediction").collectAsList()) {
    System.out.println("(" + r.get(0) + ", " + r.get(1) + ") --> prob=" + r.get(2)
      + ", prediction=" + r.get(3));
  }
  // $example off$

  spark.stop();
}

Example 8

Source File: JavaGradientBoostedTreeRegressorExample.java From SparkDemo with MIT License

4 votes

public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaGradientBoostedTreeRegressorExample")
    .getOrCreate();

  // $example on$
  // Load and parse the data file, converting it to a DataFrame.
  Dataset<Row> data = spark.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");

  // Automatically identify categorical features, and index them.
  // Set maxCategories so features with > 4 distinct values are treated as continuous.
  VectorIndexerModel featureIndexer = new VectorIndexer()
    .setInputCol("features")
    .setOutputCol("indexedFeatures")
    .setMaxCategories(4)
    .fit(data);

  // Split the data into training and test sets (30% held out for testing).
  Dataset<Row>[] splits = data.randomSplit(new double[] {0.7, 0.3});
  Dataset<Row> trainingData = splits[0];
  Dataset<Row> testData = splits[1];

  // Train a GBT model.
  GBTRegressor gbt = new GBTRegressor()
    .setLabelCol("label")
    .setFeaturesCol("indexedFeatures")
    .setMaxIter(10);

  // Chain indexer and GBT in a Pipeline.
  Pipeline pipeline = new Pipeline().setStages(new PipelineStage[] {featureIndexer, gbt});

  // Train model. This also runs the indexer.
  PipelineModel model = pipeline.fit(trainingData);

  // Make predictions.
  Dataset<Row> predictions = model.transform(testData);

  // Select example rows to display.
  predictions.select("prediction", "label", "features").show(5);

  // Select (prediction, true label) and compute test error.
  RegressionEvaluator evaluator = new RegressionEvaluator()
    .setLabelCol("label")
    .setPredictionCol("prediction")
    .setMetricName("rmse");
  double rmse = evaluator.evaluate(predictions);
  System.out.println("Root Mean Squared Error (RMSE) on test data = " + rmse);

  GBTRegressionModel gbtModel = (GBTRegressionModel)(model.stages()[1]);
  System.out.println("Learned regression GBT model:\n" + gbtModel.toDebugString());
  // $example off$

  spark.stop();
}

Example 9

Source File: JavaRandomForestRegressorExample.java From SparkDemo with MIT License

4 votes

public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaRandomForestRegressorExample")
    .getOrCreate();

  // $example on$
  // Load and parse the data file, converting it to a DataFrame.
  Dataset<Row> data = spark.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");

  // Automatically identify categorical features, and index them.
  // Set maxCategories so features with > 4 distinct values are treated as continuous.
  VectorIndexerModel featureIndexer = new VectorIndexer()
    .setInputCol("features")
    .setOutputCol("indexedFeatures")
    .setMaxCategories(4)
    .fit(data);

  // Split the data into training and test sets (30% held out for testing)
  Dataset<Row>[] splits = data.randomSplit(new double[] {0.7, 0.3});
  Dataset<Row> trainingData = splits[0];
  Dataset<Row> testData = splits[1];

  // Train a RandomForest model.
  RandomForestRegressor rf = new RandomForestRegressor()
    .setLabelCol("label")
    .setFeaturesCol("indexedFeatures");

  // Chain indexer and forest in a Pipeline
  Pipeline pipeline = new Pipeline()
    .setStages(new PipelineStage[] {featureIndexer, rf});

  // Train model. This also runs the indexer.
  PipelineModel model = pipeline.fit(trainingData);

  // Make predictions.
  Dataset<Row> predictions = model.transform(testData);

  // Select example rows to display.
  predictions.select("prediction", "label", "features").show(5);

  // Select (prediction, true label) and compute test error
  RegressionEvaluator evaluator = new RegressionEvaluator()
    .setLabelCol("label")
    .setPredictionCol("prediction")
    .setMetricName("rmse");
  double rmse = evaluator.evaluate(predictions);
  System.out.println("Root Mean Squared Error (RMSE) on test data = " + rmse);

  RandomForestRegressionModel rfModel = (RandomForestRegressionModel)(model.stages()[1]);
  System.out.println("Learned regression forest model:\n" + rfModel.toDebugString());
  // $example off$

  spark.stop();
}

Example 10

Source File: JavaGradientBoostedTreeClassifierExample.java From SparkDemo with MIT License

4 votes

public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaGradientBoostedTreeClassifierExample")
    .getOrCreate();

  // $example on$
  // Load and parse the data file, converting it to a DataFrame.
  Dataset<Row> data = spark
    .read()
    .format("libsvm")
    .load("data/mllib/sample_libsvm_data.txt");

  // Index labels, adding metadata to the label column.
  // Fit on whole dataset to include all labels in index.
  StringIndexerModel labelIndexer = new StringIndexer()
    .setInputCol("label")
    .setOutputCol("indexedLabel")
    .fit(data);
  // Automatically identify categorical features, and index them.
  // Set maxCategories so features with > 4 distinct values are treated as continuous.
  VectorIndexerModel featureIndexer = new VectorIndexer()
    .setInputCol("features")
    .setOutputCol("indexedFeatures")
    .setMaxCategories(4)
    .fit(data);

  // Split the data into training and test sets (30% held out for testing)
  Dataset<Row>[] splits = data.randomSplit(new double[] {0.7, 0.3});
  Dataset<Row> trainingData = splits[0];
  Dataset<Row> testData = splits[1];

  // Train a GBT model.
  GBTClassifier gbt = new GBTClassifier()
    .setLabelCol("indexedLabel")
    .setFeaturesCol("indexedFeatures")
    .setMaxIter(10);

  // Convert indexed labels back to original labels.
  IndexToString labelConverter = new IndexToString()
    .setInputCol("prediction")
    .setOutputCol("predictedLabel")
    .setLabels(labelIndexer.labels());

  // Chain indexers and GBT in a Pipeline.
  Pipeline pipeline = new Pipeline()
    .setStages(new PipelineStage[] {labelIndexer, featureIndexer, gbt, labelConverter});

  // Train model. This also runs the indexers.
  PipelineModel model = pipeline.fit(trainingData);

  // Make predictions.
  Dataset<Row> predictions = model.transform(testData);

  // Select example rows to display.
  predictions.select("predictedLabel", "label", "features").show(5);

  // Select (prediction, true label) and compute test error.
  MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator()
    .setLabelCol("indexedLabel")
    .setPredictionCol("prediction")
    .setMetricName("accuracy");
  double accuracy = evaluator.evaluate(predictions);
  System.out.println("Test Error = " + (1.0 - accuracy));

  GBTClassificationModel gbtModel = (GBTClassificationModel)(model.stages()[2]);
  System.out.println("Learned classification GBT model:\n" + gbtModel.toDebugString());
  // $example off$

  spark.stop();
}

Example 11

Source File: JavaDecisionTreeClassificationExample.java From SparkDemo with MIT License

4 votes

public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaDecisionTreeClassificationExample")
    .getOrCreate();

  // $example on$
  // Load the data stored in LIBSVM format as a DataFrame.
  Dataset<Row> data = spark
    .read()
    .format("libsvm")
    .load("data/mllib/sample_libsvm_data.txt");

  // Index labels, adding metadata to the label column.
  // Fit on whole dataset to include all labels in index.
  StringIndexerModel labelIndexer = new StringIndexer()
    .setInputCol("label")
    .setOutputCol("indexedLabel")
    .fit(data);

  // Automatically identify categorical features, and index them.
  VectorIndexerModel featureIndexer = new VectorIndexer()
    .setInputCol("features")
    .setOutputCol("indexedFeatures")
    .setMaxCategories(4) // features with > 4 distinct values are treated as continuous.
    .fit(data);

  // Split the data into training and test sets (30% held out for testing).
  Dataset<Row>[] splits = data.randomSplit(new double[]{0.7, 0.3});
  Dataset<Row> trainingData = splits[0];
  Dataset<Row> testData = splits[1];

  // Train a DecisionTree model.
  DecisionTreeClassifier dt = new DecisionTreeClassifier()
    .setLabelCol("indexedLabel")
    .setFeaturesCol("indexedFeatures");

  // Convert indexed labels back to original labels.
  IndexToString labelConverter = new IndexToString()
    .setInputCol("prediction")
    .setOutputCol("predictedLabel")
    .setLabels(labelIndexer.labels());

  // Chain indexers and tree in a Pipeline.
  Pipeline pipeline = new Pipeline()
    .setStages(new PipelineStage[]{labelIndexer, featureIndexer, dt, labelConverter});

  // Train model. This also runs the indexers.
  PipelineModel model = pipeline.fit(trainingData);

  // Make predictions.
  Dataset<Row> predictions = model.transform(testData);

  // Select example rows to display.
  predictions.select("predictedLabel", "label", "features").show(5);

  // Select (prediction, true label) and compute test error.
  MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator()
    .setLabelCol("indexedLabel")
    .setPredictionCol("prediction")
    .setMetricName("accuracy");
  double accuracy = evaluator.evaluate(predictions);
  System.out.println("Test Error = " + (1.0 - accuracy));

  DecisionTreeClassificationModel treeModel =
    (DecisionTreeClassificationModel) (model.stages()[2]);
  System.out.println("Learned classification tree model:\n" + treeModel.toDebugString());
  // $example off$

  spark.stop();
}

Example 12

Source File: JavaDecisionTreeRegressionExample.java From SparkDemo with MIT License

4 votes

public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaDecisionTreeRegressionExample")
    .getOrCreate();
  // $example on$
  // Load the data stored in LIBSVM format as a DataFrame.
  Dataset<Row> data = spark.read().format("libsvm")
    .load("data/mllib/sample_libsvm_data.txt");

  // Automatically identify categorical features, and index them.
  // Set maxCategories so features with > 4 distinct values are treated as continuous.
  VectorIndexerModel featureIndexer = new VectorIndexer()
    .setInputCol("features")
    .setOutputCol("indexedFeatures")
    .setMaxCategories(4)
    .fit(data);

  // Split the data into training and test sets (30% held out for testing).
  Dataset<Row>[] splits = data.randomSplit(new double[]{0.7, 0.3});
  Dataset<Row> trainingData = splits[0];
  Dataset<Row> testData = splits[1];

  // Train a DecisionTree model.
  DecisionTreeRegressor dt = new DecisionTreeRegressor()
    .setFeaturesCol("indexedFeatures");

  // Chain indexer and tree in a Pipeline.
  Pipeline pipeline = new Pipeline()
    .setStages(new PipelineStage[]{featureIndexer, dt});

  // Train model. This also runs the indexer.
  PipelineModel model = pipeline.fit(trainingData);

  // Make predictions.
  Dataset<Row> predictions = model.transform(testData);

  // Select example rows to display.
  predictions.select("label", "features").show(5);

  // Select (prediction, true label) and compute test error.
  RegressionEvaluator evaluator = new RegressionEvaluator()
    .setLabelCol("label")
    .setPredictionCol("prediction")
    .setMetricName("rmse");
  double rmse = evaluator.evaluate(predictions);
  System.out.println("Root Mean Squared Error (RMSE) on test data = " + rmse);

  DecisionTreeRegressionModel treeModel =
    (DecisionTreeRegressionModel) (model.stages()[1]);
  System.out.println("Learned regression tree model:\n" + treeModel.toDebugString());
  // $example off$

  spark.stop();
}

Example 13

Source File: TransitionClassifier.java From vn.vitk with GNU General Public License v3.0

4 votes

/**
 * Trains a transition classifier on the data frame.
 * @param jsc
 * @param graphs
 * @param featureFrame
 * @param classifierFileName
 * @param numHiddenUnits
 * @return a transition classifier.
 */
public Transformer trainMLP(JavaSparkContext jsc,
		List<DependencyGraph> graphs, FeatureFrame featureFrame,
		String classifierFileName, int numHiddenUnits) {
	// create a SQLContext
	this.sqlContext = new SQLContext(jsc);
	// extract a data frame from these graphs
	DataFrame dataset = toDataFrame(jsc, graphs, featureFrame);
	
	// create a processing pipeline and fit it to the data frame
	Pipeline pipeline = createPipeline();
	PipelineModel pipelineModel = pipeline.fit(dataset);
	DataFrame trainingData = pipelineModel.transform(dataset);
	
	// cache the training data for better performance
	trainingData.cache();
	
	if (verbose) {
		trainingData.show(false);
	}
	
	// compute the number of different labels, which is the maximum element 
	// in the 'label' column.
	trainingData.registerTempTable("dfTable");
	Row row = sqlContext.sql("SELECT MAX(label) as maxValue from dfTable").first();
	int numLabels = (int)row.getDouble(0);
	numLabels++;
	
	int vocabSize = ((CountVectorizerModel)(pipelineModel.stages()[1])).getVocabSize();
	
	// default is a two-layer MLP
	int[] layers = {vocabSize, numLabels};
	// if user specify a hidden layer, use a 3-layer MLP:
	if (numHiddenUnits > 0) {
		layers = new int[3];
		layers[0] = vocabSize;
		layers[1] = numHiddenUnits;
		layers[2] = numLabels;
	}
	MultilayerPerceptronClassifier classifier = new MultilayerPerceptronClassifier()
		.setLayers(layers)
		.setBlockSize(128)
		.setSeed(1234L)
		.setTol((Double)params.getOrDefault(params.getTolerance()))
		.setMaxIter((Integer)params.getOrDefault(params.getMaxIter()));
	MultilayerPerceptronClassificationModel model = classifier.fit(trainingData);
	
	// compute precision on the training data
	//
	DataFrame result = model.transform(trainingData);
	DataFrame predictionAndLabel = result.select("prediction", "label");
	MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator().setMetricName("precision");
	if (verbose) {
		System.out.println("N = " + trainingData.count());
		System.out.println("D = " + vocabSize);
		System.out.println("K = " + numLabels);
		System.out.println("H = " + numHiddenUnits);
		System.out.println("training precision = " + evaluator.evaluate(predictionAndLabel));
	}
	
	// save the trained MLP to a file
	//
	String classifierPath = new Path(classifierFileName, "data").toString();
	jsc.parallelize(Arrays.asList(model), 1).saveAsObjectFile(classifierPath);
	// save the pipeline model to sub-directory "pipelineModel"
	// 
	try {
		String pipelinePath = new Path(classifierFileName, "pipelineModel").toString(); 
		pipelineModel.write().overwrite().save(pipelinePath);
	} catch (IOException e) {
		e.printStackTrace();
	}
	return model;
}