org.apache.spark.ml.regression.GBTRegressor Java Examples
The following examples show how to use
org.apache.spark.ml.regression.GBTRegressor.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: BikeRentalPrediction.java From Apache-Spark-2x-for-Java-Developers with MIT License | 4 votes |
public static void main(String[] args) { System.setProperty("hadoop.home.dir", "E:\\sumitK\\Hadoop"); SparkSession sparkSession = SparkSession .builder() .master("local") .config("spark.sql.warehouse.dir", "file:///E:/sumitK/Hadoop/warehouse") .appName("BikeRentalPrediction").getOrCreate(); Logger rootLogger = LogManager.getRootLogger(); rootLogger.setLevel(Level.WARN); //We use the sqlContext.read method to read the data and set a few options: // 'format': specifies the Spark CSV data source // 'header': set to true to indicate that the first line of the CSV data file is a header // The file is called 'hour.csv'. Dataset<Row> ds=sparkSession.read() .format("org.apache.spark.sql.execution.datasources.csv.CSVFileFormat") .option("header", "true") .load("E:\\sumitK\\Hadoop\\Bike-Sharing-Dataset\\hour.csv"); ds.cache(); ds.select("season").show();; ds.show(); System.out.println("Our dataset has rows :: "+ ds.count()); Dataset<Row> df = ds.drop("instant").drop("dteday").drop("casual").drop("registered"); df.printSchema(); //col("...") is preferable to df.col("...") Dataset<Row> dformatted = df.select(col("season").cast(DataTypes.IntegerType), col("yr").cast(DataTypes.IntegerType), col("mnth").cast(DataTypes.IntegerType), col("hr").cast(DataTypes.IntegerType), col("holiday").cast(DataTypes.IntegerType), col("weekday").cast(DataTypes.IntegerType), col("workingday").cast(DataTypes.IntegerType), col("weathersit").cast(DataTypes.IntegerType), col("temp").cast(DataTypes.IntegerType), col("atemp").cast(DataTypes.IntegerType), col("hum").cast(DataTypes.IntegerType), col("windspeed").cast(DataTypes.IntegerType), col("cnt").cast(DataTypes.IntegerType)); dformatted.printSchema(); Dataset<Row>[] data= dformatted.randomSplit(new double[]{0.7,0.3}); System.out.println("We have training examples count :: "+ data[0].count()+" and test examples count ::"+data[1].count()); /// //removing 'cnt' cloumn and then forming str array String[] featuresCols = dformatted.drop("cnt").columns(); for(String str:featuresCols){ System.out.println(str+" :: "); } //This concatenates all feature columns into a single feature vector in a new column "rawFeatures". VectorAssembler vectorAssembler = new VectorAssembler().setInputCols(featuresCols).setOutputCol("rawFeatures"); //This identifies categorical features and indexes them. VectorIndexer vectorIndexer= new VectorIndexer().setInputCol("rawFeatures").setOutputCol("features").setMaxCategories(4); //Takes the "features" column and learns to predict "cnt" GBTRegressor gbt = new GBTRegressor().setLabelCol("cnt"); // Define a grid of hyperparameters to test: // - maxDepth: max depth of each decision tree in the GBT ensemble // - maxIter: iterations, i.e., number of trees in each GBT ensemble // In this example notebook, we keep these values small. In practice, to get the highest accuracy, you would likely want to try deeper trees (10 or higher) and more trees in the ensemble (>100). ParamMap[] paramGrid = new ParamGridBuilder().addGrid(gbt.maxDepth(),new int[]{2, 5}).addGrid(gbt.maxIter(),new int[] {10, 100}).build(); // We define an evaluation metric. This tells CrossValidator how well we are doing by comparing the true labels with predictions. RegressionEvaluator evaluator = new RegressionEvaluator().setMetricName("rmse").setLabelCol(gbt.getLabelCol()).setPredictionCol(gbt.getPredictionCol()); // # Declare the CrossValidator, which runs model tuning for us. CrossValidator cv = new CrossValidator().setEstimator(gbt).setEvaluator(evaluator).setEstimatorParamMaps(paramGrid); Pipeline pipeline = new Pipeline().setStages(new PipelineStage[]{vectorAssembler,vectorIndexer,cv}); PipelineModel pipelineModel=pipeline.fit(data[0]); Dataset<Row> predictions = pipelineModel.transform(data[1]); predictions.show(); //predictions.select("cnt", "prediction", *featuresCols); }
Example #2
Source File: DatasetRegressor.java From mmtf-spark with Apache License 2.0 | 4 votes |
/** * @param args args[0] path to parquet file, args[1] name of the prediction column * @throws IOException * @throws StructureException */ public static void main(String[] args) throws IOException { if (args.length != 2) { System.err.println("Usage: " + DatasetRegressor.class.getSimpleName() + " <parquet file> <prediction column name>"); System.exit(1); } // name of the prediction column String label = args[1]; long start = System.nanoTime(); SparkSession spark = SparkSession .builder() .master("local[*]") .appName(DatasetRegressor.class.getSimpleName()) .getOrCreate(); Dataset<Row> data = spark.read().parquet(args[0]).cache(); int featureCount = ((DenseVector)data.first().getAs("features")).numActives(); System.out.println("Feature count: " + featureCount); System.out.println("Dataset size : " + data.count()); double testFraction = 0.3; long seed = 123; LinearRegression lr = new LinearRegression() .setLabelCol(label) .setFeaturesCol("features"); SparkRegressor reg = new SparkRegressor(lr, label, testFraction, seed); System.out.println(reg.fit(data)); GBTRegressor gbt = new GBTRegressor() .setLabelCol(label) .setFeaturesCol("features"); reg = new SparkRegressor(gbt, label, testFraction, seed); System.out.println(reg.fit(data)); GeneralizedLinearRegression glr = new GeneralizedLinearRegression() .setLabelCol(label) .setFeaturesCol("features") .setFamily("gaussian") .setLink("identity") .setMaxIter(10) .setRegParam(0.3); reg = new SparkRegressor(glr, label, testFraction, seed); System.out.println(reg.fit(data)); long end = System.nanoTime(); System.out.println((end-start)/1E9 + " sec"); }
Example #3
Source File: JavaGradientBoostedTreeRegressorExample.java From SparkDemo with MIT License | 4 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaGradientBoostedTreeRegressorExample") .getOrCreate(); // $example on$ // Load and parse the data file, converting it to a DataFrame. Dataset<Row> data = spark.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt"); // Automatically identify categorical features, and index them. // Set maxCategories so features with > 4 distinct values are treated as continuous. VectorIndexerModel featureIndexer = new VectorIndexer() .setInputCol("features") .setOutputCol("indexedFeatures") .setMaxCategories(4) .fit(data); // Split the data into training and test sets (30% held out for testing). Dataset<Row>[] splits = data.randomSplit(new double[] {0.7, 0.3}); Dataset<Row> trainingData = splits[0]; Dataset<Row> testData = splits[1]; // Train a GBT model. GBTRegressor gbt = new GBTRegressor() .setLabelCol("label") .setFeaturesCol("indexedFeatures") .setMaxIter(10); // Chain indexer and GBT in a Pipeline. Pipeline pipeline = new Pipeline().setStages(new PipelineStage[] {featureIndexer, gbt}); // Train model. This also runs the indexer. PipelineModel model = pipeline.fit(trainingData); // Make predictions. Dataset<Row> predictions = model.transform(testData); // Select example rows to display. predictions.select("prediction", "label", "features").show(5); // Select (prediction, true label) and compute test error. RegressionEvaluator evaluator = new RegressionEvaluator() .setLabelCol("label") .setPredictionCol("prediction") .setMetricName("rmse"); double rmse = evaluator.evaluate(predictions); System.out.println("Root Mean Squared Error (RMSE) on test data = " + rmse); GBTRegressionModel gbtModel = (GBTRegressionModel)(model.stages()[1]); System.out.println("Learned regression GBT model:\n" + gbtModel.toDebugString()); // $example off$ spark.stop(); }