org.apache.spark.ml.feature.VectorAssembler Java Examples
The following examples show how to use
org.apache.spark.ml.feature.VectorAssembler.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: SparkMLHouses.java From -Data-Stream-Development-with-Apache-Spark-Kafka-and-Spring-Boot with MIT License | 5 votes |
public static void main(String[] args) throws InterruptedException, StreamingQueryException { System.setProperty("hadoop.home.dir", HADOOP_HOME_DIR_VALUE); // * the schema can be written on disk, and read from disk // * the schema is not mandatory to be complete, it can contain only the needed fields StructType HOUSES_SCHEMA = new StructType() .add("House", LongType, true) .add("Taxes", LongType, true) .add("Bedrooms", LongType, true) .add("Baths", FloatType, true) .add("Quadrant", LongType, true) .add("NW", StringType, true) .add("Price($)", LongType, false) .add("Size(sqft)", LongType, false) .add("lot", LongType, true); final SparkConf conf = new SparkConf() .setMaster(RUN_LOCAL_WITH_AVAILABLE_CORES) .setAppName(APPLICATION_NAME) .set("spark.sql.caseSensitive", CASE_SENSITIVE); SparkSession sparkSession = SparkSession.builder() .config(conf) .getOrCreate(); Dataset<Row> housesDF = sparkSession.read() .schema(HOUSES_SCHEMA) .json(HOUSES_FILE_PATH); // Gathering Data Dataset<Row> gatheredDF = housesDF.select(col("Taxes"), col("Bedrooms"), col("Baths"), col("Size(sqft)"), col("Price($)")); // Data Preparation Dataset<Row> labelDF = gatheredDF.withColumnRenamed("Price($)", "label"); Imputer imputer = new Imputer() // .setMissingValue(1.0d) .setInputCols(new String[] { "Baths" }) .setOutputCols(new String[] { "~Baths~" }); VectorAssembler assembler = new VectorAssembler() .setInputCols(new String[] { "Taxes", "Bedrooms", "~Baths~", "Size(sqft)" }) .setOutputCol("features"); // Choosing a Model LinearRegression linearRegression = new LinearRegression(); linearRegression.setMaxIter(1000); Pipeline pipeline = new Pipeline() .setStages(new PipelineStage[] { imputer, assembler, linearRegression }); // Training The Data Dataset<Row>[] splitDF = labelDF.randomSplit(new double[] { 0.8, 0.2 }); Dataset<Row> trainDF = splitDF[0]; Dataset<Row> evaluationDF = splitDF[1]; PipelineModel pipelineModel = pipeline.fit(trainDF); // Evaluation Dataset<Row> predictionsDF = pipelineModel.transform(evaluationDF); predictionsDF.show(false); Dataset<Row> forEvaluationDF = predictionsDF.select(col("label"), col("prediction")); RegressionEvaluator evaluteR2 = new RegressionEvaluator().setMetricName("r2"); RegressionEvaluator evaluteRMSE = new RegressionEvaluator().setMetricName("rmse"); double r2 = evaluteR2.evaluate(forEvaluationDF); double rmse = evaluteRMSE.evaluate(forEvaluationDF); logger.info("---------------------------"); logger.info("R2 =" + r2); logger.info("RMSE =" + rmse); logger.info("---------------------------"); }
Example #2
Source File: JavaVectorAssemblerExample.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaVectorAssemblerExample") .getOrCreate(); // $example on$ StructType schema = createStructType(new StructField[]{ createStructField("id", IntegerType, false), createStructField("hour", IntegerType, false), createStructField("mobile", DoubleType, false), createStructField("userFeatures", new VectorUDT(), false), createStructField("clicked", DoubleType, false) }); Row row = RowFactory.create(0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0); Dataset<Row> dataset = spark.createDataFrame(Arrays.asList(row), schema); VectorAssembler assembler = new VectorAssembler() .setInputCols(new String[]{"hour", "mobile", "userFeatures"}) .setOutputCol("features"); Dataset<Row> output = assembler.transform(dataset); System.out.println("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column " + "'features'"); output.select("features", "clicked").show(false); // $example off$ spark.stop(); }
Example #3
Source File: DataPreview.java From StockPrediction with MIT License | 5 votes |
public static void main (String[] args) throws IOException { SparkSession spark = SparkSession.builder().master("local").appName("DataProcess").getOrCreate(); String filename = "prices-split-adjusted.csv"; String symbol = "GOOG"; // load data from csv file Dataset<Row> data = spark.read().format("csv").option("header", true) .load(new ClassPathResource(filename).getFile().getAbsolutePath()) //.filter(functions.col("symbol").equalTo(symbol)) //.drop("date").drop("symbol") .withColumn("openPrice", functions.col("open").cast("double")).drop("open") .withColumn("closePrice", functions.col("close").cast("double")).drop("close") .withColumn("lowPrice", functions.col("low").cast("double")).drop("low") .withColumn("highPrice", functions.col("high").cast("double")).drop("high") .withColumn("volumeTmp", functions.col("volume").cast("double")).drop("volume") .toDF("date", "symbol", "open", "close", "low", "high", "volume"); data.show(); Dataset<Row> symbols = data.select("date", "symbol").groupBy("symbol").agg(functions.count("date").as("count")); System.out.println("Number of Symbols: " + symbols.count()); symbols.show(); VectorAssembler assembler = new VectorAssembler() .setInputCols(new String[] {"open", "low", "high", "volume", "close"}) .setOutputCol("features"); data = assembler.transform(data).drop("open", "low", "high", "volume", "close"); data = new MinMaxScaler().setMin(0).setMax(1) .setInputCol("features").setOutputCol("normalizedFeatures") .fit(data).transform(data) .drop("features").toDF("features"); }
Example #4
Source File: VectorAssemblerConverter.java From jpmml-sparkml with GNU Affero General Public License v3.0 | 5 votes |
@Override public List<Feature> encodeFeatures(SparkMLEncoder encoder){ VectorAssembler transformer = getTransformer(); List<Feature> result = new ArrayList<>(); String[] inputCols = transformer.getInputCols(); for(String inputCol : inputCols){ List<Feature> features = encoder.getFeatures(inputCol); result.addAll(features); } return result; }
Example #5
Source File: VectorAssemblerModelAdapter.java From spark-transformers with Apache License 2.0 | 5 votes |
@Override VectorAssemblerModelInfo getModelInfo(VectorAssembler from) { VectorAssemblerModelInfo vectorAssemblerModelInfo = new VectorAssemblerModelInfo(); vectorAssemblerModelInfo.setInputKeys(new LinkedHashSet<>(Arrays.asList(from.getInputCols()))); Set<String> outputKeys = new LinkedHashSet<String>(); outputKeys.add(from.getOutputCol()); vectorAssemblerModelInfo.setOutputKeys(outputKeys); return vectorAssemblerModelInfo; }
Example #6
Source File: VectorAssemblerModelAdapter.java From spark-transformers with Apache License 2.0 | 5 votes |
@Override VectorAssemblerModelInfo getModelInfo(VectorAssembler from, DataFrame df) { VectorAssemblerModelInfo vectorAssemblerModelInfo = new VectorAssemblerModelInfo(); vectorAssemblerModelInfo.setInputKeys(new LinkedHashSet<>(Arrays.asList(from.getInputCols()))); Set<String> outputKeys = new LinkedHashSet<String>(); outputKeys.add(from.getOutputCol()); vectorAssemblerModelInfo.setOutputKeys(outputKeys); return vectorAssemblerModelInfo; }
Example #7
Source File: BikeRentalPrediction.java From Apache-Spark-2x-for-Java-Developers with MIT License | 4 votes |
public static void main(String[] args) { System.setProperty("hadoop.home.dir", "E:\\sumitK\\Hadoop"); SparkSession sparkSession = SparkSession .builder() .master("local") .config("spark.sql.warehouse.dir", "file:///E:/sumitK/Hadoop/warehouse") .appName("BikeRentalPrediction").getOrCreate(); Logger rootLogger = LogManager.getRootLogger(); rootLogger.setLevel(Level.WARN); //We use the sqlContext.read method to read the data and set a few options: // 'format': specifies the Spark CSV data source // 'header': set to true to indicate that the first line of the CSV data file is a header // The file is called 'hour.csv'. Dataset<Row> ds=sparkSession.read() .format("org.apache.spark.sql.execution.datasources.csv.CSVFileFormat") .option("header", "true") .load("E:\\sumitK\\Hadoop\\Bike-Sharing-Dataset\\hour.csv"); ds.cache(); ds.select("season").show();; ds.show(); System.out.println("Our dataset has rows :: "+ ds.count()); Dataset<Row> df = ds.drop("instant").drop("dteday").drop("casual").drop("registered"); df.printSchema(); //col("...") is preferable to df.col("...") Dataset<Row> dformatted = df.select(col("season").cast(DataTypes.IntegerType), col("yr").cast(DataTypes.IntegerType), col("mnth").cast(DataTypes.IntegerType), col("hr").cast(DataTypes.IntegerType), col("holiday").cast(DataTypes.IntegerType), col("weekday").cast(DataTypes.IntegerType), col("workingday").cast(DataTypes.IntegerType), col("weathersit").cast(DataTypes.IntegerType), col("temp").cast(DataTypes.IntegerType), col("atemp").cast(DataTypes.IntegerType), col("hum").cast(DataTypes.IntegerType), col("windspeed").cast(DataTypes.IntegerType), col("cnt").cast(DataTypes.IntegerType)); dformatted.printSchema(); Dataset<Row>[] data= dformatted.randomSplit(new double[]{0.7,0.3}); System.out.println("We have training examples count :: "+ data[0].count()+" and test examples count ::"+data[1].count()); /// //removing 'cnt' cloumn and then forming str array String[] featuresCols = dformatted.drop("cnt").columns(); for(String str:featuresCols){ System.out.println(str+" :: "); } //This concatenates all feature columns into a single feature vector in a new column "rawFeatures". VectorAssembler vectorAssembler = new VectorAssembler().setInputCols(featuresCols).setOutputCol("rawFeatures"); //This identifies categorical features and indexes them. VectorIndexer vectorIndexer= new VectorIndexer().setInputCol("rawFeatures").setOutputCol("features").setMaxCategories(4); //Takes the "features" column and learns to predict "cnt" GBTRegressor gbt = new GBTRegressor().setLabelCol("cnt"); // Define a grid of hyperparameters to test: // - maxDepth: max depth of each decision tree in the GBT ensemble // - maxIter: iterations, i.e., number of trees in each GBT ensemble // In this example notebook, we keep these values small. In practice, to get the highest accuracy, you would likely want to try deeper trees (10 or higher) and more trees in the ensemble (>100). ParamMap[] paramGrid = new ParamGridBuilder().addGrid(gbt.maxDepth(),new int[]{2, 5}).addGrid(gbt.maxIter(),new int[] {10, 100}).build(); // We define an evaluation metric. This tells CrossValidator how well we are doing by comparing the true labels with predictions. RegressionEvaluator evaluator = new RegressionEvaluator().setMetricName("rmse").setLabelCol(gbt.getLabelCol()).setPredictionCol(gbt.getPredictionCol()); // # Declare the CrossValidator, which runs model tuning for us. CrossValidator cv = new CrossValidator().setEstimator(gbt).setEvaluator(evaluator).setEstimatorParamMaps(paramGrid); Pipeline pipeline = new Pipeline().setStages(new PipelineStage[]{vectorAssembler,vectorIndexer,cv}); PipelineModel pipelineModel=pipeline.fit(data[0]); Dataset<Row> predictions = pipelineModel.transform(data[1]); predictions.show(); //predictions.select("cnt", "prediction", *featuresCols); }
Example #8
Source File: JavaInteractionExample.java From SparkDemo with MIT License | 4 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaInteractionExample") .getOrCreate(); // $example on$ List<Row> data = Arrays.asList( RowFactory.create(1, 1, 2, 3, 8, 4, 5), RowFactory.create(2, 4, 3, 8, 7, 9, 8), RowFactory.create(3, 6, 1, 9, 2, 3, 6), RowFactory.create(4, 10, 8, 6, 9, 4, 5), RowFactory.create(5, 9, 2, 7, 10, 7, 3), RowFactory.create(6, 1, 1, 4, 2, 8, 4) ); StructType schema = new StructType(new StructField[]{ new StructField("id1", DataTypes.IntegerType, false, Metadata.empty()), new StructField("id2", DataTypes.IntegerType, false, Metadata.empty()), new StructField("id3", DataTypes.IntegerType, false, Metadata.empty()), new StructField("id4", DataTypes.IntegerType, false, Metadata.empty()), new StructField("id5", DataTypes.IntegerType, false, Metadata.empty()), new StructField("id6", DataTypes.IntegerType, false, Metadata.empty()), new StructField("id7", DataTypes.IntegerType, false, Metadata.empty()) }); Dataset<Row> df = spark.createDataFrame(data, schema); VectorAssembler assembler1 = new VectorAssembler() .setInputCols(new String[]{"id2", "id3", "id4"}) .setOutputCol("vec1"); Dataset<Row> assembled1 = assembler1.transform(df); VectorAssembler assembler2 = new VectorAssembler() .setInputCols(new String[]{"id5", "id6", "id7"}) .setOutputCol("vec2"); Dataset<Row> assembled2 = assembler2.transform(assembled1).select("id1", "vec1", "vec2"); Interaction interaction = new Interaction() .setInputCols(new String[]{"id1","vec1","vec2"}) .setOutputCol("interactedCol"); Dataset<Row> interacted = interaction.transform(assembled2); interacted.show(false); // $example off$ spark.stop(); }
Example #9
Source File: VectorAssemblerConverter.java From jpmml-sparkml with GNU Affero General Public License v3.0 | 4 votes |
public VectorAssemblerConverter(VectorAssembler transformer){ super(transformer); }
Example #10
Source File: VectorAssemblerModelAdapter.java From spark-transformers with Apache License 2.0 | 4 votes |
@Override public Class<VectorAssembler> getSource() { return VectorAssembler.class; }
Example #11
Source File: VectorAssemblerBridgeTest.java From spark-transformers with Apache License 2.0 | 4 votes |
@Test public void testVectorAssembler() { // prepare data JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList( RowFactory.create(0d, 1d, new DenseVector(new double[]{2d, 3d})), RowFactory.create(1d, 2d, new DenseVector(new double[]{3d, 4d})), RowFactory.create(2d, 3d, new DenseVector(new double[]{4d, 5d})), RowFactory.create(3d, 4d, new DenseVector(new double[]{5d, 6d})), RowFactory.create(4d, 5d, new DenseVector(new double[]{6d, 7d})) )); StructType schema = new StructType(new StructField[]{ new StructField("id", DataTypes.DoubleType, false, Metadata.empty()), new StructField("value1", DataTypes.DoubleType, false, Metadata.empty()), new StructField("vector1", new VectorUDT(), false, Metadata.empty()) }); Dataset<Row> df = spark.createDataFrame(jrdd, schema); VectorAssembler vectorAssembler = new VectorAssembler() .setInputCols(new String[]{"value1", "vector1"}) .setOutputCol("feature"); //Export this model byte[] exportedModel = ModelExporter.export(vectorAssembler); String exportedModelJson = new String(exportedModel); //Import and get Transformer Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel); //compare predictions List<Row> sparkOutput = vectorAssembler.transform(df).orderBy("id").select("id", "value1", "vector1", "feature").collectAsList(); for (Row row : sparkOutput) { Map<String, Object> data = new HashMap<>(); data.put(vectorAssembler.getInputCols()[0], row.get(1)); data.put(vectorAssembler.getInputCols()[1], ((DenseVector) row.get(2)).toArray()); transformer.transform(data); double[] output = (double[]) data.get(vectorAssembler.getOutputCol()); assertArrayEquals(output, ((DenseVector) row.get(3)).toArray(), 0d); } }
Example #12
Source File: VectorAssemblerModelAdapter.java From spark-transformers with Apache License 2.0 | 4 votes |
@Override public Class<VectorAssembler> getSource() { return VectorAssembler.class; }
Example #13
Source File: VectorAssemblerBridgeTest.java From spark-transformers with Apache License 2.0 | 4 votes |
@Test public void testVectorAssembler() { // prepare data JavaRDD<Row> jrdd = sc.parallelize(Arrays.asList( RowFactory.create(0d, 1d, new DenseVector(new double[]{2d, 3d})), RowFactory.create(1d, 2d, new DenseVector(new double[]{3d, 4d})), RowFactory.create(2d, 3d, new DenseVector(new double[]{4d, 5d})), RowFactory.create(3d, 4d, new DenseVector(new double[]{5d, 6d})), RowFactory.create(4d, 5d, new DenseVector(new double[]{6d, 7d})) )); StructType schema = new StructType(new StructField[]{ new StructField("id", DataTypes.DoubleType, false, Metadata.empty()), new StructField("value1", DataTypes.DoubleType, false, Metadata.empty()), new StructField("vector1", new VectorUDT(), false, Metadata.empty()) }); DataFrame df = sqlContext.createDataFrame(jrdd, schema); VectorAssembler vectorAssembler = new VectorAssembler() .setInputCols(new String[]{"value1", "vector1"}) .setOutputCol("feature"); //Export this model byte[] exportedModel = ModelExporter.export(vectorAssembler, null); String exportedModelJson = new String(exportedModel); //Import and get Transformer Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel); //compare predictions Row[] sparkOutput = vectorAssembler.transform(df).orderBy("id").select("id", "value1", "vector1", "feature").collect(); for (Row row : sparkOutput) { Map<String, Object> data = new HashMap<>(); data.put(vectorAssembler.getInputCols()[0], row.get(1)); data.put(vectorAssembler.getInputCols()[1], ((DenseVector) row.get(2)).toArray()); transformer.transform(data); double[] output = (double[]) data.get(vectorAssembler.getOutputCol()); assertArrayEquals(output, ((DenseVector) row.get(3)).toArray(), 0d); } }