org.apache.spark.ml.feature.VectorAssembler Java Examples

The following examples show how to use org.apache.spark.ml.feature.VectorAssembler. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: SparkMLHouses.java    From -Data-Stream-Development-with-Apache-Spark-Kafka-and-Spring-Boot with MIT License 5 votes vote down vote up
public static void main(String[] args) throws InterruptedException, StreamingQueryException {

                System.setProperty("hadoop.home.dir", HADOOP_HOME_DIR_VALUE);

                // * the schema can be written on disk, and read from disk
                // * the schema is not mandatory to be complete, it can contain only the needed fields    
                StructType HOUSES_SCHEMA = 
                       new StructType()
                           .add("House", LongType, true)
                           .add("Taxes", LongType, true)
                           .add("Bedrooms", LongType, true)
                           .add("Baths", FloatType, true)
                           .add("Quadrant", LongType, true)
                           .add("NW", StringType, true)
                           .add("Price($)", LongType, false)
                           .add("Size(sqft)", LongType, false)
                           .add("lot", LongType, true);

                final SparkConf conf = new SparkConf()
                    .setMaster(RUN_LOCAL_WITH_AVAILABLE_CORES)
                    .setAppName(APPLICATION_NAME)
                    .set("spark.sql.caseSensitive", CASE_SENSITIVE);

                SparkSession sparkSession = SparkSession.builder()
                    .config(conf)
                    .getOrCreate();

                Dataset<Row> housesDF = sparkSession.read()
                     .schema(HOUSES_SCHEMA)
                     .json(HOUSES_FILE_PATH);
             
                // Gathering Data				
                Dataset<Row> gatheredDF = housesDF.select(col("Taxes"), 
                    col("Bedrooms"), col("Baths"),
                    col("Size(sqft)"), col("Price($)"));
                
                // Data Preparation  
                Dataset<Row> labelDF = gatheredDF.withColumnRenamed("Price($)", "label");
                
                Imputer imputer = new Imputer()
                    // .setMissingValue(1.0d)
                    .setInputCols(new String[] { "Baths" })
                    .setOutputCols(new String[] { "~Baths~" });

                VectorAssembler assembler = new VectorAssembler()
                    .setInputCols(new String[] { "Taxes", "Bedrooms", "~Baths~", "Size(sqft)" })
                    .setOutputCol("features");
                
                // Choosing a Model               
                LinearRegression linearRegression = new LinearRegression();
                linearRegression.setMaxIter(1000);

                Pipeline pipeline = new Pipeline()
                                .setStages(new PipelineStage[] {
                                    imputer, assembler, linearRegression 
                                });

                // Training The Data
                Dataset<Row>[] splitDF = labelDF.randomSplit(new double[] { 0.8, 0.2 });

                Dataset<Row> trainDF = splitDF[0];
                Dataset<Row> evaluationDF = splitDF[1];

                PipelineModel pipelineModel = pipeline.fit(trainDF);
                
                // Evaluation 
                Dataset<Row> predictionsDF = pipelineModel.transform(evaluationDF);

                predictionsDF.show(false);

                Dataset<Row> forEvaluationDF = predictionsDF.select(col("label"), 
                    col("prediction"));

                RegressionEvaluator evaluteR2 = new RegressionEvaluator().setMetricName("r2");
                RegressionEvaluator evaluteRMSE = new RegressionEvaluator().setMetricName("rmse");

                double r2 = evaluteR2.evaluate(forEvaluationDF);
                double rmse = evaluteRMSE.evaluate(forEvaluationDF);

                logger.info("---------------------------");
                logger.info("R2 =" + r2);
                logger.info("RMSE =" + rmse);
                logger.info("---------------------------");
        }
 
Example #2
Source File: JavaVectorAssemblerExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaVectorAssemblerExample")
    .getOrCreate();

  // $example on$
  StructType schema = createStructType(new StructField[]{
    createStructField("id", IntegerType, false),
    createStructField("hour", IntegerType, false),
    createStructField("mobile", DoubleType, false),
    createStructField("userFeatures", new VectorUDT(), false),
    createStructField("clicked", DoubleType, false)
  });
  Row row = RowFactory.create(0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0);
  Dataset<Row> dataset = spark.createDataFrame(Arrays.asList(row), schema);

  VectorAssembler assembler = new VectorAssembler()
    .setInputCols(new String[]{"hour", "mobile", "userFeatures"})
    .setOutputCol("features");

  Dataset<Row> output = assembler.transform(dataset);
  System.out.println("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column " +
      "'features'");
  output.select("features", "clicked").show(false);
  // $example off$

  spark.stop();
}
 
Example #3
Source File: DataPreview.java    From StockPrediction with MIT License 5 votes vote down vote up
public static void main (String[] args) throws IOException {
    SparkSession spark = SparkSession.builder().master("local").appName("DataProcess").getOrCreate();
    String filename = "prices-split-adjusted.csv";
    String symbol = "GOOG";
    // load data from csv file
    Dataset<Row> data = spark.read().format("csv").option("header", true)
            .load(new ClassPathResource(filename).getFile().getAbsolutePath())
            //.filter(functions.col("symbol").equalTo(symbol))
            //.drop("date").drop("symbol")
            .withColumn("openPrice", functions.col("open").cast("double")).drop("open")
            .withColumn("closePrice", functions.col("close").cast("double")).drop("close")
            .withColumn("lowPrice", functions.col("low").cast("double")).drop("low")
            .withColumn("highPrice", functions.col("high").cast("double")).drop("high")
            .withColumn("volumeTmp", functions.col("volume").cast("double")).drop("volume")
            .toDF("date", "symbol", "open", "close", "low", "high", "volume");

    data.show();

    Dataset<Row> symbols = data.select("date", "symbol").groupBy("symbol").agg(functions.count("date").as("count"));
    System.out.println("Number of Symbols: " + symbols.count());
    symbols.show();

    VectorAssembler assembler = new VectorAssembler()
            .setInputCols(new String[] {"open", "low", "high", "volume", "close"})
            .setOutputCol("features");

    data = assembler.transform(data).drop("open", "low", "high", "volume", "close");

    data = new MinMaxScaler().setMin(0).setMax(1)
            .setInputCol("features").setOutputCol("normalizedFeatures")
            .fit(data).transform(data)
            .drop("features").toDF("features");
}
 
Example #4
Source File: VectorAssemblerConverter.java    From jpmml-sparkml with GNU Affero General Public License v3.0 5 votes vote down vote up
@Override
public List<Feature> encodeFeatures(SparkMLEncoder encoder){
	VectorAssembler transformer = getTransformer();

	List<Feature> result = new ArrayList<>();

	String[] inputCols = transformer.getInputCols();
	for(String inputCol : inputCols){
		List<Feature> features = encoder.getFeatures(inputCol);

		result.addAll(features);
	}

	return result;
}
 
Example #5
Source File: VectorAssemblerModelAdapter.java    From spark-transformers with Apache License 2.0 5 votes vote down vote up
@Override
VectorAssemblerModelInfo getModelInfo(VectorAssembler from) {
    VectorAssemblerModelInfo vectorAssemblerModelInfo = new VectorAssemblerModelInfo();

    vectorAssemblerModelInfo.setInputKeys(new LinkedHashSet<>(Arrays.asList(from.getInputCols())));

    Set<String> outputKeys = new LinkedHashSet<String>();
    outputKeys.add(from.getOutputCol());
    vectorAssemblerModelInfo.setOutputKeys(outputKeys);

    return vectorAssemblerModelInfo;
}
 
Example #6
Source File: VectorAssemblerModelAdapter.java    From spark-transformers with Apache License 2.0 5 votes vote down vote up
@Override
VectorAssemblerModelInfo getModelInfo(VectorAssembler from, DataFrame df) {
    VectorAssemblerModelInfo vectorAssemblerModelInfo = new VectorAssemblerModelInfo();

    vectorAssemblerModelInfo.setInputKeys(new LinkedHashSet<>(Arrays.asList(from.getInputCols())));

    Set<String> outputKeys = new LinkedHashSet<String>();
    outputKeys.add(from.getOutputCol());
    vectorAssemblerModelInfo.setOutputKeys(outputKeys);

    return vectorAssemblerModelInfo;
}
 
Example #7
Source File: BikeRentalPrediction.java    From Apache-Spark-2x-for-Java-Developers with MIT License 4 votes vote down vote up
public static void main(String[] args) {
	System.setProperty("hadoop.home.dir", "E:\\sumitK\\Hadoop");
	SparkSession sparkSession = SparkSession
			.builder()
			.master("local")
			.config("spark.sql.warehouse.dir",
					"file:///E:/sumitK/Hadoop/warehouse")
			.appName("BikeRentalPrediction").getOrCreate();
	Logger rootLogger = LogManager.getRootLogger();
	rootLogger.setLevel(Level.WARN);
	//We use the sqlContext.read method to read the data and set a few options:
	//  'format': specifies the Spark CSV data source
	//  'header': set to true to indicate that the first line of the CSV data file is a header
    // The file is called 'hour.csv'.	
	Dataset<Row> ds=sparkSession.read()
			  .format("org.apache.spark.sql.execution.datasources.csv.CSVFileFormat")
			  .option("header", "true")
			  .load("E:\\sumitK\\Hadoop\\Bike-Sharing-Dataset\\hour.csv");
	
	ds.cache();
	
	ds.select("season").show();;
	
	ds.show();
	
	System.out.println("Our dataset has rows :: "+ ds.count());
	
	Dataset<Row> df = ds.drop("instant").drop("dteday").drop("casual").drop("registered");
	df.printSchema();
	//col("...") is preferable to df.col("...")
	Dataset<Row> dformatted = df.select(col("season").cast(DataTypes.IntegerType),
			                            col("yr").cast(DataTypes.IntegerType),
										col("mnth").cast(DataTypes.IntegerType),
										col("hr").cast(DataTypes.IntegerType),
										col("holiday").cast(DataTypes.IntegerType),
										col("weekday").cast(DataTypes.IntegerType),
										col("workingday").cast(DataTypes.IntegerType),
										col("weathersit").cast(DataTypes.IntegerType),
										col("temp").cast(DataTypes.IntegerType),
										col("atemp").cast(DataTypes.IntegerType),
										col("hum").cast(DataTypes.IntegerType),
										col("windspeed").cast(DataTypes.IntegerType),
										col("cnt").cast(DataTypes.IntegerType));
	
	
dformatted.printSchema();	
Dataset<Row>[] data=	dformatted.randomSplit(new double[]{0.7,0.3});
System.out.println("We have training examples count :: "+ data[0].count()+" and test examples count ::"+data[1].count());

///
//removing 'cnt' cloumn and then forming str array
String[] featuresCols = dformatted.drop("cnt").columns();

for(String str:featuresCols){
	System.out.println(str+" :: ");
}

//This concatenates all feature columns into a single feature vector in a new column "rawFeatures".
VectorAssembler vectorAssembler = new VectorAssembler().setInputCols(featuresCols).setOutputCol("rawFeatures");
//This identifies categorical features and indexes them.
VectorIndexer vectorIndexer= new VectorIndexer().setInputCol("rawFeatures").setOutputCol("features").setMaxCategories(4);

//Takes the "features" column and learns to predict "cnt"
GBTRegressor gbt = new GBTRegressor().setLabelCol("cnt");
		
// Define a grid of hyperparameters to test:
  //  - maxDepth: max depth of each decision tree in the GBT ensemble
//  - maxIter: iterations, i.e., number of trees in each GBT ensemble
// In this example notebook, we keep these values small.  In practice, to get the highest accuracy, you would likely want to try deeper trees (10 or higher) and more trees in the ensemble (>100).
ParamMap[]	paramGrid = new ParamGridBuilder().addGrid(gbt.maxDepth(),new int[]{2, 5}).addGrid(gbt.maxIter(),new int[] {10, 100}).build();
// We define an evaluation metric.  This tells CrossValidator how well we are doing by comparing the true labels with predictions.
RegressionEvaluator evaluator = new RegressionEvaluator().setMetricName("rmse").setLabelCol(gbt.getLabelCol()).setPredictionCol(gbt.getPredictionCol());

//	# Declare the CrossValidator, which runs model tuning for us.
	CrossValidator cv = new CrossValidator().setEstimator(gbt).setEvaluator(evaluator).setEstimatorParamMaps(paramGrid);
		
	Pipeline pipeline = new Pipeline().setStages(new PipelineStage[]{vectorAssembler,vectorIndexer,cv});
			
	PipelineModel pipelineModel=pipeline.fit(data[0]);
	
	Dataset<Row> predictions = pipelineModel.transform(data[1]);
	
	predictions.show();
	//predictions.select("cnt", "prediction", *featuresCols);
}
 
Example #8
Source File: JavaInteractionExample.java    From SparkDemo with MIT License 4 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaInteractionExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
    RowFactory.create(1, 1, 2, 3, 8, 4, 5),
    RowFactory.create(2, 4, 3, 8, 7, 9, 8),
    RowFactory.create(3, 6, 1, 9, 2, 3, 6),
    RowFactory.create(4, 10, 8, 6, 9, 4, 5),
    RowFactory.create(5, 9, 2, 7, 10, 7, 3),
    RowFactory.create(6, 1, 1, 4, 2, 8, 4)
  );

  StructType schema = new StructType(new StructField[]{
    new StructField("id1", DataTypes.IntegerType, false, Metadata.empty()),
    new StructField("id2", DataTypes.IntegerType, false, Metadata.empty()),
    new StructField("id3", DataTypes.IntegerType, false, Metadata.empty()),
    new StructField("id4", DataTypes.IntegerType, false, Metadata.empty()),
    new StructField("id5", DataTypes.IntegerType, false, Metadata.empty()),
    new StructField("id6", DataTypes.IntegerType, false, Metadata.empty()),
    new StructField("id7", DataTypes.IntegerType, false, Metadata.empty())
  });

  Dataset<Row> df = spark.createDataFrame(data, schema);

  VectorAssembler assembler1 = new VectorAssembler()
          .setInputCols(new String[]{"id2", "id3", "id4"})
          .setOutputCol("vec1");

  Dataset<Row> assembled1 = assembler1.transform(df);

  VectorAssembler assembler2 = new VectorAssembler()
          .setInputCols(new String[]{"id5", "id6", "id7"})
          .setOutputCol("vec2");

  Dataset<Row> assembled2 = assembler2.transform(assembled1).select("id1", "vec1", "vec2");

  Interaction interaction = new Interaction()
          .setInputCols(new String[]{"id1","vec1","vec2"})
          .setOutputCol("interactedCol");

  Dataset<Row> interacted = interaction.transform(assembled2);

  interacted.show(false);
  // $example off$

  spark.stop();
}
 
Example #9
Source File: VectorAssemblerConverter.java    From jpmml-sparkml with GNU Affero General Public License v3.0 4 votes vote down vote up
public VectorAssemblerConverter(VectorAssembler transformer){
	super(transformer);
}
 
Example #10
Source File: VectorAssemblerModelAdapter.java    From spark-transformers with Apache License 2.0 4 votes vote down vote up
@Override
public Class<VectorAssembler> getSource() {
    return VectorAssembler.class;
}
 
Example #11
Source File: VectorAssemblerBridgeTest.java    From spark-transformers with Apache License 2.0 4 votes vote down vote up
@Test
public void testVectorAssembler() {
    // prepare data

    JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
            RowFactory.create(0d, 1d, new DenseVector(new double[]{2d, 3d})),
            RowFactory.create(1d, 2d, new DenseVector(new double[]{3d, 4d})),
            RowFactory.create(2d, 3d, new DenseVector(new double[]{4d, 5d})),
            RowFactory.create(3d, 4d, new DenseVector(new double[]{5d, 6d})),
            RowFactory.create(4d, 5d, new DenseVector(new double[]{6d, 7d}))
    ));

    StructType schema = new StructType(new StructField[]{
            new StructField("id", DataTypes.DoubleType, false, Metadata.empty()),
            new StructField("value1", DataTypes.DoubleType, false, Metadata.empty()),
            new StructField("vector1", new VectorUDT(), false, Metadata.empty())
    });

    Dataset<Row> df = spark.createDataFrame(jrdd, schema);
    VectorAssembler vectorAssembler = new VectorAssembler()
            .setInputCols(new String[]{"value1", "vector1"})
            .setOutputCol("feature");


    //Export this model
    byte[] exportedModel = ModelExporter.export(vectorAssembler);

    String exportedModelJson = new String(exportedModel);
    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);
    //compare predictions
    List<Row> sparkOutput = vectorAssembler.transform(df).orderBy("id").select("id", "value1", "vector1", "feature").collectAsList();
    for (Row row : sparkOutput) {

        Map<String, Object> data = new HashMap<>();
        data.put(vectorAssembler.getInputCols()[0], row.get(1));
        data.put(vectorAssembler.getInputCols()[1], ((DenseVector) row.get(2)).toArray());
        transformer.transform(data);
        double[] output = (double[]) data.get(vectorAssembler.getOutputCol());
        assertArrayEquals(output, ((DenseVector) row.get(3)).toArray(), 0d);
    }
}
 
Example #12
Source File: VectorAssemblerModelAdapter.java    From spark-transformers with Apache License 2.0 4 votes vote down vote up
@Override
public Class<VectorAssembler> getSource() {
    return VectorAssembler.class;
}
 
Example #13
Source File: VectorAssemblerBridgeTest.java    From spark-transformers with Apache License 2.0 4 votes vote down vote up
@Test
public void testVectorAssembler() {
    // prepare data

    JavaRDD<Row> jrdd = sc.parallelize(Arrays.asList(
            RowFactory.create(0d, 1d, new DenseVector(new double[]{2d, 3d})),
            RowFactory.create(1d, 2d, new DenseVector(new double[]{3d, 4d})),
            RowFactory.create(2d, 3d, new DenseVector(new double[]{4d, 5d})),
            RowFactory.create(3d, 4d, new DenseVector(new double[]{5d, 6d})),
            RowFactory.create(4d, 5d, new DenseVector(new double[]{6d, 7d}))
    ));

    StructType schema = new StructType(new StructField[]{
            new StructField("id", DataTypes.DoubleType, false, Metadata.empty()),
            new StructField("value1", DataTypes.DoubleType, false, Metadata.empty()),
            new StructField("vector1", new VectorUDT(), false, Metadata.empty())
    });

    DataFrame df = sqlContext.createDataFrame(jrdd, schema);
    VectorAssembler vectorAssembler = new VectorAssembler()
            .setInputCols(new String[]{"value1", "vector1"})
            .setOutputCol("feature");


    //Export this model
    byte[] exportedModel = ModelExporter.export(vectorAssembler, null);

    String exportedModelJson = new String(exportedModel);
    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);
    //compare predictions
    Row[] sparkOutput = vectorAssembler.transform(df).orderBy("id").select("id", "value1", "vector1", "feature").collect();
    for (Row row : sparkOutput) {

        Map<String, Object> data = new HashMap<>();
        data.put(vectorAssembler.getInputCols()[0], row.get(1));
        data.put(vectorAssembler.getInputCols()[1], ((DenseVector) row.get(2)).toArray());
        transformer.transform(data);
        double[] output = (double[]) data.get(vectorAssembler.getOutputCol());
        assertArrayEquals(output, ((DenseVector) row.get(3)).toArray(), 0d);
    }
}