org.apache.spark.ml.feature.StandardScaler Java Examples
The following examples show how to use
org.apache.spark.ml.feature.StandardScaler.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: JavaStandardScalerExample.java From SparkDemo with MIT License | 6 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaStandardScalerExample") .getOrCreate(); // $example on$ Dataset<Row> dataFrame = spark.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt"); StandardScaler scaler = new StandardScaler() .setInputCol("features") .setOutputCol("scaledFeatures") .setWithStd(true) .setWithMean(false); // Compute summary statistics by fitting the StandardScaler StandardScalerModel scalerModel = scaler.fit(dataFrame); // Normalize each feature to have unit standard deviation. Dataset<Row> scaledData = scalerModel.transform(dataFrame); scaledData.show(); // $example off$ spark.stop(); }
Example #2
Source File: StandardScalerBridgeTest.java From spark-transformers with Apache License 2.0 | 4 votes |
@Test public void testStandardScaler() { JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList( RowFactory.create(1.0, Vectors.dense(data[0])), RowFactory.create(2.0, Vectors.dense(data[1])), RowFactory.create(3.0, Vectors.dense(data[2])) )); StructType schema = new StructType(new StructField[]{ new StructField("label", DataTypes.DoubleType, false, Metadata.empty()), new StructField("features", new VectorUDT(), false, Metadata.empty()) }); Dataset<Row> df = spark.createDataFrame(jrdd, schema); //train model in spark StandardScalerModel sparkModelNone = new StandardScaler() .setInputCol("features") .setOutputCol("scaledOutput") .setWithMean(false) .setWithStd(false) .fit(df); StandardScalerModel sparkModelWithMean = new StandardScaler() .setInputCol("features") .setOutputCol("scaledOutput") .setWithMean(true) .setWithStd(false) .fit(df); StandardScalerModel sparkModelWithStd = new StandardScaler() .setInputCol("features") .setOutputCol("scaledOutput") .setWithMean(false) .setWithStd(true) .fit(df); StandardScalerModel sparkModelWithBoth = new StandardScaler() .setInputCol("features") .setOutputCol("scaledOutput") .setWithMean(true) .setWithStd(true) .fit(df); //Export model, import it back and get transformer byte[] exportedModel = ModelExporter.export(sparkModelNone); final Transformer transformerNone = ModelImporter.importAndGetTransformer(exportedModel); exportedModel = ModelExporter.export(sparkModelWithMean); final Transformer transformerWithMean = ModelImporter.importAndGetTransformer(exportedModel); exportedModel = ModelExporter.export(sparkModelWithStd); final Transformer transformerWithStd = ModelImporter.importAndGetTransformer(exportedModel); exportedModel = ModelExporter.export(sparkModelWithBoth); final Transformer transformerWithBoth = ModelImporter.importAndGetTransformer(exportedModel); //compare predictions List<Row> sparkNoneOutput = sparkModelNone.transform(df).orderBy("label").select("features", "scaledOutput").collectAsList(); assertCorrectness(sparkNoneOutput, data, transformerNone); List<Row> sparkWithMeanOutput = sparkModelWithMean.transform(df).orderBy("label").select("features", "scaledOutput").collectAsList(); assertCorrectness(sparkWithMeanOutput, resWithMean, transformerWithMean); List<Row> sparkWithStdOutput = sparkModelWithStd.transform(df).orderBy("label").select("features", "scaledOutput").collectAsList(); assertCorrectness(sparkWithStdOutput, resWithStd, transformerWithStd); List<Row> sparkWithBothOutput = sparkModelWithBoth.transform(df).orderBy("label").select("features", "scaledOutput").collectAsList(); assertCorrectness(sparkWithBothOutput, resWithBoth, transformerWithBoth); }
Example #3
Source File: StandardScalerBridgeTest.java From spark-transformers with Apache License 2.0 | 4 votes |
@Test public void testStandardScaler() { //prepare data List<LabeledPoint> localTraining = Arrays.asList( new LabeledPoint(1.0, Vectors.dense(data[0])), new LabeledPoint(2.0, Vectors.dense(data[1])), new LabeledPoint(3.0, Vectors.dense(data[2]))); DataFrame df = sqlContext.createDataFrame(sc.parallelize(localTraining), LabeledPoint.class); //train model in spark StandardScalerModel sparkModelNone = new StandardScaler() .setInputCol("features") .setOutputCol("scaledOutput") .setWithMean(false) .setWithStd(false) .fit(df); StandardScalerModel sparkModelWithMean = new StandardScaler() .setInputCol("features") .setOutputCol("scaledOutput") .setWithMean(true) .setWithStd(false) .fit(df); StandardScalerModel sparkModelWithStd = new StandardScaler() .setInputCol("features") .setOutputCol("scaledOutput") .setWithMean(false) .setWithStd(true) .fit(df); StandardScalerModel sparkModelWithBoth = new StandardScaler() .setInputCol("features") .setOutputCol("scaledOutput") .setWithMean(true) .setWithStd(true) .fit(df); //Export model, import it back and get transformer byte[] exportedModel = ModelExporter.export(sparkModelNone, df); final Transformer transformerNone = ModelImporter.importAndGetTransformer(exportedModel); exportedModel = ModelExporter.export(sparkModelWithMean, df); final Transformer transformerWithMean = ModelImporter.importAndGetTransformer(exportedModel); exportedModel = ModelExporter.export(sparkModelWithStd, df); final Transformer transformerWithStd = ModelImporter.importAndGetTransformer(exportedModel); exportedModel = ModelExporter.export(sparkModelWithBoth, df); final Transformer transformerWithBoth = ModelImporter.importAndGetTransformer(exportedModel); //compare predictions Row[] sparkNoneOutput = sparkModelNone.transform(df).orderBy("label").select("features", "scaledOutput").collect(); assertCorrectness(sparkNoneOutput, data, transformerNone); Row[] sparkWithMeanOutput = sparkModelWithMean.transform(df).orderBy("label").select("features", "scaledOutput").collect(); assertCorrectness(sparkWithMeanOutput, resWithMean, transformerWithMean); Row[] sparkWithStdOutput = sparkModelWithStd.transform(df).orderBy("label").select("features", "scaledOutput").collect(); assertCorrectness(sparkWithStdOutput, resWithStd, transformerWithStd); Row[] sparkWithBothOutput = sparkModelWithBoth.transform(df).orderBy("label").select("features", "scaledOutput").collect(); assertCorrectness(sparkWithBothOutput, resWithBoth, transformerWithBoth); }