org.apache.spark.ml.feature.StandardScalerModel Java Examples
The following examples show how to use
org.apache.spark.ml.feature.StandardScalerModel.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: JavaStandardScalerExample.java From SparkDemo with MIT License | 6 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaStandardScalerExample") .getOrCreate(); // $example on$ Dataset<Row> dataFrame = spark.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt"); StandardScaler scaler = new StandardScaler() .setInputCol("features") .setOutputCol("scaledFeatures") .setWithStd(true) .setWithMean(false); // Compute summary statistics by fitting the StandardScaler StandardScalerModel scalerModel = scaler.fit(dataFrame); // Normalize each feature to have unit standard deviation. Dataset<Row> scaledData = scalerModel.transform(dataFrame); scaledData.show(); // $example off$ spark.stop(); }
Example #2
Source File: StandardScalerModelInfoAdapter.java From spark-transformers with Apache License 2.0 | 6 votes |
@Override public StandardScalerModelInfo getModelInfo(final StandardScalerModel from) { final StandardScalerModelInfo modelInfo = new StandardScalerModelInfo(); modelInfo.setMean(from.mean().toArray()); modelInfo.setStd(from.std().toArray()); modelInfo.setWithMean(from.getWithMean()); modelInfo.setWithStd(from.getWithStd()); Set<String> inputKeys = new LinkedHashSet<String>(); inputKeys.add(from.getInputCol()); modelInfo.setInputKeys(inputKeys); Set<String> outputKeys = new LinkedHashSet<String>(); outputKeys.add(from.getOutputCol()); modelInfo.setOutputKeys(outputKeys); return modelInfo; }
Example #3
Source File: StandardScalerModelInfoAdapter.java From spark-transformers with Apache License 2.0 | 6 votes |
@Override public StandardScalerModelInfo getModelInfo(final StandardScalerModel from, final DataFrame df) { final StandardScalerModelInfo modelInfo = new StandardScalerModelInfo(); modelInfo.setMean(from.mean().toArray()); modelInfo.setStd(from.std().toArray()); modelInfo.setWithMean(from.getWithMean()); modelInfo.setWithStd(from.getWithStd()); Set<String> inputKeys = new LinkedHashSet<String>(); inputKeys.add(from.getInputCol()); modelInfo.setInputKeys(inputKeys); Set<String> outputKeys = new LinkedHashSet<String>(); outputKeys.add(from.getOutputCol()); modelInfo.setOutputKeys(outputKeys); return modelInfo; }
Example #4
Source File: StandardScalerModelConverter.java From jpmml-sparkml with GNU Affero General Public License v3.0 | 4 votes |
public StandardScalerModelConverter(StandardScalerModel transformer){ super(transformer); }
Example #5
Source File: StandardScalerModelConverter.java From jpmml-sparkml with GNU Affero General Public License v3.0 | 4 votes |
@Override public List<Feature> encodeFeatures(SparkMLEncoder encoder){ StandardScalerModel transformer = getTransformer(); Vector mean = transformer.mean(); Vector std = transformer.std(); boolean withMean = transformer.getWithMean(); boolean withStd = transformer.getWithStd(); List<Feature> features = encoder.getFeatures(transformer.getInputCol()); if(withMean){ SchemaUtil.checkSize(mean.size(), features); } // End if if(withStd){ SchemaUtil.checkSize(std.size(), features); } List<Feature> result = new ArrayList<>(); for(int i = 0, length = features.size(); i < length; i++){ Feature feature = features.get(i); FieldName name = formatName(transformer, i, length); Expression expression = null; if(withMean){ double meanValue = mean.apply(i); if(!ValueUtil.isZero(meanValue)){ ContinuousFeature continuousFeature = feature.toContinuousFeature(); expression = PMMLUtil.createApply(PMMLFunctions.SUBTRACT, continuousFeature.ref(), PMMLUtil.createConstant(meanValue)); } } // End if if(withStd){ double stdValue = std.apply(i); if(!ValueUtil.isOne(stdValue)){ Double factor = (1d / stdValue); if(expression != null){ expression = PMMLUtil.createApply(PMMLFunctions.MULTIPLY, expression, PMMLUtil.createConstant(factor)); } else { feature = new ProductFeature(encoder, feature, factor){ @Override public ContinuousFeature toContinuousFeature(){ Supplier<Apply> applySupplier = () -> { Feature feature = getFeature(); Number factor = getFactor(); return PMMLUtil.createApply(PMMLFunctions.MULTIPLY, (feature.toContinuousFeature()).ref(), PMMLUtil.createConstant(factor)); }; return toContinuousFeature(name, DataType.DOUBLE, applySupplier); } }; } } } // End if if(expression != null){ DerivedField derivedField = encoder.createDerivedField(name, OpType.CONTINUOUS, DataType.DOUBLE, expression); result.add(new ContinuousFeature(encoder, derivedField)); } else { result.add(feature); } } return result; }
Example #6
Source File: StandardScalerModelInfoAdapter.java From spark-transformers with Apache License 2.0 | 4 votes |
@Override public Class<StandardScalerModel> getSource() { return StandardScalerModel.class; }
Example #7
Source File: StandardScalerBridgeTest.java From spark-transformers with Apache License 2.0 | 4 votes |
@Test public void testStandardScaler() { JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList( RowFactory.create(1.0, Vectors.dense(data[0])), RowFactory.create(2.0, Vectors.dense(data[1])), RowFactory.create(3.0, Vectors.dense(data[2])) )); StructType schema = new StructType(new StructField[]{ new StructField("label", DataTypes.DoubleType, false, Metadata.empty()), new StructField("features", new VectorUDT(), false, Metadata.empty()) }); Dataset<Row> df = spark.createDataFrame(jrdd, schema); //train model in spark StandardScalerModel sparkModelNone = new StandardScaler() .setInputCol("features") .setOutputCol("scaledOutput") .setWithMean(false) .setWithStd(false) .fit(df); StandardScalerModel sparkModelWithMean = new StandardScaler() .setInputCol("features") .setOutputCol("scaledOutput") .setWithMean(true) .setWithStd(false) .fit(df); StandardScalerModel sparkModelWithStd = new StandardScaler() .setInputCol("features") .setOutputCol("scaledOutput") .setWithMean(false) .setWithStd(true) .fit(df); StandardScalerModel sparkModelWithBoth = new StandardScaler() .setInputCol("features") .setOutputCol("scaledOutput") .setWithMean(true) .setWithStd(true) .fit(df); //Export model, import it back and get transformer byte[] exportedModel = ModelExporter.export(sparkModelNone); final Transformer transformerNone = ModelImporter.importAndGetTransformer(exportedModel); exportedModel = ModelExporter.export(sparkModelWithMean); final Transformer transformerWithMean = ModelImporter.importAndGetTransformer(exportedModel); exportedModel = ModelExporter.export(sparkModelWithStd); final Transformer transformerWithStd = ModelImporter.importAndGetTransformer(exportedModel); exportedModel = ModelExporter.export(sparkModelWithBoth); final Transformer transformerWithBoth = ModelImporter.importAndGetTransformer(exportedModel); //compare predictions List<Row> sparkNoneOutput = sparkModelNone.transform(df).orderBy("label").select("features", "scaledOutput").collectAsList(); assertCorrectness(sparkNoneOutput, data, transformerNone); List<Row> sparkWithMeanOutput = sparkModelWithMean.transform(df).orderBy("label").select("features", "scaledOutput").collectAsList(); assertCorrectness(sparkWithMeanOutput, resWithMean, transformerWithMean); List<Row> sparkWithStdOutput = sparkModelWithStd.transform(df).orderBy("label").select("features", "scaledOutput").collectAsList(); assertCorrectness(sparkWithStdOutput, resWithStd, transformerWithStd); List<Row> sparkWithBothOutput = sparkModelWithBoth.transform(df).orderBy("label").select("features", "scaledOutput").collectAsList(); assertCorrectness(sparkWithBothOutput, resWithBoth, transformerWithBoth); }
Example #8
Source File: StandardScalerModelInfoAdapter.java From spark-transformers with Apache License 2.0 | 4 votes |
@Override public Class<StandardScalerModel> getSource() { return StandardScalerModel.class; }
Example #9
Source File: StandardScalerBridgeTest.java From spark-transformers with Apache License 2.0 | 4 votes |
@Test public void testStandardScaler() { //prepare data List<LabeledPoint> localTraining = Arrays.asList( new LabeledPoint(1.0, Vectors.dense(data[0])), new LabeledPoint(2.0, Vectors.dense(data[1])), new LabeledPoint(3.0, Vectors.dense(data[2]))); DataFrame df = sqlContext.createDataFrame(sc.parallelize(localTraining), LabeledPoint.class); //train model in spark StandardScalerModel sparkModelNone = new StandardScaler() .setInputCol("features") .setOutputCol("scaledOutput") .setWithMean(false) .setWithStd(false) .fit(df); StandardScalerModel sparkModelWithMean = new StandardScaler() .setInputCol("features") .setOutputCol("scaledOutput") .setWithMean(true) .setWithStd(false) .fit(df); StandardScalerModel sparkModelWithStd = new StandardScaler() .setInputCol("features") .setOutputCol("scaledOutput") .setWithMean(false) .setWithStd(true) .fit(df); StandardScalerModel sparkModelWithBoth = new StandardScaler() .setInputCol("features") .setOutputCol("scaledOutput") .setWithMean(true) .setWithStd(true) .fit(df); //Export model, import it back and get transformer byte[] exportedModel = ModelExporter.export(sparkModelNone, df); final Transformer transformerNone = ModelImporter.importAndGetTransformer(exportedModel); exportedModel = ModelExporter.export(sparkModelWithMean, df); final Transformer transformerWithMean = ModelImporter.importAndGetTransformer(exportedModel); exportedModel = ModelExporter.export(sparkModelWithStd, df); final Transformer transformerWithStd = ModelImporter.importAndGetTransformer(exportedModel); exportedModel = ModelExporter.export(sparkModelWithBoth, df); final Transformer transformerWithBoth = ModelImporter.importAndGetTransformer(exportedModel); //compare predictions Row[] sparkNoneOutput = sparkModelNone.transform(df).orderBy("label").select("features", "scaledOutput").collect(); assertCorrectness(sparkNoneOutput, data, transformerNone); Row[] sparkWithMeanOutput = sparkModelWithMean.transform(df).orderBy("label").select("features", "scaledOutput").collect(); assertCorrectness(sparkWithMeanOutput, resWithMean, transformerWithMean); Row[] sparkWithStdOutput = sparkModelWithStd.transform(df).orderBy("label").select("features", "scaledOutput").collect(); assertCorrectness(sparkWithStdOutput, resWithStd, transformerWithStd); Row[] sparkWithBothOutput = sparkModelWithBoth.transform(df).orderBy("label").select("features", "scaledOutput").collect(); assertCorrectness(sparkWithBothOutput, resWithBoth, transformerWithBoth); }