org.apache.spark.ml.feature.Bucketizer Java Examples
The following examples show how to use
org.apache.spark.ml.feature.Bucketizer.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: JavaBucketizerExample.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaBucketizerExample") .getOrCreate(); // $example on$ double[] splits = {Double.NEGATIVE_INFINITY, -0.5, 0.0, 0.5, Double.POSITIVE_INFINITY}; List<Row> data = Arrays.asList( RowFactory.create(-999.9), RowFactory.create(-0.5), RowFactory.create(-0.3), RowFactory.create(0.0), RowFactory.create(0.2), RowFactory.create(999.9) ); StructType schema = new StructType(new StructField[]{ new StructField("features", DataTypes.DoubleType, false, Metadata.empty()) }); Dataset<Row> dataFrame = spark.createDataFrame(data, schema); Bucketizer bucketizer = new Bucketizer() .setInputCol("features") .setOutputCol("bucketedFeatures") .setSplits(splits); // Transform original data into its bucket index. Dataset<Row> bucketedData = bucketizer.transform(dataFrame); System.out.println("Bucketizer output with " + (bucketizer.getSplits().length-1) + " buckets"); bucketedData.show(); // $example off$ spark.stop(); }
Example #2
Source File: BucketizerModelInfoAdapter.java From spark-transformers with Apache License 2.0 | 5 votes |
@Override public BucketizerModelInfo getModelInfo(final Bucketizer from) { final BucketizerModelInfo modelInfo = new BucketizerModelInfo(); modelInfo.setSplits(from.getSplits()); Set<String> inputKeys = new LinkedHashSet<String>(); inputKeys.add(from.getInputCol()); modelInfo.setInputKeys(inputKeys); Set<String> outputKeys = new LinkedHashSet<String>(); outputKeys.add(from.getOutputCol()); modelInfo.setOutputKeys(outputKeys); return modelInfo; }
Example #3
Source File: BucketizerModelInfoAdapter.java From spark-transformers with Apache License 2.0 | 5 votes |
@Override public BucketizerModelInfo getModelInfo(final Bucketizer from, final DataFrame df) { final BucketizerModelInfo modelInfo = new BucketizerModelInfo(); modelInfo.setSplits(from.getSplits()); Set<String> inputKeys = new LinkedHashSet<String>(); inputKeys.add(from.getInputCol()); modelInfo.setInputKeys(inputKeys); Set<String> outputKeys = new LinkedHashSet<String>(); outputKeys.add(from.getOutputCol()); modelInfo.setOutputKeys(outputKeys); return modelInfo; }
Example #4
Source File: BucketizerConverter.java From jpmml-sparkml with GNU Affero General Public License v3.0 | 4 votes |
public BucketizerConverter(Bucketizer transformer){ super(transformer); }
Example #5
Source File: BucketizerConverter.java From jpmml-sparkml with GNU Affero General Public License v3.0 | 4 votes |
@Override public List<Feature> encodeFeatures(SparkMLEncoder encoder){ Bucketizer transformer = getTransformer(); InOutMode inputMode = getInputMode(); String[] inputCols; double[][] splitsArray; if((InOutMode.SINGLE).equals(inputMode)){ inputCols = inputMode.getInputCols(transformer); splitsArray = new double[][]{transformer.getSplits()}; } else if((InOutMode.MULTIPLE).equals(inputMode)){ inputCols = inputMode.getInputCols(transformer); splitsArray = transformer.getSplitsArray(); } else { throw new IllegalArgumentException(); } List<Feature> result = new ArrayList<>(); for(int i = 0; i < inputCols.length; i++){ String inputCol = inputCols[i]; double[] splits = splitsArray[i]; Feature feature = encoder.getOnlyFeature(inputCol); ContinuousFeature continuousFeature = feature.toContinuousFeature(); Discretize discretize = new Discretize(continuousFeature.getName()) .setDataType(DataType.INTEGER); List<Integer> categories = new ArrayList<>(); for(int j = 0; j < (splits.length - 1); j++){ Integer category = j; categories.add(category); Interval interval = new Interval((j < (splits.length - 2)) ? Interval.Closure.CLOSED_OPEN : Interval.Closure.CLOSED_CLOSED) .setLeftMargin(formatMargin(splits[j])) .setRightMargin(formatMargin(splits[j + 1])); DiscretizeBin discretizeBin = new DiscretizeBin(category, interval); discretize.addDiscretizeBins(discretizeBin); } DerivedField derivedField = encoder.createDerivedField(formatName(transformer, i), OpType.CATEGORICAL, DataType.INTEGER, discretize); result.add(new IndexFeature(encoder, derivedField, categories)); } return result; }
Example #6
Source File: BucketizerModelInfoAdapter.java From spark-transformers with Apache License 2.0 | 4 votes |
@Override public Class<Bucketizer> getSource() { return Bucketizer.class; }
Example #7
Source File: BucketizerBridgeTest.java From spark-transformers with Apache License 2.0 | 4 votes |
@Test public void bucketizerTest() { double[] validData = {-0.5, -0.3, 0.0, 0.2}; double[] expectedBuckets = {0.0, 0.0, 1.0, 1.0}; double[] splits = {-0.5, 0.0, 0.5}; StructType schema = new StructType(new StructField[]{ new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), new StructField("feature", DataTypes.DoubleType, false, Metadata.empty()) }); List<Row> trainingData = Arrays.asList( cr(0, validData[0]), cr(1, validData[1]), cr(2, validData[2]), cr(3, validData[3])); Dataset<Row> df = spark.createDataFrame(trainingData, schema); Bucketizer sparkModel = new Bucketizer() .setInputCol("feature") .setOutputCol("result") .setSplits(splits); //Export this model byte[] exportedModel = ModelExporter.export(sparkModel); //Import and get Transformer Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel); List<Row> sparkOutput = sparkModel.transform(df).orderBy("id").select("id", "feature", "result").collectAsList(); for (Row r : sparkOutput) { double input = r.getDouble(1); double sparkOp = r.getDouble(2); Map<String, Object> data = new HashMap<String, Object>(); data.put(sparkModel.getInputCol(), input); transformer.transform(data); double transformedInput = (double) data.get(sparkModel.getOutputCol()); assertTrue((transformedInput >= 0) && (transformedInput <= 1)); assertEquals(transformedInput, sparkOp, 0.01); assertEquals(transformedInput, expectedBuckets[r.getInt(0)], 0.01); } }
Example #8
Source File: BucketizerModelInfoAdapter.java From spark-transformers with Apache License 2.0 | 4 votes |
@Override public Class<Bucketizer> getSource() { return Bucketizer.class; }
Example #9
Source File: BucketizerBridgeTest.java From spark-transformers with Apache License 2.0 | 4 votes |
@Test public void bucketizerTest() { double[] validData = {-0.5, -0.3, 0.0, 0.2}; double[] expectedBuckets = {0.0, 0.0, 1.0, 1.0}; double[] splits = {-0.5, 0.0, 0.5}; StructType schema = new StructType(new StructField[]{ new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), new StructField("feature", DataTypes.DoubleType, false, Metadata.empty()) }); List<Row> trainingData = Arrays.asList( cr(0, validData[0]), cr(1, validData[1]), cr(2, validData[2]), cr(3, validData[3])); DataFrame df = sqlContext.createDataFrame(trainingData, schema); Bucketizer sparkModel = new Bucketizer() .setInputCol("feature") .setOutputCol("result") .setSplits(splits); //Export this model byte[] exportedModel = ModelExporter.export(sparkModel, df); //Import and get Transformer Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel); Row[] sparkOutput = sparkModel.transform(df).orderBy("id").select("id", "feature", "result").collect(); for (Row r : sparkOutput) { double input = r.getDouble(1); double sparkOp = r.getDouble(2); Map<String, Object> data = new HashMap<String, Object>(); data.put(sparkModel.getInputCol(), input); transformer.transform(data); double transformedInput = (double) data.get(sparkModel.getOutputCol()); assertTrue((transformedInput >= 0) && (transformedInput <= 1)); assertEquals(transformedInput, sparkOp, EPSILON); assertEquals(transformedInput, expectedBuckets[r.getInt(0)], EPSILON); } }