org.apache.spark.ml.feature.Bucketizer Java Examples

The following examples show how to use org.apache.spark.ml.feature.Bucketizer. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: JavaBucketizerExample.java    From SparkDemo with MIT License 5 votes vote down vote up
public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaBucketizerExample")
    .getOrCreate();

  // $example on$
  double[] splits = {Double.NEGATIVE_INFINITY, -0.5, 0.0, 0.5, Double.POSITIVE_INFINITY};

  List<Row> data = Arrays.asList(
    RowFactory.create(-999.9),
    RowFactory.create(-0.5),
    RowFactory.create(-0.3),
    RowFactory.create(0.0),
    RowFactory.create(0.2),
    RowFactory.create(999.9)
  );
  StructType schema = new StructType(new StructField[]{
    new StructField("features", DataTypes.DoubleType, false, Metadata.empty())
  });
  Dataset<Row> dataFrame = spark.createDataFrame(data, schema);

  Bucketizer bucketizer = new Bucketizer()
    .setInputCol("features")
    .setOutputCol("bucketedFeatures")
    .setSplits(splits);

  // Transform original data into its bucket index.
  Dataset<Row> bucketedData = bucketizer.transform(dataFrame);

  System.out.println("Bucketizer output with " + (bucketizer.getSplits().length-1) + " buckets");
  bucketedData.show();
  // $example off$

  spark.stop();
}
 
Example #2
Source File: BucketizerModelInfoAdapter.java    From spark-transformers with Apache License 2.0 5 votes vote down vote up
@Override
public BucketizerModelInfo getModelInfo(final Bucketizer from) {
    final BucketizerModelInfo modelInfo = new BucketizerModelInfo();
    modelInfo.setSplits(from.getSplits());

    Set<String> inputKeys = new LinkedHashSet<String>();
    inputKeys.add(from.getInputCol());
    modelInfo.setInputKeys(inputKeys);

    Set<String> outputKeys = new LinkedHashSet<String>();
    outputKeys.add(from.getOutputCol());
    modelInfo.setOutputKeys(outputKeys);

    return modelInfo;
}
 
Example #3
Source File: BucketizerModelInfoAdapter.java    From spark-transformers with Apache License 2.0 5 votes vote down vote up
@Override
public BucketizerModelInfo getModelInfo(final Bucketizer from, final DataFrame df) {
    final BucketizerModelInfo modelInfo = new BucketizerModelInfo();
    modelInfo.setSplits(from.getSplits());

    Set<String> inputKeys = new LinkedHashSet<String>();
    inputKeys.add(from.getInputCol());
    modelInfo.setInputKeys(inputKeys);

    Set<String> outputKeys = new LinkedHashSet<String>();
    outputKeys.add(from.getOutputCol());
    modelInfo.setOutputKeys(outputKeys);
    return modelInfo;
}
 
Example #4
Source File: BucketizerConverter.java    From jpmml-sparkml with GNU Affero General Public License v3.0 4 votes vote down vote up
public BucketizerConverter(Bucketizer transformer){
	super(transformer);
}
 
Example #5
Source File: BucketizerConverter.java    From jpmml-sparkml with GNU Affero General Public License v3.0 4 votes vote down vote up
@Override
public List<Feature> encodeFeatures(SparkMLEncoder encoder){
	Bucketizer transformer = getTransformer();

	InOutMode inputMode = getInputMode();

	String[] inputCols;
	double[][] splitsArray;

	if((InOutMode.SINGLE).equals(inputMode)){
		inputCols = inputMode.getInputCols(transformer);
		splitsArray = new double[][]{transformer.getSplits()};
	} else

	if((InOutMode.MULTIPLE).equals(inputMode)){
		inputCols = inputMode.getInputCols(transformer);
		splitsArray = transformer.getSplitsArray();
	} else

	{
		throw new IllegalArgumentException();
	}

	List<Feature> result = new ArrayList<>();

	for(int i = 0; i < inputCols.length; i++){
		String inputCol = inputCols[i];
		double[] splits = splitsArray[i];

		Feature feature = encoder.getOnlyFeature(inputCol);

		ContinuousFeature continuousFeature = feature.toContinuousFeature();

		Discretize discretize = new Discretize(continuousFeature.getName())
			.setDataType(DataType.INTEGER);

		List<Integer> categories = new ArrayList<>();

		for(int j = 0; j < (splits.length - 1); j++){
			Integer category = j;

			categories.add(category);

			Interval interval = new Interval((j < (splits.length - 2)) ? Interval.Closure.CLOSED_OPEN : Interval.Closure.CLOSED_CLOSED)
				.setLeftMargin(formatMargin(splits[j]))
				.setRightMargin(formatMargin(splits[j + 1]));

			DiscretizeBin discretizeBin = new DiscretizeBin(category, interval);

			discretize.addDiscretizeBins(discretizeBin);
		}

		DerivedField derivedField = encoder.createDerivedField(formatName(transformer, i), OpType.CATEGORICAL, DataType.INTEGER, discretize);

		result.add(new IndexFeature(encoder, derivedField, categories));
	}

	return result;
}
 
Example #6
Source File: BucketizerModelInfoAdapter.java    From spark-transformers with Apache License 2.0 4 votes vote down vote up
@Override
public Class<Bucketizer> getSource() {
    return Bucketizer.class;
}
 
Example #7
Source File: BucketizerBridgeTest.java    From spark-transformers with Apache License 2.0 4 votes vote down vote up
@Test
public void bucketizerTest() {
    double[] validData = {-0.5, -0.3, 0.0, 0.2};
    double[] expectedBuckets = {0.0, 0.0, 1.0, 1.0};
    double[] splits = {-0.5, 0.0, 0.5};

    StructType schema = new StructType(new StructField[]{
            new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
            new StructField("feature", DataTypes.DoubleType, false, Metadata.empty())
    });
    List<Row> trainingData = Arrays.asList(
            cr(0, validData[0]),
            cr(1, validData[1]),
            cr(2, validData[2]),
            cr(3, validData[3]));

    Dataset<Row> df = spark.createDataFrame(trainingData, schema);

    Bucketizer sparkModel = new Bucketizer()
            .setInputCol("feature")
            .setOutputCol("result")
            .setSplits(splits);

    //Export this model
    byte[] exportedModel = ModelExporter.export(sparkModel);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    List<Row> sparkOutput = sparkModel.transform(df).orderBy("id").select("id", "feature", "result").collectAsList();

    for (Row r : sparkOutput) {
        double input = r.getDouble(1);
        double sparkOp = r.getDouble(2);

        Map<String, Object> data = new HashMap<String, Object>();
        data.put(sparkModel.getInputCol(), input);
        transformer.transform(data);
        double transformedInput = (double) data.get(sparkModel.getOutputCol());

        assertTrue((transformedInput >= 0) && (transformedInput <= 1));
        assertEquals(transformedInput, sparkOp, 0.01);
        assertEquals(transformedInput, expectedBuckets[r.getInt(0)], 0.01);
    }
}
 
Example #8
Source File: BucketizerModelInfoAdapter.java    From spark-transformers with Apache License 2.0 4 votes vote down vote up
@Override
public Class<Bucketizer> getSource() {
    return Bucketizer.class;
}
 
Example #9
Source File: BucketizerBridgeTest.java    From spark-transformers with Apache License 2.0 4 votes vote down vote up
@Test
public void bucketizerTest() {
    double[] validData = {-0.5, -0.3, 0.0, 0.2};
    double[] expectedBuckets = {0.0, 0.0, 1.0, 1.0};
    double[] splits = {-0.5, 0.0, 0.5};

    StructType schema = new StructType(new StructField[]{
            new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
            new StructField("feature", DataTypes.DoubleType, false, Metadata.empty())
    });
    List<Row> trainingData = Arrays.asList(
            cr(0, validData[0]),
            cr(1, validData[1]),
            cr(2, validData[2]),
            cr(3, validData[3]));

    DataFrame df = sqlContext.createDataFrame(trainingData, schema);

    Bucketizer sparkModel = new Bucketizer()
            .setInputCol("feature")
            .setOutputCol("result")
            .setSplits(splits);

    //Export this model
    byte[] exportedModel = ModelExporter.export(sparkModel, df);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    Row[] sparkOutput = sparkModel.transform(df).orderBy("id").select("id", "feature", "result").collect();

    for (Row r : sparkOutput) {
        double input = r.getDouble(1);
        double sparkOp = r.getDouble(2);

        Map<String, Object> data = new HashMap<String, Object>();
        data.put(sparkModel.getInputCol(), input);
        transformer.transform(data);
        double transformedInput = (double) data.get(sparkModel.getOutputCol());

        assertTrue((transformedInput >= 0) && (transformedInput <= 1));
        assertEquals(transformedInput, sparkOp, EPSILON);
        assertEquals(transformedInput, expectedBuckets[r.getInt(0)], EPSILON);
    }
}