Java Code Examples for org.apache.spark.ml.feature.StringIndexerModel#transform()

The following examples show how to use org.apache.spark.ml.feature.StringIndexerModel#transform() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: JavaOneHotEncoderExample.java From SparkDemo with MIT License

5 votes

public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaOneHotEncoderExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
    RowFactory.create(0, "a"),
    RowFactory.create(1, "b"),
    RowFactory.create(2, "c"),
    RowFactory.create(3, "a"),
    RowFactory.create(4, "a"),
    RowFactory.create(5, "c")
  );

  StructType schema = new StructType(new StructField[]{
    new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
    new StructField("category", DataTypes.StringType, false, Metadata.empty())
  });

  Dataset<Row> df = spark.createDataFrame(data, schema);

  StringIndexerModel indexer = new StringIndexer()
    .setInputCol("category")
    .setOutputCol("categoryIndex")
    .fit(df);
  Dataset<Row> indexed = indexer.transform(df);

  OneHotEncoder encoder = new OneHotEncoder()
    .setInputCol("categoryIndex")
    .setOutputCol("categoryVec");

  Dataset<Row> encoded = encoder.transform(indexed);
  encoded.show();
  // $example off$

  spark.stop();
}

Example 2

Source File: JavaIndexToStringExample.java From SparkDemo with MIT License

4 votes

public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaIndexToStringExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
    RowFactory.create(0, "a"),
    RowFactory.create(1, "b"),
    RowFactory.create(2, "c"),
    RowFactory.create(3, "a"),
    RowFactory.create(4, "a"),
    RowFactory.create(5, "c")
  );
  StructType schema = new StructType(new StructField[]{
    new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
    new StructField("category", DataTypes.StringType, false, Metadata.empty())
  });
  Dataset<Row> df = spark.createDataFrame(data, schema);

  StringIndexerModel indexer = new StringIndexer()
    .setInputCol("category")
    .setOutputCol("categoryIndex")
    .fit(df);
  Dataset<Row> indexed = indexer.transform(df);

  System.out.println("Transformed string column '" + indexer.getInputCol() + "' " +
      "to indexed column '" + indexer.getOutputCol() + "'");
  indexed.show();

  StructField inputColSchema = indexed.schema().apply(indexer.getOutputCol());
  System.out.println("StringIndexer will store labels in output column metadata: " +
      Attribute.fromStructField(inputColSchema).toString() + "\n");

  IndexToString converter = new IndexToString()
    .setInputCol("categoryIndex")
    .setOutputCol("originalCategory");
  Dataset<Row> converted = converter.transform(indexed);

  System.out.println("Transformed indexed column '" + converter.getInputCol() + "' back to " +
      "original string column '" + converter.getOutputCol() + "' using labels in metadata");
  converted.select("id", "categoryIndex", "originalCategory").show();

  // $example off$
  spark.stop();
}

Example 3

Source File: OneHotEncoderBridgeTest.java From spark-transformers with Apache License 2.0

4 votes

@Test
public void testOneHotEncoding() {
    // prepare data
    JavaRDD<Row> jrdd = sc.parallelize(Arrays.asList(
            RowFactory.create(0d, "a"),
            RowFactory.create(1d, "b"),
            RowFactory.create(2d, "c"),
            RowFactory.create(3d, "a"),
            RowFactory.create(4d, "a"),
            RowFactory.create(5d, "c")
    ));

    StructType schema = new StructType(new StructField[]{
            new StructField("id", DataTypes.DoubleType, false, Metadata.empty()),
            new StructField("category", DataTypes.StringType, false, Metadata.empty())
    });

    DataFrame df = sqlContext.createDataFrame(jrdd, schema);
    StringIndexerModel indexer = new StringIndexer()
            .setInputCol("category")
            .setOutputCol("categoryIndex")
            .fit(df);
    DataFrame indexed = indexer.transform(df);

    OneHotEncoder sparkModel = new OneHotEncoder()
            .setInputCol("categoryIndex")
            .setOutputCol("categoryVec");

    //Export this model
    byte[] exportedModel = ModelExporter.export(sparkModel, indexed);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    //compare predictions
    Row[] sparkOutput = sparkModel.transform(indexed).orderBy("id").select("id", "categoryIndex", "categoryVec").collect();
    for (Row row : sparkOutput) {

        Map<String, Object> data = new HashMap<String, Object>();
        data.put(sparkModel.getInputCol(), row.getDouble(1));
        transformer.transform(data);
        double[] transformedOp = (double[]) data.get(sparkModel.getOutputCol());

        double[] sparkOp = ((Vector) row.get(2)).toArray();
        assertArrayEquals(transformedOp, sparkOp, EPSILON);
    }
}

Example 4

Source File: CustomOneHotEncoderBridgeTest.java From spark-transformers with Apache License 2.0

4 votes

@Test
public void testCustomOneHotEncoding() {
    // prepare data
    JavaRDD<Row> jrdd = sc.parallelize(Arrays.asList(
            RowFactory.create(0d, "a"),
            RowFactory.create(1d, "b"),
            RowFactory.create(2d, "c"),
            RowFactory.create(3d, "a"),
            RowFactory.create(4d, "a"),
            RowFactory.create(5d, "c")
    ));

    StructType schema = new StructType(new StructField[]{
            new StructField("id", DataTypes.DoubleType, false, Metadata.empty()),
            new StructField("category", DataTypes.StringType, false, Metadata.empty())
    });

    DataFrame df = sqlContext.createDataFrame(jrdd, schema);
    StringIndexerModel indexer = new StringIndexer()
            .setInputCol("category")
            .setOutputCol("categoryIndex")
            .fit(df);
    DataFrame indexed = indexer.transform(df);

    CustomOneHotEncoderModel sparkModel = new CustomOneHotEncoder()
            .setInputCol("categoryIndex")
            .setOutputCol("categoryVec")
            .fit(indexed);

    //Export this model
    byte[] exportedModel = ModelExporter.export(sparkModel, indexed);

    //Create spark's OneHotEncoder
    OneHotEncoder sparkOneHotModel = new OneHotEncoder()
            .setInputCol("categoryIndex")
            .setOutputCol("categoryVec");

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    //compare predictions
    Row[] sparkOutput = sparkModel.transform(indexed).orderBy("id").select("id", "categoryIndex", "categoryVec").collect();
    Row[] sparkOneHotOutput = sparkOneHotModel.transform(indexed).orderBy("id").select("id", "categoryIndex", "categoryVec").collect();

    //Compare Spark's OneHotEncoder with CustomOneHotEncoder
    //See if the dictionary size is equal
    assertEquals(sparkOutput.length, sparkOneHotOutput.length);

    for (int i = 0; i < sparkOutput.length; i++) {
        Row row = sparkOutput[i];
        Map<String, Object> data = new HashMap<String, Object>();
        data.put(sparkModel.getInputCol(), row.getDouble(1));
        transformer.transform(data);
        double[] transformedOp = (double[]) data.get(sparkModel.getOutputCol());

        double[] sparkOp = ((Vector) row.get(2)).toArray();
        //get spark's OneHotEncoder output
        double[] sparkOneHotOp = ((Vector) sparkOneHotOutput[i].get(2)).toArray();
        assertArrayEquals(transformedOp, sparkOp, EPSILON);
        assertArrayEquals(sparkOneHotOp, sparkOp, EPSILON);
    }
}

Example 5

Source File: RandomForestClassificationModelInfoAdapterBridgeTest.java From spark-transformers with Apache License 2.0

4 votes

@Test
public void testRandomForestClassification() {
    // Load the data stored in LIBSVM format as a DataFrame.
    DataFrame data = sqlContext.read().format("libsvm").load("src/test/resources/classification_test.libsvm");

    StringIndexerModel stringIndexerModel = new StringIndexer()
            .setInputCol("label")
            .setOutputCol("labelIndex")
            .fit(data);

    data = stringIndexerModel.transform(data);

    // Split the data into training and test sets (30% held out for testing)
    DataFrame[] splits = data.randomSplit(new double[]{0.7, 0.3});
    DataFrame trainingData = splits[0];
    DataFrame testData = splits[1];

    // Train a RandomForest model.
    RandomForestClassificationModel classificationModel = new RandomForestClassifier()
            .setLabelCol("labelIndex")
            .setFeaturesCol("features")
            .setPredictionCol("prediction")
            .setRawPredictionCol("rawPrediction")
            .setProbabilityCol("probability")
            .fit(trainingData);


    byte[] exportedModel = ModelExporter.export(classificationModel, null);

    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    Row[] sparkOutput = classificationModel.transform(testData).select("features", "prediction", "rawPrediction", "probability").collect();

    //compare predictions
    for (Row row : sparkOutput) {
        Vector v = (Vector) row.get(0);
        double actual = row.getDouble(1);
        double [] actualProbability = ((Vector) row.get(3)).toArray();
        double[] actualRaw = ((Vector) row.get(2)).toArray();

        Map<String, Object> inputData = new HashMap<String, Object>();
        inputData.put(transformer.getInputKeys().iterator().next(), v.toArray());
        transformer.transform(inputData);
        double predicted = (double) inputData.get("prediction");
        double[] probability = (double[]) inputData.get("probability");
        double[] rawPrediction = (double[]) inputData.get("rawPrediction");

        assertEquals(actual, predicted, EPSILON);
        assertArrayEquals(actualProbability, probability, EPSILON);
        assertArrayEquals(actualRaw, rawPrediction, EPSILON);


    }

}

Example 6

Source File: DecisionTreeClassificationModelBridgeTest.java From spark-transformers with Apache License 2.0

4 votes

@Test
public void testDecisionTreeClassificationRawPrediction() {
    // Load the data stored in LIBSVM format as a DataFrame.
    DataFrame data = sqlContext.read().format("libsvm").load("src/test/resources/classification_test.libsvm");

    StringIndexerModel stringIndexerModel = new StringIndexer()
            .setInputCol("label")
            .setOutputCol("labelIndex")
            .fit(data);

    data = stringIndexerModel.transform(data);

    // Split the data into training and test sets (30% held out for testing)
    DataFrame[] splits = data.randomSplit(new double[]{0.7, 0.3});
    DataFrame trainingData = splits[0];
    DataFrame testData = splits[1];

    // Train a DecisionTree model.
    DecisionTreeClassificationModel classificationModel = new DecisionTreeClassifier()
            .setLabelCol("labelIndex")
            .setFeaturesCol("features")
            .setRawPredictionCol("rawPrediction")
            .setPredictionCol("prediction")
            .fit(trainingData);

    byte[] exportedModel = ModelExporter.export(classificationModel, null);

    Transformer transformer = (DecisionTreeTransformer) ModelImporter.importAndGetTransformer(exportedModel);

    Row[] sparkOutput = classificationModel.transform(testData).select("features", "prediction", "rawPrediction").collect();

    //compare predictions
    for (Row row : sparkOutput) {
        Vector inp = (Vector) row.get(0);
        double actual = row.getDouble(1);
        double[] actualRaw = ((Vector) row.get(2)).toArray();

        Map<String, Object> inputData = new HashMap<>();
        inputData.put(transformer.getInputKeys().iterator().next(), inp.toArray());
        transformer.transform(inputData);
        double predicted = (double) inputData.get(transformer.getOutputKeys().iterator().next());
        double[] rawPrediction = (double[]) inputData.get("rawPrediction");

        assertEquals(actual, predicted, EPSILON);
        assertArrayEquals(actualRaw, rawPrediction, EPSILON);
    }
}