org.apache.spark.ml.linalg.DenseVector Java Examples

The following examples show how to use org.apache.spark.ml.linalg.DenseVector. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: MLContextTest.java    From systemds with Apache License 2.0 5 votes vote down vote up
@Test
public void testOutputDataFrameOfVectorsDML() {
	System.out.println("MLContextTest - output DataFrame of vectors DML");

	String s = "m=matrix('1 2 3 4',rows=2,cols=2);";
	Script script = dml(s).out("m");
	MLResults results = ml.execute(script);
	Dataset<Row> df = results.getDataFrame("m", true);
	Dataset<Row> sortedDF = df.sort(RDDConverterUtils.DF_ID_COLUMN);

	// verify column types
	StructType schema = sortedDF.schema();
	StructField[] fields = schema.fields();
	StructField idColumn = fields[0];
	StructField vectorColumn = fields[1];
	Assert.assertTrue(idColumn.dataType() instanceof DoubleType);
	Assert.assertTrue(vectorColumn.dataType() instanceof VectorUDT);

	List<Row> list = sortedDF.collectAsList();

	Row row1 = list.get(0);
	Assert.assertEquals(1.0, row1.getDouble(0), 0.0);
	Vector v1 = (DenseVector) row1.get(1);
	double[] arr1 = v1.toArray();
	Assert.assertArrayEquals(new double[] { 1.0, 2.0 }, arr1, 0.0);

	Row row2 = list.get(1);
	Assert.assertEquals(2.0, row2.getDouble(0), 0.0);
	Vector v2 = (DenseVector) row2.get(1);
	double[] arr2 = v2.toArray();
	Assert.assertArrayEquals(new double[] { 3.0, 4.0 }, arr2, 0.0);
}
 
Example #2
Source File: VectorUtil.java    From jpmml-sparkml with GNU Affero General Public License v3.0 5 votes vote down vote up
static
public List<Double> toList(Vector vector){
	DenseVector denseVector = vector.toDense();

	double[] values = denseVector.values();

	return Doubles.asList(values);
}
 
Example #3
Source File: DecisionTreeClassificationModelBridgeTest.java    From spark-transformers with Apache License 2.0 5 votes vote down vote up
@Test
public void testDecisionTreeClassificationPrediction() {
    // Load the data stored in LIBSVM format as a DataFrame.
	String datapath = "src/test/resources/classification_test.libsvm";
	Dataset<Row> data = spark.read().format("libsvm").load(datapath);


    // Split the data into training and test sets (30% held out for testing)
    Dataset<Row>[] splits = data.randomSplit(new double[]{0.7, 0.3});
    Dataset<Row> trainingData = splits[0];
    Dataset<Row> testData = splits[1];

    // Train a DecisionTree model.
    DecisionTreeClassificationModel classifierModel = new DecisionTreeClassifier().fit(trainingData);
    trainingData.printSchema();
    
    List<Row> output = classifierModel.transform(testData).select("features", "prediction","rawPrediction").collectAsList();
    byte[] exportedModel = ModelExporter.export(classifierModel);

    DecisionTreeTransformer transformer = (DecisionTreeTransformer) ModelImporter.importAndGetTransformer(exportedModel);

    //compare predictions
    for (Row row : output) {
    	Map<String, Object> data_ = new HashMap<>();
    	double [] actualRawPrediction = ((DenseVector) row.get(2)).toArray();
        data_.put("features", ((SparseVector) row.get(0)).toArray());
        transformer.transform(data_);
        System.out.println(data_);
        System.out.println(data_.get("prediction"));
        assertEquals((double)data_.get("prediction"), (double)row.get(1), EPSILON);
        assertArrayEquals((double[]) data_.get("rawPrediction"), actualRawPrediction, EPSILON);
    }
}
 
Example #4
Source File: MLContextTest.java    From systemds with Apache License 2.0 5 votes vote down vote up
@Test
public void testOutputDataFrameOfVectorsDML() {
	System.out.println("MLContextTest - output DataFrame of vectors DML");

	String s = "m=matrix('1 2 3 4',rows=2,cols=2);";
	Script script = dml(s).out("m");
	MLResults results = ml.execute(script);
	Dataset<Row> df = results.getDataFrame("m", true);
	Dataset<Row> sortedDF = df.sort(RDDConverterUtils.DF_ID_COLUMN);

	// verify column types
	StructType schema = sortedDF.schema();
	StructField[] fields = schema.fields();
	StructField idColumn = fields[0];
	StructField vectorColumn = fields[1];
	Assert.assertTrue(idColumn.dataType() instanceof DoubleType);
	Assert.assertTrue(vectorColumn.dataType() instanceof VectorUDT);

	List<Row> list = sortedDF.collectAsList();

	Row row1 = list.get(0);
	Assert.assertEquals(1.0, row1.getDouble(0), 0.0);
	Vector v1 = (DenseVector) row1.get(1);
	double[] arr1 = v1.toArray();
	Assert.assertArrayEquals(new double[] { 1.0, 2.0 }, arr1, 0.0);

	Row row2 = list.get(1);
	Assert.assertEquals(2.0, row2.getDouble(0), 0.0);
	Vector v2 = (DenseVector) row2.get(1);
	double[] arr2 = v2.toArray();
	Assert.assertArrayEquals(new double[] { 3.0, 4.0 }, arr2, 0.0);
}
 
Example #5
Source File: DatasetClassifier.java    From mmtf-spark with Apache License 2.0 4 votes vote down vote up
/**
 * @param args args[0] path to parquet file, args[1] name of classification column
 * @throws IOException 
 * @throws StructureException 
 */
public static void main(String[] args) throws IOException {

	if (args.length != 2) {
		System.err.println("Usage: " + DatasetClassifier.class.getSimpleName() + " <parquet file> <classification column name>");
		System.exit(1);
	}

	// name of the class label
	String label = args[1];
	
	long start = System.nanoTime();

	SparkSession spark = SparkSession
			.builder()
			.master("local[*]")
			.appName(DatasetClassifier.class.getSimpleName())
			.getOrCreate();

	Dataset<Row> data = spark.read().parquet(args[0]).cache();
	
	int featureCount = 0;
	Object vector = data.first().getAs("features");
	if (vector instanceof DenseVector) {
	   featureCount = ((DenseVector)vector).numActives();
	} else if (vector instanceof SparseVector) {
	   featureCount = ((SparseVector)vector).numActives();
	}
	
	System.out.println("Feature count            : "  + featureCount);
	
	int classCount = (int)data.select(label).distinct().count();
	System.out.println("Class count              : " + classCount);

	System.out.println("Dataset size (unbalanced): " + data.count());
	data.groupBy(label).count().show(classCount);

	data = DatasetBalancer.downsample(data, label, 1);
	
	System.out.println("Dataset size (balanced)  : " + data.count());
	data.groupBy(label).count().show(classCount);

	double testFraction = 0.3;
	long seed = 123;

	SparkMultiClassClassifier mcc;
	Map<String, String> metrics;

	DecisionTreeClassifier dtc = new DecisionTreeClassifier();
	mcc = new SparkMultiClassClassifier(dtc, label, testFraction, seed);
	metrics = mcc.fit(data);
	System.out.println(metrics);

	RandomForestClassifier rfc = new RandomForestClassifier();
	mcc = new SparkMultiClassClassifier(rfc, label, testFraction, seed);
	metrics = mcc.fit(data);
	System.out.println(metrics);

	LogisticRegression lr = new LogisticRegression();
	mcc = new SparkMultiClassClassifier(lr, label, testFraction, seed);
	metrics = mcc.fit(data);
	System.out.println(metrics);

	// specify layers for the neural network
	//    input layer: dimension of feature vector
	//    output layer: number of classes
	int[] layers = new int[] {featureCount, 10, classCount};
	MultilayerPerceptronClassifier mpc = new MultilayerPerceptronClassifier()
			.setLayers(layers)
			.setBlockSize(128)
			.setSeed(1234L)
			.setMaxIter(200);

	mcc = new SparkMultiClassClassifier(mpc, label, testFraction, seed);
	metrics = mcc.fit(data);
	System.out.println(metrics);

	long end = System.nanoTime();

	System.out.println((end-start)/1E9 + " sec");
}
 
Example #6
Source File: DatasetRegressor.java    From mmtf-spark with Apache License 2.0 4 votes vote down vote up
/**
 * @param args args[0] path to parquet file, args[1] name of the prediction column
 * @throws IOException 
 * @throws StructureException 
 */
public static void main(String[] args) throws IOException {

	if (args.length != 2) {
		System.err.println("Usage: " + DatasetRegressor.class.getSimpleName() + " <parquet file> <prediction column name>");
		System.exit(1);
	}

	// name of the prediction column
	String label = args[1];
	
	long start = System.nanoTime();

	SparkSession spark = SparkSession
			.builder()
			.master("local[*]")
			.appName(DatasetRegressor.class.getSimpleName())
			.getOrCreate();

	Dataset<Row> data = spark.read().parquet(args[0]).cache();
	
	int featureCount = ((DenseVector)data.first().getAs("features")).numActives();
	System.out.println("Feature count: "  + featureCount);

	System.out.println("Dataset size : " + data.count());

	double testFraction = 0.3;
	long seed = 123;

	LinearRegression lr = new LinearRegression()
			.setLabelCol(label)
			.setFeaturesCol("features");
	
	SparkRegressor reg = new SparkRegressor(lr, label, testFraction, seed);
	System.out.println(reg.fit(data));
	
	GBTRegressor gbt = new GBTRegressor()
			.setLabelCol(label)
			.setFeaturesCol("features");
	
	reg = new SparkRegressor(gbt, label, testFraction, seed);
	System.out.println(reg.fit(data));
	
	GeneralizedLinearRegression glr = new GeneralizedLinearRegression()
			.setLabelCol(label)
			.setFeaturesCol("features")
			  .setFamily("gaussian")
			  .setLink("identity")
			  .setMaxIter(10)
			  .setRegParam(0.3);
	
	reg = new SparkRegressor(glr, label, testFraction, seed);
	System.out.println(reg.fit(data));
	
	
	long end = System.nanoTime();

	System.out.println((end-start)/1E9 + " sec");
}
 
Example #7
Source File: DecisionTreeClassificationModelBridgePipelineTest.java    From spark-transformers with Apache License 2.0 4 votes vote down vote up
@Test
public void testDecisionTreeClassificationWithPipeline() {
	

    // Load the data stored in LIBSVM format as a DataFrame.
	String datapath = "src/test/resources/classification_test.libsvm";
	Dataset<Row> data = spark.read().format("libsvm").load(datapath);



    // Split the data into training and test sets (30% held out for testing)
    Dataset<Row>[] splits = data.randomSplit(new double[]{0.7, 0.3});        

    Dataset<Row> trainingData = splits[0];
    Dataset<Row> testData = splits[1];

    StringIndexer indexer = new StringIndexer()
            .setInputCol("label")
            .setOutputCol("labelIndex");

    // Train a DecisionTree model.
    DecisionTreeClassifier classificationModel = new DecisionTreeClassifier()
            .setLabelCol("labelIndex")
            .setFeaturesCol("features");

    Pipeline pipeline = new Pipeline()
            .setStages(new PipelineStage[]{indexer, classificationModel});


    // Train model.  This also runs the indexer.
    PipelineModel sparkPipeline = pipeline.fit(trainingData);

    //Export this model
    byte[] exportedModel = ModelExporter.export(sparkPipeline);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    List<Row> output = sparkPipeline.transform(testData).select("features", "label","prediction","rawPrediction").collectAsList();

    //compare predictions
    for (Row row : output) {
    	Map<String, Object> data_ = new HashMap<>();
    	double [] actualRawPrediction = ((DenseVector) row.get(3)).toArray();
        data_.put("features", ((SparseVector) row.get(0)).toArray());
        data_.put("label", (row.get(1)).toString());
        transformer.transform(data_);
        System.out.println(data_);
        System.out.println(data_.get("prediction"));
        assertEquals((double)data_.get("prediction"), (double)row.get(2), EPSILON);
        assertArrayEquals((double[]) data_.get("rawPrediction"), actualRawPrediction, EPSILON);
    }
}
 
Example #8
Source File: VectorAssemblerBridgeTest.java    From spark-transformers with Apache License 2.0 4 votes vote down vote up
@Test
public void testVectorAssembler() {
    // prepare data

    JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList(
            RowFactory.create(0d, 1d, new DenseVector(new double[]{2d, 3d})),
            RowFactory.create(1d, 2d, new DenseVector(new double[]{3d, 4d})),
            RowFactory.create(2d, 3d, new DenseVector(new double[]{4d, 5d})),
            RowFactory.create(3d, 4d, new DenseVector(new double[]{5d, 6d})),
            RowFactory.create(4d, 5d, new DenseVector(new double[]{6d, 7d}))
    ));

    StructType schema = new StructType(new StructField[]{
            new StructField("id", DataTypes.DoubleType, false, Metadata.empty()),
            new StructField("value1", DataTypes.DoubleType, false, Metadata.empty()),
            new StructField("vector1", new VectorUDT(), false, Metadata.empty())
    });

    Dataset<Row> df = spark.createDataFrame(jrdd, schema);
    VectorAssembler vectorAssembler = new VectorAssembler()
            .setInputCols(new String[]{"value1", "vector1"})
            .setOutputCol("feature");


    //Export this model
    byte[] exportedModel = ModelExporter.export(vectorAssembler);

    String exportedModelJson = new String(exportedModel);
    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);
    //compare predictions
    List<Row> sparkOutput = vectorAssembler.transform(df).orderBy("id").select("id", "value1", "vector1", "feature").collectAsList();
    for (Row row : sparkOutput) {

        Map<String, Object> data = new HashMap<>();
        data.put(vectorAssembler.getInputCols()[0], row.get(1));
        data.put(vectorAssembler.getInputCols()[1], ((DenseVector) row.get(2)).toArray());
        transformer.transform(data);
        double[] output = (double[]) data.get(vectorAssembler.getOutputCol());
        assertArrayEquals(output, ((DenseVector) row.get(3)).toArray(), 0d);
    }
}
 
Example #9
Source File: ChiSqSelectorBridgeTest.java    From spark-transformers with Apache License 2.0 4 votes vote down vote up
@Test
public void testChiSqSelector() {
    // prepare data

    List<Row> inputData = Arrays.asList(
            RowFactory.create(0d, 0d, new DenseVector(new double[]{8d, 7d, 0d})),
            RowFactory.create(1d, 1d, new DenseVector(new double[]{0d, 9d, 6d})),
            RowFactory.create(2d, 1d, new DenseVector(new double[]{0.0d, 9.0d, 8.0d})),
            RowFactory.create(3d, 2d, new DenseVector(new double[]{8.0d, 9.0d, 5.0d}))
    );

    double[] preFilteredData = {0.0d, 6.0d, 8.0d, 5.0d};

    StructType schema = new StructType(new StructField[]{
            new StructField("id", DataTypes.DoubleType, false, Metadata.empty()),
            new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
            new StructField("features", new VectorUDT(), false, Metadata.empty())
    });

    Dataset<Row> df = spark.createDataFrame(inputData, schema);
    ChiSqSelector chiSqSelector = new ChiSqSelector();
    chiSqSelector.setNumTopFeatures(1);
    chiSqSelector.setFeaturesCol("features");
    chiSqSelector.setLabelCol("label");
    chiSqSelector.setOutputCol("output");

    ChiSqSelectorModel chiSqSelectorModel = chiSqSelector.fit(df);

    //Export this model
    byte[] exportedModel = ModelExporter.export(chiSqSelectorModel);

    String exportedModelJson = new String(exportedModel);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    //compare predictions
    List<Row> sparkOutput = chiSqSelectorModel.transform(df).orderBy("id").select("id", "label", "features", "output").collectAsList();
    for (Row row : sparkOutput) {
        Map<String, Object> data = new HashMap<>();
        data.put(chiSqSelectorModel.getFeaturesCol(), ((DenseVector) row.get(2)).toArray());
        transformer.transform(data);
        double[] output = (double[]) data.get(chiSqSelectorModel.getOutputCol());
        System.out.println(Arrays.toString(output));
        assertArrayEquals(output, ((DenseVector) row.get(3)).toArray(), 0d);
    }
}