org.apache.spark.ml.linalg.DenseVector Java Examples
The following examples show how to use
org.apache.spark.ml.linalg.DenseVector.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: MLContextTest.java From systemds with Apache License 2.0 | 5 votes |
@Test public void testOutputDataFrameOfVectorsDML() { System.out.println("MLContextTest - output DataFrame of vectors DML"); String s = "m=matrix('1 2 3 4',rows=2,cols=2);"; Script script = dml(s).out("m"); MLResults results = ml.execute(script); Dataset<Row> df = results.getDataFrame("m", true); Dataset<Row> sortedDF = df.sort(RDDConverterUtils.DF_ID_COLUMN); // verify column types StructType schema = sortedDF.schema(); StructField[] fields = schema.fields(); StructField idColumn = fields[0]; StructField vectorColumn = fields[1]; Assert.assertTrue(idColumn.dataType() instanceof DoubleType); Assert.assertTrue(vectorColumn.dataType() instanceof VectorUDT); List<Row> list = sortedDF.collectAsList(); Row row1 = list.get(0); Assert.assertEquals(1.0, row1.getDouble(0), 0.0); Vector v1 = (DenseVector) row1.get(1); double[] arr1 = v1.toArray(); Assert.assertArrayEquals(new double[] { 1.0, 2.0 }, arr1, 0.0); Row row2 = list.get(1); Assert.assertEquals(2.0, row2.getDouble(0), 0.0); Vector v2 = (DenseVector) row2.get(1); double[] arr2 = v2.toArray(); Assert.assertArrayEquals(new double[] { 3.0, 4.0 }, arr2, 0.0); }
Example #2
Source File: VectorUtil.java From jpmml-sparkml with GNU Affero General Public License v3.0 | 5 votes |
static public List<Double> toList(Vector vector){ DenseVector denseVector = vector.toDense(); double[] values = denseVector.values(); return Doubles.asList(values); }
Example #3
Source File: DecisionTreeClassificationModelBridgeTest.java From spark-transformers with Apache License 2.0 | 5 votes |
@Test public void testDecisionTreeClassificationPrediction() { // Load the data stored in LIBSVM format as a DataFrame. String datapath = "src/test/resources/classification_test.libsvm"; Dataset<Row> data = spark.read().format("libsvm").load(datapath); // Split the data into training and test sets (30% held out for testing) Dataset<Row>[] splits = data.randomSplit(new double[]{0.7, 0.3}); Dataset<Row> trainingData = splits[0]; Dataset<Row> testData = splits[1]; // Train a DecisionTree model. DecisionTreeClassificationModel classifierModel = new DecisionTreeClassifier().fit(trainingData); trainingData.printSchema(); List<Row> output = classifierModel.transform(testData).select("features", "prediction","rawPrediction").collectAsList(); byte[] exportedModel = ModelExporter.export(classifierModel); DecisionTreeTransformer transformer = (DecisionTreeTransformer) ModelImporter.importAndGetTransformer(exportedModel); //compare predictions for (Row row : output) { Map<String, Object> data_ = new HashMap<>(); double [] actualRawPrediction = ((DenseVector) row.get(2)).toArray(); data_.put("features", ((SparseVector) row.get(0)).toArray()); transformer.transform(data_); System.out.println(data_); System.out.println(data_.get("prediction")); assertEquals((double)data_.get("prediction"), (double)row.get(1), EPSILON); assertArrayEquals((double[]) data_.get("rawPrediction"), actualRawPrediction, EPSILON); } }
Example #4
Source File: MLContextTest.java From systemds with Apache License 2.0 | 5 votes |
@Test public void testOutputDataFrameOfVectorsDML() { System.out.println("MLContextTest - output DataFrame of vectors DML"); String s = "m=matrix('1 2 3 4',rows=2,cols=2);"; Script script = dml(s).out("m"); MLResults results = ml.execute(script); Dataset<Row> df = results.getDataFrame("m", true); Dataset<Row> sortedDF = df.sort(RDDConverterUtils.DF_ID_COLUMN); // verify column types StructType schema = sortedDF.schema(); StructField[] fields = schema.fields(); StructField idColumn = fields[0]; StructField vectorColumn = fields[1]; Assert.assertTrue(idColumn.dataType() instanceof DoubleType); Assert.assertTrue(vectorColumn.dataType() instanceof VectorUDT); List<Row> list = sortedDF.collectAsList(); Row row1 = list.get(0); Assert.assertEquals(1.0, row1.getDouble(0), 0.0); Vector v1 = (DenseVector) row1.get(1); double[] arr1 = v1.toArray(); Assert.assertArrayEquals(new double[] { 1.0, 2.0 }, arr1, 0.0); Row row2 = list.get(1); Assert.assertEquals(2.0, row2.getDouble(0), 0.0); Vector v2 = (DenseVector) row2.get(1); double[] arr2 = v2.toArray(); Assert.assertArrayEquals(new double[] { 3.0, 4.0 }, arr2, 0.0); }
Example #5
Source File: DatasetClassifier.java From mmtf-spark with Apache License 2.0 | 4 votes |
/** * @param args args[0] path to parquet file, args[1] name of classification column * @throws IOException * @throws StructureException */ public static void main(String[] args) throws IOException { if (args.length != 2) { System.err.println("Usage: " + DatasetClassifier.class.getSimpleName() + " <parquet file> <classification column name>"); System.exit(1); } // name of the class label String label = args[1]; long start = System.nanoTime(); SparkSession spark = SparkSession .builder() .master("local[*]") .appName(DatasetClassifier.class.getSimpleName()) .getOrCreate(); Dataset<Row> data = spark.read().parquet(args[0]).cache(); int featureCount = 0; Object vector = data.first().getAs("features"); if (vector instanceof DenseVector) { featureCount = ((DenseVector)vector).numActives(); } else if (vector instanceof SparseVector) { featureCount = ((SparseVector)vector).numActives(); } System.out.println("Feature count : " + featureCount); int classCount = (int)data.select(label).distinct().count(); System.out.println("Class count : " + classCount); System.out.println("Dataset size (unbalanced): " + data.count()); data.groupBy(label).count().show(classCount); data = DatasetBalancer.downsample(data, label, 1); System.out.println("Dataset size (balanced) : " + data.count()); data.groupBy(label).count().show(classCount); double testFraction = 0.3; long seed = 123; SparkMultiClassClassifier mcc; Map<String, String> metrics; DecisionTreeClassifier dtc = new DecisionTreeClassifier(); mcc = new SparkMultiClassClassifier(dtc, label, testFraction, seed); metrics = mcc.fit(data); System.out.println(metrics); RandomForestClassifier rfc = new RandomForestClassifier(); mcc = new SparkMultiClassClassifier(rfc, label, testFraction, seed); metrics = mcc.fit(data); System.out.println(metrics); LogisticRegression lr = new LogisticRegression(); mcc = new SparkMultiClassClassifier(lr, label, testFraction, seed); metrics = mcc.fit(data); System.out.println(metrics); // specify layers for the neural network // input layer: dimension of feature vector // output layer: number of classes int[] layers = new int[] {featureCount, 10, classCount}; MultilayerPerceptronClassifier mpc = new MultilayerPerceptronClassifier() .setLayers(layers) .setBlockSize(128) .setSeed(1234L) .setMaxIter(200); mcc = new SparkMultiClassClassifier(mpc, label, testFraction, seed); metrics = mcc.fit(data); System.out.println(metrics); long end = System.nanoTime(); System.out.println((end-start)/1E9 + " sec"); }
Example #6
Source File: DatasetRegressor.java From mmtf-spark with Apache License 2.0 | 4 votes |
/** * @param args args[0] path to parquet file, args[1] name of the prediction column * @throws IOException * @throws StructureException */ public static void main(String[] args) throws IOException { if (args.length != 2) { System.err.println("Usage: " + DatasetRegressor.class.getSimpleName() + " <parquet file> <prediction column name>"); System.exit(1); } // name of the prediction column String label = args[1]; long start = System.nanoTime(); SparkSession spark = SparkSession .builder() .master("local[*]") .appName(DatasetRegressor.class.getSimpleName()) .getOrCreate(); Dataset<Row> data = spark.read().parquet(args[0]).cache(); int featureCount = ((DenseVector)data.first().getAs("features")).numActives(); System.out.println("Feature count: " + featureCount); System.out.println("Dataset size : " + data.count()); double testFraction = 0.3; long seed = 123; LinearRegression lr = new LinearRegression() .setLabelCol(label) .setFeaturesCol("features"); SparkRegressor reg = new SparkRegressor(lr, label, testFraction, seed); System.out.println(reg.fit(data)); GBTRegressor gbt = new GBTRegressor() .setLabelCol(label) .setFeaturesCol("features"); reg = new SparkRegressor(gbt, label, testFraction, seed); System.out.println(reg.fit(data)); GeneralizedLinearRegression glr = new GeneralizedLinearRegression() .setLabelCol(label) .setFeaturesCol("features") .setFamily("gaussian") .setLink("identity") .setMaxIter(10) .setRegParam(0.3); reg = new SparkRegressor(glr, label, testFraction, seed); System.out.println(reg.fit(data)); long end = System.nanoTime(); System.out.println((end-start)/1E9 + " sec"); }
Example #7
Source File: DecisionTreeClassificationModelBridgePipelineTest.java From spark-transformers with Apache License 2.0 | 4 votes |
@Test public void testDecisionTreeClassificationWithPipeline() { // Load the data stored in LIBSVM format as a DataFrame. String datapath = "src/test/resources/classification_test.libsvm"; Dataset<Row> data = spark.read().format("libsvm").load(datapath); // Split the data into training and test sets (30% held out for testing) Dataset<Row>[] splits = data.randomSplit(new double[]{0.7, 0.3}); Dataset<Row> trainingData = splits[0]; Dataset<Row> testData = splits[1]; StringIndexer indexer = new StringIndexer() .setInputCol("label") .setOutputCol("labelIndex"); // Train a DecisionTree model. DecisionTreeClassifier classificationModel = new DecisionTreeClassifier() .setLabelCol("labelIndex") .setFeaturesCol("features"); Pipeline pipeline = new Pipeline() .setStages(new PipelineStage[]{indexer, classificationModel}); // Train model. This also runs the indexer. PipelineModel sparkPipeline = pipeline.fit(trainingData); //Export this model byte[] exportedModel = ModelExporter.export(sparkPipeline); //Import and get Transformer Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel); List<Row> output = sparkPipeline.transform(testData).select("features", "label","prediction","rawPrediction").collectAsList(); //compare predictions for (Row row : output) { Map<String, Object> data_ = new HashMap<>(); double [] actualRawPrediction = ((DenseVector) row.get(3)).toArray(); data_.put("features", ((SparseVector) row.get(0)).toArray()); data_.put("label", (row.get(1)).toString()); transformer.transform(data_); System.out.println(data_); System.out.println(data_.get("prediction")); assertEquals((double)data_.get("prediction"), (double)row.get(2), EPSILON); assertArrayEquals((double[]) data_.get("rawPrediction"), actualRawPrediction, EPSILON); } }
Example #8
Source File: VectorAssemblerBridgeTest.java From spark-transformers with Apache License 2.0 | 4 votes |
@Test public void testVectorAssembler() { // prepare data JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList( RowFactory.create(0d, 1d, new DenseVector(new double[]{2d, 3d})), RowFactory.create(1d, 2d, new DenseVector(new double[]{3d, 4d})), RowFactory.create(2d, 3d, new DenseVector(new double[]{4d, 5d})), RowFactory.create(3d, 4d, new DenseVector(new double[]{5d, 6d})), RowFactory.create(4d, 5d, new DenseVector(new double[]{6d, 7d})) )); StructType schema = new StructType(new StructField[]{ new StructField("id", DataTypes.DoubleType, false, Metadata.empty()), new StructField("value1", DataTypes.DoubleType, false, Metadata.empty()), new StructField("vector1", new VectorUDT(), false, Metadata.empty()) }); Dataset<Row> df = spark.createDataFrame(jrdd, schema); VectorAssembler vectorAssembler = new VectorAssembler() .setInputCols(new String[]{"value1", "vector1"}) .setOutputCol("feature"); //Export this model byte[] exportedModel = ModelExporter.export(vectorAssembler); String exportedModelJson = new String(exportedModel); //Import and get Transformer Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel); //compare predictions List<Row> sparkOutput = vectorAssembler.transform(df).orderBy("id").select("id", "value1", "vector1", "feature").collectAsList(); for (Row row : sparkOutput) { Map<String, Object> data = new HashMap<>(); data.put(vectorAssembler.getInputCols()[0], row.get(1)); data.put(vectorAssembler.getInputCols()[1], ((DenseVector) row.get(2)).toArray()); transformer.transform(data); double[] output = (double[]) data.get(vectorAssembler.getOutputCol()); assertArrayEquals(output, ((DenseVector) row.get(3)).toArray(), 0d); } }
Example #9
Source File: ChiSqSelectorBridgeTest.java From spark-transformers with Apache License 2.0 | 4 votes |
@Test public void testChiSqSelector() { // prepare data List<Row> inputData = Arrays.asList( RowFactory.create(0d, 0d, new DenseVector(new double[]{8d, 7d, 0d})), RowFactory.create(1d, 1d, new DenseVector(new double[]{0d, 9d, 6d})), RowFactory.create(2d, 1d, new DenseVector(new double[]{0.0d, 9.0d, 8.0d})), RowFactory.create(3d, 2d, new DenseVector(new double[]{8.0d, 9.0d, 5.0d})) ); double[] preFilteredData = {0.0d, 6.0d, 8.0d, 5.0d}; StructType schema = new StructType(new StructField[]{ new StructField("id", DataTypes.DoubleType, false, Metadata.empty()), new StructField("label", DataTypes.DoubleType, false, Metadata.empty()), new StructField("features", new VectorUDT(), false, Metadata.empty()) }); Dataset<Row> df = spark.createDataFrame(inputData, schema); ChiSqSelector chiSqSelector = new ChiSqSelector(); chiSqSelector.setNumTopFeatures(1); chiSqSelector.setFeaturesCol("features"); chiSqSelector.setLabelCol("label"); chiSqSelector.setOutputCol("output"); ChiSqSelectorModel chiSqSelectorModel = chiSqSelector.fit(df); //Export this model byte[] exportedModel = ModelExporter.export(chiSqSelectorModel); String exportedModelJson = new String(exportedModel); //Import and get Transformer Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel); //compare predictions List<Row> sparkOutput = chiSqSelectorModel.transform(df).orderBy("id").select("id", "label", "features", "output").collectAsList(); for (Row row : sparkOutput) { Map<String, Object> data = new HashMap<>(); data.put(chiSqSelectorModel.getFeaturesCol(), ((DenseVector) row.get(2)).toArray()); transformer.transform(data); double[] output = (double[]) data.get(chiSqSelectorModel.getOutputCol()); System.out.println(Arrays.toString(output)); assertArrayEquals(output, ((DenseVector) row.get(3)).toArray(), 0d); } }