org.apache.spark.ml.linalg.VectorUDT Java Examples
The following examples show how to use
org.apache.spark.ml.linalg.VectorUDT.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: RDDConverterUtils.java From systemds with Apache License 2.0 | 6 votes |
public static Dataset<Row> binaryBlockToDataFrame(SparkSession sparkSession, JavaPairRDD<MatrixIndexes, MatrixBlock> in, DataCharacteristics mc, boolean toVector) { if( !mc.colsKnown() ) throw new RuntimeException("Number of columns needed to convert binary block to data frame."); //slice blocks into rows, align and convert into data frame rows JavaRDD<Row> rowsRDD = in .flatMapToPair(new SliceBinaryBlockToRowsFunction(mc.getBlocksize())) .groupByKey().map(new ConvertRowBlocksToRows((int)mc.getCols(), mc.getBlocksize(), toVector)); //create data frame schema List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField(DF_ID_COLUMN, DataTypes.DoubleType, false)); if( toVector ) fields.add(DataTypes.createStructField("C1", new VectorUDT(), false)); else { // row for(int i = 1; i <= mc.getCols(); i++) fields.add(DataTypes.createStructField("C"+i, DataTypes.DoubleType, false)); } //rdd to data frame conversion return sparkSession.createDataFrame(rowsRDD.rdd(), DataTypes.createStructType(fields)); }
Example #2
Source File: RDDConverterUtils.java From systemds with Apache License 2.0 | 6 votes |
public static Dataset<Row> binaryBlockToDataFrame(SparkSession sparkSession, JavaPairRDD<MatrixIndexes, MatrixBlock> in, DataCharacteristics mc, boolean toVector) { if( !mc.colsKnown() ) throw new RuntimeException("Number of columns needed to convert binary block to data frame."); //slice blocks into rows, align and convert into data frame rows JavaRDD<Row> rowsRDD = in .flatMapToPair(new SliceBinaryBlockToRowsFunction(mc.getBlocksize())) .groupByKey().map(new ConvertRowBlocksToRows((int)mc.getCols(), mc.getBlocksize(), toVector)); //create data frame schema List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField(DF_ID_COLUMN, DataTypes.DoubleType, false)); if( toVector ) fields.add(DataTypes.createStructField("C1", new VectorUDT(), false)); else { // row for(int i = 1; i <= mc.getCols(); i++) fields.add(DataTypes.createStructField("C"+i, DataTypes.DoubleType, false)); } //rdd to data frame conversion return sparkSession.createDataFrame(rowsRDD.rdd(), DataTypes.createStructType(fields)); }
Example #3
Source File: MLContextTest.java From systemds with Apache License 2.0 | 6 votes |
@Test public void testDataFrameSumDMLVectorWithIDColumn() { System.out.println("MLContextTest - DataFrame sum DML, vector with ID column"); List<Tuple2<Double, Vector>> list = new ArrayList<>(); list.add(new Tuple2<>(1.0, Vectors.dense(1.0, 2.0, 3.0))); list.add(new Tuple2<>(2.0, Vectors.dense(4.0, 5.0, 6.0))); list.add(new Tuple2<>(3.0, Vectors.dense(7.0, 8.0, 9.0))); JavaRDD<Tuple2<Double, Vector>> javaRddTuple = sc.parallelize(list); JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleVectorRow()); List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true)); fields.add(DataTypes.createStructField("C1", new VectorUDT(), true)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema); MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR_WITH_INDEX); Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame, mm); setExpectedStdOut("sum: 45.0"); ml.execute(script); }
Example #4
Source File: MLContextTest.java From systemds with Apache License 2.0 | 6 votes |
@Test public void testDataFrameSumDMLMllibVectorWithIDColumn() { System.out.println("MLContextTest - DataFrame sum DML, mllib vector with ID column"); List<Tuple2<Double, org.apache.spark.mllib.linalg.Vector>> list = new ArrayList<>(); list.add(new Tuple2<>(1.0, org.apache.spark.mllib.linalg.Vectors.dense(1.0, 2.0, 3.0))); list.add(new Tuple2<>(2.0, org.apache.spark.mllib.linalg.Vectors.dense(4.0, 5.0, 6.0))); list.add(new Tuple2<>(3.0, org.apache.spark.mllib.linalg.Vectors.dense(7.0, 8.0, 9.0))); JavaRDD<Tuple2<Double, org.apache.spark.mllib.linalg.Vector>> javaRddTuple = sc.parallelize(list); JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleMllibVectorRow()); List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true)); fields.add(DataTypes.createStructField("C1", new org.apache.spark.mllib.linalg.VectorUDT(), true)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema); MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR_WITH_INDEX); Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame, mm); setExpectedStdOut("sum: 45.0"); ml.execute(script); }
Example #5
Source File: MLContextTest.java From systemds with Apache License 2.0 | 6 votes |
@Test public void testDataFrameSumDMLVectorWithNoIDColumn() { System.out.println("MLContextTest - DataFrame sum DML, vector with no ID column"); List<Vector> list = new ArrayList<>(); list.add(Vectors.dense(1.0, 2.0, 3.0)); list.add(Vectors.dense(4.0, 5.0, 6.0)); list.add(Vectors.dense(7.0, 8.0, 9.0)); JavaRDD<Vector> javaRddVector = sc.parallelize(list); JavaRDD<Row> javaRddRow = javaRddVector.map(new VectorRow()); List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField("C1", new VectorUDT(), true)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema); MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR); Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame, mm); setExpectedStdOut("sum: 45.0"); ml.execute(script); }
Example #6
Source File: MLContextTest.java From systemds with Apache License 2.0 | 6 votes |
@Test public void testDataFrameSumDMLMllibVectorWithNoIDColumn() { System.out.println("MLContextTest - DataFrame sum DML, mllib vector with no ID column"); List<org.apache.spark.mllib.linalg.Vector> list = new ArrayList<>(); list.add(org.apache.spark.mllib.linalg.Vectors.dense(1.0, 2.0, 3.0)); list.add(org.apache.spark.mllib.linalg.Vectors.dense(4.0, 5.0, 6.0)); list.add(org.apache.spark.mllib.linalg.Vectors.dense(7.0, 8.0, 9.0)); JavaRDD<org.apache.spark.mllib.linalg.Vector> javaRddVector = sc.parallelize(list); JavaRDD<Row> javaRddRow = javaRddVector.map(new MllibVectorRow()); List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField("C1", new org.apache.spark.mllib.linalg.VectorUDT(), true)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema); MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR); Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame, mm); setExpectedStdOut("sum: 45.0"); ml.execute(script); }
Example #7
Source File: MLContextTest.java From systemds with Apache License 2.0 | 6 votes |
@Test public void testDataFrameSumDMLVectorWithIDColumnNoFormatSpecified() { System.out.println("MLContextTest - DataFrame sum DML, vector with ID column, no format specified"); List<Tuple2<Double, Vector>> list = new ArrayList<>(); list.add(new Tuple2<>(1.0, Vectors.dense(1.0, 2.0, 3.0))); list.add(new Tuple2<>(2.0, Vectors.dense(4.0, 5.0, 6.0))); list.add(new Tuple2<>(3.0, Vectors.dense(7.0, 8.0, 9.0))); JavaRDD<Tuple2<Double, Vector>> javaRddTuple = sc.parallelize(list); JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleVectorRow()); List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true)); fields.add(DataTypes.createStructField("C1", new VectorUDT(), true)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema); Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame); setExpectedStdOut("sum: 45.0"); ml.execute(script); }
Example #8
Source File: MLContextTest.java From systemds with Apache License 2.0 | 6 votes |
@Test public void testDataFrameSumDMLVectorWithNoIDColumnNoFormatSpecified() { System.out.println("MLContextTest - DataFrame sum DML, vector with no ID column, no format specified"); List<Vector> list = new ArrayList<>(); list.add(Vectors.dense(1.0, 2.0, 3.0)); list.add(Vectors.dense(4.0, 5.0, 6.0)); list.add(Vectors.dense(7.0, 8.0, 9.0)); JavaRDD<Vector> javaRddVector = sc.parallelize(list); JavaRDD<Row> javaRddRow = javaRddVector.map(new VectorRow()); List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField("C1", new VectorUDT(), true)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema); Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame); setExpectedStdOut("sum: 45.0"); ml.execute(script); }
Example #9
Source File: MLContextTest.java From systemds with Apache License 2.0 | 6 votes |
@Test public void testDataFrameSumPYDMLVectorWithNoIDColumnNoFormatSpecified() { System.out.println("MLContextTest - DataFrame sum PYDML, vector with no ID column, no format specified"); List<Vector> list = new ArrayList<>(); list.add(Vectors.dense(1.0, 2.0, 3.0)); list.add(Vectors.dense(4.0, 5.0, 6.0)); list.add(Vectors.dense(7.0, 8.0, 9.0)); JavaRDD<Vector> javaRddVector = sc.parallelize(list); JavaRDD<Row> javaRddRow = javaRddVector.map(new VectorRow()); List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField("C1", new VectorUDT(), true)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema); Script script = dml("print('sum: ' + sum(M))").in("M", dataFrame); setExpectedStdOut("sum: 45.0"); ml.execute(script); }
Example #10
Source File: MLContextTest.java From systemds with Apache License 2.0 | 6 votes |
@Test public void testDataFrameSumDMLVectorWithNoIDColumnNoFormatSpecified() { System.out.println("MLContextTest - DataFrame sum DML, vector with no ID column, no format specified"); List<Vector> list = new ArrayList<>(); list.add(Vectors.dense(1.0, 2.0, 3.0)); list.add(Vectors.dense(4.0, 5.0, 6.0)); list.add(Vectors.dense(7.0, 8.0, 9.0)); JavaRDD<Vector> javaRddVector = sc.parallelize(list); JavaRDD<Row> javaRddRow = javaRddVector.map(new VectorRow()); List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField("C1", new VectorUDT(), true)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema); Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame); setExpectedStdOut("sum: 45.0"); ml.execute(script); }
Example #11
Source File: MLContextTest.java From systemds with Apache License 2.0 | 6 votes |
@Test public void testDataFrameSumPYDMLVectorWithIDColumnNoFormatSpecified() { System.out.println("MLContextTest - DataFrame sum PYDML, vector with ID column, no format specified"); List<Tuple2<Double, Vector>> list = new ArrayList<>(); list.add(new Tuple2<>(1.0, Vectors.dense(1.0, 2.0, 3.0))); list.add(new Tuple2<>(2.0, Vectors.dense(4.0, 5.0, 6.0))); list.add(new Tuple2<>(3.0, Vectors.dense(7.0, 8.0, 9.0))); JavaRDD<Tuple2<Double, Vector>> javaRddTuple = sc.parallelize(list); JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleVectorRow()); List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true)); fields.add(DataTypes.createStructField("C1", new VectorUDT(), true)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema); Script script = dml("print('sum: ' + sum(M))").in("M", dataFrame); setExpectedStdOut("sum: 45.0"); ml.execute(script); }
Example #12
Source File: MLContextTest.java From systemds with Apache License 2.0 | 6 votes |
@Test public void testDataFrameSumDMLVectorWithIDColumnNoFormatSpecified() { System.out.println("MLContextTest - DataFrame sum DML, vector with ID column, no format specified"); List<Tuple2<Double, Vector>> list = new ArrayList<>(); list.add(new Tuple2<>(1.0, Vectors.dense(1.0, 2.0, 3.0))); list.add(new Tuple2<>(2.0, Vectors.dense(4.0, 5.0, 6.0))); list.add(new Tuple2<>(3.0, Vectors.dense(7.0, 8.0, 9.0))); JavaRDD<Tuple2<Double, Vector>> javaRddTuple = sc.parallelize(list); JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleVectorRow()); List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true)); fields.add(DataTypes.createStructField("C1", new VectorUDT(), true)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema); Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame); setExpectedStdOut("sum: 45.0"); ml.execute(script); }
Example #13
Source File: MLContextTest.java From systemds with Apache License 2.0 | 6 votes |
@Test public void testDataFrameSumDMLMllibVectorWithNoIDColumn() { System.out.println("MLContextTest - DataFrame sum DML, mllib vector with no ID column"); List<org.apache.spark.mllib.linalg.Vector> list = new ArrayList<>(); list.add(org.apache.spark.mllib.linalg.Vectors.dense(1.0, 2.0, 3.0)); list.add(org.apache.spark.mllib.linalg.Vectors.dense(4.0, 5.0, 6.0)); list.add(org.apache.spark.mllib.linalg.Vectors.dense(7.0, 8.0, 9.0)); JavaRDD<org.apache.spark.mllib.linalg.Vector> javaRddVector = sc.parallelize(list); JavaRDD<Row> javaRddRow = javaRddVector.map(new MllibVectorRow()); List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField("C1", new org.apache.spark.mllib.linalg.VectorUDT(), true)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema); MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR); Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame, mm); setExpectedStdOut("sum: 45.0"); ml.execute(script); }
Example #14
Source File: MLContextTest.java From systemds with Apache License 2.0 | 6 votes |
@Test public void testDataFrameSumDMLVectorWithNoIDColumn() { System.out.println("MLContextTest - DataFrame sum DML, vector with no ID column"); List<Vector> list = new ArrayList<>(); list.add(Vectors.dense(1.0, 2.0, 3.0)); list.add(Vectors.dense(4.0, 5.0, 6.0)); list.add(Vectors.dense(7.0, 8.0, 9.0)); JavaRDD<Vector> javaRddVector = sc.parallelize(list); JavaRDD<Row> javaRddRow = javaRddVector.map(new VectorRow()); List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField("C1", new VectorUDT(), true)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema); MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR); Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame, mm); setExpectedStdOut("sum: 45.0"); ml.execute(script); }
Example #15
Source File: MLContextTest.java From systemds with Apache License 2.0 | 6 votes |
@Test public void testDataFrameSumDMLMllibVectorWithIDColumn() { System.out.println("MLContextTest - DataFrame sum DML, mllib vector with ID column"); List<Tuple2<Double, org.apache.spark.mllib.linalg.Vector>> list = new ArrayList<>(); list.add(new Tuple2<>(1.0, org.apache.spark.mllib.linalg.Vectors.dense(1.0, 2.0, 3.0))); list.add(new Tuple2<>(2.0, org.apache.spark.mllib.linalg.Vectors.dense(4.0, 5.0, 6.0))); list.add(new Tuple2<>(3.0, org.apache.spark.mllib.linalg.Vectors.dense(7.0, 8.0, 9.0))); JavaRDD<Tuple2<Double, org.apache.spark.mllib.linalg.Vector>> javaRddTuple = sc.parallelize(list); JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleMllibVectorRow()); List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true)); fields.add(DataTypes.createStructField("C1", new org.apache.spark.mllib.linalg.VectorUDT(), true)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema); MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR_WITH_INDEX); Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame, mm); setExpectedStdOut("sum: 45.0"); ml.execute(script); }
Example #16
Source File: MLContextTest.java From systemds with Apache License 2.0 | 6 votes |
@Test public void testDataFrameSumDMLVectorWithIDColumn() { System.out.println("MLContextTest - DataFrame sum DML, vector with ID column"); List<Tuple2<Double, Vector>> list = new ArrayList<>(); list.add(new Tuple2<>(1.0, Vectors.dense(1.0, 2.0, 3.0))); list.add(new Tuple2<>(2.0, Vectors.dense(4.0, 5.0, 6.0))); list.add(new Tuple2<>(3.0, Vectors.dense(7.0, 8.0, 9.0))); JavaRDD<Tuple2<Double, Vector>> javaRddTuple = sc.parallelize(list); JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleVectorRow()); List<StructField> fields = new ArrayList<>(); fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true)); fields.add(DataTypes.createStructField("C1", new VectorUDT(), true)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema); MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR_WITH_INDEX); Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame, mm); setExpectedStdOut("sum: 45.0"); ml.execute(script); }
Example #17
Source File: MLContextConversionUtil.java From systemds with Apache License 2.0 | 5 votes |
/** * If the MatrixFormat of the DataFrame has not been explicitly specified, * attempt to determine the proper MatrixFormat. * * @param dataFrame * the Spark {@code DataFrame} * @param matrixMetadata * the matrix metadata, if available */ public static void determineMatrixFormatIfNeeded(Dataset<Row> dataFrame, MatrixMetadata matrixMetadata) { if (matrixMetadata == null) { return; } MatrixFormat matrixFormat = matrixMetadata.getMatrixFormat(); if (matrixFormat != null) { return; } StructType schema = dataFrame.schema(); boolean hasID = false; try { schema.fieldIndex(RDDConverterUtils.DF_ID_COLUMN); hasID = true; } catch (IllegalArgumentException iae) { } StructField[] fields = schema.fields(); MatrixFormat mf = null; if (hasID) { if (fields[1].dataType() instanceof VectorUDT) { mf = MatrixFormat.DF_VECTOR_WITH_INDEX; } else { mf = MatrixFormat.DF_DOUBLES_WITH_INDEX; } } else { if (fields[0].dataType() instanceof VectorUDT) { mf = MatrixFormat.DF_VECTOR; } else { mf = MatrixFormat.DF_DOUBLES; } } if (mf == null) { throw new MLContextException("DataFrame format not recognized as an accepted SystemDS MatrixFormat"); } matrixMetadata.setMatrixFormat(mf); }
Example #18
Source File: FrameRDDConverterUtils.java From systemds with Apache License 2.0 | 5 votes |
/** * Obtain column vector from DataFrame schema * * @param dfschema schema as StructType * @param containsID if true, contains ID column * @return 0-based column index of vector column, -1 if no vector. */ private static int getColVectFromDFSchema(StructType dfschema, boolean containsID) { int off = containsID ? 1 : 0; for( int i=off; i<dfschema.fields().length; i++ ) { StructField structType = dfschema.apply(i); if(structType.dataType() instanceof VectorUDT) return i-off; } return -1; }
Example #19
Source File: MLContextTest.java From systemds with Apache License 2.0 | 5 votes |
@Test public void testOutputDataFrameOfVectorsDML() { System.out.println("MLContextTest - output DataFrame of vectors DML"); String s = "m=matrix('1 2 3 4',rows=2,cols=2);"; Script script = dml(s).out("m"); MLResults results = ml.execute(script); Dataset<Row> df = results.getDataFrame("m", true); Dataset<Row> sortedDF = df.sort(RDDConverterUtils.DF_ID_COLUMN); // verify column types StructType schema = sortedDF.schema(); StructField[] fields = schema.fields(); StructField idColumn = fields[0]; StructField vectorColumn = fields[1]; Assert.assertTrue(idColumn.dataType() instanceof DoubleType); Assert.assertTrue(vectorColumn.dataType() instanceof VectorUDT); List<Row> list = sortedDF.collectAsList(); Row row1 = list.get(0); Assert.assertEquals(1.0, row1.getDouble(0), 0.0); Vector v1 = (DenseVector) row1.get(1); double[] arr1 = v1.toArray(); Assert.assertArrayEquals(new double[] { 1.0, 2.0 }, arr1, 0.0); Row row2 = list.get(1); Assert.assertEquals(2.0, row2.getDouble(0), 0.0); Vector v2 = (DenseVector) row2.get(1); double[] arr2 = v2.toArray(); Assert.assertArrayEquals(new double[] { 3.0, 4.0 }, arr2, 0.0); }
Example #20
Source File: MLContextConversionUtil.java From systemds with Apache License 2.0 | 5 votes |
/** * If the MatrixFormat of the DataFrame has not been explicitly specified, * attempt to determine the proper MatrixFormat. * * @param dataFrame * the Spark {@code DataFrame} * @param matrixMetadata * the matrix metadata, if available */ public static void determineMatrixFormatIfNeeded(Dataset<Row> dataFrame, MatrixMetadata matrixMetadata) { if (matrixMetadata == null) { return; } MatrixFormat matrixFormat = matrixMetadata.getMatrixFormat(); if (matrixFormat != null) { return; } StructType schema = dataFrame.schema(); boolean hasID = false; try { schema.fieldIndex(RDDConverterUtils.DF_ID_COLUMN); hasID = true; } catch (IllegalArgumentException iae) { } StructField[] fields = schema.fields(); MatrixFormat mf = null; if (hasID) { if (fields[1].dataType() instanceof VectorUDT) { mf = MatrixFormat.DF_VECTOR_WITH_INDEX; } else { mf = MatrixFormat.DF_DOUBLES_WITH_INDEX; } } else { if (fields[0].dataType() instanceof VectorUDT) { mf = MatrixFormat.DF_VECTOR; } else { mf = MatrixFormat.DF_DOUBLES; } } if (mf == null) { throw new MLContextException("DataFrame format not recognized as an accepted SystemDS MatrixFormat"); } matrixMetadata.setMatrixFormat(mf); }
Example #21
Source File: MinMaxScalerBridgeTest.java From spark-transformers with Apache License 2.0 | 5 votes |
@Test public void testMinMaxScaler() { //prepare data JavaRDD<Row> jrdd = jsc.parallelize(Arrays.asList( RowFactory.create(1.0, Vectors.dense(data[0])), RowFactory.create(2.0, Vectors.dense(data[1])), RowFactory.create(3.0, Vectors.dense(data[2])), RowFactory.create(4.0, Vectors.dense(data[3])) )); StructType schema = new StructType(new StructField[]{ new StructField("label", DataTypes.DoubleType, false, Metadata.empty()), new StructField("features", new VectorUDT(), false, Metadata.empty()) }); Dataset<Row> df = spark.createDataFrame(jrdd, schema); //train model in spark MinMaxScalerModel sparkModel = new MinMaxScaler() .setInputCol("features") .setOutputCol("scaled") .setMin(-5) .setMax(5) .fit(df); //Export model, import it back and get transformer byte[] exportedModel = ModelExporter.export(sparkModel); final Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel); //compare predictions List<Row> sparkOutput = sparkModel.transform(df).orderBy("label").select("features", "scaled").collectAsList(); assertCorrectness(sparkOutput, expected, transformer); }
Example #22
Source File: SimplePredictionFromTextFile.java From net.jgp.labs.spark with Apache License 2.0 | 5 votes |
private void start() { SparkSession spark = SparkSession.builder().appName( "Simple prediction from Text File").master("local").getOrCreate(); spark.udf().register("vectorBuilder", new VectorBuilder(), new VectorUDT()); String filename = "data/tuple-data-file.csv"; StructType schema = new StructType( new StructField[] { new StructField("_c0", DataTypes.DoubleType, false, Metadata.empty()), new StructField("_c1", DataTypes.DoubleType, false, Metadata .empty()), new StructField("features", new VectorUDT(), true, Metadata .empty()), }); Dataset<Row> df = spark.read().format("csv").schema(schema).option("header", "false") .load(filename); df = df.withColumn("valuefeatures", df.col("_c0")).drop("_c0"); df = df.withColumn("label", df.col("_c1")).drop("_c1"); df.printSchema(); df = df.withColumn("features", callUDF("vectorBuilder", df.col( "valuefeatures"))); df.printSchema(); df.show(); LinearRegression lr = new LinearRegression().setMaxIter(20);// .setRegParam(1).setElasticNetParam(1); // Fit the model to the data. LinearRegressionModel model = lr.fit(df); // Given a dataset, predict each point's label, and show the results. model.transform(df).show(); LinearRegressionTrainingSummary trainingSummary = model.summary(); System.out.println("numIterations: " + trainingSummary.totalIterations()); System.out.println("objectiveHistory: " + Vectors.dense(trainingSummary .objectiveHistory())); trainingSummary.residuals().show(); System.out.println("RMSE: " + trainingSummary.rootMeanSquaredError()); System.out.println("r2: " + trainingSummary.r2()); double intercept = model.intercept(); System.out.println("Interesection: " + intercept); double regParam = model.getRegParam(); System.out.println("Regression parameter: " + regParam); double tol = model.getTol(); System.out.println("Tol: " + tol); Double feature = 7.0; Vector features = Vectors.dense(feature); double p = model.predict(features); System.out.println("Prediction for feature " + feature + " is " + p); System.out.println(8 * regParam + intercept); }
Example #23
Source File: JavaElementwiseProductExample.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaElementwiseProductExample") .getOrCreate(); // $example on$ // Create some vector data; also works for sparse vectors List<Row> data = Arrays.asList( RowFactory.create("a", Vectors.dense(1.0, 2.0, 3.0)), RowFactory.create("b", Vectors.dense(4.0, 5.0, 6.0)) ); List<StructField> fields = new ArrayList<StructField>(2); fields.add(DataTypes.createStructField("id", DataTypes.StringType, false)); fields.add(DataTypes.createStructField("vector", new VectorUDT(), false)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> dataFrame = spark.createDataFrame(data, schema); Vector transformingVector = Vectors.dense(0.0, 1.0, 2.0); ElementwiseProduct transformer = new ElementwiseProduct() .setScalingVec(transformingVector) .setInputCol("vector") .setOutputCol("transformedVector"); // Batch transform the vectors to create new column: transformer.transform(dataFrame).show(); // $example off$ spark.stop(); }
Example #24
Source File: JavaPolynomialExpansionExample.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaPolynomialExpansionExample") .getOrCreate(); // $example on$ PolynomialExpansion polyExpansion = new PolynomialExpansion() .setInputCol("features") .setOutputCol("polyFeatures") .setDegree(3); List<Row> data = Arrays.asList( RowFactory.create(Vectors.dense(2.0, 1.0)), RowFactory.create(Vectors.dense(0.0, 0.0)), RowFactory.create(Vectors.dense(3.0, -1.0)) ); StructType schema = new StructType(new StructField[]{ new StructField("features", new VectorUDT(), false, Metadata.empty()), }); Dataset<Row> df = spark.createDataFrame(data, schema); Dataset<Row> polyDF = polyExpansion.transform(df); polyDF.show(false); // $example off$ spark.stop(); }
Example #25
Source File: JavaAFTSurvivalRegressionExample.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaAFTSurvivalRegressionExample") .getOrCreate(); // $example on$ List<Row> data = Arrays.asList( RowFactory.create(1.218, 1.0, Vectors.dense(1.560, -0.605)), RowFactory.create(2.949, 0.0, Vectors.dense(0.346, 2.158)), RowFactory.create(3.627, 0.0, Vectors.dense(1.380, 0.231)), RowFactory.create(0.273, 1.0, Vectors.dense(0.520, 1.151)), RowFactory.create(4.199, 0.0, Vectors.dense(0.795, -0.226)) ); StructType schema = new StructType(new StructField[]{ new StructField("label", DataTypes.DoubleType, false, Metadata.empty()), new StructField("censor", DataTypes.DoubleType, false, Metadata.empty()), new StructField("features", new VectorUDT(), false, Metadata.empty()) }); Dataset<Row> training = spark.createDataFrame(data, schema); double[] quantileProbabilities = new double[]{0.3, 0.6}; AFTSurvivalRegression aft = new AFTSurvivalRegression() .setQuantileProbabilities(quantileProbabilities) .setQuantilesCol("quantiles"); AFTSurvivalRegressionModel model = aft.fit(training); // Print the coefficients, intercept and scale parameter for AFT survival regression System.out.println("Coefficients: " + model.coefficients()); System.out.println("Intercept: " + model.intercept()); System.out.println("Scale: " + model.scale()); model.transform(training).show(false); // $example off$ spark.stop(); }
Example #26
Source File: JavaNormalizerExample.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaNormalizerExample") .getOrCreate(); // $example on$ List<Row> data = Arrays.asList( RowFactory.create(0, Vectors.dense(1.0, 0.1, -8.0)), RowFactory.create(1, Vectors.dense(2.0, 1.0, -4.0)), RowFactory.create(2, Vectors.dense(4.0, 10.0, 8.0)) ); StructType schema = new StructType(new StructField[]{ new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), new StructField("features", new VectorUDT(), false, Metadata.empty()) }); Dataset<Row> dataFrame = spark.createDataFrame(data, schema); // Normalize each Vector using $L^1$ norm. Normalizer normalizer = new Normalizer() .setInputCol("features") .setOutputCol("normFeatures") .setP(1.0); Dataset<Row> l1NormData = normalizer.transform(dataFrame); l1NormData.show(); // Normalize each Vector using $L^\infty$ norm. Dataset<Row> lInfNormData = normalizer.transform(dataFrame, normalizer.p().w(Double.POSITIVE_INFINITY)); lInfNormData.show(); // $example off$ spark.stop(); }
Example #27
Source File: FrameRDDConverterUtils.java From systemds with Apache License 2.0 | 5 votes |
/** * Obtain column vector from DataFrame schema * * @param dfschema schema as StructType * @param containsID if true, contains ID column * @return 0-based column index of vector column, -1 if no vector. */ private static int getColVectFromDFSchema(StructType dfschema, boolean containsID) { int off = containsID ? 1 : 0; for( int i=off; i<dfschema.fields().length; i++ ) { StructField structType = dfschema.apply(i); if(structType.dataType() instanceof VectorUDT) return i-off; } return -1; }
Example #28
Source File: MLContextTest.java From systemds with Apache License 2.0 | 5 votes |
@Test public void testOutputDataFrameOfVectorsDML() { System.out.println("MLContextTest - output DataFrame of vectors DML"); String s = "m=matrix('1 2 3 4',rows=2,cols=2);"; Script script = dml(s).out("m"); MLResults results = ml.execute(script); Dataset<Row> df = results.getDataFrame("m", true); Dataset<Row> sortedDF = df.sort(RDDConverterUtils.DF_ID_COLUMN); // verify column types StructType schema = sortedDF.schema(); StructField[] fields = schema.fields(); StructField idColumn = fields[0]; StructField vectorColumn = fields[1]; Assert.assertTrue(idColumn.dataType() instanceof DoubleType); Assert.assertTrue(vectorColumn.dataType() instanceof VectorUDT); List<Row> list = sortedDF.collectAsList(); Row row1 = list.get(0); Assert.assertEquals(1.0, row1.getDouble(0), 0.0); Vector v1 = (DenseVector) row1.get(1); double[] arr1 = v1.toArray(); Assert.assertArrayEquals(new double[] { 1.0, 2.0 }, arr1, 0.0); Row row2 = list.get(1); Assert.assertEquals(2.0, row2.getDouble(0), 0.0); Vector v2 = (DenseVector) row2.get(1); double[] arr2 = v2.toArray(); Assert.assertArrayEquals(new double[] { 3.0, 4.0 }, arr2, 0.0); }
Example #29
Source File: ProteinSequenceEncoder.java From mmtf-spark with Apache License 2.0 | 5 votes |
/** * One-hot encodes a protein sequence. The one-hot encoding * encodes the 20 natural amino acids, plus X for any other * residue for a total of 21 elements per residue. * * @return dataset with feature vector appended */ public Dataset<Row> oneHotEncode() { SparkSession session = data.sparkSession(); int maxLength = getMaxSequenceLength(data); session.udf().register("encoder", new UDF1<String, Vector>() { private static final long serialVersionUID = -6095318836772114908L; @Override public Vector call(String s) throws Exception { int len = AMINO_ACIDS21.size(); double[] values = new double[len * maxLength]; char[] seq = s.toCharArray(); for (int i = 0; i < seq.length; i++) { int index = AMINO_ACIDS21.indexOf(seq[i]); // replace any non-matching code, e.g., U, with X if (index == -1) { index = AMINO_ACIDS21.indexOf('X'); } values[i * len + index] = 1; } return Vectors.dense(values); } }, new VectorUDT()); // append feature column data.createOrReplaceTempView("table"); data = session.sql("SELECT *, encoder(" + inputCol + ") AS " + outputCol + " from table"); return data; }
Example #30
Source File: ProteinSequenceEncoder.java From mmtf-spark with Apache License 2.0 | 5 votes |
/** * Encodes a protein sequence by 7 physicochemical * properties. * * <p> See: Meiler, J., Müller, M., Zeidler, A. et al. J Mol Model (2001) 7: 360. doi: * <a href="https://link.springer.com/article/10.1007/s008940100038">10.1007/s008940100038</a> * * @return dataset with feature vector appended */ public Dataset<Row> propertyEncode() { SparkSession session = data.sparkSession(); int maxLength = getMaxSequenceLength(data); session.udf().register("encoder", new UDF1<String, Vector>(){ private static final long serialVersionUID = 1L; @Override public Vector call(String s) throws Exception { double[] values = new double[7*maxLength]; for (int i = 0, k = 0; i < s.length(); i++) { double[] property = properties.get(s.charAt(i)); if (property != null) { for (double p: property) { values[k++] = p; } } } return Vectors.dense(values); } }, new VectorUDT()); // append feature column data.createOrReplaceTempView("table"); data = session.sql("SELECT *, encoder(" + inputCol + ") AS " + outputCol + " from table"); return data; }