org.apache.spark.ml.feature.PCA Scala Examples
The following examples show how to use org.apache.spark.ml.feature.PCA.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: PCAExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.PCA import org.apache.spark.ml.linalg.Vectors // $example off$ import org.apache.spark.sql.SparkSession object PCAExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("PCAExample") .getOrCreate() // $example on$ val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0) ) val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features") val pca = new PCA() .setInputCol("features") .setOutputCol("pcaFeatures") .setK(3) .fit(df) val result = pca.transform(df).select("pcaFeatures") result.show(false) // $example off$ spark.stop() } } // scalastyle:on println
Example 2
Source File: PcaParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.feature import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.feature.{PCA, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.DataFrame class PcaParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("dti", "loan_amount") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new VectorAssembler(). setInputCols(Array("dti", "loan_amount")). setOutputCol("features"), new PCA(). setInputCol("features"). setOutputCol("pca_features"). setK(2))).fit(dataset) override val unserializedParams = Set("k") }
Example 3
Source File: MNIST.scala From spark-knn with Apache License 2.0 | 5 votes |
package com.github.saurfang.spark.ml.knn.examples import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.KNNClassifier import org.apache.spark.ml.feature.PCA import org.apache.spark.mllib.util.MLUtils import org.apache.spark.sql.{DataFrame, SparkSession} import org.apache.log4j object MNIST { val logger = log4j.Logger.getLogger(getClass) def main(args: Array[String]) { val spark = SparkSession.builder().getOrCreate() val sc = spark.sparkContext import spark.implicits._ //read in raw label and features val rawDataset = MLUtils.loadLibSVMFile(sc, "data/mnist/mnist.bz2") .toDF() // convert "features" from mllib.linalg.Vector to ml.linalg.Vector val dataset = MLUtils.convertVectorColumnsToML(rawDataset) //split training and testing val Array(train, test) = dataset .randomSplit(Array(0.7, 0.3), seed = 1234L) .map(_.cache()) //create PCA matrix to reduce feature dimensions val pca = new PCA() .setInputCol("features") .setK(50) .setOutputCol("pcaFeatures") val knn = new KNNClassifier() .setTopTreeSize(dataset.count().toInt / 500) .setFeaturesCol("pcaFeatures") .setPredictionCol("predicted") .setK(1) val pipeline = new Pipeline() .setStages(Array(pca, knn)) .fit(train) val insample = validate(pipeline.transform(train)) val outofsample = validate(pipeline.transform(test)) //reference accuracy: in-sample 95% out-of-sample 94% logger.info(s"In-sample: $insample, Out-of-sample: $outofsample") } private[this] def validate(results: DataFrame): Double = { results .selectExpr("SUM(CASE WHEN label = predicted THEN 1.0 ELSE 0.0 END) / COUNT(1)") .collect() .head .getDecimal(0) .doubleValue() } }
Example 4
Source File: MNISTCrossValidation.scala From spark-knn with Apache License 2.0 | 5 votes |
package com.github.saurfang.spark.ml.knn.examples import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.KNNClassifier import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.feature.PCA import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder} import org.apache.spark.mllib.util.MLUtils import org.apache.spark.sql.{DataFrame, SparkSession} import org.apache.log4j object MNISTCrossValidation { val logger = log4j.Logger.getLogger(getClass) def main(args: Array[String]) { val spark = SparkSession.builder().getOrCreate() val sc = spark.sparkContext import spark.implicits._ //read in raw label and features val dataset = MLUtils.loadLibSVMFile(sc, "data/mnist/mnist.bz2") .toDF() //.limit(10000) //split traning and testing val Array(train, test) = dataset.randomSplit(Array(0.7, 0.3), seed = 1234L).map(_.cache()) //create PCA matrix to reduce feature dimensions val pca = new PCA() .setInputCol("features") .setK(50) .setOutputCol("pcaFeatures") val knn = new KNNClassifier() .setTopTreeSize(50) .setFeaturesCol("pcaFeatures") .setPredictionCol("prediction") .setK(1) val pipeline = new Pipeline() .setStages(Array(pca, knn)) val paramGrid = new ParamGridBuilder() // .addGrid(knn.k, 1 to 20) .addGrid(pca.k, 10 to 100 by 10) .build() val cv = new CrossValidator() .setEstimator(pipeline) .setEvaluator(new MulticlassClassificationEvaluator) .setEstimatorParamMaps(paramGrid) .setNumFolds(5) val cvModel = cv.fit(train) val insample = validate(cvModel.transform(train)) val outofsample = validate(cvModel.transform(test)) //reference accuracy: in-sample 95% out-of-sample 94% logger.info(s"In-sample: $insample, Out-of-sample: $outofsample") logger.info(s"Cross-validated: ${cvModel.avgMetrics.toSeq}") } private[this] def validate(results: DataFrame): Double = { results .selectExpr("SUM(CASE WHEN label = prediction THEN 1.0 ELSE 0.0 END) / COUNT(1)") .collect() .head .getDecimal(0) .doubleValue() } }
Example 5
Source File: PCAExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.PCA import org.apache.spark.ml.linalg.Vectors // $example off$ import org.apache.spark.sql.SparkSession object PCAExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("PCAExample") .getOrCreate() // $example on$ val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0) ) val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features") val pca = new PCA() .setInputCol("features") .setOutputCol("pcaFeatures") .setK(3) .fit(df) val result = pca.transform(df).select("pcaFeatures") result.show(false) // $example off$ spark.stop() } } // scalastyle:on println
Example 6
Source File: PCAExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.PCA import org.apache.spark.ml.linalg.Vectors // $example off$ import org.apache.spark.sql.SparkSession object PCAExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("PCAExample") .getOrCreate() // $example on$ val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0) ) val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features") val pca = new PCA() .setInputCol("features") .setOutputCol("pcaFeatures") .setK(3) .fit(df) val result = pca.transform(df).select("pcaFeatures") result.show(false) // $example off$ spark.stop() } } // scalastyle:on println
Example 7
Source File: PCAExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.PCA import org.apache.spark.ml.linalg.Vectors // $example off$ import org.apache.spark.sql.SparkSession object PCAExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("PCAExample") .getOrCreate() // $example on$ val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0) ) val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features") val pca = new PCA() .setInputCol("features") .setOutputCol("pcaFeatures") .setK(3) .fit(df) val result = pca.transform(df).select("pcaFeatures") result.show(false) // $example off$ spark.stop() } } // scalastyle:on println
Example 8
Source File: PCAExample.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.PCA import org.apache.spark.mllib.linalg.Vectors // $example off$ import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} object PCAExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("PCAExample") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) // $example on$ val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0) ) val df = sqlContext.createDataFrame(data.map(Tuple1.apply)).toDF("features") val pca = new PCA() .setInputCol("features") .setOutputCol("pcaFeatures") .setK(3) .fit(df) val pcaDF = pca.transform(df) val result = pcaDF.select("pcaFeatures") result.show() // $example off$ sc.stop() } } // scalastyle:on println
Example 9
Source File: PCAExample.scala From Scala-and-Spark-for-Big-Data-Analytics with MIT License | 5 votes |
package com.chapter11.SparkMachineLearning import org.apache.spark.ml.feature.PCA import org.apache.spark.ml.linalg.Vectors import org.apache.spark.sql.SparkSession object PCASampleDemo { def main(args: Array[String]): Unit = { val spark = SparkSession .builder() .master("local[4]") .appName("PCAExample") .getOrCreate() val data = Array( Vectors.dense(3.5, 2.0, 5.0, 6.3, 5.60, 2.4), Vectors.dense(4.40, 0.10, 3.0, 9.0, 7.0, 8.75), Vectors.dense(3.20, 2.40, 0.0, 6.0, 7.4, 3.34) ) val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features") df.show(false) val pca = new PCA() .setInputCol("features") .setOutputCol("pcaFeatures") .setK(4) .fit(df) val result = pca.transform(df).select("pcaFeatures") result.show(false) spark.stop() } }
Example 10
Source File: PCAModelSuite.scala From aardpfark with Apache License 2.0 | 5 votes |
package com.ibm.aardpfark.spark.ml.feature import com.ibm.aardpfark.pfa.{Result, SparkFeaturePFASuiteBase} import org.apache.spark.ml.feature.PCA import org.apache.spark.ml.linalg.Vector import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder class PCAModelSuite extends SparkFeaturePFASuiteBase[PCAModelResult] { implicit val enc = ExpressionEncoder[Vector]() val inputPath = "data/sample_lda_libsvm_data.txt" val dataset = spark.read.format("libsvm").load(inputPath) val pca = new PCA() .setInputCol("features") .setOutputCol("pcaFeatures") .setK(3) override val sparkTransformer = pca.fit(dataset) val result = sparkTransformer.transform(dataset) override val input = withColumnAsArray(result, pca.getInputCol).toJSON.collect() override val expectedOutput = withColumnAsArray(result, pca.getOutputCol).toJSON.collect() } case class PCAModelResult(pcaFeatures: Seq[Double]) extends Result
Example 11
Source File: MyPCA.scala From Apache-Spark-2x-Machine-Learning-Cookbook with MIT License | 5 votes |
package spark.ml.cookbook.chapter11 import org.apache.log4j.{Level, Logger} import org.apache.spark.ml.feature.PCA import org.apache.spark.ml.linalg.Vectors import org.apache.spark.sql.SparkSession object MyPCA { def main(args: Array[String]): Unit = { Logger.getLogger("org").setLevel(Level.ERROR) val spark = SparkSession .builder .master("local[*]") .appName("MyPCA") .config("spark.sql.warehouse.dir", ".") .getOrCreate() val dataFile = "../data/sparkml2/chapter11/processed.cleveland.data" val rawdata = spark.sparkContext.textFile(dataFile).map(_.trim) println(rawdata.count()) val data = rawdata.filter(text => !(text.isEmpty || text.indexOf("?") > -1)) .map { line => val values = line.split(',').map(_.toDouble) Vectors.dense(values) } println(data.count()) data.take(2).foreach(println) val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features") val pca = new PCA() .setInputCol("features") .setOutputCol("pcaFeatures") .setK(4) .fit(df) val pcaDF = pca.transform(df) val result = pcaDF.select("pcaFeatures") result.show(false) spark.stop() } }