org.apache.spark.annotation.Experimental Scala Examples
The following examples show how to use org.apache.spark.annotation.Experimental.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: ChiSqSelector.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import scala.collection.mutable.ArrayBuilder import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.stat.Statistics import org.apache.spark.rdd.RDD @Since("1.3.0") def fit(data: RDD[LabeledPoint]): ChiSqSelectorModel = { val indices = Statistics.chiSqTest(data) .zipWithIndex.sortBy { case (res, _) => -res.statistic } .take(numTopFeatures) .map { case (_, indices) => indices } .sorted new ChiSqSelectorModel(indices) } }
Example 2
Source File: ChiSquareTest.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.stat import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT} import org.apache.spark.ml.util.SchemaUtils import org.apache.spark.mllib.linalg.{Vectors => OldVectors} import org.apache.spark.mllib.regression.{LabeledPoint => OldLabeledPoint} import org.apache.spark.mllib.stat.{Statistics => OldStatistics} import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.col @Since("2.2.0") def test(dataset: DataFrame, featuresCol: String, labelCol: String): DataFrame = { val spark = dataset.sparkSession import spark.implicits._ SchemaUtils.checkColumnType(dataset.schema, featuresCol, new VectorUDT) SchemaUtils.checkNumericType(dataset.schema, labelCol) val rdd = dataset.select(col(labelCol).cast("double"), col(featuresCol)).as[(Double, Vector)] .rdd.map { case (label, features) => OldLabeledPoint(label, OldVectors.fromML(features)) } val testResults = OldStatistics.chiSqTest(rdd) val pValues: Vector = Vectors.dense(testResults.map(_.pValue)) val degreesOfFreedom: Array[Int] = testResults.map(_.degreesOfFreedom) val statistics: Vector = Vectors.dense(testResults.map(_.statistic)) spark.createDataFrame(Seq(ChiSquareResult(pValues, degreesOfFreedom, statistics))) } }
Example 3
Source File: RegressionEvaluator.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DoubleType, FloatType} @Since("1.4.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "rmse") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnTypes(schema, $(predictionCol), Seq(DoubleType, FloatType)) SchemaUtils.checkNumericType(schema, $(labelCol)) val predictionAndLabels = dataset .select(col($(predictionCol)).cast(DoubleType), col($(labelCol)).cast(DoubleType)) .rdd .map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new RegressionMetrics(predictionAndLabels) val metric = $(metricName) match { case "rmse" => metrics.rootMeanSquaredError case "mse" => metrics.meanSquaredError case "r2" => metrics.r2 case "mae" => metrics.meanAbsoluteError } metric } @Since("1.4.0") override def isLargerBetter: Boolean = $(metricName) match { case "rmse" => false case "mse" => false case "r2" => true case "mae" => false } @Since("1.5.0") override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra) } @Since("1.6.0") object RegressionEvaluator extends DefaultParamsReadable[RegressionEvaluator] { @Since("1.6.0") override def load(path: String): RegressionEvaluator = super.load(path) }
Example 4
Source File: MulticlassClassificationEvaluator.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.DoubleType @Since("1.5.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "f1") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType) SchemaUtils.checkNumericType(schema, $(labelCol)) val predictionAndLabels = dataset.select(col($(predictionCol)), col($(labelCol)).cast(DoubleType)).rdd.map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new MulticlassMetrics(predictionAndLabels) val metric = $(metricName) match { case "f1" => metrics.weightedFMeasure case "weightedPrecision" => metrics.weightedPrecision case "weightedRecall" => metrics.weightedRecall case "accuracy" => metrics.accuracy } metric } @Since("1.5.0") override def isLargerBetter: Boolean = true @Since("1.5.0") override def copy(extra: ParamMap): MulticlassClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object MulticlassClassificationEvaluator extends DefaultParamsReadable[MulticlassClassificationEvaluator] { @Since("1.6.0") override def load(path: String): MulticlassClassificationEvaluator = super.load(path) }
Example 5
Source File: BinaryClassificationEvaluator.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.DoubleType @Since("1.2.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "areaUnderROC") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnTypes(schema, $(rawPredictionCol), Seq(DoubleType, new VectorUDT)) SchemaUtils.checkNumericType(schema, $(labelCol)) // TODO: When dataset metadata has been implemented, check rawPredictionCol vector length = 2. val scoreAndLabels = dataset.select(col($(rawPredictionCol)), col($(labelCol)).cast(DoubleType)).rdd.map { case Row(rawPrediction: Vector, label: Double) => (rawPrediction(1), label) case Row(rawPrediction: Double, label: Double) => (rawPrediction, label) } val metrics = new BinaryClassificationMetrics(scoreAndLabels) val metric = $(metricName) match { case "areaUnderROC" => metrics.areaUnderROC() case "areaUnderPR" => metrics.areaUnderPR() } metrics.unpersist() metric } @Since("1.5.0") override def isLargerBetter: Boolean = $(metricName) match { case "areaUnderROC" => true case "areaUnderPR" => true } @Since("1.4.1") override def copy(extra: ParamMap): BinaryClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object BinaryClassificationEvaluator extends DefaultParamsReadable[BinaryClassificationEvaluator] { @Since("1.6.0") override def load(path: String): BinaryClassificationEvaluator = super.load(path) }
Example 6
Source File: ColumnProfilerRunner.scala From deequ with Apache License 2.0 | 5 votes |
package com.amazon.deequ.profiles import com.amazon.deequ.analyzers.{DataTypeInstances, KLLParameters} import com.amazon.deequ.io.DfsUtils import com.amazon.deequ.repository.{MetricsRepository, ResultKey} import org.apache.spark.annotation.Experimental import org.apache.spark.sql.{DataFrame, SparkSession} private[profiles] case class ColumnProfilerRunBuilderMetricsRepositoryOptions( metricsRepository: Option[MetricsRepository], reuseExistingResultsKey: Option[ResultKey], failIfResultsForReusingMissing: Boolean, saveOrAppendResultsKey: Option[ResultKey]) private[profiles] case class ColumnProfilerRunBuilderFileOutputOptions( session: Option[SparkSession], saveColumnProfilesJsonToPath: Option[String], overwriteResults: Boolean) @Experimental class ColumnProfilerRunner { def onData(data: DataFrame): ColumnProfilerRunBuilder = { new ColumnProfilerRunBuilder(data) } private[profiles] def run( data: DataFrame, restrictToColumns: Option[Seq[String]], lowCardinalityHistogramThreshold: Int, printStatusUpdates: Boolean, cacheInputs: Boolean, fileOutputOptions: ColumnProfilerRunBuilderFileOutputOptions, metricsRepositoryOptions: ColumnProfilerRunBuilderMetricsRepositoryOptions, kllProfiling: Boolean, kllParameters: Option[KLLParameters], predefinedTypes: Map[String, DataTypeInstances.Value]) : ColumnProfiles = { if (cacheInputs) { data.cache() } val columnProfiles = ColumnProfiler .profile( data, restrictToColumns, printStatusUpdates, lowCardinalityHistogramThreshold, metricsRepositoryOptions.metricsRepository, metricsRepositoryOptions.reuseExistingResultsKey, metricsRepositoryOptions.failIfResultsForReusingMissing, metricsRepositoryOptions.saveOrAppendResultsKey, kllProfiling, kllParameters, predefinedTypes ) saveColumnProfilesJsonToFileSystemIfNecessary( fileOutputOptions, printStatusUpdates, columnProfiles ) if (cacheInputs) { data.unpersist() } columnProfiles } private[this] def saveColumnProfilesJsonToFileSystemIfNecessary( fileOutputOptions: ColumnProfilerRunBuilderFileOutputOptions, printStatusUpdates: Boolean, columnProfiles: ColumnProfiles) : Unit = { fileOutputOptions.session.foreach { session => fileOutputOptions.saveColumnProfilesJsonToPath.foreach { profilesOutput => if (printStatusUpdates) { println(s"### WRITING COLUMN PROFILES TO $profilesOutput") } DfsUtils.writeToTextFileOnDfs(session, profilesOutput, overwrite = fileOutputOptions.overwriteResults) { writer => writer.append(ColumnProfiles.toJson(columnProfiles.profiles.values.toSeq).toString) writer.newLine() } } } } } object ColumnProfilerRunner { def apply(): ColumnProfilerRunner = { new ColumnProfilerRunner() } }
Example 7
Source File: IsotonicRegression.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.regression.IsotonicRegressionModel import org.apache.spark.ml.util._ import org.apache.spark.mllib.odkl.{IsotonicRegression => MLlibIsotonicRegression} import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.storage.StorageLevel @Since("1.5.0") @Experimental class IsotonicRegression @Since("1.5.0")(@Since("1.5.0") override val uid: String) extends org.apache.spark.ml.regression.IsotonicRegression(uid) { @Since("1.5.0") def this() = this(Identifiable.randomUID("isoReg")) @Since("1.5.0") override def fit(dataset: Dataset[_]): IsotonicRegressionModel = { validateAndTransformSchema(dataset.schema, fitting = true) // Extract columns from data. If dataset is persisted, do not persist oldDataset. val instances = extractWeightedLabeledPoints(dataset) val handlePersistence = dataset.rdd.getStorageLevel == StorageLevel.NONE if (handlePersistence) instances.persist(StorageLevel.MEMORY_AND_DISK) val isotonicRegression = new MLlibIsotonicRegression().setIsotonic($(isotonic)) val oldModel = isotonicRegression.run(instances) copyValues(new IsotonicRegressionModel(uid, oldModel).setParent(this)) } } @Since("1.6.0") object IsotonicRegression extends DefaultParamsReadable[IsotonicRegression] { @Since("1.6.0") override def load(path: String): IsotonicRegression = super.load(path) }
Example 8
Source File: Broker.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kafka import org.apache.spark.annotation.Experimental @Experimental object Broker { def create(host: String, port: Int): Broker = new Broker(host, port) def apply(host: String, port: Int): Broker = new Broker(host, port) def unapply(broker: Broker): Option[(String, Int)] = { if (broker == null) { None } else { Some((broker.host, broker.port)) } } }
Example 9
Source File: TestResult.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.test import org.apache.spark.annotation.{Experimental, Since} @Experimental @Since("1.5.0") class KolmogorovSmirnovTestResult private[stat] ( @Since("1.5.0") override val pValue: Double, @Since("1.5.0") override val statistic: Double, @Since("1.5.0") override val nullHypothesis: String) extends TestResult[Int] { @Since("1.5.0") override val degreesOfFreedom = 0 override def toString: String = { "Kolmogorov-Smirnov test summary:\n" + super.toString } }
Example 10
Source File: KernelDensity.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat import com.github.fommil.netlib.BLAS.{getInstance => blas} import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.api.java.JavaRDD import org.apache.spark.rdd.RDD def normPdf( mean: Double, standardDeviation: Double, logStandardDeviationPlusHalfLog2Pi: Double, x: Double): Double = { val x0 = x - mean val x1 = x0 / standardDeviation val logDensity = -0.5 * x1 * x1 - logStandardDeviationPlusHalfLog2Pi math.exp(logDensity) } }
Example 11
Source File: Algo.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.tree.configuration import org.apache.spark.annotation.{Experimental, Since} @Since("1.0.0") @Experimental object Algo extends Enumeration { @Since("1.0.0") type Algo = Value @Since("1.0.0") val Classification, Regression = Value private[mllib] def fromString(name: String): Algo = name match { case "classification" | "Classification" => Classification case "regression" | "Regression" => Regression case _ => throw new IllegalArgumentException(s"Did not recognize Algo name: $name") } }
Example 12
Source File: BoostingStrategy.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.tree.configuration import scala.beans.BeanProperty import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.mllib.tree.configuration.Algo._ import org.apache.spark.mllib.tree.loss.{LogLoss, SquaredError, Loss} @Since("1.3.0") def defaultParams(algo: Algo): BoostingStrategy = { val treeStrategy = Strategy.defaultStrategy(algo) treeStrategy.maxDepth = 3 algo match { case Algo.Classification => treeStrategy.numClasses = 2 new BoostingStrategy(treeStrategy, LogLoss) case Algo.Regression => new BoostingStrategy(treeStrategy, SquaredError) case _ => throw new IllegalArgumentException(s"$algo is not supported by boosting.") } } }
Example 13
Source File: Broker.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kafka import org.apache.spark.annotation.Experimental @Experimental @deprecated("Update to Kafka 0.10 integration", "2.3.0") object Broker { def create(host: String, port: Int): Broker = new Broker(host, port) def apply(host: String, port: Int): Broker = new Broker(host, port) def unapply(broker: Broker): Option[(String, Int)] = { if (broker == null) { None } else { Some((broker.host, broker.port)) } } }
Example 14
Source File: Normalizer.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} @Since("1.1.0") override def transform(vector: Vector): Vector = { val norm = Vectors.norm(vector, p) if (norm != 0.0) { // For dense vector, we've to allocate new memory for new output vector. // However, for sparse vector, the `index` array will not be changed, // so we can re-use it to save memory. vector match { case DenseVector(vs) => val values = vs.clone() val size = values.size var i = 0 while (i < size) { values(i) /= norm i += 1 } Vectors.dense(values) case SparseVector(size, ids, vs) => val values = vs.clone() val nnz = values.size var i = 0 while (i < nnz) { values(i) /= norm i += 1 } Vectors.sparse(size, ids, values) case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass) } } else { // Since the norm is zero, return the input vector object itself. // Note that it's safe since we always assume that the data in RDD // should be immutable. vector } } }
Example 15
Source File: ElementwiseProduct.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.mllib.linalg._ @Since("1.4.0") override def transform(vector: Vector): Vector = { require(vector.size == scalingVec.size, s"vector sizes do not match: Expected ${scalingVec.size} but found ${vector.size}") vector match { case dv: DenseVector => val values: Array[Double] = dv.values.clone() val dim = scalingVec.size var i = 0 while (i < dim) { values(i) *= scalingVec(i) i += 1 } Vectors.dense(values) case SparseVector(size, indices, vs) => val values = vs.clone() val dim = values.length var i = 0 while (i < dim) { values(i) *= scalingVec(indices(i)) i += 1 } Vectors.sparse(size, indices, values) case v => throw new IllegalArgumentException("Does not support vector type " + v.getClass) } } }
Example 16
Source File: ParamGridBuilder.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.tuning import scala.annotation.varargs import scala.collection.mutable import org.apache.spark.annotation.Experimental import org.apache.spark.ml.param._ def build(): Array[ParamMap] = { var paramMaps = Array(new ParamMap) paramGrid.foreach { case (param, values) => val newParamMaps = values.flatMap { v => paramMaps.map(_.copy.put(param.asInstanceOf[Param[Any]], v)) } paramMaps = newParamMaps.toArray } paramMaps } }
Example 17
Source File: RegressionEvaluator.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.Experimental import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types.DoubleType def setLabelCol(value: String): this.type = set(labelCol, value) //默认均方根误差 setDefault(metricName -> "rmse") override def evaluate(dataset: DataFrame): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType) SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType) val predictionAndLabels = dataset.select($(predictionCol), $(labelCol)) .map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new RegressionMetrics(predictionAndLabels) val metric = $(metricName) match { //均方根误差 case "rmse" => metrics.rootMeanSquaredError //均方差 case "mse" => metrics.meanSquaredError case "r2" => metrics.r2 //平均绝对误差 case "mae" => metrics.meanAbsoluteError } metric } override def isLargerBetter: Boolean = $(metricName) match { case "rmse" => false//均方根误差 case "mse" => false//均方差 case "r2" => true//平方系统 case "mae" => false//平均绝对误差 } override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra) }
Example 18
Source File: MulticlassClassificationEvaluator.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.Experimental import org.apache.spark.ml.param.{ParamMap, ParamValidators, Param} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{SchemaUtils, Identifiable} import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.sql.{Row, DataFrame} import org.apache.spark.sql.types.DoubleType def setLabelCol(value: String): this.type = set(labelCol, value) //F1-Measure是根据准确率Precision和召回率Recall二者给出的一个综合的评价指标 setDefault(metricName -> "f1") override def evaluate(dataset: DataFrame): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType) SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType) val predictionAndLabels = dataset.select($(predictionCol), $(labelCol)) .map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new MulticlassMetrics(predictionAndLabels) val metric = $(metricName) match { //F1-Measure是根据准确率Precision和召回率Recall二者给出的一个综合的评价指标 case "f1" => metrics.weightedFMeasure case "precision" => metrics.precision//准确率 case "recall" => metrics.recall//召回率 case "weightedPrecision" => metrics.weightedPrecision//加权准确率 case "weightedRecall" => metrics.weightedRecall//加权召回率 } metric } override def isLargerBetter: Boolean = $(metricName) match { case "f1" => true//F1-Measure是根据准确率Precision和召回率Recall二者给出的一个综合的评价指标 case "precision" => true//准确率 case "recall" => true//召回率 case "weightedPrecision" => true//加权准确率 case "weightedRecall" => true//加权召回率 } override def copy(extra: ParamMap): MulticlassClassificationEvaluator = defaultCopy(extra) }
Example 19
Source File: BinaryClassificationEvaluator.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.Experimental import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.linalg.{Vector, VectorUDT} import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types.DoubleType def setLabelCol(value: String): this.type = set(labelCol, value) //ROC曲线下面积 setDefault(metricName -> "areaUnderROC") override def evaluate(dataset: DataFrame): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(rawPredictionCol), new VectorUDT) SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType) // TODO: When dataset metadata has been implemented, check rawPredictionCol vector length = 2. val scoreAndLabels = dataset.select($(rawPredictionCol), $(labelCol)) .map { case Row(rawPrediction: Vector, label: Double) => (rawPrediction(1), label) } val metrics = new BinaryClassificationMetrics(scoreAndLabels) val metric = $(metricName) match { //ROC曲线下面积为1.0时表示一个完美的分类器 case "areaUnderROC" => metrics.areaUnderROC() //准确率与召回率 case "areaUnderPR" => metrics.areaUnderPR() } metrics.unpersist() metric } override def isLargerBetter: Boolean = $(metricName) match { case "areaUnderROC" => true//ROC曲线下面积为1.0时表示一个完美的分类器,0.5则表示一个随机的性能 case "areaUnderPR" => true //准确率与召回率 } override def copy(extra: ParamMap): BinaryClassificationEvaluator = defaultCopy(extra) }
Example 20
package org.apache.spark.ml.feature import org.apache.spark.annotation.Experimental import org.apache.spark.ml._ import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.Identifiable import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.{Vector, VectorUDT} import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{StructField, StructType} override def transform(dataset: DataFrame): DataFrame = { transformSchema(dataset.schema, logging = true) val pcaOp = udf { pcaModel.transform _ } dataset.withColumn($(outputCol), pcaOp(col($(inputCol)))) } override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType require(inputType.isInstanceOf[VectorUDT], s"Input column ${$(inputCol)} must be a vector column") require(!schema.fieldNames.contains($(outputCol)), s"Output column ${$(outputCol)} already exists.") val outputFields = schema.fields :+ StructField($(outputCol), new VectorUDT, false) StructType(outputFields) } override def copy(extra: ParamMap): PCAModel = { val copied = new PCAModel(uid, pcaModel) copyValues(copied, extra).setParent(parent) } }
Example 21
package org.apache.spark.ml.feature import edu.emory.mathcs.jtransforms.dct._ import org.apache.spark.annotation.Experimental import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.param.BooleanParam import org.apache.spark.ml.util.Identifiable import org.apache.spark.mllib.linalg.{Vector, VectorUDT, Vectors} import org.apache.spark.sql.types.DataType def getInverse: Boolean = $(inverse) setDefault(inverse -> false) override protected def createTransformFunc: Vector => Vector = { vec => val result = vec.toArray val jTransformer = new DoubleDCT_1D(result.length) if ($(inverse)) jTransformer.inverse(result, true) else jTransformer.forward(result, true) Vectors.dense(result) } override protected def validateInputType(inputType: DataType): Unit = { require(inputType.isInstanceOf[VectorUDT], s"Input type must be VectorUDT but got $inputType.") } override protected def outputDataType: DataType = new VectorUDT }
Example 22
Source File: Binarizer.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Experimental import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.BinaryAttribute import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DoubleType, StructType} def setOutputCol(value: String): this.type = set(outputCol, value) override def transform(dataset: DataFrame): DataFrame = { transformSchema(dataset.schema, logging = true) val td = $(threshold) val binarizer = udf { in: Double => if (in > td) 1.0 else 0.0 } val outputColName = $(outputCol) val metadata = BinaryAttribute.defaultAttr.withName(outputColName).toMetadata() dataset.select(col("*"), binarizer(col($(inputCol))).as(outputColName, metadata)) } override def transformSchema(schema: StructType): StructType = { SchemaUtils.checkColumnType(schema, $(inputCol), DoubleType) val inputFields = schema.fields val outputColName = $(outputCol) require(inputFields.forall(_.name != outputColName), s"Output column $outputColName already exists.") val attr = BinaryAttribute.defaultAttr.withName(outputColName) val outputFields = inputFields :+ attr.toStructField() StructType(outputFields) } override def copy(extra: ParamMap): Binarizer = defaultCopy(extra) }
Example 23
package org.apache.spark.ml.feature import org.apache.spark.annotation.Experimental import org.apache.spark.ml._ import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.{Vector, VectorUDT} import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.StructType def setOutputCol(value: String): this.type = set(outputCol, value) override def transform(dataset: DataFrame): DataFrame = { transformSchema(dataset.schema, logging = true) val idf = udf { vec: Vector => idfModel.transform(vec) } dataset.withColumn($(outputCol), idf(col($(inputCol)))) } override def transformSchema(schema: StructType): StructType = { validateAndTransformSchema(schema) } override def copy(extra: ParamMap): IDFModel = { val copied = new IDFModel(uid, idfModel) copyValues(copied, extra).setParent(parent) } }
Example 24
Source File: MulticlassClassificationEvaluator.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{ParamMap, ParamValidators, Param} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, SchemaUtils, Identifiable} import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.sql.{Row, DataFrame} import org.apache.spark.sql.types.DoubleType @Since("1.5.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "f1") @Since("1.5.0") override def evaluate(dataset: DataFrame): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType) SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType) val predictionAndLabels = dataset.select($(predictionCol), $(labelCol)) .map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new MulticlassMetrics(predictionAndLabels) val metric = $(metricName) match { case "f1" => metrics.weightedFMeasure case "precision" => metrics.precision case "recall" => metrics.recall case "weightedPrecision" => metrics.weightedPrecision case "weightedRecall" => metrics.weightedRecall } metric } @Since("1.5.0") override def isLargerBetter: Boolean = $(metricName) match { case "f1" => true case "precision" => true case "recall" => true case "weightedPrecision" => true case "weightedRecall" => true } @Since("1.5.0") override def copy(extra: ParamMap): MulticlassClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object MulticlassClassificationEvaluator extends DefaultParamsReadable[MulticlassClassificationEvaluator] { @Since("1.6.0") override def load(path: String): MulticlassClassificationEvaluator = super.load(path) }
Example 25
Source File: EquilibratedUpdater.scala From zen with Apache License 2.0 | 5 votes |
package com.github.cloudml.zen.ml.neuralNetwork import com.github.cloudml.zen.ml.linalg.BLAS import com.github.cloudml.zen.ml.util.SparkUtils._ import com.github.cloudml.zen.ml.util.Utils import org.apache.spark.annotation.Experimental import org.apache.spark.mllib.linalg.{Vector => SV, DenseVector => SDV, Vectors} import com.github.cloudml.zen.ml.optimization._ @Experimental class EquilibratedUpdater( val epsilon: Double, val gamma: Double, val momentum: Double) extends Updater { require(momentum >= 0 && momentum < 1) @transient private var etaSum: SDV = null @transient private var momentumSum: SDV = null protected def l2( weightsOld: SV, gradient: SV, stepSize: Double, iter: Int, regParam: Double): Double = { 0D } override def compute( weightsOld: SV, gradient: SV, stepSize: Double, iter: Int, regParam: Double): (SV, Double) = { if (etaSum == null) etaSum = new SDV(new Array[Double](weightsOld.size)) val reg = l2(weightsOld, gradient, stepSize, iter, regParam) val grad = toBreeze(gradient) val e = toBreeze(etaSum) for (i <- 0 until grad.length) { e(i) += math.pow(grad(i) * Utils.random.nextGaussian(), 2) } etaSum.synchronized { for (i <- 0 until grad.length) { grad(i) = gamma * grad(i) / (epsilon + math.sqrt(etaSum(i) / iter)) } } if (momentum > 0) { if (momentumSum == null) momentumSum = new SDV(new Array[Double](weightsOld.size)) momentumSum.synchronized { BLAS.axpy(momentum, momentumSum, gradient) BLAS.copy(gradient, momentumSum) } } BLAS.axpy(-stepSize, gradient, weightsOld) (weightsOld, reg) } }
Example 26
package com.github.cloudml.zen.ml.neuralNetwork import com.github.cloudml.zen.ml.util.Logging import org.apache.spark.annotation.Experimental import org.apache.spark.mllib.linalg.{Vector => SV} import org.apache.spark.rdd.RDD @Experimental class DBN(val stackedRBM: StackedRBM) extends Logging with Serializable { lazy val mlp: MLPModel = { val nn = stackedRBM.toMLP() val lastLayer = nn.innerLayers(nn.numLayer - 1) NNUtil.initUniformDistWeight(lastLayer.weight, 0.01) nn.innerLayers(nn.numLayer - 1) = new SoftMaxLayer(lastLayer.weight, lastLayer.bias) nn } def this(topology: Array[Int]) { this(new StackedRBM(topology)) } } @Experimental object DBN extends Logging { def train( data: RDD[(SV, SV)], batchSize: Int, numIteration: Int, topology: Array[Int], fraction: Double, learningRate: Double, weightCost: Double): DBN = { val dbn = new DBN(topology) pretrain(data, batchSize, numIteration, dbn, fraction, learningRate, weightCost) finetune(data, batchSize, numIteration, dbn, fraction, learningRate, weightCost) dbn } def pretrain( data: RDD[(SV, SV)], batchSize: Int, numIteration: Int, dbn: DBN, fraction: Double, learningRate: Double, weightCost: Double): DBN = { val stackedRBM = dbn.stackedRBM val numLayer = stackedRBM.innerRBMs.length StackedRBM.train(data.map(_._1), batchSize, numIteration, stackedRBM, fraction, learningRate, weightCost, numLayer - 1) dbn } def finetune(data: RDD[(SV, SV)], batchSize: Int, numIteration: Int, dbn: DBN, fraction: Double, learningRate: Double, weightCost: Double): DBN = { MLP.train(data, batchSize, numIteration, dbn.mlp, fraction, learningRate, weightCost) dbn } }
Example 27
Source File: MomentumUpdater.scala From zen with Apache License 2.0 | 5 votes |
package com.github.cloudml.zen.ml.neuralNetwork import com.github.cloudml.zen.ml.linalg.BLAS import com.github.cloudml.zen.ml.util.SparkUtils import org.apache.spark.annotation.Experimental import org.apache.spark.mllib.linalg.{Vector => SV, DenseVector => SDV} import com.github.cloudml.zen.ml.optimization._ @Experimental class MomentumUpdater(val momentum: Double) extends Updater { assert(momentum > 0 && momentum < 1) @transient private var momentumSum: SDV = null protected def l2( weightsOld: SV, gradient: SV, stepSize: Double, iter: Int, regParam: Double): Double = { 0D } override def compute( weightsOld: SV, gradient: SV, stepSize: Double, iter: Int, regParam: Double): (SV, Double) = { if (momentumSum == null) { momentumSum = new SDV(new Array[Double](weightsOld.size)) } val reg = l2(weightsOld, gradient, stepSize, iter, regParam) if (momentum > 0) { BLAS.axpy(momentum, momentumSum, gradient) this.synchronized { BLAS.copy(gradient, momentumSum) } } BLAS.axpy(-stepSize, gradient, weightsOld) (weightsOld, reg) } }
Example 28
Source File: AdaDeltaUpdater.scala From zen with Apache License 2.0 | 5 votes |
package com.github.cloudml.zen.ml.neuralNetwork import com.github.cloudml.zen.ml.linalg.BLAS import com.github.cloudml.zen.ml.util.SparkUtils import org.apache.spark.annotation.Experimental import org.apache.spark.mllib.linalg.{Vector => SV, DenseVector => SDV} import com.github.cloudml.zen.ml.optimization._ @Experimental private[ml] class AdaDeltaUpdater( val rho: Double, val epsilon: Double, val momentum: Double) extends Updater { require(rho > 0 && rho < 1) require(momentum >= 0 && momentum < 1) @transient private var gradientSum: SDV = null @transient private var deltaSum: SDV = null @transient private var momentumSum: SDV = null protected def l2( weightsOld: SV, gradient: SV, stepSize: Double, iter: Int, regParam: Double): Double = { 0D } override def compute( weightsOld: SV, gradient: SV, stepSize: Double, iter: Int, regParam: Double): (SV, Double) = { if (momentum > 0 && momentumSum == null) { momentumSum = new SDV(new Array[Double](weightsOld.size)) } if (deltaSum == null) { deltaSum = new SDV(new Array[Double](weightsOld.size)) gradientSum = new SDV(new Array[Double](weightsOld.size)) } val reg = l2(weightsOld, gradient, stepSize, iter, regParam) if (momentum > 0) { BLAS.axpy(momentum, momentumSum, gradient) this.synchronized { BLAS.copy(gradient, momentumSum) } } val grad = SparkUtils.toBreeze(gradient) val g2 = grad :* grad this.synchronized { BLAS.scal(rho, gradientSum) BLAS.axpy(1 - rho, SparkUtils.fromBreeze(g2), gradientSum) } for (i <- 0 until grad.length) { val rmsDelta = math.sqrt(epsilon + deltaSum(i)) val rmsGrad = math.sqrt(epsilon + gradientSum(i)) grad(i) *= rmsDelta / rmsGrad } val d2 = grad :* grad this.synchronized { BLAS.scal(rho, deltaSum) BLAS.axpy(1 - rho, SparkUtils.fromBreeze(d2), deltaSum) } BLAS.axpy(-stepSize, gradient, weightsOld) (weightsOld, reg) } }
Example 29
Source File: AdaGradUpdater.scala From zen with Apache License 2.0 | 5 votes |
package com.github.cloudml.zen.ml.neuralNetwork import com.github.cloudml.zen.ml.linalg.BLAS import com.github.cloudml.zen.ml.util.SparkUtils import org.apache.spark.annotation.Experimental import org.apache.spark.mllib.linalg.{Vector => SV, DenseVector => SDV} import com.github.cloudml.zen.ml.optimization._ @Experimental class AdaGradUpdater( val rho: Double, val epsilon: Double, val gamma: Double, val momentum: Double) extends Updater { require(rho >= 0 && rho < 1) require(momentum >= 0 && momentum < 1) @transient private var etaSum: SDV = null @transient private var momentumSum: SDV = null protected def l2( weightsOld: SV, gradient: SV, stepSize: Double, iter: Int, regParam: Double): Double = { 0D } override def compute( weightsOld: SV, gradient: SV, stepSize: Double, iter: Int, regParam: Double): (SV, Double) = { if (momentum > 0 && momentumSum == null) { momentumSum = new SDV(new Array[Double](weightsOld.size)) } if (etaSum == null) { etaSum = new SDV(new Array[Double](weightsOld.size)) } val reg = l2(weightsOld, gradient, stepSize, iter, regParam) if (momentum > 0) { BLAS.axpy(momentum, momentumSum, gradient) this.synchronized { BLAS.copy(gradient, momentumSum) } } val grad = SparkUtils.toBreeze(gradient) val g2 = grad :* grad this.synchronized { if (rho > 0D && rho < 1D) { BLAS.scal(rho, etaSum) } BLAS.axpy(1D, SparkUtils.fromBreeze(g2), etaSum) } for (i <- 0 until grad.length) { grad(i) *= gamma / (epsilon + math.sqrt(etaSum(i))) } BLAS.axpy(-stepSize, SparkUtils.fromBreeze(grad), weightsOld) (weightsOld, reg) } }
Example 30
Source File: Broker.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kafka import org.apache.spark.annotation.Experimental @Experimental object Broker { def create(host: String, port: Int): Broker = new Broker(host, port) def apply(host: String, port: Int): Broker = new Broker(host, port) def unapply(broker: Broker): Option[(String, Int)] = { if (broker == null) { None } else { Some((broker.host, broker.port)) } } }
Example 31
Source File: TestResult.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.test import org.apache.spark.annotation.{Experimental, Since} @Experimental @Since("1.6.0") private[stat] class StreamingTestResult @Since("1.6.0") ( @Since("1.6.0") override val pValue: Double, @Since("1.6.0") override val degreesOfFreedom: Double, @Since("1.6.0") override val statistic: Double, @Since("1.6.0") val method: String, @Since("1.6.0") override val nullHypothesis: String) extends TestResult[Double] with Serializable { override def toString: String = { "Streaming test summary:\n" + s"method: $method\n" + super.toString } }
Example 32
Source File: Algo.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.tree.configuration import org.apache.spark.annotation.{Experimental, Since} @Since("1.0.0") @Experimental object Algo extends Enumeration { @Since("1.0.0") type Algo = Value @Since("1.0.0") val Classification, Regression = Value private[mllib] def fromString(name: String): Algo = name match { case "classification" | "Classification" => Classification case "regression" | "Regression" => Regression case _ => throw new IllegalArgumentException(s"Did not recognize Algo name: $name") } }
Example 33
Source File: ParamGridBuilder.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.tuning import scala.annotation.varargs import scala.collection.mutable import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param._ @Since("1.2.0") def build(): Array[ParamMap] = { var paramMaps = Array(new ParamMap) paramGrid.foreach { case (param, values) => val newParamMaps = values.flatMap { v => paramMaps.map(_.copy.put(param.asInstanceOf[Param[Any]], v)) } paramMaps = newParamMaps.toArray } paramMaps } }
Example 34
Source File: RegressionEvaluator.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DoubleType, FloatType} @Since("1.4.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "rmse") @Since("1.4.0") override def evaluate(dataset: DataFrame): Double = { val schema = dataset.schema val predictionColName = $(predictionCol) val predictionType = schema($(predictionCol)).dataType require(predictionType == FloatType || predictionType == DoubleType, s"Prediction column $predictionColName must be of type float or double, " + s" but not $predictionType") val labelColName = $(labelCol) val labelType = schema($(labelCol)).dataType require(labelType == FloatType || labelType == DoubleType, s"Label column $labelColName must be of type float or double, but not $labelType") val predictionAndLabels = dataset .select(col($(predictionCol)).cast(DoubleType), col($(labelCol)).cast(DoubleType)) .map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new RegressionMetrics(predictionAndLabels) val metric = $(metricName) match { case "rmse" => metrics.rootMeanSquaredError case "mse" => metrics.meanSquaredError case "r2" => metrics.r2 case "mae" => metrics.meanAbsoluteError } metric } @Since("1.4.0") override def isLargerBetter: Boolean = $(metricName) match { case "rmse" => false case "mse" => false case "r2" => true case "mae" => false } @Since("1.5.0") override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra) } @Since("1.6.0") object RegressionEvaluator extends DefaultParamsReadable[RegressionEvaluator] { @Since("1.6.0") override def load(path: String): RegressionEvaluator = super.load(path) }
Example 35
Source File: HashingTF.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Experimental import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.mllib.feature import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, StructType} def setNumFeatures(value: Int): this.type = set(numFeatures, value) override def transform(dataset: DataFrame): DataFrame = { val outputSchema = transformSchema(dataset.schema) val hashingTF = new feature.HashingTF($(numFeatures)) val t = udf { terms: Seq[_] => hashingTF.transform(terms) } val metadata = outputSchema($(outputCol)).metadata dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata)) } override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType require(inputType.isInstanceOf[ArrayType], s"The input column must be ArrayType, but got $inputType.") val attrGroup = new AttributeGroup($(outputCol), $(numFeatures)) SchemaUtils.appendColumn(schema, attrGroup.toStructField()) } override def copy(extra: ParamMap): HashingTF = defaultCopy(extra) }
Example 36
Source File: BinaryClassificationEvaluator.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.linalg.{Vector, VectorUDT} import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types.DoubleType @Since("1.2.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "areaUnderROC") @Since("1.2.0") override def evaluate(dataset: DataFrame): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(rawPredictionCol), new VectorUDT) SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType) // TODO: When dataset metadata has been implemented, check rawPredictionCol vector length = 2. val scoreAndLabels = dataset.select($(rawPredictionCol), $(labelCol)) .map { case Row(rawPrediction: Vector, label: Double) => (rawPrediction(1), label) } val metrics = new BinaryClassificationMetrics(scoreAndLabels) val metric = $(metricName) match { case "areaUnderROC" => metrics.areaUnderROC() case "areaUnderPR" => metrics.areaUnderPR() } metrics.unpersist() metric } @Since("1.5.0") override def isLargerBetter: Boolean = $(metricName) match { case "areaUnderROC" => true case "areaUnderPR" => true } @Since("1.4.1") override def copy(extra: ParamMap): BinaryClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object BinaryClassificationEvaluator extends DefaultParamsReadable[BinaryClassificationEvaluator] { @Since("1.6.0") override def load(path: String): BinaryClassificationEvaluator = super.load(path) }
Example 37
Source File: NGram.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.{Since, Experimental} import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.param._ import org.apache.spark.ml.util._ import org.apache.spark.sql.types.{ArrayType, DataType, StringType} def getN: Int = $(n) setDefault(n -> 2) override protected def createTransformFunc: Seq[String] => Seq[String] = { _.iterator.sliding($(n)).withPartial(false).map(_.mkString(" ")).toSeq } override protected def validateInputType(inputType: DataType): Unit = { require(inputType.sameType(ArrayType(StringType)), s"Input type must be ArrayType(StringType) but got $inputType.") } override protected def outputDataType: DataType = new ArrayType(StringType, false) } @Since("1.6.0") object NGram extends DefaultParamsReadable[NGram] { @Since("1.6.0") override def load(path: String): NGram = super.load(path) }
Example 38
Source File: DCT.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import edu.emory.mathcs.jtransforms.dct._ import org.apache.spark.annotation.{Since, Experimental} import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.param.BooleanParam import org.apache.spark.ml.util._ import org.apache.spark.mllib.linalg.{Vector, VectorUDT, Vectors} import org.apache.spark.sql.types.DataType def getInverse: Boolean = $(inverse) setDefault(inverse -> false) override protected def createTransformFunc: Vector => Vector = { vec => val result = vec.toArray val jTransformer = new DoubleDCT_1D(result.length) if ($(inverse)) jTransformer.inverse(result, true) else jTransformer.forward(result, true) Vectors.dense(result) } override protected def validateInputType(inputType: DataType): Unit = { require(inputType.isInstanceOf[VectorUDT], s"Input type must be VectorUDT but got $inputType.") } override protected def outputDataType: DataType = new VectorUDT } @Since("1.6.0") object DCT extends DefaultParamsReadable[DCT] { @Since("1.6.0") override def load(path: String): DCT = super.load(path) }
Example 39
Source File: Binarizer.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.{Since, Experimental} import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.BinaryAttribute import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util._ import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DoubleType, StructType} def setOutputCol(value: String): this.type = set(outputCol, value) override def transform(dataset: DataFrame): DataFrame = { transformSchema(dataset.schema, logging = true) val td = $(threshold) val binarizer = udf { in: Double => if (in > td) 1.0 else 0.0 } val outputColName = $(outputCol) val metadata = BinaryAttribute.defaultAttr.withName(outputColName).toMetadata() dataset.select(col("*"), binarizer(col($(inputCol))).as(outputColName, metadata)) } override def transformSchema(schema: StructType): StructType = { SchemaUtils.checkColumnType(schema, $(inputCol), DoubleType) val inputFields = schema.fields val outputColName = $(outputCol) require(inputFields.forall(_.name != outputColName), s"Output column $outputColName already exists.") val attr = BinaryAttribute.defaultAttr.withName(outputColName) val outputFields = inputFields :+ attr.toStructField() StructType(outputFields) } override def copy(extra: ParamMap): Binarizer = defaultCopy(extra) } @Since("1.6.0") object Binarizer extends DefaultParamsReadable[Binarizer] { @Since("1.6.0") override def load(path: String): Binarizer = super.load(path) }
Example 40
Source File: Normalizer.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.{Since, Experimental} import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.param.{DoubleParam, ParamValidators} import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.{Vector, VectorUDT} import org.apache.spark.sql.types.DataType def setP(value: Double): this.type = set(p, value) override protected def createTransformFunc: Vector => Vector = { val normalizer = new feature.Normalizer($(p)) normalizer.transform } override protected def outputDataType: DataType = new VectorUDT() } @Since("1.6.0") object Normalizer extends DefaultParamsReadable[Normalizer] { @Since("1.6.0") override def load(path: String): Normalizer = super.load(path) }
Example 41
Source File: SQLTransformer.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkContext import org.apache.spark.annotation.{Since, Experimental} import org.apache.spark.ml.param.{ParamMap, Param} import org.apache.spark.ml.Transformer import org.apache.spark.ml.util._ import org.apache.spark.sql.{SQLContext, DataFrame, Row} import org.apache.spark.sql.types.StructType @Since("1.6.0") def getStatement: String = $(statement) private val tableIdentifier: String = "__THIS__" @Since("1.6.0") override def transform(dataset: DataFrame): DataFrame = { val tableName = Identifiable.randomUID(uid) dataset.registerTempTable(tableName) val realStatement = $(statement).replace(tableIdentifier, tableName) val outputDF = dataset.sqlContext.sql(realStatement) outputDF } @Since("1.6.0") override def transformSchema(schema: StructType): StructType = { val sc = SparkContext.getOrCreate() val sqlContext = SQLContext.getOrCreate(sc) val dummyRDD = sc.parallelize(Seq(Row.empty)) val dummyDF = sqlContext.createDataFrame(dummyRDD, schema) dummyDF.registerTempTable(tableIdentifier) val outputSchema = sqlContext.sql($(statement)).schema outputSchema } @Since("1.6.0") override def copy(extra: ParamMap): SQLTransformer = defaultCopy(extra) } @Since("1.6.0") object SQLTransformer extends DefaultParamsReadable[SQLTransformer] { @Since("1.6.0") override def load(path: String): SQLTransformer = super.load(path) }
Example 42
Source File: HashingTF.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.{Since, Experimental} import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, StructType} def setNumFeatures(value: Int): this.type = set(numFeatures, value) override def transform(dataset: DataFrame): DataFrame = { val outputSchema = transformSchema(dataset.schema) val hashingTF = new feature.HashingTF($(numFeatures)) val t = udf { terms: Seq[_] => hashingTF.transform(terms) } val metadata = outputSchema($(outputCol)).metadata dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata)) } override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType require(inputType.isInstanceOf[ArrayType], s"The input column must be ArrayType, but got $inputType.") val attrGroup = new AttributeGroup($(outputCol), $(numFeatures)) SchemaUtils.appendColumn(schema, attrGroup.toStructField()) } override def copy(extra: ParamMap): HashingTF = defaultCopy(extra) } @Since("1.6.0") object HashingTF extends DefaultParamsReadable[HashingTF] { @Since("1.6.0") override def load(path: String): HashingTF = super.load(path) }
Example 43
Source File: ContinuousTrigger.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming.continuous import java.util.concurrent.TimeUnit import scala.concurrent.duration.Duration import org.apache.commons.lang3.StringUtils import org.apache.spark.annotation.{Experimental, InterfaceStability} import org.apache.spark.sql.streaming.{ProcessingTime, Trigger} import org.apache.spark.unsafe.types.CalendarInterval @InterfaceStability.Evolving case class ContinuousTrigger(intervalMs: Long) extends Trigger { require(intervalMs >= 0, "the interval of trigger should not be negative") } private[sql] object ContinuousTrigger { def apply(interval: String): ContinuousTrigger = { if (StringUtils.isBlank(interval)) { throw new IllegalArgumentException( "interval cannot be null or blank.") } val cal = if (interval.startsWith("interval")) { CalendarInterval.fromString(interval) } else { CalendarInterval.fromString("interval " + interval) } if (cal == null) { throw new IllegalArgumentException(s"Invalid interval: $interval") } if (cal.months > 0) { throw new IllegalArgumentException(s"Doesn't support month or year interval: $interval") } new ContinuousTrigger(cal.microseconds / 1000) } def apply(interval: Duration): ContinuousTrigger = { ContinuousTrigger(interval.toMillis) } def create(interval: String): ContinuousTrigger = { apply(interval) } def create(interval: Long, unit: TimeUnit): ContinuousTrigger = { ContinuousTrigger(unit.toMillis(interval)) } }
Example 44
Source File: Aggregator.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.expressions import org.apache.spark.annotation.{Experimental, InterfaceStability} import org.apache.spark.sql.{Dataset, Encoder, TypedColumn} import org.apache.spark.sql.catalyst.encoders.encoderFor import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Complete} import org.apache.spark.sql.execution.aggregate.TypedAggregateExpression def toColumn: TypedColumn[IN, OUT] = { implicit val bEncoder = bufferEncoder implicit val cEncoder = outputEncoder val expr = AggregateExpression( TypedAggregateExpression(this), Complete, isDistinct = false) new TypedColumn[IN, OUT](expr, encoderFor[OUT]) } }
Example 45
Source File: HiveSessionStateBuilder.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.apache.spark.annotation.{Experimental, InterfaceStability} import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.analysis.Analyzer import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.SparkPlanner import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.hive.client.HiveClient import org.apache.spark.sql.internal.{BaseSessionStateBuilder, SessionResourceLoader, SessionState} override protected def planner: SparkPlanner = { new SparkPlanner(session.sparkContext, conf, experimentalMethods) with HiveStrategies { override val sparkSession: SparkSession = session override def extraPlanningStrategies: Seq[Strategy] = super.extraPlanningStrategies ++ customPlanningStrategies ++ Seq(HiveTableScans, Scripts) } } override protected def newBuilder: NewBuilder = new HiveSessionStateBuilder(_, _) } class HiveSessionResourceLoader( session: SparkSession, client: HiveClient) extends SessionResourceLoader(session) { override def addJar(path: String): Unit = { client.addJar(path) super.addJar(path) } }
Example 46
Source File: BinaryClassificationEvaluator.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.DoubleType @Since("1.2.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "areaUnderROC") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnTypes(schema, $(rawPredictionCol), Seq(DoubleType, new VectorUDT)) SchemaUtils.checkNumericType(schema, $(labelCol)) // TODO: When dataset metadata has been implemented, check rawPredictionCol vector length = 2. val scoreAndLabels = dataset.select(col($(rawPredictionCol)), col($(labelCol)).cast(DoubleType)).rdd.map { case Row(rawPrediction: Vector, label: Double) => (rawPrediction(1), label) case Row(rawPrediction: Double, label: Double) => (rawPrediction, label) } val metrics = new BinaryClassificationMetrics(scoreAndLabels) val metric = $(metricName) match { case "areaUnderROC" => metrics.areaUnderROC() case "areaUnderPR" => metrics.areaUnderPR() } metrics.unpersist() metric } @Since("1.5.0") override def isLargerBetter: Boolean = $(metricName) match { case "areaUnderROC" => true case "areaUnderPR" => true } @Since("1.4.1") override def copy(extra: ParamMap): BinaryClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object BinaryClassificationEvaluator extends DefaultParamsReadable[BinaryClassificationEvaluator] { @Since("1.6.0") override def load(path: String): BinaryClassificationEvaluator = super.load(path) }
Example 47
Source File: DiscretizationUtils.scala From spark-MDLP-discretization with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.annotation.Experimental import org.apache.spark.mllib.linalg._ import scala.collection.mutable def entropy(frequencies: Seq[Long], n: Long): Double = { -frequencies.aggregate(0.0)( { case (h, q) => h + (if (q == 0) 0 else { val qn = q.toDouble / n qn * log2(qn) }) }, { case (h1, h2) => h1 + h2 } ) } }
Example 48
Source File: URAEType.scala From ScalaNetwork with GNU General Public License v2.0 | 5 votes |
package kr.ac.kaist.ir.deep.train import kr.ac.kaist.ir.deep.fn._ import kr.ac.kaist.ir.deep.network.{AutoEncoder, Network} import kr.ac.kaist.ir.deep.rec.BinaryTree import org.apache.spark.annotation.Experimental def stringOf(net: Network, pair: (BinaryTree, Null)): String = net match { case net: AutoEncoder ⇒ val string = StringBuilder.newBuilder val in = pair._1 // Encode phrase of Reconstruction val out = in forward net.apply // Decode phrase of reconstruction var terminals = in.backward(out, net.reconstruct) while (terminals.nonEmpty) { val leaf = terminals.head terminals = terminals.tail string append s"IN: ${leaf.x.mkString} URAE → OUT: ${leaf.out.mkString};" } string.mkString case _ ⇒ "NOT AN AUTOENCODER" } }
Example 49
Source File: MultivariateStudentsT.scala From DynaML with Apache License 2.0 | 5 votes |
package io.github.mandar2812.dynaml.probability.distributions import breeze.numerics._ import math.Pi import breeze.linalg._ import breeze.stats.distributions._ import org.apache.spark.annotation.Experimental import scala.runtime.ScalaRunTime case class MultivariateStudentsT( mu: Double, mean: DenseVector[Double], covariance : DenseMatrix[Double])(implicit rand: RandBasis = Rand) extends AbstractContinuousDistr[DenseVector[Double]] with Moments[DenseVector[Double], DenseMatrix[Double]] with HasErrorBars[DenseVector[Double]] { assert(mu > 2.0, "Parameter mu in Multivariate Students T must be greater than 2.0") private val chisq = new ChiSquared(mu) def draw() = { val w = math.sqrt(mu/chisq.draw()) val z: DenseVector[Double] = DenseVector.rand(mean.length, rand.gaussian(0.0, 1.0))*w (root * z) += mean } private val root: DenseMatrix[Double] = cholesky(covariance) override def toString() = ScalaRunTime._toString(this) override def unnormalizedLogPdf(t: DenseVector[Double]) = { val centered = t - mean val slv = covariance \ centered -0.5*(mu+mean.length)*log(1.0 + ((slv dot centered) / mu)) } override lazy val logNormalizer = { // determinant of the cholesky decomp is the sqrt of the determinant of the cov matrix // this is the log det of the cholesky decomp val det = sum(log(diag(root))) ((mean.length/2) * (log(mu) + log(Pi))) + 0.5*det + lgamma(mu/2.0) - lgamma((mu+mean.length)/2.0) } def variance = covariance*(mu/(mu-2.0)) def mode = mean //TODO: Check and correct calculation of entropy for Mult Students T @Experimental lazy val entropy = { sum(log(diag(root))) + (mean.length/2.0)*log(mu*Pi) + lbeta(mean.length/2.0, mu/2.0) - lgamma(mean.length/2.0) + (digamma((mu+mean.length)/2.0) - digamma(mu/2.0))*(mu+mean.length)/2.0 } override def confidenceInterval(s: Double) = { val signFlag = if(s < 0) -1.0 else 1.0 val ones = DenseVector.ones[Double](mean.length) val multiplier = signFlag*s val bar: DenseVector[Double] = root*(ones*(multiplier*math.sqrt(mu/(mu-2.0)))) (mean - bar, mean + bar) } }
Example 50
Source File: BoostingStrategy.scala From mllib_subpackage with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.tree.configuration import scala.beans.BeanProperty import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.mllib.tree.configuration.Algo._ import org.apache.spark.mllib.tree.loss.{LogLoss, SquaredError, Loss} @Since("1.3.0") def defaultParams(algo: Algo): LambdaBoostingStrategy = { val treeStrategy = LambdaStrategy.defaultStrategy(algo) treeStrategy.maxDepth = 3 algo match { case Algo.Classification => treeStrategy.numClasses = 2 new LambdaBoostingStrategy(treeStrategy, LogLoss) case Algo.Regression => new LambdaBoostingStrategy(treeStrategy, SquaredError) case _ => throw new IllegalArgumentException(s"$algo is not supported by boosting.") } } }
Example 51
Source File: FeatureSelectionUtils.scala From spark-infotheoretic-feature-selection with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.annotation.Experimental import org.apache.spark.mllib.linalg._ import scala.collection.mutable private[feature] def compress(features: Vector, filterIndices: Array[Int]): Vector = { features match { case v: SparseVector => val newSize = filterIndices.length val newValues = new mutable.ArrayBuilder.ofDouble val newIndices = new mutable.ArrayBuilder.ofInt var i = 0 var j = 0 var indicesIdx = 0 var filterIndicesIdx = 0 while (i < v.indices.length && j < filterIndices.length) { indicesIdx = v.indices(i) filterIndicesIdx = filterIndices(j) if (indicesIdx == filterIndicesIdx) { newIndices += j newValues += v.values(i) j += 1 i += 1 } else { if (indicesIdx > filterIndicesIdx) { j += 1 } else { i += 1 } } } // TODO: Sparse representation might be ineffective if (newSize ~= newValues.size) Vectors.sparse(newSize, newIndices.result(), newValues.result()) case v: DenseVector => Vectors.dense(filterIndices.map(i => v.values(i))) case other => throw new UnsupportedOperationException( s"Only sparse and dense vectors are supported but got ${other.getClass}." ) } } }
Example 52
Source File: PartialResult.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import org.apache.spark.annotation.Experimental @Experimental class PartialResult[R](initialVal: R, isFinal: Boolean) { private var finalValue: Option[R] = if (isFinal) Some(initialVal) else None private var failure: Option[Exception] = None private var completionHandler: Option[R => Unit] = None private var failureHandler: Option[Exception => Unit] = None def initialValue: R = initialVal def isInitialValueFinal: Boolean = isFinal def map[T](f: R => T) : PartialResult[T] = { new PartialResult[T](f(initialVal), isFinal) { override def getFinalValue() : T = synchronized { f(PartialResult.this.getFinalValue()) } override def onComplete(handler: T => Unit): PartialResult[T] = synchronized { PartialResult.this.onComplete(handler.compose(f)).map(f) } override def onFail(handler: Exception => Unit) { synchronized { PartialResult.this.onFail(handler) } } override def toString : String = synchronized { PartialResult.this.getFinalValueInternal() match { case Some(value) => "(final: " + f(value) + ")" case None => "(partial: " + initialValue + ")" } } def getFinalValueInternal() = PartialResult.this.getFinalValueInternal().map(f) } } private[spark] def setFinalValue(value: R) { synchronized { if (finalValue.isDefined) { throw new UnsupportedOperationException("setFinalValue called twice on a PartialResult") } finalValue = Some(value) // Call the completion handler if it was set completionHandler.foreach(h => h(value)) // Notify any threads that may be calling getFinalValue() this.notifyAll() } } private def getFinalValueInternal() = finalValue private[spark] def setFailure(exception: Exception) { synchronized { if (failure.isDefined) { throw new UnsupportedOperationException("setFailure called twice on a PartialResult") } failure = Some(exception) // Call the failure handler if it was set failureHandler.foreach(h => h(exception)) // Notify any threads that may be calling getFinalValue() this.notifyAll() } } override def toString: String = synchronized { finalValue match { case Some(value) => "(final: " + value + ")" case None => "(partial: " + initialValue + ")" } } }
Example 53
Source File: Aggregator.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.expressions import org.apache.spark.annotation.{Experimental, InterfaceStability} import org.apache.spark.sql.{Dataset, Encoder, TypedColumn} import org.apache.spark.sql.catalyst.encoders.encoderFor import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Complete} import org.apache.spark.sql.execution.aggregate.TypedAggregateExpression def toColumn: TypedColumn[IN, OUT] = { implicit val bEncoder = bufferEncoder implicit val cEncoder = outputEncoder val expr = AggregateExpression( TypedAggregateExpression(this), Complete, isDistinct = false) new TypedColumn[IN, OUT](expr, encoderFor[OUT]) } }
Example 54
Source File: Broker.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kafka import org.apache.spark.annotation.Experimental @Experimental object Broker { def create(host: String, port: Int): Broker = new Broker(host, port) def apply(host: String, port: Int): Broker = new Broker(host, port) def unapply(broker: Broker): Option[(String, Int)] = { if (broker == null) { None } else { Some((broker.host, broker.port)) } } }
Example 55
Source File: RegressionEvaluator.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DoubleType, FloatType} @Since("1.4.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "rmse") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnTypes(schema, $(predictionCol), Seq(DoubleType, FloatType)) SchemaUtils.checkNumericType(schema, $(labelCol)) val predictionAndLabels = dataset .select(col($(predictionCol)).cast(DoubleType), col($(labelCol)).cast(DoubleType)) .rdd .map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new RegressionMetrics(predictionAndLabels) val metric = $(metricName) match { case "rmse" => metrics.rootMeanSquaredError case "mse" => metrics.meanSquaredError case "r2" => metrics.r2 case "mae" => metrics.meanAbsoluteError } metric } @Since("1.4.0") override def isLargerBetter: Boolean = $(metricName) match { case "rmse" => false case "mse" => false case "r2" => true case "mae" => false } @Since("1.5.0") override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra) } @Since("1.6.0") object RegressionEvaluator extends DefaultParamsReadable[RegressionEvaluator] { @Since("1.6.0") override def load(path: String): RegressionEvaluator = super.load(path) }
Example 56
Source File: MulticlassClassificationEvaluator.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.DoubleType @Since("1.5.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "f1") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType) SchemaUtils.checkNumericType(schema, $(labelCol)) val predictionAndLabels = dataset.select(col($(predictionCol)), col($(labelCol)).cast(DoubleType)).rdd.map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new MulticlassMetrics(predictionAndLabels) val metric = $(metricName) match { case "f1" => metrics.weightedFMeasure case "weightedPrecision" => metrics.weightedPrecision case "weightedRecall" => metrics.weightedRecall case "accuracy" => metrics.accuracy } metric } @Since("1.5.0") override def isLargerBetter: Boolean = true @Since("1.5.0") override def copy(extra: ParamMap): MulticlassClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object MulticlassClassificationEvaluator extends DefaultParamsReadable[MulticlassClassificationEvaluator] { @Since("1.6.0") override def load(path: String): MulticlassClassificationEvaluator = super.load(path) }
Example 57
Source File: BinaryClassificationEvaluator.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.DoubleType @Since("1.2.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "areaUnderROC") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnTypes(schema, $(rawPredictionCol), Seq(DoubleType, new VectorUDT)) SchemaUtils.checkNumericType(schema, $(labelCol)) // TODO: When dataset metadata has been implemented, check rawPredictionCol vector length = 2. val scoreAndLabels = dataset.select(col($(rawPredictionCol)), col($(labelCol)).cast(DoubleType)).rdd.map { case Row(rawPrediction: Vector, label: Double) => (rawPrediction(1), label) case Row(rawPrediction: Double, label: Double) => (rawPrediction, label) } val metrics = new BinaryClassificationMetrics(scoreAndLabels) val metric = $(metricName) match { case "areaUnderROC" => metrics.areaUnderROC() case "areaUnderPR" => metrics.areaUnderPR() } metrics.unpersist() metric } @Since("1.5.0") override def isLargerBetter: Boolean = $(metricName) match { case "areaUnderROC" => true case "areaUnderPR" => true } @Since("1.4.1") override def copy(extra: ParamMap): BinaryClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object BinaryClassificationEvaluator extends DefaultParamsReadable[BinaryClassificationEvaluator] { @Since("1.6.0") override def load(path: String): BinaryClassificationEvaluator = super.load(path) }
Example 58
Source File: ContinuousTrigger.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming.continuous import java.util.concurrent.TimeUnit import scala.concurrent.duration.Duration import org.apache.commons.lang3.StringUtils import org.apache.spark.annotation.{Experimental, InterfaceStability} import org.apache.spark.sql.streaming.{ProcessingTime, Trigger} import org.apache.spark.unsafe.types.CalendarInterval @InterfaceStability.Evolving case class ContinuousTrigger(intervalMs: Long) extends Trigger { require(intervalMs >= 0, "the interval of trigger should not be negative") } private[sql] object ContinuousTrigger { def apply(interval: String): ContinuousTrigger = { if (StringUtils.isBlank(interval)) { throw new IllegalArgumentException( "interval cannot be null or blank.") } val cal = if (interval.startsWith("interval")) { CalendarInterval.fromString(interval) } else { CalendarInterval.fromString("interval " + interval) } if (cal == null) { throw new IllegalArgumentException(s"Invalid interval: $interval") } if (cal.months > 0) { throw new IllegalArgumentException(s"Doesn't support month or year interval: $interval") } new ContinuousTrigger(cal.microseconds / 1000) } def apply(interval: Duration): ContinuousTrigger = { ContinuousTrigger(interval.toMillis) } def create(interval: String): ContinuousTrigger = { apply(interval) } def create(interval: Long, unit: TimeUnit): ContinuousTrigger = { ContinuousTrigger(unit.toMillis(interval)) } }
Example 59
Source File: Aggregator.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.expressions import org.apache.spark.annotation.{Experimental, InterfaceStability} import org.apache.spark.sql.{Dataset, Encoder, TypedColumn} import org.apache.spark.sql.catalyst.encoders.encoderFor import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Complete} import org.apache.spark.sql.execution.aggregate.TypedAggregateExpression def toColumn: TypedColumn[IN, OUT] = { implicit val bEncoder = bufferEncoder implicit val cEncoder = outputEncoder val expr = AggregateExpression( TypedAggregateExpression(this), Complete, isDistinct = false) new TypedColumn[IN, OUT](expr, encoderFor[OUT]) } }
Example 60
Source File: HiveSessionStateBuilder.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.apache.spark.annotation.{Experimental, InterfaceStability} import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.analysis.Analyzer import org.apache.spark.sql.catalyst.catalog.ExternalCatalogWithListener import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.SparkPlanner import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.hive.client.HiveClient import org.apache.spark.sql.internal.{BaseSessionStateBuilder, SessionResourceLoader, SessionState} override protected def planner: SparkPlanner = { new SparkPlanner(session.sparkContext, conf, experimentalMethods) with HiveStrategies { override val sparkSession: SparkSession = session override def extraPlanningStrategies: Seq[Strategy] = super.extraPlanningStrategies ++ customPlanningStrategies ++ Seq(HiveTableScans, Scripts) } } override protected def newBuilder: NewBuilder = new HiveSessionStateBuilder(_, _) } class HiveSessionResourceLoader( session: SparkSession, clientBuilder: () => HiveClient) extends SessionResourceLoader(session) { private lazy val client = clientBuilder() override def addJar(path: String): Unit = { client.addJar(path) super.addJar(path) } }
Example 61
Source File: StaticSources.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.metrics.source import com.codahale.metrics.MetricRegistry import org.apache.spark.annotation.Experimental private[spark] object StaticSources { def reset(): Unit = { METRIC_PARTITIONS_FETCHED.dec(METRIC_PARTITIONS_FETCHED.getCount()) METRIC_FILES_DISCOVERED.dec(METRIC_FILES_DISCOVERED.getCount()) METRIC_FILE_CACHE_HITS.dec(METRIC_FILE_CACHE_HITS.getCount()) } // clients can use these to avoid classloader issues with the codahale classes def incrementFetchedPartitions(n: Int): Unit = METRIC_PARTITIONS_FETCHED.inc(n) def incrementFilesDiscovered(n: Int): Unit = METRIC_FILES_DISCOVERED.inc(n) def incrementFileCacheHits(n: Int): Unit = METRIC_FILE_CACHE_HITS.inc(n) }
Example 62
Source File: StreamingQueryException.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.streaming import org.apache.spark.annotation.Experimental import org.apache.spark.sql.execution.streaming.{Offset, StreamExecution} val time: Long = System.currentTimeMillis override def toString(): String = { val causeStr = s"${cause.getMessage} ${cause.getStackTrace.take(10).mkString("", "\n|\t", "\n")}" s""" |$causeStr | |${query.asInstanceOf[StreamExecution].toDebugString} """.stripMargin } }
Example 63
Source File: SourceStatus.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.streaming import java.{util => ju} import scala.collection.JavaConverters._ import org.json4s._ import org.json4s.JsonAST.JValue import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.annotation.Experimental import org.apache.spark.sql.streaming.StreamingQueryStatus.indent import org.apache.spark.util.JsonProtocol private[sql] object SourceStatus { def apply( desc: String, offsetDesc: String, inputRate: Double, processingRate: Double, triggerDetails: Map[String, String]): SourceStatus = { new SourceStatus(desc, offsetDesc, inputRate, processingRate, triggerDetails.asJava) } }
Example 64
Source File: Aggregator.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.expressions import org.apache.spark.annotation.{Experimental, InterfaceStability} import org.apache.spark.sql.{Dataset, Encoder, TypedColumn} import org.apache.spark.sql.catalyst.encoders.encoderFor import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Complete} import org.apache.spark.sql.execution.aggregate.TypedAggregateExpression def toColumn: TypedColumn[IN, OUT] = { implicit val bEncoder = bufferEncoder implicit val cEncoder = outputEncoder val expr = AggregateExpression( TypedAggregateExpression(this), Complete, isDistinct = false) new TypedColumn[IN, OUT](expr, encoderFor[OUT]) } }
Example 65
Source File: Broker.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kafka import org.apache.spark.annotation.Experimental @Experimental object Broker { def create(host: String, port: Int): Broker = new Broker(host, port) def apply(host: String, port: Int): Broker = new Broker(host, port) def unapply(broker: Broker): Option[(String, Int)] = { if (broker == null) { None } else { Some((broker.host, broker.port)) } } }
Example 66
Source File: RegressionEvaluator.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DoubleType, FloatType} @Since("1.4.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "rmse") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnTypes(schema, $(predictionCol), Seq(DoubleType, FloatType)) SchemaUtils.checkNumericType(schema, $(labelCol)) val predictionAndLabels = dataset .select(col($(predictionCol)).cast(DoubleType), col($(labelCol)).cast(DoubleType)) .rdd .map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new RegressionMetrics(predictionAndLabels) val metric = $(metricName) match { case "rmse" => metrics.rootMeanSquaredError case "mse" => metrics.meanSquaredError case "r2" => metrics.r2 case "mae" => metrics.meanAbsoluteError } metric } @Since("1.4.0") override def isLargerBetter: Boolean = $(metricName) match { case "rmse" => false case "mse" => false case "r2" => true case "mae" => false } @Since("1.5.0") override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra) } @Since("1.6.0") object RegressionEvaluator extends DefaultParamsReadable[RegressionEvaluator] { @Since("1.6.0") override def load(path: String): RegressionEvaluator = super.load(path) }
Example 67
Source File: MulticlassClassificationEvaluator.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.DoubleType @Since("1.5.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "f1") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType) SchemaUtils.checkNumericType(schema, $(labelCol)) val predictionAndLabels = dataset.select(col($(predictionCol)), col($(labelCol)).cast(DoubleType)).rdd.map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new MulticlassMetrics(predictionAndLabels) val metric = $(metricName) match { case "f1" => metrics.weightedFMeasure case "weightedPrecision" => metrics.weightedPrecision case "weightedRecall" => metrics.weightedRecall case "accuracy" => metrics.accuracy } metric } @Since("1.5.0") override def isLargerBetter: Boolean = true @Since("1.5.0") override def copy(extra: ParamMap): MulticlassClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object MulticlassClassificationEvaluator extends DefaultParamsReadable[MulticlassClassificationEvaluator] { @Since("1.6.0") override def load(path: String): MulticlassClassificationEvaluator = super.load(path) }
Example 68
Source File: ParamGridBuilder.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.tuning import scala.annotation.varargs import scala.collection.mutable import org.apache.spark.annotation.Experimental import org.apache.spark.ml.param._ def build(): Array[ParamMap] = { var paramMaps = Array(new ParamMap) paramGrid.foreach { case (param, values) => val newParamMaps = values.flatMap { v => paramMaps.map(_.copy.put(param.asInstanceOf[Param[Any]], v)) } paramMaps = newParamMaps.toArray } paramMaps } }
Example 69
Source File: Tokenizer.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Experimental import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.param._ import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.types.{ArrayType, DataType, StringType} def getPattern: String = $(pattern) setDefault(minTokenLength -> 1, gaps -> true, pattern -> "\\s+") override protected def createTransformFunc: String => Seq[String] = { str => val re = $(pattern).r val tokens = if ($(gaps)) re.split(str).toSeq else re.findAllIn(str).toSeq val minLength = $(minTokenLength) tokens.filter(_.length >= minLength) } override protected def validateInputType(inputType: DataType): Unit = { require(inputType == StringType, s"Input type must be string type but got $inputType.") } override protected def outputDataType: DataType = new ArrayType(StringType, true) override def copy(extra: ParamMap): RegexTokenizer = defaultCopy(extra) }
Example 70
Source File: PartialResult.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import org.apache.spark.annotation.Experimental @Experimental class PartialResult[R](initialVal: R, isFinal: Boolean) { private var finalValue: Option[R] = if (isFinal) Some(initialVal) else None private var failure: Option[Exception] = None private var completionHandler: Option[R => Unit] = None private var failureHandler: Option[Exception => Unit] = None def initialValue: R = initialVal def isInitialValueFinal: Boolean = isFinal def map[T](f: R => T) : PartialResult[T] = { new PartialResult[T](f(initialVal), isFinal) { override def getFinalValue() : T = synchronized { f(PartialResult.this.getFinalValue()) } override def onComplete(handler: T => Unit): PartialResult[T] = synchronized { PartialResult.this.onComplete(handler.compose(f)).map(f) } override def onFail(handler: Exception => Unit) { synchronized { PartialResult.this.onFail(handler) } } override def toString : String = synchronized { PartialResult.this.getFinalValueInternal() match { case Some(value) => "(final: " + f(value) + ")" case None => "(partial: " + initialValue + ")" } } def getFinalValueInternal(): Option[T] = PartialResult.this.getFinalValueInternal().map(f) } } private[spark] def setFinalValue(value: R) { synchronized { if (finalValue.isDefined) { throw new UnsupportedOperationException("setFinalValue called twice on a PartialResult") } finalValue = Some(value) // Call the completion handler if it was set completionHandler.foreach(h => h(value)) // Notify any threads that may be calling getFinalValue() this.notifyAll() } } private def getFinalValueInternal() = finalValue private[spark] def setFailure(exception: Exception) { synchronized { if (failure.isDefined) { throw new UnsupportedOperationException("setFailure called twice on a PartialResult") } failure = Some(exception) // Call the failure handler if it was set failureHandler.foreach(h => h(exception)) // Notify any threads that may be calling getFinalValue() this.notifyAll() } } override def toString: String = synchronized { finalValue match { case Some(value) => "(final: " + value + ")" case None => "(partial: " + initialValue + ")" } } }
Example 71
Source File: CreateTableAsSelect.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import org.apache.spark.annotation.Experimental import org.apache.spark.sql.{AnalysisException, SQLContext} import org.apache.spark.sql.catalyst.expressions.Row import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, LogicalPlan} import org.apache.spark.sql.execution.RunnableCommand import org.apache.spark.sql.hive.client.{HiveTable, HiveColumn} import org.apache.spark.sql.hive.{HiveContext, MetastoreRelation, HiveMetastoreTypes} private[hive] case class CreateTableAsSelect( tableDesc: HiveTable, query: LogicalPlan, allowExisting: Boolean) extends RunnableCommand { def database: String = tableDesc.database def tableName: String = tableDesc.name override def run(sqlContext: SQLContext): Seq[Row] = { val hiveContext = sqlContext.asInstanceOf[HiveContext] lazy val metastoreRelation: MetastoreRelation = { import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe import org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat import org.apache.hadoop.io.Text import org.apache.hadoop.mapred.TextInputFormat val withSchema = tableDesc.copy( schema = query.output.map(c => HiveColumn(c.name, HiveMetastoreTypes.toMetastoreType(c.dataType), null)), inputFormat = tableDesc.inputFormat.orElse(Some(classOf[TextInputFormat].getName)), outputFormat = tableDesc.outputFormat .orElse(Some(classOf[HiveIgnoreKeyTextOutputFormat[Text, Text]].getName)), serde = tableDesc.serde.orElse(Some(classOf[LazySimpleSerDe].getName()))) hiveContext.catalog.client.createTable(withSchema) // Get the Metastore Relation hiveContext.catalog.lookupRelation(Seq(database, tableName), None) match { case r: MetastoreRelation => r } } // TODO ideally, we should get the output data ready first and then // add the relation into catalog, just in case of failure occurs while data // processing. if (hiveContext.catalog.tableExists(Seq(database, tableName))) { if (allowExisting) { // table already exists, will do nothing, to keep consistent with Hive } else { throw new AnalysisException(s"$database.$tableName already exists.") } } else { hiveContext.executePlan(InsertIntoTable(metastoreRelation, Map(), query, true, false)).toRdd } Seq.empty[Row] } override def argString: String = { s"[Database:$database, TableName: $tableName, InsertIntoHiveTable]\n" + query.toString } }
Example 72
Source File: Broker.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kafka import org.apache.spark.annotation.Experimental @Experimental object Broker { def create(host: String, port: Int): Broker = new Broker(host, port) def apply(host: String, port: Int): Broker = new Broker(host, port) def unapply(broker: Broker): Option[(String, Int)] = { if (broker == null) { None } else { Some((broker.host, broker.port)) } } }
Example 73
Source File: TestResult.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.test import org.apache.spark.annotation.Experimental @Experimental class ChiSqTestResult private[stat] (override val pValue: Double, override val degreesOfFreedom: Int, override val statistic: Double, val method: String, override val nullHypothesis: String) extends TestResult[Int] { override def toString: String = { "Chi squared test summary:\n" + s"method: $method\n" + super.toString } }
Example 74
Source File: KernelDensity.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat import com.github.fommil.netlib.BLAS.{getInstance => blas} import org.apache.spark.annotation.Experimental import org.apache.spark.api.java.JavaRDD import org.apache.spark.rdd.RDD def normPdf( mean: Double, standardDeviation: Double, logStandardDeviationPlusHalfLog2Pi: Double, x: Double): Double = { val x0 = x - mean val x1 = x0 / standardDeviation val logDensity = -0.5 * x1 * x1 - logStandardDeviationPlusHalfLog2Pi math.exp(logDensity) } }
Example 75
Source File: Algo.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.tree.configuration import org.apache.spark.annotation.Experimental @Experimental object Algo extends Enumeration { type Algo = Value val Classification, Regression = Value private[mllib] def fromString(name: String): Algo = name match { case "classification" | "Classification" => Classification case "regression" | "Regression" => Regression case _ => throw new IllegalArgumentException(s"Did not recognize Algo name: $name") } }
Example 76
Source File: BoostingStrategy.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.tree.configuration import scala.beans.BeanProperty import org.apache.spark.annotation.Experimental import org.apache.spark.mllib.tree.configuration.Algo._ import org.apache.spark.mllib.tree.loss.{LogLoss, SquaredError, Loss} def defaultParams(algo: Algo): BoostingStrategy = { val treeStrategy = Strategy.defaultStategy(algo) treeStrategy.maxDepth = 3 algo match { case Algo.Classification => treeStrategy.numClasses = 2 new BoostingStrategy(treeStrategy, LogLoss) case Algo.Regression => new BoostingStrategy(treeStrategy, SquaredError) case _ => throw new IllegalArgumentException(s"$algo is not supported by boosting.") } } }
Example 77
Source File: Normalizer.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.annotation.Experimental import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} override def transform(vector: Vector): Vector = { val norm = Vectors.norm(vector, p) if (norm != 0.0) { // For dense vector, we've to allocate new memory for new output vector. // However, for sparse vector, the `index` array will not be changed, // so we can re-use it to save memory. vector match { case DenseVector(vs) => val values = vs.clone() val size = values.size var i = 0 while (i < size) { values(i) /= norm i += 1 } Vectors.dense(values) case SparseVector(size, ids, vs) => val values = vs.clone() val nnz = values.size var i = 0 while (i < nnz) { values(i) /= norm i += 1 } Vectors.sparse(size, ids, values) case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass) } } else { // Since the norm is zero, return the input vector object itself. // Note that it's safe since we always assume that the data in RDD // should be immutable. vector } } }
Example 78
Source File: ElementwiseProduct.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import org.apache.spark.annotation.Experimental import org.apache.spark.mllib.linalg._ override def transform(vector: Vector): Vector = { require(vector.size == scalingVec.size, s"vector sizes do not match: Expected ${scalingVec.size} but found ${vector.size}") vector match { case dv: DenseVector => val values: Array[Double] = dv.values.clone() val dim = scalingVec.size var i = 0 while (i < dim) { values(i) *= scalingVec(i) i += 1 } Vectors.dense(values) case SparseVector(size, indices, vs) => val values = vs.clone() val dim = values.length var i = 0 while (i < dim) { values(i) *= scalingVec(indices(i)) i += 1 } Vectors.sparse(size, indices, values) case v => throw new IllegalArgumentException("Does not support vector type " + v.getClass) } } }
Example 79
Source File: BinaryClassificationEvaluator.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.DoubleType @Since("1.2.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "areaUnderROC") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnTypes(schema, $(rawPredictionCol), Seq(DoubleType, new VectorUDT)) SchemaUtils.checkNumericType(schema, $(labelCol)) // TODO: When dataset metadata has been implemented, check rawPredictionCol vector length = 2. val scoreAndLabels = dataset.select(col($(rawPredictionCol)), col($(labelCol)).cast(DoubleType)).rdd.map { case Row(rawPrediction: Vector, label: Double) => (rawPrediction(1), label) case Row(rawPrediction: Double, label: Double) => (rawPrediction, label) } val metrics = new BinaryClassificationMetrics(scoreAndLabels) val metric = $(metricName) match { case "areaUnderROC" => metrics.areaUnderROC() case "areaUnderPR" => metrics.areaUnderPR() } metrics.unpersist() metric } @Since("1.5.0") override def isLargerBetter: Boolean = $(metricName) match { case "areaUnderROC" => true case "areaUnderPR" => true } @Since("1.4.1") override def copy(extra: ParamMap): BinaryClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object BinaryClassificationEvaluator extends DefaultParamsReadable[BinaryClassificationEvaluator] { @Since("1.6.0") override def load(path: String): BinaryClassificationEvaluator = super.load(path) }
Example 80
Source File: RegressionEvaluator.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.Experimental import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types.DoubleType def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "rmse") override def evaluate(dataset: DataFrame): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType) SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType) val predictionAndLabels = dataset.select($(predictionCol), $(labelCol)) .map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new RegressionMetrics(predictionAndLabels) val metric = $(metricName) match { case "rmse" => -metrics.rootMeanSquaredError case "mse" => -metrics.meanSquaredError case "r2" => metrics.r2 case "mae" => -metrics.meanAbsoluteError } metric } override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra) }
Example 81
Source File: BinaryClassificationEvaluator.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.Experimental import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.linalg.{Vector, VectorUDT} import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types.DoubleType def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "areaUnderROC") override def evaluate(dataset: DataFrame): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(rawPredictionCol), new VectorUDT) SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType) // TODO: When dataset metadata has been implemented, check rawPredictionCol vector length = 2. val scoreAndLabels = dataset.select($(rawPredictionCol), $(labelCol)) .map { case Row(rawPrediction: Vector, label: Double) => (rawPrediction(1), label) } val metrics = new BinaryClassificationMetrics(scoreAndLabels) val metric = $(metricName) match { case "areaUnderROC" => metrics.areaUnderROC() case "areaUnderPR" => metrics.areaUnderPR() case other => throw new IllegalArgumentException(s"Does not support metric $other.") } metrics.unpersist() metric } override def copy(extra: ParamMap): BinaryClassificationEvaluator = defaultCopy(extra) }
Example 82
Source File: Binarizer.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Experimental import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.BinaryAttribute import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DoubleType, StructType} def setOutputCol(value: String): this.type = set(outputCol, value) override def transform(dataset: DataFrame): DataFrame = { transformSchema(dataset.schema, logging = true) val td = $(threshold) val binarizer = udf { in: Double => if (in > td) 1.0 else 0.0 } val outputColName = $(outputCol) val metadata = BinaryAttribute.defaultAttr.withName(outputColName).toMetadata() dataset.select(col("*"), binarizer(col($(inputCol))).as(outputColName, metadata)) } override def transformSchema(schema: StructType): StructType = { SchemaUtils.checkColumnType(schema, $(inputCol), DoubleType) val inputFields = schema.fields val outputColName = $(outputCol) require(inputFields.forall(_.name != outputColName), s"Output column $outputColName already exists.") val attr = BinaryAttribute.defaultAttr.withName(outputColName) val outputFields = inputFields :+ attr.toStructField() StructType(outputFields) } override def copy(extra: ParamMap): Binarizer = defaultCopy(extra) }
Example 83
package org.apache.spark.ml.feature import org.apache.spark.annotation.Experimental import org.apache.spark.ml._ import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.{Vector, VectorUDT} import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.StructType def setOutputCol(value: String): this.type = set(outputCol, value) override def transform(dataset: DataFrame): DataFrame = { transformSchema(dataset.schema, logging = true) val idf = udf { vec: Vector => idfModel.transform(vec) } dataset.withColumn($(outputCol), idf(col($(inputCol)))) } override def transformSchema(schema: StructType): StructType = { validateAndTransformSchema(schema) } override def copy(extra: ParamMap): IDFModel = { val copied = new IDFModel(uid, idfModel) copyValues(copied, extra) } }
Example 84
Source File: HashingTF.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Experimental import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.mllib.feature import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, StructType} def setNumFeatures(value: Int): this.type = set(numFeatures, value) override def transform(dataset: DataFrame): DataFrame = { val outputSchema = transformSchema(dataset.schema) val hashingTF = new feature.HashingTF($(numFeatures)) val t = udf { terms: Seq[_] => hashingTF.transform(terms) } val metadata = outputSchema($(outputCol)).metadata dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata)) } override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType require(inputType.isInstanceOf[ArrayType], s"The input column must be ArrayType, but got $inputType.") val attrGroup = new AttributeGroup($(outputCol), $(numFeatures)) SchemaUtils.appendColumn(schema, attrGroup.toStructField()) } override def copy(extra: ParamMap): HashingTF = defaultCopy(extra) }
Example 85
Source File: Tokenizer.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Experimental import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.param._ import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.types.{ArrayType, DataType, StringType} def getPattern: String = $(pattern) setDefault(minTokenLength -> 1, gaps -> true, pattern -> "\\s+") override protected def createTransformFunc: String => Seq[String] = { str => val re = $(pattern).r val tokens = if ($(gaps)) re.split(str).toSeq else re.findAllIn(str).toSeq val minLength = $(minTokenLength) tokens.filter(_.length >= minLength) } override protected def validateInputType(inputType: DataType): Unit = { require(inputType == StringType, s"Input type must be string type but got $inputType.") } override protected def outputDataType: DataType = new ArrayType(StringType, false) override def copy(extra: ParamMap): RegexTokenizer = defaultCopy(extra) }
Example 86
Source File: Aggregator.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.expressions import org.apache.spark.annotation.{Experimental, InterfaceStability} import org.apache.spark.sql.{Dataset, Encoder, TypedColumn} import org.apache.spark.sql.catalyst.encoders.encoderFor import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Complete} import org.apache.spark.sql.execution.aggregate.TypedAggregateExpression def toColumn: TypedColumn[IN, OUT] = { implicit val bEncoder = bufferEncoder implicit val cEncoder = outputEncoder val expr = AggregateExpression( TypedAggregateExpression(this), Complete, isDistinct = false) new TypedColumn[IN, OUT](expr, encoderFor[OUT]) } }
Example 87
Source File: Broker.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kafka import org.apache.spark.annotation.Experimental @Experimental object Broker { def create(host: String, port: Int): Broker = new Broker(host, port) def apply(host: String, port: Int): Broker = new Broker(host, port) def unapply(broker: Broker): Option[(String, Int)] = { if (broker == null) { None } else { Some((broker.host, broker.port)) } } }
Example 88
Source File: RegressionEvaluator.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DoubleType, FloatType} @Since("1.4.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "rmse") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnTypes(schema, $(predictionCol), Seq(DoubleType, FloatType)) SchemaUtils.checkNumericType(schema, $(labelCol)) val predictionAndLabels = dataset .select(col($(predictionCol)).cast(DoubleType), col($(labelCol)).cast(DoubleType)) .rdd .map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new RegressionMetrics(predictionAndLabels) val metric = $(metricName) match { case "rmse" => metrics.rootMeanSquaredError case "mse" => metrics.meanSquaredError case "r2" => metrics.r2 case "mae" => metrics.meanAbsoluteError } metric } @Since("1.4.0") override def isLargerBetter: Boolean = $(metricName) match { case "rmse" => false case "mse" => false case "r2" => true case "mae" => false } @Since("1.5.0") override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra) } @Since("1.6.0") object RegressionEvaluator extends DefaultParamsReadable[RegressionEvaluator] { @Since("1.6.0") override def load(path: String): RegressionEvaluator = super.load(path) }
Example 89
Source File: MulticlassClassificationEvaluator.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.DoubleType @Since("1.5.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "f1") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType) SchemaUtils.checkNumericType(schema, $(labelCol)) val predictionAndLabels = dataset.select(col($(predictionCol)), col($(labelCol)).cast(DoubleType)).rdd.map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new MulticlassMetrics(predictionAndLabels) val metric = $(metricName) match { case "f1" => metrics.weightedFMeasure case "weightedPrecision" => metrics.weightedPrecision case "weightedRecall" => metrics.weightedRecall case "accuracy" => metrics.accuracy } metric } @Since("1.5.0") override def isLargerBetter: Boolean = true @Since("1.5.0") override def copy(extra: ParamMap): MulticlassClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object MulticlassClassificationEvaluator extends DefaultParamsReadable[MulticlassClassificationEvaluator] { @Since("1.6.0") override def load(path: String): MulticlassClassificationEvaluator = super.load(path) }