scala.collection.mutable.ArrayBuilder Scala Examples
The following examples show how to use scala.collection.mutable.ArrayBuilder.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: Binarizer.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import scala.collection.mutable.ArrayBuilder import org.apache.spark.annotation.Since import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.BinaryAttribute import org.apache.spark.ml.linalg._ import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util._ import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ @Since("1.4.0") def setOutputCol(value: String): this.type = set(outputCol, value) @Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { val outputSchema = transformSchema(dataset.schema, logging = true) val schema = dataset.schema val inputType = schema($(inputCol)).dataType val td = $(threshold) val binarizerDouble = udf { in: Double => if (in > td) 1.0 else 0.0 } val binarizerVector = udf { (data: Vector) => val indices = ArrayBuilder.make[Int] val values = ArrayBuilder.make[Double] data.foreachActive { (index, value) => if (value > td) { indices += index values += 1.0 } } Vectors.sparse(data.size, indices.result(), values.result()).compressed } val metadata = outputSchema($(outputCol)).metadata inputType match { case DoubleType => dataset.select(col("*"), binarizerDouble(col($(inputCol))).as($(outputCol), metadata)) case _: VectorUDT => dataset.select(col("*"), binarizerVector(col($(inputCol))).as($(outputCol), metadata)) } } @Since("1.4.0") override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType val outputColName = $(outputCol) val outCol: StructField = inputType match { case DoubleType => BinaryAttribute.defaultAttr.withName(outputColName).toStructField() case _: VectorUDT => StructField(outputColName, new VectorUDT) case _ => throw new IllegalArgumentException(s"Data type $inputType is not supported.") } if (schema.fieldNames.contains(outputColName)) { throw new IllegalArgumentException(s"Output column $outputColName already exists.") } StructType(schema.fields :+ outCol) } @Since("1.4.1") override def copy(extra: ParamMap): Binarizer = defaultCopy(extra) } @Since("1.6.0") object Binarizer extends DefaultParamsReadable[Binarizer] { @Since("1.6.0") override def load(path: String): Binarizer = super.load(path) }
Example 2
Source File: NumericParser.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import java.util.StringTokenizer import scala.collection.mutable.{ArrayBuilder, ListBuffer} import org.apache.spark.SparkException def parse(s: String): Any = { val tokenizer = new StringTokenizer(s, "()[],", true) if (tokenizer.hasMoreTokens()) { val token = tokenizer.nextToken() if (token == "(") { parseTuple(tokenizer) } else if (token == "[") { parseArray(tokenizer) } else { // expecting a number parseDouble(token) } } else { throw new SparkException(s"Cannot find any token from the input string.") } } private def parseArray(tokenizer: StringTokenizer): Array[Double] = { val values = ArrayBuilder.make[Double] var parsing = true var allowComma = false var token: String = null while (parsing && tokenizer.hasMoreTokens()) { token = tokenizer.nextToken() if (token == "]") { parsing = false } else if (token == ",") { if (allowComma) { allowComma = false } else { throw new SparkException("Found a ',' at a wrong position.") } } else { // expecting a number values += parseDouble(token) allowComma = true } } if (parsing) { throw new SparkException(s"An array must end with ']'.") } values.result() } private def parseTuple(tokenizer: StringTokenizer): Seq[_] = { val items = ListBuffer.empty[Any] var parsing = true var allowComma = false var token: String = null while (parsing && tokenizer.hasMoreTokens()) { token = tokenizer.nextToken() if (token == "(") { items.append(parseTuple(tokenizer)) allowComma = true } else if (token == "[") { items.append(parseArray(tokenizer)) allowComma = true } else if (token == ",") { if (allowComma) { allowComma = false } else { throw new SparkException("Found a ',' at a wrong position.") } } else if (token == ")") { parsing = false } else if (token.trim.isEmpty) { // ignore whitespaces between delim chars, e.g. ", [" } else { // expecting a number items.append(parseDouble(token)) allowComma = true } } if (parsing) { throw new SparkException(s"A tuple must end with ')'.") } items } private def parseDouble(s: String): Double = { try { java.lang.Double.parseDouble(s) } catch { case e: NumberFormatException => throw new SparkException(s"Cannot parse a double from: $s", e) } } }
Example 3
Source File: Binarizer.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import scala.collection.mutable.ArrayBuilder import org.apache.spark.annotation.Since import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.BinaryAttribute import org.apache.spark.ml.linalg._ import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util._ import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ @Since("1.4.0") def setOutputCol(value: String): this.type = set(outputCol, value) @Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { val outputSchema = transformSchema(dataset.schema, logging = true) val schema = dataset.schema val inputType = schema($(inputCol)).dataType val td = $(threshold) val binarizerDouble = udf { in: Double => if (in > td) 1.0 else 0.0 } val binarizerVector = udf { (data: Vector) => val indices = ArrayBuilder.make[Int] val values = ArrayBuilder.make[Double] data.foreachActive { (index, value) => if (value > td) { indices += index values += 1.0 } } Vectors.sparse(data.size, indices.result(), values.result()).compressed } val metadata = outputSchema($(outputCol)).metadata inputType match { case DoubleType => dataset.select(col("*"), binarizerDouble(col($(inputCol))).as($(outputCol), metadata)) case _: VectorUDT => dataset.select(col("*"), binarizerVector(col($(inputCol))).as($(outputCol), metadata)) } } @Since("1.4.0") override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType val outputColName = $(outputCol) val outCol: StructField = inputType match { case DoubleType => BinaryAttribute.defaultAttr.withName(outputColName).toStructField() case _: VectorUDT => StructField(outputColName, new VectorUDT) case _ => throw new IllegalArgumentException(s"Data type $inputType is not supported.") } if (schema.fieldNames.contains(outputColName)) { throw new IllegalArgumentException(s"Output column $outputColName already exists.") } StructType(schema.fields :+ outCol) } @Since("1.4.1") override def copy(extra: ParamMap): Binarizer = defaultCopy(extra) } @Since("1.6.0") object Binarizer extends DefaultParamsReadable[Binarizer] { @Since("1.6.0") override def load(path: String): Binarizer = super.load(path) }
Example 4
Source File: NumericParser.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import java.util.StringTokenizer import scala.collection.mutable.{ArrayBuilder, ListBuffer} import org.apache.spark.SparkException def parse(s: String): Any = { val tokenizer = new StringTokenizer(s, "()[],", true) if (tokenizer.hasMoreTokens()) { val token = tokenizer.nextToken() if (token == "(") { parseTuple(tokenizer) } else if (token == "[") { parseArray(tokenizer) } else { // expecting a number parseDouble(token) } } else { throw new SparkException(s"Cannot find any token from the input string.") } } private def parseArray(tokenizer: StringTokenizer): Array[Double] = { val values = ArrayBuilder.make[Double] var parsing = true var allowComma = false var token: String = null while (parsing && tokenizer.hasMoreTokens()) { token = tokenizer.nextToken() if (token == "]") { parsing = false } else if (token == ",") { if (allowComma) { allowComma = false } else { throw new SparkException("Found a ',' at a wrong position.") } } else { // expecting a number values += parseDouble(token) allowComma = true } } if (parsing) { throw new SparkException(s"An array must end with ']'.") } values.result() } private def parseTuple(tokenizer: StringTokenizer): Seq[_] = { val items = ListBuffer.empty[Any] var parsing = true var allowComma = false var token: String = null while (parsing && tokenizer.hasMoreTokens()) { token = tokenizer.nextToken() if (token == "(") { items.append(parseTuple(tokenizer)) allowComma = true } else if (token == "[") { items.append(parseArray(tokenizer)) allowComma = true } else if (token == ",") { if (allowComma) { allowComma = false } else { throw new SparkException("Found a ',' at a wrong position.") } } else if (token == ")") { parsing = false } else if (token.trim.isEmpty) { // ignore whitespaces between delim chars, e.g. ", [" } else { // expecting a number items.append(parseDouble(token)) allowComma = true } } if (parsing) { throw new SparkException(s"A tuple must end with ')'.") } items } private def parseDouble(s: String): Double = { try { java.lang.Double.parseDouble(s) } catch { case e: NumberFormatException => throw new SparkException(s"Cannot parse a double from: $s", e) } } }
Example 5
Source File: Binarizer.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import scala.collection.mutable.ArrayBuilder import org.apache.spark.annotation.Since import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.BinaryAttribute import org.apache.spark.ml.linalg._ import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util._ import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ @Since("1.4.0") def setOutputCol(value: String): this.type = set(outputCol, value) @Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { val outputSchema = transformSchema(dataset.schema, logging = true) val schema = dataset.schema val inputType = schema($(inputCol)).dataType val td = $(threshold) val binarizerDouble = udf { in: Double => if (in > td) 1.0 else 0.0 } val binarizerVector = udf { (data: Vector) => val indices = ArrayBuilder.make[Int] val values = ArrayBuilder.make[Double] data.foreachActive { (index, value) => if (value > td) { indices += index values += 1.0 } } Vectors.sparse(data.size, indices.result(), values.result()).compressed } val metadata = outputSchema($(outputCol)).metadata inputType match { case DoubleType => dataset.select(col("*"), binarizerDouble(col($(inputCol))).as($(outputCol), metadata)) case _: VectorUDT => dataset.select(col("*"), binarizerVector(col($(inputCol))).as($(outputCol), metadata)) } } @Since("1.4.0") override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType val outputColName = $(outputCol) val outCol: StructField = inputType match { case DoubleType => BinaryAttribute.defaultAttr.withName(outputColName).toStructField() case _: VectorUDT => StructField(outputColName, new VectorUDT) case _ => throw new IllegalArgumentException(s"Data type $inputType is not supported.") } if (schema.fieldNames.contains(outputColName)) { throw new IllegalArgumentException(s"Output column $outputColName already exists.") } StructType(schema.fields :+ outCol) } @Since("1.4.1") override def copy(extra: ParamMap): Binarizer = defaultCopy(extra) } @Since("1.6.0") object Binarizer extends DefaultParamsReadable[Binarizer] { @Since("1.6.0") override def load(path: String): Binarizer = super.load(path) }
Example 6
Source File: NumericParser.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import java.util.StringTokenizer import scala.collection.mutable.{ArrayBuilder, ListBuffer} import org.apache.spark.SparkException def parse(s: String): Any = { val tokenizer = new StringTokenizer(s, "()[],", true) if (tokenizer.hasMoreTokens()) { val token = tokenizer.nextToken() if (token == "(") { parseTuple(tokenizer) } else if (token == "[") { parseArray(tokenizer) } else { // expecting a number parseDouble(token) } } else { throw new SparkException(s"Cannot find any token from the input string.") } } private def parseArray(tokenizer: StringTokenizer): Array[Double] = { val values = ArrayBuilder.make[Double] var parsing = true var allowComma = false var token: String = null while (parsing && tokenizer.hasMoreTokens()) { token = tokenizer.nextToken() if (token == "]") { parsing = false } else if (token == ",") { if (allowComma) { allowComma = false } else { throw new SparkException("Found a ',' at a wrong position.") } } else { // expecting a number values += parseDouble(token) allowComma = true } } if (parsing) { throw new SparkException(s"An array must end with ']'.") } values.result() } private def parseTuple(tokenizer: StringTokenizer): Seq[_] = { val items = ListBuffer.empty[Any] var parsing = true var allowComma = false var token: String = null while (parsing && tokenizer.hasMoreTokens()) { token = tokenizer.nextToken() if (token == "(") { items.append(parseTuple(tokenizer)) allowComma = true } else if (token == "[") { items.append(parseArray(tokenizer)) allowComma = true } else if (token == ",") { if (allowComma) { allowComma = false } else { throw new SparkException("Found a ',' at a wrong position.") } } else if (token == ")") { parsing = false } else if (token.trim.isEmpty) { // ignore whitespaces between delim chars, e.g. ", [" } else { // expecting a number items.append(parseDouble(token)) allowComma = true } } if (parsing) { throw new SparkException(s"A tuple must end with ')'.") } items } private def parseDouble(s: String): Double = { try { java.lang.Double.parseDouble(s) } catch { case e: NumberFormatException => throw new SparkException(s"Cannot parse a double from: $s", e) } } }
Example 7
Source File: NumericParser.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import java.util.StringTokenizer import scala.collection.mutable.{ArrayBuilder, ListBuffer} import org.apache.spark.SparkException def parse(s: String): Any = { val tokenizer = new StringTokenizer(s, "()[],", true) if (tokenizer.hasMoreTokens()) { val token = tokenizer.nextToken() if (token == "(") { parseTuple(tokenizer) } else if (token == "[") { parseArray(tokenizer) } else { // expecting a number parseDouble(token) } } else { throw new SparkException(s"Cannot find any token from the input string.") } } private def parseArray(tokenizer: StringTokenizer): Array[Double] = { val values = ArrayBuilder.make[Double] var parsing = true var allowComma = false var token: String = null while (parsing && tokenizer.hasMoreTokens()) { token = tokenizer.nextToken() if (token == "]") { parsing = false } else if (token == ",") { if (allowComma) { allowComma = false } else { throw new SparkException("Found a ',' at a wrong position.") } } else { // expecting a number values += parseDouble(token) allowComma = true } } if (parsing) { throw new SparkException(s"An array must end with ']'.") } values.result() } private def parseTuple(tokenizer: StringTokenizer): Seq[_] = { val items = ListBuffer.empty[Any] var parsing = true var allowComma = false var token: String = null while (parsing && tokenizer.hasMoreTokens()) { token = tokenizer.nextToken() if (token == "(") { items.append(parseTuple(tokenizer)) allowComma = true } else if (token == "[") { items.append(parseArray(tokenizer)) allowComma = true } else if (token == ",") { if (allowComma) { allowComma = false } else { throw new SparkException("Found a ',' at a wrong position.") } } else if (token == ")") { parsing = false } else if (token.trim.isEmpty){ // ignore whitespaces between delim chars, e.g. ", [" } else { // expecting a number items.append(parseDouble(token)) allowComma = true } } if (parsing) { throw new SparkException(s"A tuple must end with ')'.") } items } private def parseDouble(s: String): Double = { try { java.lang.Double.parseDouble(s) } catch { case e: NumberFormatException => throw new SparkException(s"Cannot parse a double from: $s", e) } } }
Example 8
Source File: ChiSqSelector.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import scala.collection.mutable.ArrayBuilder import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.stat.Statistics import org.apache.spark.rdd.RDD @Since("1.3.0") def fit(data: RDD[LabeledPoint]): ChiSqSelectorModel = { val indices = Statistics.chiSqTest(data) .zipWithIndex.sortBy { case (res, _) => -res.statistic } .take(numTopFeatures) .map { case (_, indices) => indices } .sorted new ChiSqSelectorModel(indices) } }
Example 9
Source File: NumericParser.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import java.util.StringTokenizer import scala.collection.mutable.{ArrayBuilder, ListBuffer} import org.apache.spark.SparkException def parse(s: String): Any = { val tokenizer = new StringTokenizer(s, "()[],", true) if (tokenizer.hasMoreTokens()) { val token = tokenizer.nextToken() if (token == "(") { parseTuple(tokenizer) } else if (token == "[") { parseArray(tokenizer) } else { // expecting a number parseDouble(token) } } else { throw new SparkException(s"Cannot find any token from the input string.") } } private def parseArray(tokenizer: StringTokenizer): Array[Double] = { val values = ArrayBuilder.make[Double] var parsing = true var allowComma = false var token: String = null while (parsing && tokenizer.hasMoreTokens()) { token = tokenizer.nextToken() if (token == "]") { parsing = false } else if (token == ",") { if (allowComma) { allowComma = false } else { throw new SparkException("Found a ',' at a wrong position.") } } else { // expecting a number values += parseDouble(token) allowComma = true } } if (parsing) { throw new SparkException(s"An array must end with ']'.") } values.result() } private def parseTuple(tokenizer: StringTokenizer): Seq[_] = { val items = ListBuffer.empty[Any] var parsing = true var allowComma = false var token: String = null while (parsing && tokenizer.hasMoreTokens()) { token = tokenizer.nextToken() if (token == "(") { items.append(parseTuple(tokenizer)) allowComma = true } else if (token == "[") { items.append(parseArray(tokenizer)) allowComma = true } else if (token == ",") { if (allowComma) { allowComma = false } else { throw new SparkException("Found a ',' at a wrong position.") } } else if (token == ")") { parsing = false } else if (token.trim.isEmpty){ // ignore whitespaces between delim chars, e.g. ", [" } else { // expecting a number items.append(parseDouble(token)) allowComma = true } } if (parsing) { throw new SparkException(s"A tuple must end with ')'.") } items } private def parseDouble(s: String): Double = { try { java.lang.Double.parseDouble(s) } catch { case e: NumberFormatException => throw new SparkException(s"Cannot parse a double from: $s", e) } } }
Example 10
Source File: Binarizer.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import scala.collection.mutable.ArrayBuilder import org.apache.spark.annotation.Since import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.BinaryAttribute import org.apache.spark.ml.linalg._ import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util._ import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ @Since("1.4.0") def setOutputCol(value: String): this.type = set(outputCol, value) @Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { val outputSchema = transformSchema(dataset.schema, logging = true) val schema = dataset.schema val inputType = schema($(inputCol)).dataType val td = $(threshold) val binarizerDouble = udf { in: Double => if (in > td) 1.0 else 0.0 } val binarizerVector = udf { (data: Vector) => val indices = ArrayBuilder.make[Int] val values = ArrayBuilder.make[Double] data.foreachActive { (index, value) => if (value > td) { indices += index values += 1.0 } } Vectors.sparse(data.size, indices.result(), values.result()).compressed } val metadata = outputSchema($(outputCol)).metadata inputType match { case DoubleType => dataset.select(col("*"), binarizerDouble(col($(inputCol))).as($(outputCol), metadata)) case _: VectorUDT => dataset.select(col("*"), binarizerVector(col($(inputCol))).as($(outputCol), metadata)) } } @Since("1.4.0") override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType val outputColName = $(outputCol) val outCol: StructField = inputType match { case DoubleType => BinaryAttribute.defaultAttr.withName(outputColName).toStructField() case _: VectorUDT => StructField(outputColName, new VectorUDT) case _ => throw new IllegalArgumentException(s"Data type $inputType is not supported.") } if (schema.fieldNames.contains(outputColName)) { throw new IllegalArgumentException(s"Output column $outputColName already exists.") } StructType(schema.fields :+ outCol) } @Since("1.4.1") override def copy(extra: ParamMap): Binarizer = defaultCopy(extra) } @Since("1.6.0") object Binarizer extends DefaultParamsReadable[Binarizer] { @Since("1.6.0") override def load(path: String): Binarizer = super.load(path) }
Example 11
Source File: NumericParser.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import java.util.StringTokenizer import scala.collection.mutable.{ArrayBuilder, ListBuffer} import org.apache.spark.SparkException def parse(s: String): Any = { val tokenizer = new StringTokenizer(s, "()[],", true) if (tokenizer.hasMoreTokens()) { val token = tokenizer.nextToken() if (token == "(") { parseTuple(tokenizer) } else if (token == "[") { parseArray(tokenizer) } else { // expecting a number parseDouble(token) } } else { throw new SparkException(s"Cannot find any token from the input string.") } } private def parseArray(tokenizer: StringTokenizer): Array[Double] = { val values = ArrayBuilder.make[Double] var parsing = true var allowComma = false var token: String = null while (parsing && tokenizer.hasMoreTokens()) { token = tokenizer.nextToken() if (token == "]") { parsing = false } else if (token == ",") { if (allowComma) { allowComma = false } else { throw new SparkException("Found a ',' at a wrong position.") } } else { // expecting a number values += parseDouble(token) allowComma = true } } if (parsing) { throw new SparkException(s"An array must end with ']'.") } values.result() } private def parseTuple(tokenizer: StringTokenizer): Seq[_] = { val items = ListBuffer.empty[Any] var parsing = true var allowComma = false var token: String = null while (parsing && tokenizer.hasMoreTokens()) { token = tokenizer.nextToken() if (token == "(") { items.append(parseTuple(tokenizer)) allowComma = true } else if (token == "[") { items.append(parseArray(tokenizer)) allowComma = true } else if (token == ",") { if (allowComma) { allowComma = false } else { throw new SparkException("Found a ',' at a wrong position.") } } else if (token == ")") { parsing = false } else if (token.trim.isEmpty) { // ignore whitespaces between delim chars, e.g. ", [" } else { // expecting a number items.append(parseDouble(token)) allowComma = true } } if (parsing) { throw new SparkException(s"A tuple must end with ')'.") } items } private def parseDouble(s: String): Double = { try { java.lang.Double.parseDouble(s) } catch { case e: NumberFormatException => throw new SparkException(s"Cannot parse a double from: $s", e) } } }
Example 12
Source File: NumericParser.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import java.util.StringTokenizer import scala.collection.mutable.{ArrayBuilder, ListBuffer} import org.apache.spark.SparkException def parse(s: String): Any = { val tokenizer = new StringTokenizer(s, "()[],", true) if (tokenizer.hasMoreTokens()) { val token = tokenizer.nextToken() if (token == "(") { parseTuple(tokenizer) } else if (token == "[") { parseArray(tokenizer) } else { // expecting a number parseDouble(token) } } else { throw new SparkException(s"Cannot find any token from the input string.") } } private def parseArray(tokenizer: StringTokenizer): Array[Double] = { val values = ArrayBuilder.make[Double] var parsing = true var allowComma = false var token: String = null while (parsing && tokenizer.hasMoreTokens()) { token = tokenizer.nextToken() if (token == "]") { parsing = false } else if (token == ",") { if (allowComma) { allowComma = false } else { throw new SparkException("Found a ',' at a wrong position.") } } else { // expecting a number values += parseDouble(token) allowComma = true } } if (parsing) { throw new SparkException(s"An array must end with ']'.") } values.result() } private def parseTuple(tokenizer: StringTokenizer): Seq[_] = { val items = ListBuffer.empty[Any] var parsing = true var allowComma = false var token: String = null while (parsing && tokenizer.hasMoreTokens()) { token = tokenizer.nextToken() if (token == "(") { items.append(parseTuple(tokenizer)) allowComma = true } else if (token == "[") { items.append(parseArray(tokenizer)) allowComma = true } else if (token == ",") { if (allowComma) { allowComma = false } else { throw new SparkException("Found a ',' at a wrong position.") } } else if (token == ")") { parsing = false } else if (token.trim.isEmpty){ // ignore whitespaces between delim chars, e.g. ", [" } else { // expecting a number items.append(parseDouble(token)) allowComma = true } } if (parsing) { throw new SparkException(s"A tuple must end with ')'.") } items } private def parseDouble(s: String): Double = { try { java.lang.Double.parseDouble(s) } catch { case e: NumberFormatException => throw new SparkException(s"Cannot parse a double from: $s", e) } } }
Example 13
Source File: SimpleVectorAssembler.scala From albedo with MIT License | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkException import org.apache.spark.ml.Transformer import org.apache.spark.ml.linalg.{Vector, VectorUDT, Vectors} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Dataset, Row} import scala.collection.mutable.ArrayBuilder def setOutputCol(value: String): this.type = set(outputCol, value) override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) val schema = dataset.schema val assembleFunc = udf { r: Row => SimpleVectorAssembler.assemble(r.toSeq: _*) } val args = $(inputCols).map { c => schema(c).dataType match { case DoubleType => dataset(c) case _: VectorUDT => dataset(c) case _: NumericType | BooleanType => dataset(c).cast(DoubleType).as(s"${c}_double_$uid") } } dataset.select(col("*"), assembleFunc(struct(args: _*)).as($(outputCol))) } override def transformSchema(schema: StructType): StructType = { val inputColNames = $(inputCols) val outputColName = $(outputCol) val inputDataTypes = inputColNames.map(name => schema(name).dataType) inputDataTypes.foreach { case _: NumericType | BooleanType => case t if t.isInstanceOf[VectorUDT] => case other => throw new IllegalArgumentException(s"Data type $other is not supported.") } if (schema.fieldNames.contains(outputColName)) { throw new IllegalArgumentException(s"Output column $outputColName already exists.") } StructType(schema.fields :+ new StructField(outputColName, new VectorUDT, true)) } override def copy(extra: ParamMap): SimpleVectorAssembler = defaultCopy(extra) } object SimpleVectorAssembler extends DefaultParamsReadable[SimpleVectorAssembler] { override def load(path: String): SimpleVectorAssembler = super.load(path) def assemble(vv: Any*): Vector = { val indices = ArrayBuilder.make[Int] val values = ArrayBuilder.make[Double] var cur = 0 vv.foreach { case v: Double => if (v != 0.0) { indices += cur values += v } cur += 1 case vec: Vector => vec.foreachActive { case (i, v) => if (v != 0.0) { indices += cur + i values += v } } cur += vec.size case null => // TODO: output Double.NaN? throw new SparkException("Values to assemble cannot be null.") case o => throw new SparkException(s"$o of type ${o.getClass.getName} is not supported.") } Vectors.sparse(cur, indices.result(), values.result()).compressed } }