scala.beans.BeanInfo Scala Examples
The following examples show how to use scala.beans.BeanInfo.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: LabeledPoint.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.regression import scala.beans.BeanInfo import org.apache.spark.annotation.Since import org.apache.spark.mllib.linalg.{Vectors, Vector} import org.apache.spark.mllib.util.NumericParser import org.apache.spark.SparkException @Since("1.1.0") def parse(s: String): LabeledPoint = { if (s.startsWith("(")) { NumericParser.parse(s) match { case Seq(label: Double, numeric: Any) => LabeledPoint(label, Vectors.parseNumeric(numeric)) case other => throw new SparkException(s"Cannot parse $other.") } } else { // dense format used before v1.0 val parts = s.split(',') val label = java.lang.Double.parseDouble(parts(0)) val features = Vectors.dense(parts(1).trim().split(' ').map(java.lang.Double.parseDouble)) LabeledPoint(label, features) } } }
Example 2
Source File: LabeledPoint.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.regression import scala.beans.BeanInfo import org.apache.spark.annotation.Since import org.apache.spark.mllib.linalg.{Vectors, Vector} import org.apache.spark.mllib.util.NumericParser import org.apache.spark.SparkException @Since("1.1.0") def parse(s: String): LabeledPoint = { if (s.startsWith("(")) { NumericParser.parse(s) match { case Seq(label: Double, numeric: Any) => LabeledPoint(label, Vectors.parseNumeric(numeric)) case other => throw new SparkException(s"Cannot parse $other.") } } else { // dense format used before v1.0 val parts = s.split(',') val label = java.lang.Double.parseDouble(parts(0)) val features = Vectors.dense(parts(1).trim().split(' ').map(java.lang.Double.parseDouble)) LabeledPoint(label, features) } } }
Example 3
Source File: DCTSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import scala.beans.BeanInfo import edu.emory.mathcs.jtransforms.dct.DoubleDCT_1D import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{DataFrame, Row} @BeanInfo case class DCTTestData(vec: Vector, wantedVec: Vector) class DCTSuite extends SparkFunSuite with MLlibTestSparkContext { //正向离散余弦变换jtransforms比赛结果 test("forward transform of discrete cosine matches jTransforms result") { val data = Vectors.dense((0 until 128).map(_ => 2D * math.random - 1D).toArray) val inverse = false testDCT(data, inverse) } //逆离散余弦变换jtransforms比赛结果 test("inverse transform of discrete cosine matches jTransforms result") { val data = Vectors.dense((0 until 128).map(_ => 2D * math.random - 1D).toArray) val inverse = true testDCT(data, inverse) } private def testDCT(data: Vector, inverse: Boolean): Unit = { val expectedResultBuffer = data.toArray.clone() if (inverse) { (new DoubleDCT_1D(data.size)).inverse(expectedResultBuffer, true) } else { (new DoubleDCT_1D(data.size)).forward(expectedResultBuffer, true) } val expectedResult = Vectors.dense(expectedResultBuffer) val dataset = sqlContext.createDataFrame(Seq( DCTTestData(data, expectedResult) )) val transformer = new DCT() .setInputCol("vec") .setOutputCol("resultVec") .setInverse(inverse) //transform()方法将DataFrame转化为另外一个DataFrame的算法 transformer.transform(dataset) .select("resultVec", "wantedVec") .collect() .foreach { case Row(resultVec: Vector, wantedVec: Vector) => assert(Vectors.sqdist(resultVec, wantedVec) < 1e-6) } } }
Example 4
Source File: TokenizerSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import scala.beans.BeanInfo import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{DataFrame, Row} @BeanInfo case class TokenizerTestData(rawText: String, wantedTokens: Array[String]) dataset1.show() tokenizer0.setMinTokenLength(3) testRegexTokenizer(tokenizer0, dataset1) val tokenizer2 = new RegexTokenizer() .setInputCol("rawText") .setOutputCol("tokens") val dataset2 = sqlContext.createDataFrame(Seq( TokenizerTestData("Test for tokenization.", Array("Test", "for", "tokenization.")), TokenizerTestData("Te,st. punct", Array("Te,st.", "punct")) )) testRegexTokenizer(tokenizer2, dataset2) } } object RegexTokenizerSuite extends SparkFunSuite { def testRegexTokenizer(t: RegexTokenizer, dataset: DataFrame): Unit = { //transform()方法将DataFrame转化为另外一个DataFrame的算法 t.transform(dataset) .select("tokens", "wantedTokens") .collect() .foreach { case Row(tokens, wantedTokens) => assert(tokens === wantedTokens) } } }
Example 5
Source File: LabeledPoint.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.regression import scala.beans.BeanInfo import org.apache.spark.SparkException import org.apache.spark.annotation.Since import org.apache.spark.ml.feature.{LabeledPoint => NewLabeledPoint} import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.util.NumericParser @Since("1.1.0") def parse(s: String): LabeledPoint = { if (s.startsWith("(")) { NumericParser.parse(s) match { case Seq(label: Double, numeric: Any) => LabeledPoint(label, Vectors.parseNumeric(numeric)) case other => throw new SparkException(s"Cannot parse $other.") } } else { // dense format used before v1.0 val parts = s.split(',') val label = java.lang.Double.parseDouble(parts(0)) val features = Vectors.dense(parts(1).trim().split(' ').map(java.lang.Double.parseDouble)) LabeledPoint(label, features) } } private[spark] def fromML(point: NewLabeledPoint): LabeledPoint = { LabeledPoint(point.label, Vectors.fromML(point.features)) } }
Example 6
Source File: DCTSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import scala.beans.BeanInfo import edu.emory.mathcs.jtransforms.dct.DoubleDCT_1D import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest} import org.apache.spark.sql.Row @BeanInfo case class DCTTestData(vec: Vector, wantedVec: Vector) class DCTSuite extends MLTest with DefaultReadWriteTest { import testImplicits._ test("forward transform of discrete cosine matches jTransforms result") { val data = Vectors.dense((0 until 128).map(_ => 2D * math.random - 1D).toArray) val inverse = false testDCT(data, inverse) } test("inverse transform of discrete cosine matches jTransforms result") { val data = Vectors.dense((0 until 128).map(_ => 2D * math.random - 1D).toArray) val inverse = true testDCT(data, inverse) } test("read/write") { val t = new DCT() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setInverse(true) testDefaultReadWrite(t) } private def testDCT(data: Vector, inverse: Boolean): Unit = { val expectedResultBuffer = data.toArray.clone() if (inverse) { new DoubleDCT_1D(data.size).inverse(expectedResultBuffer, true) } else { new DoubleDCT_1D(data.size).forward(expectedResultBuffer, true) } val expectedResult = Vectors.dense(expectedResultBuffer) val dataset = Seq(DCTTestData(data, expectedResult)).toDF() val transformer = new DCT() .setInputCol("vec") .setOutputCol("resultVec") .setInverse(inverse) testTransformer[(Vector, Vector)](dataset, transformer, "resultVec", "wantedVec") { case Row(resultVec: Vector, wantedVec: Vector) => assert(Vectors.sqdist(resultVec, wantedVec) < 1e-6) } } }
Example 7
Source File: TokenizerSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import scala.beans.BeanInfo import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest} import org.apache.spark.sql.{DataFrame, Row} @BeanInfo case class TokenizerTestData(rawText: String, wantedTokens: Array[String]) class TokenizerSuite extends MLTest with DefaultReadWriteTest { test("params") { ParamsSuite.checkParams(new Tokenizer) } test("read/write") { val t = new Tokenizer() .setInputCol("myInputCol") .setOutputCol("myOutputCol") testDefaultReadWrite(t) } } class RegexTokenizerSuite extends MLTest with DefaultReadWriteTest { import testImplicits._ def testRegexTokenizer(t: RegexTokenizer, dataframe: DataFrame): Unit = { testTransformer[(String, Seq[String])](dataframe, t, "tokens", "wantedTokens") { case Row(tokens, wantedTokens) => assert(tokens === wantedTokens) } } test("params") { ParamsSuite.checkParams(new RegexTokenizer) } test("RegexTokenizer") { val tokenizer0 = new RegexTokenizer() .setGaps(false) .setPattern("\\w+|\\p{Punct}") .setInputCol("rawText") .setOutputCol("tokens") val dataset0 = Seq( TokenizerTestData("Test for tokenization.", Array("test", "for", "tokenization", ".")), TokenizerTestData("Te,st. punct", Array("te", ",", "st", ".", "punct")) ).toDF() testRegexTokenizer(tokenizer0, dataset0) val dataset1 = Seq( TokenizerTestData("Test for tokenization.", Array("test", "for", "tokenization")), TokenizerTestData("Te,st. punct", Array("punct")) ).toDF() tokenizer0.setMinTokenLength(3) testRegexTokenizer(tokenizer0, dataset1) val tokenizer2 = new RegexTokenizer() .setInputCol("rawText") .setOutputCol("tokens") val dataset2 = Seq( TokenizerTestData("Test for tokenization.", Array("test", "for", "tokenization.")), TokenizerTestData("Te,st. punct", Array("te,st.", "punct")) ).toDF() testRegexTokenizer(tokenizer2, dataset2) } test("RegexTokenizer with toLowercase false") { val tokenizer = new RegexTokenizer() .setInputCol("rawText") .setOutputCol("tokens") .setToLowercase(false) val dataset = Seq( TokenizerTestData("JAVA SCALA", Array("JAVA", "SCALA")), TokenizerTestData("java scala", Array("java", "scala")) ).toDF() testRegexTokenizer(tokenizer, dataset) } test("read/write") { val t = new RegexTokenizer() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setMinTokenLength(2) .setGaps(false) .setPattern("hi") .setToLowercase(false) testDefaultReadWrite(t) } }
Example 8
Source File: NGramSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import scala.beans.BeanInfo import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest} import org.apache.spark.sql.{DataFrame, Row} @BeanInfo case class NGramTestData(inputTokens: Array[String], wantedNGrams: Array[String]) class NGramSuite extends MLTest with DefaultReadWriteTest { import testImplicits._ test("default behavior yields bigram features") { val nGram = new NGram() .setInputCol("inputTokens") .setOutputCol("nGrams") val dataset = Seq(NGramTestData( Array("Test", "for", "ngram", "."), Array("Test for", "for ngram", "ngram .") )).toDF() testNGram(nGram, dataset) } test("NGramLength=4 yields length 4 n-grams") { val nGram = new NGram() .setInputCol("inputTokens") .setOutputCol("nGrams") .setN(4) val dataset = Seq(NGramTestData( Array("a", "b", "c", "d", "e"), Array("a b c d", "b c d e") )).toDF() testNGram(nGram, dataset) } test("empty input yields empty output") { val nGram = new NGram() .setInputCol("inputTokens") .setOutputCol("nGrams") .setN(4) val dataset = Seq(NGramTestData(Array(), Array())).toDF() testNGram(nGram, dataset) } test("input array < n yields empty output") { val nGram = new NGram() .setInputCol("inputTokens") .setOutputCol("nGrams") .setN(6) val dataset = Seq(NGramTestData( Array("a", "b", "c", "d", "e"), Array() )).toDF() testNGram(nGram, dataset) } test("read/write") { val t = new NGram() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setN(3) testDefaultReadWrite(t) } def testNGram(t: NGram, dataFrame: DataFrame): Unit = { testTransformer[(Seq[String], Seq[String])](dataFrame, t, "nGrams", "wantedNGrams") { case Row(actualNGrams : Seq[_], wantedNGrams: Seq[_]) => assert(actualNGrams === wantedNGrams) } } }
Example 9
Source File: SimpleTextClassificationPipeline.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml import scala.beans.BeanInfo import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.feature.{HashingTF, Tokenizer} import org.apache.spark.mllib.linalg.Vector import org.apache.spark.sql.{Row, SQLContext} @BeanInfo case class LabeledDocument(id: Long, text: String, label: Double) @BeanInfo case class Document(id: Long, text: String) object SimpleTextClassificationPipeline { def main(args: Array[String]) { val conf = new SparkConf().setAppName("SimpleTextClassificationPipeline") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) import sqlContext.implicits._ // Prepare training documents, which are labeled. val training = sc.parallelize(Seq( LabeledDocument(0L, "a b c d e spark", 1.0), LabeledDocument(1L, "b d", 0.0), LabeledDocument(2L, "spark f g h", 1.0), LabeledDocument(3L, "hadoop mapreduce", 0.0))) // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. val tokenizer = new Tokenizer() .setInputCol("text") .setOutputCol("words") val hashingTF = new HashingTF() .setNumFeatures(1000) .setInputCol(tokenizer.getOutputCol) .setOutputCol("features") val lr = new LogisticRegression() .setMaxIter(10) .setRegParam(0.001) val pipeline = new Pipeline() .setStages(Array(tokenizer, hashingTF, lr)) // Fit the pipeline to training documents. val model = pipeline.fit(training.toDF()) // Prepare test documents, which are unlabeled. val test = sc.parallelize(Seq( Document(4L, "spark i j k"), Document(5L, "l m n"), Document(6L, "spark hadoop spark"), Document(7L, "apache hadoop"))) // Make predictions on test documents. model.transform(test.toDF()) .select("id", "text", "probability", "prediction") .collect() .foreach { case Row(id: Long, text: String, prob: Vector, prediction: Double) => println(s"($id, $text) --> prob=$prob, prediction=$prediction") } sc.stop() } } // scalastyle:on println
Example 10
Source File: TokenizerSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import scala.beans.BeanInfo import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{DataFrame, Row} @BeanInfo case class TokenizerTestData(rawText: String, wantedTokens: Array[String]) class TokenizerSuite extends SparkFunSuite { test("params") { ParamsSuite.checkParams(new Tokenizer) } } class RegexTokenizerSuite extends SparkFunSuite with MLlibTestSparkContext { import org.apache.spark.ml.feature.RegexTokenizerSuite._ test("params") { ParamsSuite.checkParams(new RegexTokenizer) } test("RegexTokenizer") { val tokenizer0 = new RegexTokenizer() .setGaps(false) .setPattern("\\w+|\\p{Punct}") .setInputCol("rawText") .setOutputCol("tokens") val dataset0 = sqlContext.createDataFrame(Seq( TokenizerTestData("Test for tokenization.", Array("Test", "for", "tokenization", ".")), TokenizerTestData("Te,st. punct", Array("Te", ",", "st", ".", "punct")) )) testRegexTokenizer(tokenizer0, dataset0) val dataset1 = sqlContext.createDataFrame(Seq( TokenizerTestData("Test for tokenization.", Array("Test", "for", "tokenization")), TokenizerTestData("Te,st. punct", Array("punct")) )) tokenizer0.setMinTokenLength(3) testRegexTokenizer(tokenizer0, dataset1) val tokenizer2 = new RegexTokenizer() .setInputCol("rawText") .setOutputCol("tokens") val dataset2 = sqlContext.createDataFrame(Seq( TokenizerTestData("Test for tokenization.", Array("Test", "for", "tokenization.")), TokenizerTestData("Te,st. punct", Array("Te,st.", "punct")) )) testRegexTokenizer(tokenizer2, dataset2) } } object RegexTokenizerSuite extends SparkFunSuite { def testRegexTokenizer(t: RegexTokenizer, dataset: DataFrame): Unit = { t.transform(dataset) .select("tokens", "wantedTokens") .collect() .foreach { case Row(tokens, wantedTokens) => assert(tokens === wantedTokens) } } }
Example 11
Source File: DCTSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import scala.beans.BeanInfo import edu.emory.mathcs.jtransforms.dct.DoubleDCT_1D import org.apache.spark.SparkFunSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{DataFrame, Row} @BeanInfo case class DCTTestData(vec: Vector, wantedVec: Vector) class DCTSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { test("forward transform of discrete cosine matches jTransforms result") { val data = Vectors.dense((0 until 128).map(_ => 2D * math.random - 1D).toArray) val inverse = false testDCT(data, inverse) } test("inverse transform of discrete cosine matches jTransforms result") { val data = Vectors.dense((0 until 128).map(_ => 2D * math.random - 1D).toArray) val inverse = true testDCT(data, inverse) } test("read/write") { val t = new DCT() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setInverse(true) testDefaultReadWrite(t) } private def testDCT(data: Vector, inverse: Boolean): Unit = { val expectedResultBuffer = data.toArray.clone() if (inverse) { (new DoubleDCT_1D(data.size)).inverse(expectedResultBuffer, true) } else { (new DoubleDCT_1D(data.size)).forward(expectedResultBuffer, true) } val expectedResult = Vectors.dense(expectedResultBuffer) val dataset = sqlContext.createDataFrame(Seq( DCTTestData(data, expectedResult) )) val transformer = new DCT() .setInputCol("vec") .setOutputCol("resultVec") .setInverse(inverse) transformer.transform(dataset) .select("resultVec", "wantedVec") .collect() .foreach { case Row(resultVec: Vector, wantedVec: Vector) => assert(Vectors.sqdist(resultVec, wantedVec) < 1e-6) } } }
Example 12
Source File: TokenizerSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import scala.beans.BeanInfo import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{DataFrame, Row} @BeanInfo case class TokenizerTestData(rawText: String, wantedTokens: Array[String]) class TokenizerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { test("params") { ParamsSuite.checkParams(new Tokenizer) } test("read/write") { val t = new Tokenizer() .setInputCol("myInputCol") .setOutputCol("myOutputCol") testDefaultReadWrite(t) } } class RegexTokenizerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import org.apache.spark.ml.feature.RegexTokenizerSuite._ test("params") { ParamsSuite.checkParams(new RegexTokenizer) } test("RegexTokenizer") { val tokenizer0 = new RegexTokenizer() .setGaps(false) .setPattern("\\w+|\\p{Punct}") .setInputCol("rawText") .setOutputCol("tokens") val dataset0 = sqlContext.createDataFrame(Seq( TokenizerTestData("Test for tokenization.", Array("test", "for", "tokenization", ".")), TokenizerTestData("Te,st. punct", Array("te", ",", "st", ".", "punct")) )) testRegexTokenizer(tokenizer0, dataset0) val dataset1 = sqlContext.createDataFrame(Seq( TokenizerTestData("Test for tokenization.", Array("test", "for", "tokenization")), TokenizerTestData("Te,st. punct", Array("punct")) )) tokenizer0.setMinTokenLength(3) testRegexTokenizer(tokenizer0, dataset1) val tokenizer2 = new RegexTokenizer() .setInputCol("rawText") .setOutputCol("tokens") val dataset2 = sqlContext.createDataFrame(Seq( TokenizerTestData("Test for tokenization.", Array("test", "for", "tokenization.")), TokenizerTestData("Te,st. punct", Array("te,st.", "punct")) )) testRegexTokenizer(tokenizer2, dataset2) } test("RegexTokenizer with toLowercase false") { val tokenizer = new RegexTokenizer() .setInputCol("rawText") .setOutputCol("tokens") .setToLowercase(false) val dataset = sqlContext.createDataFrame(Seq( TokenizerTestData("JAVA SCALA", Array("JAVA", "SCALA")), TokenizerTestData("java scala", Array("java", "scala")) )) testRegexTokenizer(tokenizer, dataset) } test("read/write") { val t = new RegexTokenizer() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setMinTokenLength(2) .setGaps(false) .setPattern("hi") .setToLowercase(false) testDefaultReadWrite(t) } } object RegexTokenizerSuite extends SparkFunSuite { def testRegexTokenizer(t: RegexTokenizer, dataset: DataFrame): Unit = { t.transform(dataset) .select("tokens", "wantedTokens") .collect() .foreach { case Row(tokens, wantedTokens) => assert(tokens === wantedTokens) } } }
Example 13
Source File: NGramSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import scala.beans.BeanInfo import org.apache.spark.SparkFunSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{DataFrame, Row} @BeanInfo case class NGramTestData(inputTokens: Array[String], wantedNGrams: Array[String]) class NGramSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import org.apache.spark.ml.feature.NGramSuite._ test("default behavior yields bigram features") { val nGram = new NGram() .setInputCol("inputTokens") .setOutputCol("nGrams") val dataset = sqlContext.createDataFrame(Seq( NGramTestData( Array("Test", "for", "ngram", "."), Array("Test for", "for ngram", "ngram .") ))) testNGram(nGram, dataset) } test("NGramLength=4 yields length 4 n-grams") { val nGram = new NGram() .setInputCol("inputTokens") .setOutputCol("nGrams") .setN(4) val dataset = sqlContext.createDataFrame(Seq( NGramTestData( Array("a", "b", "c", "d", "e"), Array("a b c d", "b c d e") ))) testNGram(nGram, dataset) } test("empty input yields empty output") { val nGram = new NGram() .setInputCol("inputTokens") .setOutputCol("nGrams") .setN(4) val dataset = sqlContext.createDataFrame(Seq( NGramTestData( Array(), Array() ))) testNGram(nGram, dataset) } test("input array < n yields empty output") { val nGram = new NGram() .setInputCol("inputTokens") .setOutputCol("nGrams") .setN(6) val dataset = sqlContext.createDataFrame(Seq( NGramTestData( Array("a", "b", "c", "d", "e"), Array() ))) testNGram(nGram, dataset) } test("read/write") { val t = new NGram() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setN(3) testDefaultReadWrite(t) } } object NGramSuite extends SparkFunSuite { def testNGram(t: NGram, dataset: DataFrame): Unit = { t.transform(dataset) .select("nGrams", "wantedNGrams") .collect() .foreach { case Row(actualNGrams, wantedNGrams) => assert(actualNGrams === wantedNGrams) } } }
Example 14
Source File: LabeledPoint.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.ml.feature import org.apache.spark.SparkException import org.apache.spark.linalg.{NumericParser, Vector, Vectors} import scala.beans.BeanInfo /** * * Class that represents the features and label of a data point. * * @param label Label for this data point. * @param features List of features for this data point. */ @BeanInfo case class LabeledPoint(label: Double, features: Vector) extends Serializable { override def toString: String = { s"($label,$features)" } } object LabeledPoint { /** * Parses a string resulted from `LabeledPoint#toString` into * an [[LabeledPoint]]. * */ def parse(s: String): LabeledPoint = { if (s.startsWith("(")) { NumericParser.parse(s) match { case Seq(label: Double, numeric: Any) => LabeledPoint(label, Vectors.parseNumeric(numeric)) case other => throw new SparkException(s"Cannot parse $other.") } } else { // dense format used before v1.0 val parts = s.split(',') val label = java.lang.Double.parseDouble(parts(0)) val features = Vectors.dense(parts(1).trim().split(' ').map(java.lang.Double.parseDouble)) LabeledPoint(label, features) } } }
Example 15
Source File: DCTSuite.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.ml.feature import scala.beans.BeanInfo import edu.emory.mathcs.jtransforms.dct.DoubleDCT_1D import org.apache.spark.linalg import org.apache.spark.linalg.Vectors import com.tencent.angel.sona.ml.util.{DefaultReadWriteTest, MLTest} import org.apache.spark.sql.Row @BeanInfo case class DCTTestData(vec: linalg.Vector, wantedVec: linalg.Vector) class DCTSuite extends MLTest with DefaultReadWriteTest { import testImplicits._ test("forward transform of discrete cosine matches jTransforms result") { val data = Vectors.dense((0 until 128).map(_ => 2D * math.random - 1D).toArray) val inverse = false testDCT(data, inverse) } test("inverse transform of discrete cosine matches jTransforms result") { val data = Vectors.dense((0 until 128).map(_ => 2D * math.random - 1D).toArray) val inverse = true testDCT(data, inverse) } test("read/write") { val t = new DCT() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setInverse(true) testDefaultReadWrite(t) } private def testDCT(data: linalg.Vector, inverse: Boolean): Unit = { val expectedResultBuffer = data.toArray.clone() if (inverse) { new DoubleDCT_1D(data.size.toInt).inverse(expectedResultBuffer, true) } else { new DoubleDCT_1D(data.size.toInt).forward(expectedResultBuffer, true) } val expectedResult = Vectors.dense(expectedResultBuffer) val dataset = Seq(DCTTestData(data, expectedResult)).toDF() val transformer = new DCT() .setInputCol("vec") .setOutputCol("resultVec") .setInverse(inverse) testTransformer[(linalg.Vector, linalg.Vector)](dataset, transformer, "resultVec", "wantedVec") { case Row(resultVec: linalg.Vector, wantedVec: linalg.Vector) => assert(Vectors.sqdist(resultVec, wantedVec) < 1e-6) } } }
Example 16
Source File: TokenizerSuite.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.ml.feature import com.tencent.angel.sona.ml.util.{DefaultReadWriteTest, MLTest} import scala.beans.BeanInfo import org.apache.spark.sql.{DataFrame, Row} @BeanInfo case class TokenizerTestData(rawText: String, wantedTokens: Array[String]) class TokenizerSuite extends MLTest with DefaultReadWriteTest { test("read/write") { val t = new Tokenizer() .setInputCol("myInputCol") .setOutputCol("myOutputCol") testDefaultReadWrite(t) } } class RegexTokenizerSuite extends MLTest with DefaultReadWriteTest { import testImplicits._ def testRegexTokenizer(t: RegexTokenizer, dataframe: DataFrame): Unit = { testTransformer[(String, Seq[String])](dataframe, t, "tokens", "wantedTokens") { case Row(tokens, wantedTokens) => assert(tokens === wantedTokens) } } test("RegexTokenizer") { val tokenizer0 = new RegexTokenizer() .setGaps(false) .setPattern("\\w+|\\p{Punct}") .setInputCol("rawText") .setOutputCol("tokens") val dataset0 = Seq( TokenizerTestData("Test for tokenization.", Array("test", "for", "tokenization", ".")), TokenizerTestData("Te,st. punct", Array("te", ",", "st", ".", "punct")) ).toDF() testRegexTokenizer(tokenizer0, dataset0) val dataset1 = Seq( TokenizerTestData("Test for tokenization.", Array("test", "for", "tokenization")), TokenizerTestData("Te,st. punct", Array("punct")) ).toDF() tokenizer0.setMinTokenLength(3) testRegexTokenizer(tokenizer0, dataset1) val tokenizer2 = new RegexTokenizer() .setInputCol("rawText") .setOutputCol("tokens") val dataset2 = Seq( TokenizerTestData("Test for tokenization.", Array("test", "for", "tokenization.")), TokenizerTestData("Te,st. punct", Array("te,st.", "punct")) ).toDF() testRegexTokenizer(tokenizer2, dataset2) } test("RegexTokenizer with toLowercase false") { val tokenizer = new RegexTokenizer() .setInputCol("rawText") .setOutputCol("tokens") .setToLowercase(false) val dataset = Seq( TokenizerTestData("JAVA SCALA", Array("JAVA", "SCALA")), TokenizerTestData("java scala", Array("java", "scala")) ).toDF() testRegexTokenizer(tokenizer, dataset) } test("read/write") { val t = new RegexTokenizer() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setMinTokenLength(2) .setGaps(false) .setPattern("hi") .setToLowercase(false) testDefaultReadWrite(t) } }
Example 17
Source File: NGramSuite.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.ml.feature import com.tencent.angel.sona.ml.util.{DefaultReadWriteTest, MLTest} import scala.beans.BeanInfo import org.apache.spark.sql.{DataFrame, Row} @BeanInfo case class NGramTestData(inputTokens: Array[String], wantedNGrams: Array[String]) class NGramSuite extends MLTest with DefaultReadWriteTest { import testImplicits._ test("default behavior yields bigram features") { val nGram = new NGram() .setInputCol("inputTokens") .setOutputCol("nGrams") val dataset = Seq(NGramTestData( Array("Test", "for", "ngram", "."), Array("Test for", "for ngram", "ngram .") )).toDF() testNGram(nGram, dataset) } test("NGramLength=4 yields length 4 n-grams") { val nGram = new NGram() .setInputCol("inputTokens") .setOutputCol("nGrams") .setN(4) val dataset = Seq(NGramTestData( Array("a", "b", "c", "d", "e"), Array("a b c d", "b c d e") )).toDF() testNGram(nGram, dataset) } test("empty input yields empty output") { val nGram = new NGram() .setInputCol("inputTokens") .setOutputCol("nGrams") .setN(4) val dataset = Seq(NGramTestData(Array(), Array())).toDF() testNGram(nGram, dataset) } test("input array < n yields empty output") { val nGram = new NGram() .setInputCol("inputTokens") .setOutputCol("nGrams") .setN(6) val dataset = Seq(NGramTestData( Array("a", "b", "c", "d", "e"), Array() )).toDF() testNGram(nGram, dataset) } test("read/write") { val t = new NGram() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setN(3) testDefaultReadWrite(t) } def testNGram(t: NGram, dataFrame: DataFrame): Unit = { testTransformer[(Seq[String], Seq[String])](dataFrame, t, "nGrams", "wantedNGrams") { case Row(actualNGrams : Seq[_], wantedNGrams: Seq[_]) => assert(actualNGrams === wantedNGrams) } } }
Example 18
Source File: LabeledPoint.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.regression import scala.beans.BeanInfo import org.apache.spark.annotation.Since import org.apache.spark.ml.feature.{LabeledPoint => NewLabeledPoint} import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.util.NumericParser import org.apache.spark.SparkException @Since("1.1.0") def parse(s: String): LabeledPoint = { if (s.startsWith("(")) { NumericParser.parse(s) match { case Seq(label: Double, numeric: Any) => LabeledPoint(label, Vectors.parseNumeric(numeric)) case other => throw new SparkException(s"Cannot parse $other.") } } else { // dense format used before v1.0 val parts = s.split(',') val label = java.lang.Double.parseDouble(parts(0)) val features = Vectors.dense(parts(1).trim().split(' ').map(java.lang.Double.parseDouble)) LabeledPoint(label, features) } } private[spark] def fromML(point: NewLabeledPoint): LabeledPoint = { LabeledPoint(point.label, Vectors.fromML(point.features)) } }
Example 19
Source File: LabeledPoint.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.regression import scala.beans.BeanInfo import org.apache.spark.mllib.linalg.{Vectors, Vector} import org.apache.spark.mllib.util.NumericParser import org.apache.spark.SparkException def parse(s: String): LabeledPoint = { if (s.startsWith("(")) { NumericParser.parse(s) match { case Seq(label: Double, numeric: Any) => LabeledPoint(label, Vectors.parseNumeric(numeric)) case other => throw new SparkException(s"Cannot parse $other.") } } else { // dense format used before v1.0 val parts = s.split(',') val label = java.lang.Double.parseDouble(parts(0)) val features = Vectors.dense(parts(1).trim().split(' ').map(java.lang.Double.parseDouble)) LabeledPoint(label, features) } } }
Example 20
Source File: SimpleTextClassificationPipeline.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.ml import scala.beans.BeanInfo import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.feature.{HashingTF, Tokenizer} import org.apache.spark.mllib.linalg.Vector import org.apache.spark.sql.{Row, SQLContext} @BeanInfo case class LabeledDocument(id: Long, text: String, label: Double) @BeanInfo case class Document(id: Long, text: String) object SimpleTextClassificationPipeline { def main(args: Array[String]) { val conf = new SparkConf().setAppName("SimpleTextClassificationPipeline") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) import sqlContext.implicits._ // Prepare training documents, which are labeled. val training = sc.parallelize(Seq( LabeledDocument(0L, "a b c d e spark", 1.0), LabeledDocument(1L, "b d", 0.0), LabeledDocument(2L, "spark f g h", 1.0), LabeledDocument(3L, "hadoop mapreduce", 0.0))) // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. val tokenizer = new Tokenizer() .setInputCol("text") .setOutputCol("words") val hashingTF = new HashingTF() .setNumFeatures(1000) .setInputCol(tokenizer.getOutputCol) .setOutputCol("features") val lr = new LogisticRegression() .setMaxIter(10) .setRegParam(0.001) val pipeline = new Pipeline() .setStages(Array(tokenizer, hashingTF, lr)) // Fit the pipeline to training documents. val model = pipeline.fit(training.toDF()) // Prepare test documents, which are unlabeled. val test = sc.parallelize(Seq( Document(4L, "spark i j k"), Document(5L, "l m n"), Document(6L, "spark hadoop spark"), Document(7L, "apache hadoop"))) // Make predictions on test documents. model.transform(test.toDF()) .select("id", "text", "probability", "prediction") .collect() .foreach { case Row(id: Long, text: String, prob: Vector, prediction: Double) => println(s"($id, $text) --> prob=$prob, prediction=$prediction") } sc.stop() } }
Example 21
Source File: NGramSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import scala.beans.BeanInfo import org.apache.spark.SparkFunSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{Dataset, Row} @BeanInfo case class NGramTestData(inputTokens: Array[String], wantedNGrams: Array[String]) class NGramSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import org.apache.spark.ml.feature.NGramSuite._ import testImplicits._ test("default behavior yields bigram features") { val nGram = new NGram() .setInputCol("inputTokens") .setOutputCol("nGrams") val dataset = Seq(NGramTestData( Array("Test", "for", "ngram", "."), Array("Test for", "for ngram", "ngram .") )).toDF() testNGram(nGram, dataset) } test("NGramLength=4 yields length 4 n-grams") { val nGram = new NGram() .setInputCol("inputTokens") .setOutputCol("nGrams") .setN(4) val dataset = Seq(NGramTestData( Array("a", "b", "c", "d", "e"), Array("a b c d", "b c d e") )).toDF() testNGram(nGram, dataset) } test("empty input yields empty output") { val nGram = new NGram() .setInputCol("inputTokens") .setOutputCol("nGrams") .setN(4) val dataset = Seq(NGramTestData(Array(), Array())).toDF() testNGram(nGram, dataset) } test("input array < n yields empty output") { val nGram = new NGram() .setInputCol("inputTokens") .setOutputCol("nGrams") .setN(6) val dataset = Seq(NGramTestData( Array("a", "b", "c", "d", "e"), Array() )).toDF() testNGram(nGram, dataset) } test("read/write") { val t = new NGram() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setN(3) testDefaultReadWrite(t) } } object NGramSuite extends SparkFunSuite { def testNGram(t: NGram, dataset: Dataset[_]): Unit = { t.transform(dataset) .select("nGrams", "wantedNGrams") .collect() .foreach { case Row(actualNGrams, wantedNGrams) => assert(actualNGrams === wantedNGrams) } } }
Example 22
Source File: TokenizerSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import scala.beans.BeanInfo import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{Dataset, Row} @BeanInfo case class TokenizerTestData(rawText: String, wantedTokens: Array[String]) class TokenizerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { test("params") { ParamsSuite.checkParams(new Tokenizer) } test("read/write") { val t = new Tokenizer() .setInputCol("myInputCol") .setOutputCol("myOutputCol") testDefaultReadWrite(t) } } class RegexTokenizerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import org.apache.spark.ml.feature.RegexTokenizerSuite._ import testImplicits._ test("params") { ParamsSuite.checkParams(new RegexTokenizer) } test("RegexTokenizer") { val tokenizer0 = new RegexTokenizer() .setGaps(false) .setPattern("\\w+|\\p{Punct}") .setInputCol("rawText") .setOutputCol("tokens") val dataset0 = Seq( TokenizerTestData("Test for tokenization.", Array("test", "for", "tokenization", ".")), TokenizerTestData("Te,st. punct", Array("te", ",", "st", ".", "punct")) ).toDF() testRegexTokenizer(tokenizer0, dataset0) val dataset1 = Seq( TokenizerTestData("Test for tokenization.", Array("test", "for", "tokenization")), TokenizerTestData("Te,st. punct", Array("punct")) ).toDF() tokenizer0.setMinTokenLength(3) testRegexTokenizer(tokenizer0, dataset1) val tokenizer2 = new RegexTokenizer() .setInputCol("rawText") .setOutputCol("tokens") val dataset2 = Seq( TokenizerTestData("Test for tokenization.", Array("test", "for", "tokenization.")), TokenizerTestData("Te,st. punct", Array("te,st.", "punct")) ).toDF() testRegexTokenizer(tokenizer2, dataset2) } test("RegexTokenizer with toLowercase false") { val tokenizer = new RegexTokenizer() .setInputCol("rawText") .setOutputCol("tokens") .setToLowercase(false) val dataset = Seq( TokenizerTestData("JAVA SCALA", Array("JAVA", "SCALA")), TokenizerTestData("java scala", Array("java", "scala")) ).toDF() testRegexTokenizer(tokenizer, dataset) } test("read/write") { val t = new RegexTokenizer() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setMinTokenLength(2) .setGaps(false) .setPattern("hi") .setToLowercase(false) testDefaultReadWrite(t) } } object RegexTokenizerSuite extends SparkFunSuite { def testRegexTokenizer(t: RegexTokenizer, dataset: Dataset[_]): Unit = { t.transform(dataset) .select("tokens", "wantedTokens") .collect() .foreach { case Row(tokens, wantedTokens) => assert(tokens === wantedTokens) } } }
Example 23
Source File: DCTSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import scala.beans.BeanInfo import edu.emory.mathcs.jtransforms.dct.DoubleDCT_1D import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.Row @BeanInfo case class DCTTestData(vec: Vector, wantedVec: Vector) class DCTSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import testImplicits._ test("forward transform of discrete cosine matches jTransforms result") { val data = Vectors.dense((0 until 128).map(_ => 2D * math.random - 1D).toArray) val inverse = false testDCT(data, inverse) } test("inverse transform of discrete cosine matches jTransforms result") { val data = Vectors.dense((0 until 128).map(_ => 2D * math.random - 1D).toArray) val inverse = true testDCT(data, inverse) } test("read/write") { val t = new DCT() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setInverse(true) testDefaultReadWrite(t) } private def testDCT(data: Vector, inverse: Boolean): Unit = { val expectedResultBuffer = data.toArray.clone() if (inverse) { new DoubleDCT_1D(data.size).inverse(expectedResultBuffer, true) } else { new DoubleDCT_1D(data.size).forward(expectedResultBuffer, true) } val expectedResult = Vectors.dense(expectedResultBuffer) val dataset = Seq(DCTTestData(data, expectedResult)).toDF() val transformer = new DCT() .setInputCol("vec") .setOutputCol("resultVec") .setInverse(inverse) transformer.transform(dataset) .select("resultVec", "wantedVec") .collect() .foreach { case Row(resultVec: Vector, wantedVec: Vector) => assert(Vectors.sqdist(resultVec, wantedVec) < 1e-6) } } }
Example 24
Source File: LabeledPoint.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.regression import scala.beans.BeanInfo import org.apache.spark.annotation.Since import org.apache.spark.ml.feature.{LabeledPoint => NewLabeledPoint} import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.util.NumericParser import org.apache.spark.SparkException @Since("1.1.0") def parse(s: String): LabeledPoint = { if (s.startsWith("(")) { NumericParser.parse(s) match { case Seq(label: Double, numeric: Any) => LabeledPoint(label, Vectors.parseNumeric(numeric)) case other => throw new SparkException(s"Cannot parse $other.") } } else { // dense format used before v1.0 val parts = s.split(',') val label = java.lang.Double.parseDouble(parts(0)) val features = Vectors.dense(parts(1).trim().split(' ').map(java.lang.Double.parseDouble)) LabeledPoint(label, features) } } private[spark] def fromML(point: NewLabeledPoint): LabeledPoint = { LabeledPoint(point.label, Vectors.fromML(point.features)) } }
Example 25
Source File: NGramSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import scala.beans.BeanInfo import org.apache.spark.SparkFunSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{Dataset, Row} @BeanInfo case class NGramTestData(inputTokens: Array[String], wantedNGrams: Array[String]) class NGramSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import org.apache.spark.ml.feature.NGramSuite._ import testImplicits._ test("default behavior yields bigram features") { val nGram = new NGram() .setInputCol("inputTokens") .setOutputCol("nGrams") val dataset = Seq(NGramTestData( Array("Test", "for", "ngram", "."), Array("Test for", "for ngram", "ngram .") )).toDF() testNGram(nGram, dataset) } test("NGramLength=4 yields length 4 n-grams") { val nGram = new NGram() .setInputCol("inputTokens") .setOutputCol("nGrams") .setN(4) val dataset = Seq(NGramTestData( Array("a", "b", "c", "d", "e"), Array("a b c d", "b c d e") )).toDF() testNGram(nGram, dataset) } test("empty input yields empty output") { val nGram = new NGram() .setInputCol("inputTokens") .setOutputCol("nGrams") .setN(4) val dataset = Seq(NGramTestData(Array(), Array())).toDF() testNGram(nGram, dataset) } test("input array < n yields empty output") { val nGram = new NGram() .setInputCol("inputTokens") .setOutputCol("nGrams") .setN(6) val dataset = Seq(NGramTestData( Array("a", "b", "c", "d", "e"), Array() )).toDF() testNGram(nGram, dataset) } test("read/write") { val t = new NGram() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setN(3) testDefaultReadWrite(t) } } object NGramSuite extends SparkFunSuite { def testNGram(t: NGram, dataset: Dataset[_]): Unit = { t.transform(dataset) .select("nGrams", "wantedNGrams") .collect() .foreach { case Row(actualNGrams, wantedNGrams) => assert(actualNGrams === wantedNGrams) } } }
Example 26
Source File: TokenizerSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import scala.beans.BeanInfo import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{Dataset, Row} @BeanInfo case class TokenizerTestData(rawText: String, wantedTokens: Array[String]) class TokenizerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { test("params") { ParamsSuite.checkParams(new Tokenizer) } test("read/write") { val t = new Tokenizer() .setInputCol("myInputCol") .setOutputCol("myOutputCol") testDefaultReadWrite(t) } } class RegexTokenizerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import org.apache.spark.ml.feature.RegexTokenizerSuite._ import testImplicits._ test("params") { ParamsSuite.checkParams(new RegexTokenizer) } test("RegexTokenizer") { val tokenizer0 = new RegexTokenizer() .setGaps(false) .setPattern("\\w+|\\p{Punct}") .setInputCol("rawText") .setOutputCol("tokens") val dataset0 = Seq( TokenizerTestData("Test for tokenization.", Array("test", "for", "tokenization", ".")), TokenizerTestData("Te,st. punct", Array("te", ",", "st", ".", "punct")) ).toDF() testRegexTokenizer(tokenizer0, dataset0) val dataset1 = Seq( TokenizerTestData("Test for tokenization.", Array("test", "for", "tokenization")), TokenizerTestData("Te,st. punct", Array("punct")) ).toDF() tokenizer0.setMinTokenLength(3) testRegexTokenizer(tokenizer0, dataset1) val tokenizer2 = new RegexTokenizer() .setInputCol("rawText") .setOutputCol("tokens") val dataset2 = Seq( TokenizerTestData("Test for tokenization.", Array("test", "for", "tokenization.")), TokenizerTestData("Te,st. punct", Array("te,st.", "punct")) ).toDF() testRegexTokenizer(tokenizer2, dataset2) } test("RegexTokenizer with toLowercase false") { val tokenizer = new RegexTokenizer() .setInputCol("rawText") .setOutputCol("tokens") .setToLowercase(false) val dataset = Seq( TokenizerTestData("JAVA SCALA", Array("JAVA", "SCALA")), TokenizerTestData("java scala", Array("java", "scala")) ).toDF() testRegexTokenizer(tokenizer, dataset) } test("read/write") { val t = new RegexTokenizer() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setMinTokenLength(2) .setGaps(false) .setPattern("hi") .setToLowercase(false) testDefaultReadWrite(t) } } object RegexTokenizerSuite extends SparkFunSuite { def testRegexTokenizer(t: RegexTokenizer, dataset: Dataset[_]): Unit = { t.transform(dataset) .select("tokens", "wantedTokens") .collect() .foreach { case Row(tokens, wantedTokens) => assert(tokens === wantedTokens) } } }
Example 27
Source File: DCTSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import scala.beans.BeanInfo import edu.emory.mathcs.jtransforms.dct.DoubleDCT_1D import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.Row @BeanInfo case class DCTTestData(vec: Vector, wantedVec: Vector) class DCTSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import testImplicits._ test("forward transform of discrete cosine matches jTransforms result") { val data = Vectors.dense((0 until 128).map(_ => 2D * math.random - 1D).toArray) val inverse = false testDCT(data, inverse) } test("inverse transform of discrete cosine matches jTransforms result") { val data = Vectors.dense((0 until 128).map(_ => 2D * math.random - 1D).toArray) val inverse = true testDCT(data, inverse) } test("read/write") { val t = new DCT() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setInverse(true) testDefaultReadWrite(t) } private def testDCT(data: Vector, inverse: Boolean): Unit = { val expectedResultBuffer = data.toArray.clone() if (inverse) { new DoubleDCT_1D(data.size).inverse(expectedResultBuffer, true) } else { new DoubleDCT_1D(data.size).forward(expectedResultBuffer, true) } val expectedResult = Vectors.dense(expectedResultBuffer) val dataset = Seq(DCTTestData(data, expectedResult)).toDF() val transformer = new DCT() .setInputCol("vec") .setOutputCol("resultVec") .setInverse(inverse) transformer.transform(dataset) .select("resultVec", "wantedVec") .collect() .foreach { case Row(resultVec: Vector, wantedVec: Vector) => assert(Vectors.sqdist(resultVec, wantedVec) < 1e-6) } } }
Example 28
Source File: LabeledPoint.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.regression import scala.beans.BeanInfo import org.apache.spark.annotation.Since import org.apache.spark.ml.feature.{LabeledPoint => NewLabeledPoint} import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.util.NumericParser import org.apache.spark.SparkException @Since("1.1.0") def parse(s: String): LabeledPoint = { if (s.startsWith("(")) { NumericParser.parse(s) match { case Seq(label: Double, numeric: Any) => LabeledPoint(label, Vectors.parseNumeric(numeric)) case other => throw new SparkException(s"Cannot parse $other.") } } else { // dense format used before v1.0 val parts = s.split(',') val label = java.lang.Double.parseDouble(parts(0)) val features = Vectors.dense(parts(1).trim().split(' ').map(java.lang.Double.parseDouble)) LabeledPoint(label, features) } } private[spark] def fromML(point: NewLabeledPoint): LabeledPoint = { LabeledPoint(point.label, Vectors.fromML(point.features)) } }
Example 29
Source File: PipeClassificationSvm.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.classification import scala.beans.BeanInfo import org.apache.spark.mllib.classification.NaiveBayes import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD import de.unihamburg.vsis.sddf.SddfContext.SymPairSim import de.unihamburg.vsis.sddf.reading.SymPair import de.unihamburg.vsis.sddf.reading.Tuple import org.apache.spark.mllib.classification.SVMWithSGD class PipeClassificationSvm(numIterations: Int = 100) extends AbstractPipeClassification { val paramMap: Map[String, Any] = Map(("numIterations", numIterations)) def trainModelAndClassify( trainingData: RDD[LabeledPoint], symPairSim: SymPairSim): RDD[(SymPair[Tuple], Array[Double], Double)] = { val model = SVMWithSGD.train(trainingData, numIterations) log.debug("Classification Model:" + model) // Marking Missing Values as Not Equal (0) symPairSim.map(pair => (pair._1, pair._2, model.predict(Vectors.dense(pair._2)))) } } object PipeClassificationSvm { def apply(numIterations: Int = 100) = { new PipeClassificationSvm(numIterations) } }
Example 30
Source File: PipeClassificationNaiveBayes.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.classification import scala.beans.BeanInfo import org.apache.spark.mllib.classification.NaiveBayes import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD import de.unihamburg.vsis.sddf.SddfContext.SymPairSim import de.unihamburg.vsis.sddf.reading.SymPair import de.unihamburg.vsis.sddf.reading.Tuple import org.apache.spark.mllib.classification.NaiveBayesModel class PipeClassificationNaiveBayes(lambda: Double = 1.0) extends AbstractPipeClassification { val paramMap: Map[String, Any] = Map(("lambda", lambda)) def trainModelAndClassify( trainingData: RDD[LabeledPoint], symPairSim: SymPairSim): RDD[(SymPair[Tuple], Array[Double], Double)] = { val model = NaiveBayes.train(trainingData, lambda) log.debug("Classification Model:" + model) log.debug("Classification Model labels :" + model.labels.mkString(" ")) log.debug("Classification Model pi: " + model.pi.mkString(" ")) log.debug("Classification Model theta: " + model.theta.foreach(_.mkString(" "))) // Marking Missing Values as Not Equal (0) symPairSim.map(pair => (pair._1, pair._2, model.predict(Vectors.dense(pair._2)))) } } object PipeClassificationNaiveBayes { def apply(lambda: Double = 1.0) = { new PipeClassificationNaiveBayes(lambda) } }
Example 31
Source File: NGramSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import scala.beans.BeanInfo import org.apache.spark.SparkFunSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{Dataset, Row} @BeanInfo case class NGramTestData(inputTokens: Array[String], wantedNGrams: Array[String]) class NGramSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import org.apache.spark.ml.feature.NGramSuite._ import testImplicits._ test("default behavior yields bigram features") { val nGram = new NGram() .setInputCol("inputTokens") .setOutputCol("nGrams") val dataset = Seq(NGramTestData( Array("Test", "for", "ngram", "."), Array("Test for", "for ngram", "ngram .") )).toDF() testNGram(nGram, dataset) } test("NGramLength=4 yields length 4 n-grams") { val nGram = new NGram() .setInputCol("inputTokens") .setOutputCol("nGrams") .setN(4) val dataset = Seq(NGramTestData( Array("a", "b", "c", "d", "e"), Array("a b c d", "b c d e") )).toDF() testNGram(nGram, dataset) } test("empty input yields empty output") { val nGram = new NGram() .setInputCol("inputTokens") .setOutputCol("nGrams") .setN(4) val dataset = Seq(NGramTestData(Array(), Array())).toDF() testNGram(nGram, dataset) } test("input array < n yields empty output") { val nGram = new NGram() .setInputCol("inputTokens") .setOutputCol("nGrams") .setN(6) val dataset = Seq(NGramTestData( Array("a", "b", "c", "d", "e"), Array() )).toDF() testNGram(nGram, dataset) } test("read/write") { val t = new NGram() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setN(3) testDefaultReadWrite(t) } } object NGramSuite extends SparkFunSuite { def testNGram(t: NGram, dataset: Dataset[_]): Unit = { t.transform(dataset) .select("nGrams", "wantedNGrams") .collect() .foreach { case Row(actualNGrams, wantedNGrams) => assert(actualNGrams === wantedNGrams) } } }
Example 32
Source File: TokenizerSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import scala.beans.BeanInfo import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{Dataset, Row} @BeanInfo case class TokenizerTestData(rawText: String, wantedTokens: Array[String]) class TokenizerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { test("params") { ParamsSuite.checkParams(new Tokenizer) } test("read/write") { val t = new Tokenizer() .setInputCol("myInputCol") .setOutputCol("myOutputCol") testDefaultReadWrite(t) } } class RegexTokenizerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import org.apache.spark.ml.feature.RegexTokenizerSuite._ import testImplicits._ test("params") { ParamsSuite.checkParams(new RegexTokenizer) } test("RegexTokenizer") { val tokenizer0 = new RegexTokenizer() .setGaps(false) .setPattern("\\w+|\\p{Punct}") .setInputCol("rawText") .setOutputCol("tokens") val dataset0 = Seq( TokenizerTestData("Test for tokenization.", Array("test", "for", "tokenization", ".")), TokenizerTestData("Te,st. punct", Array("te", ",", "st", ".", "punct")) ).toDF() testRegexTokenizer(tokenizer0, dataset0) val dataset1 = Seq( TokenizerTestData("Test for tokenization.", Array("test", "for", "tokenization")), TokenizerTestData("Te,st. punct", Array("punct")) ).toDF() tokenizer0.setMinTokenLength(3) testRegexTokenizer(tokenizer0, dataset1) val tokenizer2 = new RegexTokenizer() .setInputCol("rawText") .setOutputCol("tokens") val dataset2 = Seq( TokenizerTestData("Test for tokenization.", Array("test", "for", "tokenization.")), TokenizerTestData("Te,st. punct", Array("te,st.", "punct")) ).toDF() testRegexTokenizer(tokenizer2, dataset2) } test("RegexTokenizer with toLowercase false") { val tokenizer = new RegexTokenizer() .setInputCol("rawText") .setOutputCol("tokens") .setToLowercase(false) val dataset = Seq( TokenizerTestData("JAVA SCALA", Array("JAVA", "SCALA")), TokenizerTestData("java scala", Array("java", "scala")) ).toDF() testRegexTokenizer(tokenizer, dataset) } test("read/write") { val t = new RegexTokenizer() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setMinTokenLength(2) .setGaps(false) .setPattern("hi") .setToLowercase(false) testDefaultReadWrite(t) } } object RegexTokenizerSuite extends SparkFunSuite { def testRegexTokenizer(t: RegexTokenizer, dataset: Dataset[_]): Unit = { t.transform(dataset) .select("tokens", "wantedTokens") .collect() .foreach { case Row(tokens, wantedTokens) => assert(tokens === wantedTokens) } } }
Example 33
Source File: DCTSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import scala.beans.BeanInfo import edu.emory.mathcs.jtransforms.dct.DoubleDCT_1D import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.Row @BeanInfo case class DCTTestData(vec: Vector, wantedVec: Vector) class DCTSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import testImplicits._ test("forward transform of discrete cosine matches jTransforms result") { val data = Vectors.dense((0 until 128).map(_ => 2D * math.random - 1D).toArray) val inverse = false testDCT(data, inverse) } test("inverse transform of discrete cosine matches jTransforms result") { val data = Vectors.dense((0 until 128).map(_ => 2D * math.random - 1D).toArray) val inverse = true testDCT(data, inverse) } test("read/write") { val t = new DCT() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setInverse(true) testDefaultReadWrite(t) } private def testDCT(data: Vector, inverse: Boolean): Unit = { val expectedResultBuffer = data.toArray.clone() if (inverse) { new DoubleDCT_1D(data.size).inverse(expectedResultBuffer, true) } else { new DoubleDCT_1D(data.size).forward(expectedResultBuffer, true) } val expectedResult = Vectors.dense(expectedResultBuffer) val dataset = Seq(DCTTestData(data, expectedResult)).toDF() val transformer = new DCT() .setInputCol("vec") .setOutputCol("resultVec") .setInverse(inverse) transformer.transform(dataset) .select("resultVec", "wantedVec") .collect() .foreach { case Row(resultVec: Vector, wantedVec: Vector) => assert(Vectors.sqdist(resultVec, wantedVec) < 1e-6) } } }