org.apache.spark.sql.types.ArrayType Scala Examples
The following examples show how to use org.apache.spark.sql.types.ArrayType.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: NGram.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.{Since, Experimental} import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.param._ import org.apache.spark.ml.util._ import org.apache.spark.sql.types.{ArrayType, DataType, StringType} def getN: Int = $(n) setDefault(n -> 2) override protected def createTransformFunc: Seq[String] => Seq[String] = { _.iterator.sliding($(n)).withPartial(false).map(_.mkString(" ")).toSeq } override protected def validateInputType(inputType: DataType): Unit = { require(inputType.sameType(ArrayType(StringType)), s"Input type must be ArrayType(StringType) but got $inputType.") } override protected def outputDataType: DataType = new ArrayType(StringType, false) } @Since("1.6.0") object NGram extends DefaultParamsReadable[NGram] { @Since("1.6.0") override def load(path: String): NGram = super.load(path) }
Example 2
Source File: CountVectorizerExample.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperations.examples import org.apache.spark.sql.Row import org.apache.spark.sql.types.{ArrayType, StringType, StructField, StructType} import ai.deepsense.deeplang.doperables.dataframe.{DataFrame, DataFrameBuilder} import ai.deepsense.deeplang.doperations.spark.wrappers.estimators.CountVectorizer class CountVectorizerExample extends AbstractOperationExample[CountVectorizer]{ override def dOperation: CountVectorizer = { val op = new CountVectorizer() op.estimator .setInputColumn("lines") .setNoInPlace("lines_out") .setMinTF(3) op.set(op.estimator.extractParamMap()) } override def inputDataFrames: Seq[DataFrame] = { val rows = Seq( Row("a a a b b c c c d ".split(" ").toSeq), Row("c c c c c c".split(" ").toSeq), Row("a".split(" ").toSeq), Row("e e e e e".split(" ").toSeq)) val rdd = sparkContext.parallelize(rows) val schema = StructType(Seq(StructField("lines", ArrayType(StringType, containsNull = true)))) Seq(DataFrameBuilder(sparkSQLSession).buildDataFrame(schema, rdd)) } }
Example 3
Source File: StringTokenizerSmokeTest.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperables.spark.wrappers.transformers import org.apache.spark.sql.types.{ArrayType, DataType, StringType} import ai.deepsense.deeplang.doperables.multicolumn.MultiColumnParams.SingleOrMultiColumnChoices.SingleColumnChoice import ai.deepsense.deeplang.doperables.multicolumn.SingleColumnParams.SingleTransformInPlaceChoices.NoInPlaceChoice import ai.deepsense.deeplang.params.selections.NameSingleColumnSelection class StringTokenizerSmokeTest extends AbstractTransformerWrapperSmokeTest[StringTokenizer] with MultiColumnTransformerWrapperTestSupport { override def transformerWithParams: StringTokenizer = { val inPlace = NoInPlaceChoice() .setOutputColumn("tokenized") val single = SingleColumnChoice() .setInputColumn(NameSingleColumnSelection("s")) .setInPlace(inPlace) val transformer = new StringTokenizer() transformer.set(Seq( transformer.singleOrMultiChoiceParam -> single ): _*) } override def testValues: Seq[(Any, Any)] = { val strings = Seq( "this is a test", "this values should be separated", "Bla bla bla!" ) val tokenized = strings.map { _.toLowerCase.split("\\s") } strings.zip(tokenized) } override def inputType: DataType = StringType override def outputType: DataType = new ArrayType(StringType, true) }
Example 4
Source File: RegexTokenizerSmokeTest.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperables.spark.wrappers.transformers import org.apache.spark.sql.types.{ArrayType, DataType, StringType} import ai.deepsense.deeplang.doperables.multicolumn.MultiColumnParams.SingleOrMultiColumnChoices.SingleColumnChoice import ai.deepsense.deeplang.doperables.multicolumn.SingleColumnParams.SingleTransformInPlaceChoices.NoInPlaceChoice import ai.deepsense.deeplang.params.selections.NameSingleColumnSelection class RegexTokenizerSmokeTest extends AbstractTransformerWrapperSmokeTest[RegexTokenizer] with MultiColumnTransformerWrapperTestSupport { override def transformerWithParams: RegexTokenizer = { val inPlace = NoInPlaceChoice() .setOutputColumn("tokenized") val single = SingleColumnChoice() .setInputColumn(NameSingleColumnSelection("s")) .setInPlace(inPlace) val transformer = new RegexTokenizer() transformer.set(Seq( transformer.singleOrMultiChoiceParam -> single, transformer.gaps -> false, transformer.minTokenLength -> 1, transformer.pattern -> "\\d+" ): _*) } override def testValues: Seq[(Any, Any)] = { val strings = Seq( "100 200 300", "400 500 600", "700 800 900" ) val tokenized = strings.map { _.toLowerCase.split(" ") } strings.zip(tokenized) } override def inputType: DataType = StringType override def outputType: DataType = new ArrayType(StringType, true) }
Example 5
Source File: NGramTransformerSmokeTest.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperables.spark.wrappers.transformers import org.apache.spark.sql.types.{ArrayType, DataType, StringType} import ai.deepsense.deeplang.doperables.multicolumn.MultiColumnParams.SingleOrMultiColumnChoices.SingleColumnChoice import ai.deepsense.deeplang.doperables.multicolumn.SingleColumnParams.SingleTransformInPlaceChoices.NoInPlaceChoice import ai.deepsense.deeplang.params.selections.NameSingleColumnSelection class NGramTransformerSmokeTest extends AbstractTransformerWrapperSmokeTest[NGramTransformer] with MultiColumnTransformerWrapperTestSupport { override def transformerWithParams: NGramTransformer = { val inPlace = NoInPlaceChoice() .setOutputColumn("ngrams") val single = SingleColumnChoice() .setInputColumn(NameSingleColumnSelection("as")) .setInPlace(inPlace) val transformer = new NGramTransformer() transformer.set(Seq( transformer.singleOrMultiChoiceParam -> single, transformer.n -> 2 ): _*) } override def testValues: Seq[(Any, Any)] = { val strings = Seq( Array("a", "b", "c"), Array("d", "e", "f") ) val ngrams = Seq( Array("a b", "b c"), Array("d e", "e f") ) strings.zip(ngrams) } override def inputType: DataType = new ArrayType(StringType, true) override def outputType: DataType = new ArrayType(StringType, false) }
Example 6
Source File: StopWordsRemoverSmokeTest.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperables.spark.wrappers.transformers import org.apache.spark.sql.types.{ArrayType, DataType, StringType} import ai.deepsense.deeplang.doperables.multicolumn.MultiColumnParams.SingleOrMultiColumnChoices.SingleColumnChoice import ai.deepsense.deeplang.doperables.multicolumn.SingleColumnParams.SingleTransformInPlaceChoices.NoInPlaceChoice import ai.deepsense.deeplang.params.selections.NameSingleColumnSelection class StopWordsRemoverSmokeTest extends AbstractTransformerWrapperSmokeTest[StopWordsRemover] with MultiColumnTransformerWrapperTestSupport { override def transformerWithParams: StopWordsRemover = { val inPlace = NoInPlaceChoice() .setOutputColumn("stopWordsRemoverOutput") val single = SingleColumnChoice() .setInputColumn(NameSingleColumnSelection("as")) .setInPlace(inPlace) val stopWordsRemover = new StopWordsRemover() stopWordsRemover.set( stopWordsRemover.singleOrMultiChoiceParam -> single, stopWordsRemover.caseSensitive -> false) } override def testValues: Seq[(Any, Any)] = { val inputNumbers = Seq(Array("a", "seahorse", "The", "Horseshoe", "Crab")) val outputNumbers = Seq(Array("seahorse", "Horseshoe", "Crab")) inputNumbers.zip(outputNumbers) } override def inputType: DataType = ArrayType(StringType) override def outputType: DataType = ArrayType(StringType) }
Example 7
Source File: Tokenizer.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Experimental import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.param._ import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.types.{ArrayType, DataType, StringType} def getPattern: String = $(pattern) setDefault(minTokenLength -> 1, gaps -> true, pattern -> "\\s+") override protected def createTransformFunc: String => Seq[String] = { str => val re = $(pattern).r val tokens = if ($(gaps)) re.split(str).toSeq else re.findAllIn(str).toSeq val minLength = $(minTokenLength) tokens.filter(_.length >= minLength) } override protected def validateInputType(inputType: DataType): Unit = { require(inputType == StringType, s"Input type must be string type but got $inputType.") } override protected def outputDataType: DataType = new ArrayType(StringType, true) override def copy(extra: ParamMap): RegexTokenizer = defaultCopy(extra) }
Example 8
Source File: HashingTF.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Experimental import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.mllib.feature import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, StructType} def setNumFeatures(value: Int): this.type = set(numFeatures, value) override def transform(dataset: DataFrame): DataFrame = { val outputSchema = transformSchema(dataset.schema) val hashingTF = new feature.HashingTF($(numFeatures)) val t = udf { terms: Seq[_] => hashingTF.transform(terms) } val metadata = outputSchema($(outputCol)).metadata dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata)) } override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType require(inputType.isInstanceOf[ArrayType], s"The input column must be ArrayType, but got $inputType.") val attrGroup = new AttributeGroup($(outputCol), $(numFeatures)) SchemaUtils.appendColumn(schema, attrGroup.toStructField()) } override def copy(extra: ParamMap): HashingTF = defaultCopy(extra) }
Example 9
Source File: NullableColumnAccessorSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.columnar import java.nio.ByteBuffer import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.expressions.GenericMutableRow import org.apache.spark.sql.types.{StringType, ArrayType, DataType} //试验可为空的列的访问 class TestNullableColumnAccessor[JvmType]( buffer: ByteBuffer, columnType: ColumnType[JvmType]) extends BasicColumnAccessor(buffer, columnType) with NullableColumnAccessor //试验可为空的列的访问 object TestNullableColumnAccessor { def apply[JvmType](buffer: ByteBuffer, columnType: ColumnType[JvmType]) : TestNullableColumnAccessor[JvmType] = { // Skips the column type ID buffer.getInt() new TestNullableColumnAccessor(buffer, columnType) } } //空列存取器套件 class NullableColumnAccessorSuite extends SparkFunSuite { import ColumnarTestUtils._ Seq( BOOLEAN, BYTE, SHORT, INT, DATE, LONG, TIMESTAMP, FLOAT, DOUBLE, STRING, BINARY, FIXED_DECIMAL(15, 10), GENERIC(ArrayType(StringType))) .foreach { testNullableColumnAccessor(_) } //试验可为空的列的访问 def testNullableColumnAccessor[JvmType]( columnType: ColumnType[JvmType]): Unit = { //stripSuffix去掉<string>字串中结尾的字符 val typeName = columnType.getClass.getSimpleName.stripSuffix("$") val nullRow = makeNullRow(1) //空值 test(s"Nullable $typeName column accessor: empty column") { val builder = TestNullableColumnBuilder(columnType) val accessor = TestNullableColumnAccessor(builder.build(), columnType) assert(!accessor.hasNext) } //访问空值 test(s"Nullable $typeName column accessor: access null values") { val builder = TestNullableColumnBuilder(columnType) val randomRow = makeRandomRow(columnType) (0 until 4).foreach { _ => builder.appendFrom(randomRow, 0) builder.appendFrom(nullRow, 0) } val accessor = TestNullableColumnAccessor(builder.build(), columnType) val row = new GenericMutableRow(1) (0 until 4).foreach { _ => assert(accessor.hasNext) accessor.extractTo(row, 0) assert(row.get(0, columnType.dataType) === randomRow.get(0, columnType.dataType)) assert(accessor.hasNext) accessor.extractTo(row, 0) assert(row.isNullAt(0)) } assert(!accessor.hasNext) } } }
Example 10
Source File: LanguageAwareAnalyzer.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl.texts import org.apache.lucene.analysis.util.StopwordAnalyzerBase import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.shared.HasOutputCol import org.apache.spark.ml.param.{Param, ParamMap, Params} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.udf import org.apache.spark.sql.types.{ArrayType, StringType, StructType} def setOutputCol(value: String): this.type = set(outputCol, value) override def copy(extra: ParamMap): Transformer = { defaultCopy(extra) } def this() = this(Identifiable.randomUID("languageAnalyzer")) override def transform(dataset: Dataset[_]): DataFrame = { dataset.withColumn($(outputCol), stemmTextUDF(dataset.col($(inputColLang)), dataset.col($(inputColText)))).toDF } @DeveloperApi override def transformSchema(schema: StructType): StructType = { if ($(inputColText) equals $(outputCol)) { val schemaWithoutInput = new StructType(schema.fields.filterNot(_.name equals $(inputColText))) SchemaUtils.appendColumn(schemaWithoutInput, $(outputCol), ArrayType(StringType, true)) } else { SchemaUtils.appendColumn(schema, $(outputCol), ArrayType(StringType, true)) } } } object LanguageAwareAnalyzer extends DefaultParamsReadable[LanguageAwareAnalyzer] { override def load(path: String): LanguageAwareAnalyzer = super.load(path) }
Example 11
Source File: NGramExtractor.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl.texts import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.param.{IntParam, ParamMap, ParamPair, ParamValidators, Params} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.udf import org.apache.spark.sql.types.{ArrayType, StringType, StructType} def setOutputCol(value: String): this.type = set(outputCol, value) setDefault(new ParamPair[Int](upperN, 2), new ParamPair[Int](lowerN, 1)) override def transform(dataset: Dataset[_]): DataFrame = { val lowerBound = $(lowerN) val upperBound = $(upperN) val nGramUDF = udf[Seq[String], Seq[String]](NGramUtils.nGramFun(_,lowerBound,upperBound)) dataset.withColumn($(outputCol), nGramUDF(dataset.col($(inputCol)))) } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = { if ($(inputCol) != $(outputCol)) { schema.add($(outputCol), new ArrayType(StringType, true)) } else { schema } } } object NGramExtractor extends DefaultParamsReadable[NGramExtractor] { override def load(path: String): NGramExtractor = super.load(path) }
Example 12
Source File: NGramExtractorSpec.scala From pravda-ml with Apache License 2.0 | 5 votes |
package odkl.analysis.spark.texts import odkl.analysis.spark.TestEnv import org.apache.spark.ml.odkl.texts.NGramExtractor import org.apache.spark.sql.Row import org.apache.spark.sql.types.{ArrayType, StringType, StructType} import org.scalatest.FlatSpec class NGramExtractorSpec extends FlatSpec with TestEnv with org.scalatest.Matchers { "NGramExtractor" should "extract NGrams upTo=true" in { val nGramExtractor = new NGramExtractor() .setUpperN(2) .setInputCol("textTokenized") .setOutputCol("nGram") val schema = new StructType().add("textTokenized",ArrayType(StringType,true)) val inDF = sqlc.createDataFrame( sc.parallelize(Seq(Seq[String]("ab","bc","cd"),Seq[String]("a","b"))) .map(f => {Row(f)}), schema) val outDF = nGramExtractor.transform(inDF) val outArrays = outDF.collect().map(_.getAs[Seq[String]]("nGram")).toSeq val correctArrays = Seq(Seq("ab","bc","cd","ab bc","bc cd"),Seq("a","b", "a b")) assertResult(correctArrays)(outArrays) } "NGramExtractor" should "extract NGrams upTo=false" in { val nGramExtractor = new NGramExtractor() .setUpperN(3) .setLowerN(3) .setInputCol("textTokenized") .setOutputCol("nGram") val schema = new StructType().add("textTokenized",ArrayType(StringType,true)) val inDF = sqlc.createDataFrame( sc.parallelize(Seq(Seq[String]("a","b","c","d")).map(f => {Row(f)})), schema) val outDF = nGramExtractor.transform(inDF) val outArrays = outDF.collect().map(_.getAs[Seq[String]]("nGram")).toSeq val correctArrays = Seq(Seq("a b c", "b c d")) assertResult(correctArrays)(outArrays) } "NGramExtractor" should "extract NGrams with the same col" in { val nGramExtractor = new NGramExtractor() .setUpperN(3) .setLowerN(3) .setInputCol("textTokenized") .setOutputCol("textTokenized") val schema = new StructType().add("textTokenized",ArrayType(StringType,true)) val inDF = sqlc.createDataFrame( sc.parallelize(Seq(Seq[String]("a","b","c","d")).map(f => {Row(f)})), schema) val outDF = nGramExtractor.transform(inDF) val outArrays = outDF.collect().map(_.getAs[Seq[String]]("textTokenized")).toSeq val correctArrays = Seq(Seq("a b c", "b c d")) assertResult(correctArrays)(outArrays) } }
Example 13
Source File: FreqStatsTransformerSpec.scala From pravda-ml with Apache License 2.0 | 5 votes |
package odkl.analysis.spark.texts import odkl.analysis.spark.TestEnv import org.apache.spark.ml.odkl.texts.FreqStatsTransformer import org.apache.spark.sql.Row import org.apache.spark.sql.types.{ArrayType, LongType, StringType, StructType} import org.scalatest.FlatSpec class FreqStatsTransformerSpec extends FlatSpec with TestEnv with org.scalatest.Matchers { "FreqStatsTransformer" should "count freq" in { val fTransformer = new FreqStatsTransformer() .setInputDataCol("data") .setOutputColFreq("Freq") .setOutputColTerm("Term") val schema = new StructType().add("data",ArrayType(StringType,true)) val inDF = sqlc.createDataFrame( sc.parallelize(Seq(Seq[String]("a","b","c"),Seq[String]("a","b","a"))) .map(f => {Row(f)}), schema) val correctAns = Array[(String,Double)](("a",2D/5D),("b",2D/5D),("c",1D/5D)) val realAns = fTransformer.transform(inDF).sort("Term").collect().map(f =>{(f.getAs[String]("Term"),f.getAs[Double]("Freq"))}) assertResult(correctAns)(realAns) } "FreqStatsTransformer" should "filter freq by uni and bi treshold" in { val fTransformer = new FreqStatsTransformer() .setInputDataCol("data") .setOutputColFreq("Freq") .setOutputColTerm("Term") .setTresholdArr(Array[Double](1.5D/8D,1.1D/8D)) val schema = new StructType().add("data",ArrayType(StringType,true)) val inDF = sqlc.createDataFrame( sc.parallelize(Seq(Seq[String]("a","b","c","c a", "c a"),Seq[String]("a","b","a", "c a", "a b"))) .map(f => {Row(f)}), schema) val correctAns = Array[(String,Double)](("a",2D/8D),("b",2D/8D),("c a",2D/8D)) val realAnsDF = fTransformer.transform(inDF).sort("Term") val realAns = realAnsDF.collect().map(f =>{(f.getAs[String]("Term"),f.getAs[Double]("Freq"))}) assertResult(correctAns)(realAns) } "FreqStatsTransformer" should "extract max timestamp by term" in { val fTransformer = new FreqStatsTransformer() .setInputDataCol("data") .setOutputColFreq("Freq") .setOutputColTerm("Term") .setWithTimestamp(true) .setTimestampColumnName("timestamp") .setTresholdArr(Array[Double](1D/8D,1.1D/8D)) val schema = new StructType().add("data",ArrayType(StringType,true)).add("timestamp",LongType) val inDF = sqlc.createDataFrame( sc.parallelize(Seq(Seq(Seq[String]("a","c","c a", "c a"),100L),Seq(Seq[String]("c a", "a b"),150L),Seq(Seq[String]("b"),200L))) .map(f => {Row.fromSeq(f)}), schema) inDF.collect() val correctAns = Array[(String,Double,Long)](("a",1D/6D,100L),("a b",1D/6D, 150L),("b",1D/6D,200L), ("c",1D/6D, 100L),("c a",2D/6D, 150L)) val realAns = fTransformer.transform(inDF).sort("Term").collect().map(f =>{(f.getAs[String]("Term"),f.getAs[Double]("Freq"),f.getAs[Long]("timestamp"))}) assertResult(correctAns)(realAns) assertResult(correctAns(1))(realAns(1)) } }
Example 14
Source File: HashingTF.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, StructType} @Since("2.0.0") def setBinary(value: Boolean): this.type = set(binary, value) @Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { val outputSchema = transformSchema(dataset.schema) val hashingTF = new feature.HashingTF($(numFeatures)).setBinary($(binary)) // TODO: Make the hashingTF.transform natively in ml framework to avoid extra conversion. val t = udf { terms: Seq[_] => hashingTF.transform(terms).asML } val metadata = outputSchema($(outputCol)).metadata dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata)) } @Since("1.4.0") override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType require(inputType.isInstanceOf[ArrayType], s"The input column must be ArrayType, but got $inputType.") val attrGroup = new AttributeGroup($(outputCol), $(numFeatures)) SchemaUtils.appendColumn(schema, attrGroup.toStructField()) } @Since("1.4.1") override def copy(extra: ParamMap): HashingTF = defaultCopy(extra) } @Since("1.6.0") object HashingTF extends DefaultParamsReadable[HashingTF] { @Since("1.6.0") override def load(path: String): HashingTF = super.load(path) }
Example 15
Source File: NGram.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.param._ import org.apache.spark.ml.util._ import org.apache.spark.sql.types.{ArrayType, DataType, StringType} @Since("1.5.0") def getN: Int = $(n) setDefault(n -> 2) override protected def createTransformFunc: Seq[String] => Seq[String] = { _.iterator.sliding($(n)).withPartial(false).map(_.mkString(" ")).toSeq } override protected def validateInputType(inputType: DataType): Unit = { require(inputType.sameType(ArrayType(StringType)), s"Input type must be ArrayType(StringType) but got $inputType.") } override protected def outputDataType: DataType = new ArrayType(StringType, false) } @Since("1.6.0") object NGram extends DefaultParamsReadable[NGram] { @Since("1.6.0") override def load(path: String): NGram = super.load(path) }
Example 16
Source File: HashingTF.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.{Since, Experimental} import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, StructType} def setNumFeatures(value: Int): this.type = set(numFeatures, value) override def transform(dataset: DataFrame): DataFrame = { val outputSchema = transformSchema(dataset.schema) val hashingTF = new feature.HashingTF($(numFeatures)) val t = udf { terms: Seq[_] => hashingTF.transform(terms) } val metadata = outputSchema($(outputCol)).metadata dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata)) } override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType require(inputType.isInstanceOf[ArrayType], s"The input column must be ArrayType, but got $inputType.") val attrGroup = new AttributeGroup($(outputCol), $(numFeatures)) SchemaUtils.appendColumn(schema, attrGroup.toStructField()) } override def copy(extra: ParamMap): HashingTF = defaultCopy(extra) } @Since("1.6.0") object HashingTF extends DefaultParamsReadable[HashingTF] { @Since("1.6.0") override def load(path: String): HashingTF = super.load(path) }
Example 17
Source File: FrequentItems.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.stat import scala.collection.mutable.{Map => MutableMap} import org.apache.spark.Logging import org.apache.spark.sql.{Column, DataFrame, Row} import org.apache.spark.sql.catalyst.plans.logical.LocalRelation import org.apache.spark.sql.types.{ArrayType, StructField, StructType} private[sql] object FrequentItems extends Logging { private[sql] def singlePassFreqItems( df: DataFrame, cols: Seq[String], support: Double): DataFrame = { require(support >= 1e-4, s"support ($support) must be greater than 1e-4.") val numCols = cols.length // number of max items to keep counts for val sizeOfMap = (1 / support).toInt val countMaps = Seq.tabulate(numCols)(i => new FreqItemCounter(sizeOfMap)) val originalSchema = df.schema val colInfo = cols.map { name => val index = originalSchema.fieldIndex(name) (name, originalSchema.fields(index).dataType) } val freqItems = df.select(cols.map(Column(_)) : _*).rdd.aggregate(countMaps)( seqOp = (counts, row) => { var i = 0 while (i < numCols) { val thisMap = counts(i) val key = row.get(i) thisMap.add(key, 1L) i += 1 } counts }, combOp = (baseCounts, counts) => { var i = 0 while (i < numCols) { baseCounts(i).merge(counts(i)) i += 1 } baseCounts } ) val justItems = freqItems.map(m => m.baseMap.keys.toSeq) val resultRow = Row(justItems : _*) // append frequent Items to the column name for easy debugging val outputCols = colInfo.map { v => StructField(v._1 + "_freqItems", ArrayType(v._2, false)) } val schema = StructType(outputCols).toAttributes new DataFrame(df.sqlContext, LocalRelation(schema, Seq(resultRow))) } }
Example 18
Source File: MongodbSchemaIT.scala From Spark-MongoDB with Apache License 2.0 | 5 votes |
package com.stratio.datasource.mongodb.schema import java.text.SimpleDateFormat import java.util.Locale import com.stratio.datasource.MongodbTestConstants import com.stratio.datasource.mongodb.config.{MongodbConfig, MongodbConfigBuilder} import com.stratio.datasource.mongodb.partitioner.MongodbPartitioner import com.stratio.datasource.mongodb.rdd.MongodbRDD import com.stratio.datasource.mongodb._ import org.apache.spark.sql.mongodb.{TemporaryTestSQLContext, TestSQLContext} import org.apache.spark.sql.types.{ArrayType, StringType, StructField, TimestampType} import org.junit.runner.RunWith import org.scalatest._ import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class MongodbSchemaIT extends FlatSpec with Matchers with MongoEmbedDatabase with TestBsonData with MongodbTestConstants { private val host: String = "localhost" private val collection: String = "testCol" private val readPreference = "secondaryPreferred" val testConfig = MongodbConfigBuilder() .set(MongodbConfig.Host,List(host + ":" + mongoPort)) .set(MongodbConfig.Database,db) .set(MongodbConfig.Collection,collection) .set(MongodbConfig.SamplingRatio,1.0) .set(MongodbConfig.ReadPreference, readPreference) .build() val mongodbPartitioner = new MongodbPartitioner(testConfig) val mongodbRDD = new MongodbRDD(TemporaryTestSQLContext, testConfig, mongodbPartitioner) behavior of "A schema" it should "be inferred from rdd with primitives" + scalaBinaryVersion in { withEmbedMongoFixture(primitiveFieldAndType) { mongodProc => val schema = MongodbSchema(mongodbRDD, 1.0).schema() schema.fields should have size 7 schema.fieldNames should contain allOf("string", "integer", "long", "double", "boolean", "null") schema.printTreeString() } } it should "be inferred from rdd with complex fields" + scalaBinaryVersion in { withEmbedMongoFixture(complexFieldAndType1) { mongodProc => val schema = MongodbSchema(mongodbRDD, 1.0).schema() schema.fields should have size 13 schema.fields filter { case StructField(name, ArrayType(StringType, _), _, _) => Set("arrayOfNull", "arrayEmpty") contains name case _ => false } should have size 2 schema.printTreeString() } } it should "resolve type conflicts between fields" + scalaBinaryVersion in { withEmbedMongoFixture(primitiveFieldValueTypeConflict) { mongodProc => val schema = MongodbSchema(mongodbRDD, 1.0).schema() schema.fields should have size 7 schema.printTreeString() } } it should "be inferred from rdd with more complex fields" + scalaBinaryVersion in { withEmbedMongoFixture(complexFieldAndType2) { mongodProc => val schema = MongodbSchema(mongodbRDD, 1.0).schema() schema.fields should have size 5 schema.printTreeString() } } it should "read java.util.Date fields as timestamptype" + scalaBinaryVersion in { val dfunc = (s: String) => new SimpleDateFormat("EEE MMM dd HH:mm:ss Z yyyy", Locale.ENGLISH).parse(s) import com.mongodb.casbah.Imports.DBObject val stringAndDate = List(DBObject("string" -> "this is a simple string.", "date" -> dfunc("Mon Aug 10 07:52:49 EDT 2015"))) withEmbedMongoFixture(stringAndDate) { mongodProc => val schema = MongodbSchema(mongodbRDD, 1.0).schema() schema.fields should have size 3 schema.fields.filter(_.name == "date").head.dataType should equal(TimestampType) schema.printTreeString() } } }
Example 19
Source File: SnowballStemmer.scala From albedo with MIT License | 5 votes |
package ws.vinta.albedo.transformers import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.sql.types.{ArrayType, DataType, StringType} import org.tartarus.snowball.ext.EnglishStemmer class SnowballStemmer(override val uid: String) extends UnaryTransformer[Seq[String], Seq[String], SnowballStemmer] with DefaultParamsWritable { def this() = { this(Identifiable.randomUID("snowballStemmer")) } override def createTransformFunc: Seq[String] => Seq[String] = { strings => val stemmer = new EnglishStemmer() strings.map((str: String) => { try { stemmer.setCurrent(str) stemmer.stem() stemmer.getCurrent() } catch { case _: Exception => str } }) } override def validateInputType(inputType: DataType): Unit = { require(inputType == ArrayType(StringType), s"Input type must be string type but got $inputType.") } override def outputDataType: DataType = { ArrayType(StringType) } override def copy(extra: ParamMap): SnowballStemmer = { defaultCopy(extra) } } object SnowballStemmer extends DefaultParamsReadable[SnowballStemmer]
Example 20
Source File: DatasetUtil.scala From sona with Apache License 2.0 | 5 votes |
package org.apache.spark.util import org.apache.spark.linalg.{VectorUDT, Vectors} import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, DoubleType, FloatType, Metadata} import org.apache.spark.sql.{Column, DataFrame, Dataset} object DatasetUtil { def withColumns[T](ds: Dataset[T], colNames: Seq[String], cols: Seq[Column], metadata: Seq[Metadata]): DataFrame = { require(colNames.size == cols.size, s"The size of column names: ${colNames.size} isn't equal to " + s"the size of columns: ${cols.size}") require(colNames.size == metadata.size, s"The size of column names: ${colNames.size} isn't equal to " + s"the size of metadata elements: ${metadata.size}") val sparkSession = ds.sparkSession val queryExecution = ds.queryExecution val resolver = sparkSession.sessionState.analyzer.resolver val output = queryExecution.analyzed.output checkColumnNameDuplication(colNames, "in given column names", sparkSession.sessionState.conf.caseSensitiveAnalysis) val columnMap = colNames.zip(cols).zip(metadata).map { case ((colName: String, col: Column), metadata: Metadata) => colName -> col.as(colName, metadata) }.toMap val replacedAndExistingColumns = output.map { field => columnMap.find { case (colName, _) => resolver(field.name, colName) } match { case Some((colName: String, col: Column)) => col.as(colName) case _ => new Column(field) } } val newColumns = columnMap.filter { case (colName, col) => !output.exists(f => resolver(f.name, colName)) }.map { case (colName, col) => col.as(colName) } ds.select(replacedAndExistingColumns ++ newColumns: _*) } def withColumn[T](ds: Dataset[T], colName: String, col: Column, metadata: Metadata): DataFrame = { withColumns(ds, Seq(colName), Seq(col), Seq(metadata)) } private def checkColumnNameDuplication(columnNames: Seq[String], colType: String, caseSensitiveAnalysis: Boolean): Unit = { val names = if (caseSensitiveAnalysis) columnNames else columnNames.map(_.toLowerCase) if (names.distinct.length != names.length) { val duplicateColumns = names.groupBy(identity).collect { case (x, ys) if ys.length > 1 => s"`$x`" } throw new Exception(s"Found duplicate column(s) $colType: ${duplicateColumns.mkString(", ")}") } } /** * Cast a column in a Dataset to Vector type. * * The supported data types of the input column are * - Vector * - float/double type Array. * * Note: The returned column does not have Metadata. * * @param dataset input DataFrame * @param colName column name. * @return Vector column */ def columnToVector(dataset: Dataset[_], colName: String): Column = { val columnDataType = dataset.schema(colName).dataType columnDataType match { case _: VectorUDT => col(colName) case fdt: ArrayType => val transferUDF = fdt.elementType match { case _: FloatType => udf(f = (vector: Seq[Float]) => { val inputArray = Array.fill[Double](vector.size)(0.0) vector.indices.foreach(idx => inputArray(idx) = vector(idx).toDouble) Vectors.dense(inputArray) }) case _: DoubleType => udf((vector: Seq[Double]) => { Vectors.dense(vector.toArray) }) case other => throw new IllegalArgumentException(s"Array[$other] column cannot be cast to Vector") } transferUDF(col(colName)) case other => throw new IllegalArgumentException(s"$other column cannot be cast to Vector") } } }
Example 21
Source File: DataTypeUtil.scala From sona with Apache License 2.0 | 5 votes |
package org.apache.spark.util import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{ArrayType, DataType, MapType, StructType} object DataTypeUtil { def sameType(left: DataType, right: DataType): Boolean = if (SQLConf.get.caseSensitiveAnalysis) { equalsIgnoreNullability(left, right) } else { equalsIgnoreCaseAndNullability(left, right) } private def equalsIgnoreNullability(left: DataType, right: DataType): Boolean = { (left, right) match { case (ArrayType(leftElementType, _), ArrayType(rightElementType, _)) => equalsIgnoreNullability(leftElementType, rightElementType) case (MapType(leftKeyType, leftValueType, _), MapType(rightKeyType, rightValueType, _)) => equalsIgnoreNullability(leftKeyType, rightKeyType) && equalsIgnoreNullability(leftValueType, rightValueType) case (StructType(leftFields), StructType(rightFields)) => leftFields.length == rightFields.length && leftFields.zip(rightFields).forall { case (l, r) => l.name == r.name && equalsIgnoreNullability(l.dataType, r.dataType) } case (l, r) => l == r } } private def equalsIgnoreCaseAndNullability(from: DataType, to: DataType): Boolean = { (from, to) match { case (ArrayType(fromElement, _), ArrayType(toElement, _)) => equalsIgnoreCaseAndNullability(fromElement, toElement) case (MapType(fromKey, fromValue, _), MapType(toKey, toValue, _)) => equalsIgnoreCaseAndNullability(fromKey, toKey) && equalsIgnoreCaseAndNullability(fromValue, toValue) case (StructType(fromFields), StructType(toFields)) => fromFields.length == toFields.length && fromFields.zip(toFields).forall { case (l, r) => l.name.equalsIgnoreCase(r.name) && equalsIgnoreCaseAndNullability(l.dataType, r.dataType) } case (fromDataType, toDataType) => fromDataType == toDataType } } }
Example 22
Source File: NGram.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.ml.feature import com.tencent.angel.sona.ml.UnaryTransformer import com.tencent.angel.sona.ml.param.{IntParam, ParamValidators} import com.tencent.angel.sona.ml.util.{DefaultParamsWritable, Identifiable} import org.apache.spark.sql.types.{ArrayType, DataType, StringType} import com.tencent.angel.sona.ml.util.DefaultParamsReadable import org.apache.spark.util.DataTypeUtil /** * A feature transformer that converts the input array of strings into an array of n-grams. Null * values in the input array are ignored. * It returns an array of n-grams where each n-gram is represented by a space-separated string of * words. * * When the input is empty, an empty array is returned. * When the input array length is less than n (number of elements per n-gram), no n-grams are * returned. */ class NGram(override val uid: String) extends UnaryTransformer[Seq[String], Seq[String], NGram] with DefaultParamsWritable { def this() = this(Identifiable.randomUID("ngram")) /** * Minimum n-gram length, greater than or equal to 1. * Default: 2, bigram features * * @group param */ val n: IntParam = new IntParam(this, "n", "number elements per n-gram (>=1)", ParamValidators.gtEq(1)) def setN(value: Int): this.type = set(n, value) def getN: Int = $(n) setDefault(n -> 2) override protected def createTransformFunc: Seq[String] => Seq[String] = { _.iterator.sliding($(n)).withPartial(false).map(_.mkString(" ")).toSeq } override protected def validateInputType(inputType: DataType): Unit = { require(DataTypeUtil.sameType(inputType, ArrayType(StringType)), s"Input type must be ${ArrayType(StringType).catalogString} but got " + inputType.catalogString) } override protected def outputDataType: DataType = new ArrayType(StringType, false) } object NGram extends DefaultParamsReadable[NGram] { override def load(path: String): NGram = super.load(path) }
Example 23
Source File: AnnotationUtils.scala From glow with Apache License 2.0 | 5 votes |
package io.projectglow.vcf import org.apache.spark.sql.types.{ArrayType, DataType, IntegerType, StringType, StructField, StructType} // Unified VCF annotation representation, used by SnpEff and VEP object AnnotationUtils { // Delimiter between annotation fields val annotationDelimiter = "|" val annotationDelimiterRegex = "\\|" // Fractional delimiter for struct subfields val structDelimiter = "/" val structDelimiterRegex = "\\/" // Delimiter for array subfields val arrayDelimiter = "&" // Struct subfield schemas private val rankTotalStruct = StructType( Seq(StructField("rank", IntegerType), StructField("total", IntegerType))) private val posLengthStruct = StructType( Seq(StructField("pos", IntegerType), StructField("length", IntegerType))) private val referenceVariantStruct = StructType( Seq(StructField("reference", StringType), StructField("variant", StringType))) // Special schemas for SnpEff subfields private val snpEffFieldsToSchema: Map[String, DataType] = Map( "Annotation" -> ArrayType(StringType), "Rank" -> rankTotalStruct, "cDNA_pos/cDNA_length" -> posLengthStruct, "CDS_pos/CDS_length" -> posLengthStruct, "AA_pos/AA_length" -> posLengthStruct, "Distance" -> IntegerType ) // Special schemas for VEP subfields private val vepFieldsToSchema: Map[String, DataType] = Map( "Consequence" -> ArrayType(StringType), "EXON" -> rankTotalStruct, "INTRON" -> rankTotalStruct, "cDNA_position" -> IntegerType, "CDS_position" -> IntegerType, "Protein_position" -> IntegerType, "Amino_acids" -> referenceVariantStruct, "Codons" -> referenceVariantStruct, "Existing_variation" -> ArrayType(StringType), "DISTANCE" -> IntegerType, "STRAND" -> IntegerType, "FLAGS" -> ArrayType(StringType) ) // Special schemas for LOFTEE (as VEP plugin) subfields private val lofteeFieldsToSchema: Map[String, DataType] = Map( "LoF_filter" -> ArrayType(StringType), "LoF_flags" -> ArrayType(StringType), "LoF_info" -> ArrayType(StringType) ) // Default string schema for annotation subfield val allFieldsToSchema: Map[String, DataType] = (snpEffFieldsToSchema ++ vepFieldsToSchema ++ lofteeFieldsToSchema).withDefaultValue(StringType) }
Example 24
Source File: VCFWriterUtils.scala From glow with Apache License 2.0 | 5 votes |
package io.projectglow.vcf import htsjdk.variant.variantcontext.{VariantContext, VariantContextBuilder} import org.apache.spark.sql.DataFrame import org.apache.spark.sql.types.{ArrayType, StructType} import io.projectglow.common.GlowLogging object VCFWriterUtils extends GlowLogging { def throwMixedSamplesFailure(): Unit = { throw new IllegalArgumentException("Cannot mix missing and non-missing sample IDs.") } def throwSampleInferenceFailure(): Unit = { throw new IllegalArgumentException( "Cannot infer sample ids because they are not the same in every row.") } def inferSampleIdsIfPresent(data: DataFrame): SampleIdInfo = { val genotypeSchemaOpt = data .schema .find(_.name == "genotypes") .map(_.dataType.asInstanceOf[ArrayType].elementType.asInstanceOf[StructType]) if (genotypeSchemaOpt.isEmpty) { logger.info("No genotypes column, no sample IDs will be inferred.") return SampleIds(Seq.empty) } val genotypeSchema = genotypeSchemaOpt.get import data.sparkSession.implicits._ val hasSampleIdsColumn = genotypeSchema.exists(_.name == "sampleId") if (hasSampleIdsColumn) { val distinctSampleIds = data .selectExpr("explode(genotypes.sampleId)") .distinct() .as[String] .collect val numPresentSampleIds = distinctSampleIds.count(!sampleIsMissing(_)) if (numPresentSampleIds > 0) { if (numPresentSampleIds < distinctSampleIds.length) { throwMixedSamplesFailure() } return SampleIds(distinctSampleIds) } } val numGenotypesPerRow = data .selectExpr("size(genotypes)") .distinct() .as[Int] .collect if (numGenotypesPerRow.length > 1) { throw new IllegalArgumentException( "Rows contain varying number of missing samples; cannot infer sample IDs.") } logger.warn("Detected missing sample IDs, inferring sample IDs.") InferSampleIds } def sampleIsMissing(s: String): Boolean = { s == null || s.isEmpty } def convertVcAttributesToStrings(vc: VariantContext): VariantContextBuilder = { val vcBuilder = new VariantContextBuilder(vc) val iterator = vc.getAttributes.entrySet().iterator() while (iterator.hasNext) { // parse to string, then write, as the VCF encoder messes up double precisions val entry = iterator.next() vcBuilder.attribute( entry.getKey, VariantContextToInternalRowConverter.parseObjectAsString(entry.getValue)) } vcBuilder } } case class SampleIds(unsortedSampleIds: Seq[String]) extends SampleIdInfo { val sortedSampleIds: Seq[String] = unsortedSampleIds.sorted } case object InferSampleIds extends SampleIdInfo { def fromNumberMissing(numMissingSamples: Int): Seq[String] = { (1 to numMissingSamples).map { idx => "sample_" + idx } } } sealed trait SampleIdInfo
Example 25
Source File: PlinkRowToInternalRowConverter.scala From glow with Apache License 2.0 | 5 votes |
package io.projectglow.plink import org.apache.spark.sql.SQLUtils.structFieldsEqualExceptNullability import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.util.GenericArrayData import org.apache.spark.sql.types.{ArrayType, StructType} import org.apache.spark.unsafe.types.UTF8String import io.projectglow.common.{GlowLogging, VariantSchemas} import io.projectglow.sql.util.RowConverter class PlinkRowToInternalRowConverter(schema: StructType) extends GlowLogging { private val homAlt = new GenericArrayData(Array(1, 1)) private val missing = new GenericArrayData(Array(-1, -1)) private val het = new GenericArrayData(Array(0, 1)) private val homRef = new GenericArrayData(Array(0, 0)) private def twoBitsToCalls(twoBits: Int): GenericArrayData = { twoBits match { case 0 => homAlt // Homozygous for first (alternate) allele case 1 => missing // Missing genotype case 2 => het // Heterozygous case 3 => homRef // Homozygous for second (reference) allele } } private val converter = { val fns = schema.map { field => val fn: RowConverter.Updater[(Array[UTF8String], Array[Byte])] = field match { case f if f.name == VariantSchemas.genotypesFieldName => val gSchema = f.dataType.asInstanceOf[ArrayType].elementType.asInstanceOf[StructType] val converter = makeGenotypeConverter(gSchema) (samplesAndBlock, r, i) => { val genotypes = new Array[Any](samplesAndBlock._1.length) var sampleIdx = 0 while (sampleIdx < genotypes.length) { val sample = samplesAndBlock._1(sampleIdx) // Get the relevant 2 bits for the sample from the block // The i-th sample's call bits are the (i%4)-th pair within the (i/4)-th block val twoBits = samplesAndBlock._2(sampleIdx / 4) >> (2 * (sampleIdx % 4)) & 3 genotypes(sampleIdx) = converter((sample, twoBits)) sampleIdx += 1 } r.update(i, new GenericArrayData(genotypes)) } case _ => // BED file only contains genotypes (_, _, _) => () } fn } new RowConverter[(Array[UTF8String], Array[Byte])](schema, fns.toArray) } private def makeGenotypeConverter(gSchema: StructType): RowConverter[(UTF8String, Int)] = { val functions = gSchema.map { field => val fn: RowConverter.Updater[(UTF8String, Int)] = field match { case f if structFieldsEqualExceptNullability(f, VariantSchemas.sampleIdField) => (sampleAndTwoBits, r, i) => { r.update(i, sampleAndTwoBits._1) } case f if structFieldsEqualExceptNullability(f, VariantSchemas.callsField) => (sampleAndTwoBits, r, i) => r.update(i, twoBitsToCalls(sampleAndTwoBits._2)) case f => logger.info( s"Genotype field $f cannot be derived from PLINK files. It will be null " + s"for each sample." ) (_, _, _) => () } fn } new RowConverter[(UTF8String, Int)](gSchema, functions.toArray) } def convertRow( bimRow: InternalRow, sampleIds: Array[UTF8String], gtBlock: Array[Byte]): InternalRow = { converter((sampleIds, gtBlock), bimRow) } }
Example 26
Source File: MeanSubstitute.scala From glow with Apache License 2.0 | 5 votes |
package io.projectglow.sql.expressions import org.apache.spark.sql.SQLUtils import org.apache.spark.sql.catalyst.analysis.TypeCheckResult import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.aggregate.Average import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} import org.apache.spark.sql.catalyst.util.ArrayData import org.apache.spark.sql.types.{ArrayType, NumericType, StringType, StructType} import org.apache.spark.unsafe.types.UTF8String import io.projectglow.sql.dsl._ import io.projectglow.sql.util.RewriteAfterResolution case class MeanSubstitute(array: Expression, missingValue: Expression) extends RewriteAfterResolution { override def children: Seq[Expression] = Seq(array, missingValue) def this(array: Expression) = { this(array, Literal(-1)) } private lazy val arrayElementType = array.dataType.asInstanceOf[ArrayType].elementType // A value is considered missing if it is NaN, null or equal to the missing value parameter def isMissing(arrayElement: Expression): Predicate = IsNaN(arrayElement) || IsNull(arrayElement) || arrayElement === missingValue def createNamedStruct(sumValue: Expression, countValue: Expression): Expression = { val sumName = Literal(UTF8String.fromString("sum"), StringType) val countName = Literal(UTF8String.fromString("count"), StringType) namedStruct(sumName, sumValue, countName, countValue) } // Update sum and count with array element if not missing def updateSumAndCountConditionally( stateStruct: Expression, arrayElement: Expression): Expression = { If( isMissing(arrayElement), // If value is missing, do not update sum and count stateStruct, // If value is not missing, add to sum and increment count createNamedStruct( stateStruct.getField("sum") + arrayElement, stateStruct.getField("count") + 1) ) } // Calculate mean for imputation def calculateMean(stateStruct: Expression): Expression = { If( stateStruct.getField("count") > 0, // If non-missing values were found, calculate the average stateStruct.getField("sum") / stateStruct.getField("count"), // If all values were missing, substitute with missing value missingValue ) } lazy val arrayMean: Expression = { // Sum and count of non-missing values array.aggregate( createNamedStruct(Literal(0d), Literal(0L)), updateSumAndCountConditionally, calculateMean ) } def substituteWithMean(arrayElement: Expression): Expression = { If(isMissing(arrayElement), arrayMean, arrayElement) } override def rewrite: Expression = { if (!array.dataType.isInstanceOf[ArrayType] || !arrayElementType.isInstanceOf[NumericType]) { throw SQLUtils.newAnalysisException( s"Can only perform mean substitution on numeric array; provided type is ${array.dataType}.") } if (!missingValue.dataType.isInstanceOf[NumericType]) { throw SQLUtils.newAnalysisException( s"Missing value must be of numeric type; provided type is ${missingValue.dataType}.") } // Replace missing values with the provided strategy array.arrayTransform(substituteWithMean(_)) } }
Example 27
Source File: CountVectorizerExample.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperations.examples import org.apache.spark.sql.Row import org.apache.spark.sql.types.{ArrayType, StringType, StructField, StructType} import io.deepsense.deeplang.doperables.dataframe.{DataFrame, DataFrameBuilder} import io.deepsense.deeplang.doperations.spark.wrappers.estimators.CountVectorizer class CountVectorizerExample extends AbstractOperationExample[CountVectorizer]{ override def dOperation: CountVectorizer = { val op = new CountVectorizer() op.estimator .setInputColumn("lines") .setNoInPlace("lines_out") .setMinTF(3) op.set(op.estimator.extractParamMap()) } override def inputDataFrames: Seq[DataFrame] = { val rows = Seq( Row("a a a b b c c c d ".split(" ").toSeq), Row("c c c c c c".split(" ").toSeq), Row("a".split(" ").toSeq), Row("e e e e e".split(" ").toSeq)) val rdd = sparkContext.parallelize(rows) val schema = StructType(Seq(StructField("lines", ArrayType(StringType, containsNull = true)))) Seq(DataFrameBuilder(sparkSQLSession).buildDataFrame(schema, rdd)) } }
Example 28
Source File: StringTokenizerSmokeTest.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables.spark.wrappers.transformers import org.apache.spark.sql.types.{ArrayType, DataType, StringType} import io.deepsense.deeplang.doperables.multicolumn.MultiColumnParams.SingleOrMultiColumnChoices.SingleColumnChoice import io.deepsense.deeplang.doperables.multicolumn.SingleColumnParams.SingleTransformInPlaceChoices.NoInPlaceChoice import io.deepsense.deeplang.params.selections.NameSingleColumnSelection class StringTokenizerSmokeTest extends AbstractTransformerWrapperSmokeTest[StringTokenizer] with MultiColumnTransformerWrapperTestSupport { override def transformerWithParams: StringTokenizer = { val inPlace = NoInPlaceChoice() .setOutputColumn("tokenized") val single = SingleColumnChoice() .setInputColumn(NameSingleColumnSelection("s")) .setInPlace(inPlace) val transformer = new StringTokenizer() transformer.set(Seq( transformer.singleOrMultiChoiceParam -> single ): _*) } override def testValues: Seq[(Any, Any)] = { val strings = Seq( "this is a test", "this values should be separated", "Bla bla bla!" ) val tokenized = strings.map { _.toLowerCase.split("\\s") } strings.zip(tokenized) } override def inputType: DataType = StringType override def outputType: DataType = new ArrayType(StringType, true) }
Example 29
Source File: RegexTokenizerSmokeTest.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables.spark.wrappers.transformers import org.apache.spark.sql.types.{ArrayType, DataType, StringType} import io.deepsense.deeplang.doperables.multicolumn.MultiColumnParams.SingleOrMultiColumnChoices.SingleColumnChoice import io.deepsense.deeplang.doperables.multicolumn.SingleColumnParams.SingleTransformInPlaceChoices.NoInPlaceChoice import io.deepsense.deeplang.params.selections.NameSingleColumnSelection class RegexTokenizerSmokeTest extends AbstractTransformerWrapperSmokeTest[RegexTokenizer] with MultiColumnTransformerWrapperTestSupport { override def transformerWithParams: RegexTokenizer = { val inPlace = NoInPlaceChoice() .setOutputColumn("tokenized") val single = SingleColumnChoice() .setInputColumn(NameSingleColumnSelection("s")) .setInPlace(inPlace) val transformer = new RegexTokenizer() transformer.set(Seq( transformer.singleOrMultiChoiceParam -> single, transformer.gaps -> false, transformer.minTokenLength -> 1, transformer.pattern -> "\\d+" ): _*) } override def testValues: Seq[(Any, Any)] = { val strings = Seq( "100 200 300", "400 500 600", "700 800 900" ) val tokenized = strings.map { _.toLowerCase.split(" ") } strings.zip(tokenized) } override def inputType: DataType = StringType override def outputType: DataType = new ArrayType(StringType, true) }
Example 30
Source File: NGramTransformerSmokeTest.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables.spark.wrappers.transformers import org.apache.spark.sql.types.{ArrayType, DataType, StringType} import io.deepsense.deeplang.doperables.multicolumn.MultiColumnParams.SingleOrMultiColumnChoices.SingleColumnChoice import io.deepsense.deeplang.doperables.multicolumn.SingleColumnParams.SingleTransformInPlaceChoices.NoInPlaceChoice import io.deepsense.deeplang.params.selections.NameSingleColumnSelection class NGramTransformerSmokeTest extends AbstractTransformerWrapperSmokeTest[NGramTransformer] with MultiColumnTransformerWrapperTestSupport { override def transformerWithParams: NGramTransformer = { val inPlace = NoInPlaceChoice() .setOutputColumn("ngrams") val single = SingleColumnChoice() .setInputColumn(NameSingleColumnSelection("as")) .setInPlace(inPlace) val transformer = new NGramTransformer() transformer.set(Seq( transformer.singleOrMultiChoiceParam -> single, transformer.n -> 2 ): _*) } override def testValues: Seq[(Any, Any)] = { val strings = Seq( Array("a", "b", "c"), Array("d", "e", "f") ) val ngrams = Seq( Array("a b", "b c"), Array("d e", "e f") ) strings.zip(ngrams) } override def inputType: DataType = new ArrayType(StringType, true) override def outputType: DataType = new ArrayType(StringType, false) }
Example 31
Source File: StopWordsRemoverSmokeTest.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables.spark.wrappers.transformers import org.apache.spark.sql.types.{ArrayType, DataType, StringType} import io.deepsense.deeplang.doperables.multicolumn.MultiColumnParams.SingleOrMultiColumnChoices.SingleColumnChoice import io.deepsense.deeplang.doperables.multicolumn.SingleColumnParams.SingleTransformInPlaceChoices.NoInPlaceChoice import io.deepsense.deeplang.params.selections.NameSingleColumnSelection class StopWordsRemoverSmokeTest extends AbstractTransformerWrapperSmokeTest[StopWordsRemover] with MultiColumnTransformerWrapperTestSupport { override def transformerWithParams: StopWordsRemover = { val inPlace = NoInPlaceChoice() .setOutputColumn("stopWordsRemoverOutput") val single = SingleColumnChoice() .setInputColumn(NameSingleColumnSelection("as")) .setInPlace(inPlace) val stopWordsRemover = new StopWordsRemover() stopWordsRemover.set( stopWordsRemover.singleOrMultiChoiceParam -> single, stopWordsRemover.caseSensitive -> false) } override def testValues: Seq[(Any, Any)] = { val inputNumbers = Seq(Array("a", "seahorse", "The", "Horseshoe", "Crab")) val outputNumbers = Seq(Array("seahorse", "Horseshoe", "Crab")) inputNumbers.zip(outputNumbers) } override def inputType: DataType = ArrayType(StringType) override def outputType: DataType = ArrayType(StringType) }
Example 32
Source File: NGram.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.param._ import org.apache.spark.ml.util._ import org.apache.spark.sql.types.{ArrayType, DataType, StringType} @Since("1.5.0") def getN: Int = $(n) setDefault(n -> 2) override protected def createTransformFunc: Seq[String] => Seq[String] = { _.iterator.sliding($(n)).withPartial(false).map(_.mkString(" ")).toSeq } override protected def validateInputType(inputType: DataType): Unit = { require(inputType.sameType(ArrayType(StringType)), s"Input type must be ArrayType(StringType) but got $inputType.") } override protected def outputDataType: DataType = new ArrayType(StringType, false) } @Since("1.6.0") object NGram extends DefaultParamsReadable[NGram] { @Since("1.6.0") override def load(path: String): NGram = super.load(path) }
Example 33
Source File: cogroup.scala From spark-tools with Apache License 2.0 | 5 votes |
package io.univalence.plumbus import org.apache.spark.Partitioner import org.apache.spark.rdd.{ CoGroupedRDD, RDD } import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.types.{ ArrayType, StructField } import org.apache.spark.sql.{ types, DataFrame, Dataset, Encoder, KeyValueGroupedDataset, Row } import scala.reflect.ClassTag import scala.util.Try object cogroup { implicit class KVGD[K, A](val kvgd: KeyValueGroupedDataset[K, A]) { def cogroup[B](right: KeyValueGroupedDataset[K, B]): Dataset[(K, Seq[A], Seq[B])] = //Use SparkAddOn ? ??? } def apply[A, B, K](left: Dataset[A], right: Dataset[B])(keyLeft: A => K, keyRight: B => K)( implicit encA: Encoder[A], encB: Encoder[B], encC: Encoder[K], enc: Encoder[(K, Seq[A], Seq[B])], ca: ClassTag[A], ck: ClassTag[K], cb: ClassTag[B] ): Dataset[(K, Seq[A], Seq[B])] = left.sparkSession.implicits .rddToDatasetHolder( RDD .rddToPairRDDFunctions(left.rdd.keyBy(keyLeft)) .cogroup(right.rdd.keyBy(keyRight)) .map({ case (k, (ia, ib)) => (k, ia.toSeq, ib.toSeq) }) ) .toDS def cogroupDf(group: DataFrame, namedSubGroup: (String, DataFrame)*)( byKey: String, partitioner: Partitioner = Partitioner.defaultPartitioner(group.rdd, namedSubGroup.map(_._2.rdd): _*) ): Try[DataFrame] = Try { val subGroup: Seq[DataFrame] = namedSubGroup.map(_._2) val allFrames: Seq[DataFrame] = group +: subGroup val allFramesKeyed: Seq[RDD[(String, Row)]] = allFrames.map(df => { val idx = df.columns.indexOf(byKey) df.rdd.keyBy(_.get(idx).toString) }) val cogroupRdd: CoGroupedRDD[String] = new CoGroupedRDD[String](allFramesKeyed, partitioner) val rowRdd: RDD[Row] = cogroupRdd.map(x => { val rows: Array[Seq[Row]] = x._2.asInstanceOf[Array[Iterable[Row]]].map(_.toSeq) val seq = rows.head.head.toSeq ++ rows.tail new GenericRowWithSchema(seq.toArray, null).asInstanceOf[Row] }) val schema = types.StructType( group.schema.fields ++ namedSubGroup.map { case (name, df) => StructField(name, ArrayType(df.schema)) } ) group.sparkSession.createDataFrame(rowRdd, schema) } }
Example 34
Source File: SparkLensTest.scala From spark-tools with Apache License 2.0 | 5 votes |
package io.univalence import org.apache.spark.SparkConf import org.apache.spark.sql.DataFrame import org.apache.spark.sql.SparkSession import org.apache.spark.sql.types.ArrayType import org.apache.spark.sql.types.StringType import io.univalence.SparkLens._ import org.scalatest.FunSuite case class Toto(name: String, age: Int) case class Tata(toto: Toto) class SparkLensTest extends FunSuite { val conf: SparkConf = new SparkConf() conf.setAppName("yo") conf.setMaster("local[*]") implicit val ss: SparkSession = SparkSession.builder.config(conf).getOrCreate import ss.implicits._ test("testLensRegExp change string") { assert(lensRegExp(ss.createDataFrame(Seq(Toto("a", 1))))({ case ("name", StringType) => true case _ => false }, { case (a: String, d) => a.toUpperCase }).as[Toto].first() == Toto("A", 1)) } test("change Int") { assert(lensRegExp(ss.createDataFrame(Seq(Tata(Toto("a", 1)))))({ case ("toto/age", _) => true case _ => false }, { case (a: Int, d) => a + 1 }).as[Tata].first() == Tata(Toto("a", 2))) } ignore("null to nil") { val df: DataFrame = ss.read.parquet("/home/phong/daily_gpp_20180705") val yoho: DataFrame = lensRegExp(df)({ case (_, ArrayType(_, _)) => true case _ => false }, (a, b) => if (a == null) Nil else a) } }
Example 35
Source File: Tokenizer.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.param._ import org.apache.spark.ml.util._ import org.apache.spark.sql.types.{ArrayType, DataType, StringType} @Since("1.6.0") def getToLowercase: Boolean = $(toLowercase) setDefault(minTokenLength -> 1, gaps -> true, pattern -> "\\s+", toLowercase -> true) override protected def createTransformFunc: String => Seq[String] = { originStr => val re = $(pattern).r val str = if ($(toLowercase)) originStr.toLowerCase() else originStr val tokens = if ($(gaps)) re.split(str).toSeq else re.findAllIn(str).toSeq val minLength = $(minTokenLength) tokens.filter(_.length >= minLength) } override protected def validateInputType(inputType: DataType): Unit = { require(inputType == StringType, s"Input type must be string type but got $inputType.") } override protected def outputDataType: DataType = new ArrayType(StringType, true) @Since("1.4.1") override def copy(extra: ParamMap): RegexTokenizer = defaultCopy(extra) } @Since("1.6.0") object RegexTokenizer extends DefaultParamsReadable[RegexTokenizer] { @Since("1.6.0") override def load(path: String): RegexTokenizer = super.load(path) }
Example 36
Source File: HashingTF.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, StructType} @Since("2.0.0") def setBinary(value: Boolean): this.type = set(binary, value) @Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { val outputSchema = transformSchema(dataset.schema) val hashingTF = new feature.HashingTF($(numFeatures)).setBinary($(binary)) // TODO: Make the hashingTF.transform natively in ml framework to avoid extra conversion. val t = udf { terms: Seq[_] => hashingTF.transform(terms).asML } val metadata = outputSchema($(outputCol)).metadata dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata)) } @Since("1.4.0") override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType require(inputType.isInstanceOf[ArrayType], s"The input column must be ArrayType, but got $inputType.") val attrGroup = new AttributeGroup($(outputCol), $(numFeatures)) SchemaUtils.appendColumn(schema, attrGroup.toStructField()) } @Since("1.4.1") override def copy(extra: ParamMap): HashingTF = defaultCopy(extra) } @Since("1.6.0") object HashingTF extends DefaultParamsReadable[HashingTF] { @Since("1.6.0") override def load(path: String): HashingTF = super.load(path) }
Example 37
Source File: NGram.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.param._ import org.apache.spark.ml.util._ import org.apache.spark.sql.types.{ArrayType, DataType, StringType} @Since("1.5.0") def getN: Int = $(n) setDefault(n -> 2) override protected def createTransformFunc: Seq[String] => Seq[String] = { _.iterator.sliding($(n)).withPartial(false).map(_.mkString(" ")).toSeq } override protected def validateInputType(inputType: DataType): Unit = { require(inputType.sameType(ArrayType(StringType)), s"Input type must be ArrayType(StringType) but got $inputType.") } override protected def outputDataType: DataType = new ArrayType(StringType, false) } @Since("1.6.0") object NGram extends DefaultParamsReadable[NGram] { @Since("1.6.0") override def load(path: String): NGram = super.load(path) }
Example 38
Source File: SparkScoreDoc.scala From spark-lucenerdd with Apache License 2.0 | 5 votes |
package org.zouzias.spark.lucenerdd.models import org.apache.lucene.document.Document import org.apache.lucene.index.IndexableField import org.apache.lucene.search.{IndexSearcher, ScoreDoc} import org.apache.spark.sql.types.{ArrayType, IntegerType, StringType, StructField, StructType} import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.zouzias.spark.lucenerdd.models.SparkScoreDoc.inferNumericType import org.zouzias.spark.lucenerdd.models.SparkScoreDoc.{DocIdField, ScoreField, ShardField} import scala.collection.JavaConverters._ sealed trait FieldType extends Serializable object TextType extends FieldType object IntType extends FieldType object DoubleType extends FieldType object LongType extends FieldType object FloatType extends FieldType private def inferNumericType(num: Number): FieldType = { num match { case _: java.lang.Double => DoubleType case _: java.lang.Long => LongType case _: java.lang.Integer => IntType case _: java.lang.Float => FloatType case _ => TextType } } }
Example 39
Source File: StackSummarizerFactory.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.timeseries.summarize.summarizer import com.twosigma.flint.timeseries.summarize._ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.util.GenericArrayData import org.apache.spark.sql.types.{ ArrayType, StructField, StructType } case class StackSummarizerFactory(factories: Seq[SummarizerFactory]) extends SummarizerFactory { factories.foreach { case factory => require( !factory.isInstanceOf[OverlappableSummarizerFactory], "Stacking overlappable summarizers are not supported" ) } override val requiredColumns: ColumnList = factories.map(_.requiredColumns).reduce(_ ++ _) def apply(inputSchema: StructType): Summarizer = { val summarizers = factories.map(f => f.apply(inputSchema)) new StackSummarizer(inputSchema, prefixOpt, requiredColumns, summarizers) } } class StackSummarizer( override val inputSchema: StructType, override val prefixOpt: Option[String], override val requiredColumns: ColumnList, summarizers: Seq[Summarizer] ) extends Summarizer with InputAlwaysValid { override type T = InternalRow override type U = Seq[Any] override type V = Seq[InternalRow] require( summarizers.forall(s => s.outputSchema == summarizers.head.outputSchema), s"Summarizers must have identical schemas to be stacked: ${summarizers.map(_.outputSchema).mkString(" vs. ")}" ) override val schema: StructType = StructType( StructField(StackSummarizer.stackColumn, ArrayType(summarizers.head.outputSchema)) :: Nil ) override val summarizer = com.twosigma.flint.rdd.function.summarize.summarizer.StackSummarizer(summarizers) // Convert the output of `summarizer` to the InternalRow. override def fromV(v: V): InternalRow = InternalRow(new GenericArrayData(v)) // Convert the InternalRow to the type of row expected by the `summarizer`. override def toT(r: InternalRow): T = r } object StackSummarizer { val stackColumn = "stack" }
Example 40
Source File: ArrowSummarizer.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.timeseries.summarize.summarizer import com.twosigma.flint.rdd.function.summarize.summarizer.{ ArrowSummarizerResult, ArrowSummarizerState, ArrowSummarizer => ArrowSum } import com.twosigma.flint.timeseries.row.Schema import com.twosigma.flint.timeseries.summarize.ColumnList import com.twosigma.flint.timeseries.summarize.{ ColumnList, InputAlwaysValid, Summarizer, SummarizerFactory } import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.catalyst.util.GenericArrayData import org.apache.spark.sql.types.{ ArrayType, BinaryType, StructType } object ArrowSummarizer { val baseRowsColumnName = "__baseRows" val arrowBatchColumnName = "arrow_bytes" } case class ArrowSummarizerFactory(columns: Seq[String], includeBaseRows: Boolean) extends SummarizerFactory { override val requiredColumns: ColumnList = if (includeBaseRows) { ColumnList.All } else { ColumnList.Sequence(columns) } override def apply(inputSchema: StructType): ArrowSummarizer = { val outputBatchSchema = StructType(columns.map(col => inputSchema(inputSchema.fieldIndex(col)))) ArrowSummarizer(inputSchema, outputBatchSchema, includeBaseRows, prefixOpt, requiredColumns) } } case class ArrowSummarizer( override val inputSchema: StructType, outputBatchSchema: StructType, includeBaseRows: Boolean, override val prefixOpt: Option[String], requiredColumns: ColumnList ) extends Summarizer with InputAlwaysValid { override type T = InternalRow override type U = ArrowSummarizerState override type V = ArrowSummarizerResult override val summarizer = ArrowSum(inputSchema, outputBatchSchema, includeBaseRows) override val schema: StructType = if (includeBaseRows) { Schema.of( ArrowSummarizer.baseRowsColumnName -> ArrayType(inputSchema), ArrowSummarizer.arrowBatchColumnName -> BinaryType ) } else { Schema.of( ArrowSummarizer.arrowBatchColumnName -> BinaryType ) } override def toT(r: InternalRow): T = r override def fromV(v: V): InternalRow = if (includeBaseRows) { InternalRow(new GenericArrayData(v.baseRows), v.arrowBatch) } else { InternalRow(v.arrowBatch) } }
Example 41
Source File: AssertEqualsSpec.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.timeseries import com.twosigma.flint.timeseries.row.Schema import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.expressions.{ GenericRowWithSchema => SqlRow } import org.apache.spark.sql.types.{ ArrayType, DoubleType } import org.scalatest.exceptions.TestFailedException import scala.collection.mutable class AssertEqualsSpec extends TimeSeriesSuite { "TimeSeriesSuite" should "assertEquals for two sql rows of DoubleType correctly" in { val schema = Schema("x" -> DoubleType) val row1 = new SqlRow(Array(1L, 1.0), schema) val row2 = new SqlRow(Array(1L, 1.0 + defaultAdditivePrecision * 0.1), schema) val row3 = new SqlRow(Array(1L, 1.0 + defaultAdditivePrecision * 10.0), schema) assertAlmostEquals(row1, row2) intercept[TestFailedException] { assertAlmostEquals(row1, row3) } } it should "assertEquals for two sql rows of ArrayType(DoubleType) correctly" in { val schema = Schema("x" -> ArrayType(DoubleType)) val row1: Row = new SqlRow(Array(1L, mutable.WrappedArray.make(Array(1.0))), schema) val row2: Row = new SqlRow( Array(1L, mutable.WrappedArray.make(Array(1.0 + defaultAdditivePrecision * 0.1))), schema ) val row3: Row = new SqlRow( Array(1L, mutable.WrappedArray.make(Array(1.0 + defaultAdditivePrecision * 10.0))), schema ) assertAlmostEquals(row1, row2) intercept[TestFailedException] { assertAlmostEquals(row1, row3) } } it should "assertEquals for two sql rows of ArrayType(DoubleType) that contain NaN values correctly" in { val schema = Schema("x" -> ArrayType(DoubleType)) val row1 = new SqlRow(Array(1L, mutable.WrappedArray.make(Array(Double.NaN))), schema) val row2 = new SqlRow(Array(1L, mutable.WrappedArray.make(Array(Double.NaN))), schema) val row3 = new SqlRow(Array(1L, mutable.WrappedArray.make(Array(1.0))), schema) assertAlmostEquals(row1, row2) intercept[TestFailedException] { assertAlmostEquals(row1, row3) } } }
Example 42
Source File: XSDToSchemaSuite.scala From spark-xml with Apache License 2.0 | 5 votes |
package com.databricks.spark.xml.util import java.nio.file.Paths import org.apache.spark.sql.types.{ArrayType, StructField, StructType, StringType} import org.scalatest.funsuite.AnyFunSuite class XSDToSchemaSuite extends AnyFunSuite { test("Basic parsing") { val parsedSchema = XSDToSchema.read(Paths.get("src/test/resources/basket.xsd")) val expectedSchema = StructType(Array( StructField("basket", StructType(Array( StructField("entry", ArrayType( StructType(Array( StructField("key", StringType), StructField("value", StringType) ))) )) ))) ) assert(expectedSchema === parsedSchema) } }
Example 43
Source File: ResolveLambdaVariablesSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.PlanTest import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan} import org.apache.spark.sql.catalyst.rules.RuleExecutor import org.apache.spark.sql.types.{ArrayType, IntegerType} class ResolveLambdaVariablesSuite extends PlanTest { import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.dsl.plans._ object Analyzer extends RuleExecutor[LogicalPlan] { val batches = Batch("Resolution", FixedPoint(4), ResolveLambdaVariables(conf)) :: Nil } private val key = 'key.int private val values1 = 'values1.array(IntegerType) private val values2 = 'values2.array(ArrayType(ArrayType(IntegerType))) private val data = LocalRelation(Seq(key, values1, values2)) private val lvInt = NamedLambdaVariable("x", IntegerType, nullable = true) private val lvHiddenInt = NamedLambdaVariable("col0", IntegerType, nullable = true) private val lvArray = NamedLambdaVariable("x", ArrayType(IntegerType), nullable = true) private def plan(e: Expression): LogicalPlan = data.select(e.as("res")) private def checkExpression(e1: Expression, e2: Expression): Unit = { comparePlans(Analyzer.execute(plan(e1)), plan(e2)) } private def lv(s: Symbol) = UnresolvedNamedLambdaVariable(Seq(s.name)) test("resolution - no op") { checkExpression(key, key) } test("resolution - simple") { val in = ArrayTransform(values1, LambdaFunction(lv('x) + 1, lv('x) :: Nil)) val out = ArrayTransform(values1, LambdaFunction(lvInt + 1, lvInt :: Nil)) checkExpression(in, out) } test("resolution - nested") { val in = ArrayTransform(values2, LambdaFunction( ArrayTransform(lv('x), LambdaFunction(lv('x) + 1, lv('x) :: Nil)), lv('x) :: Nil)) val out = ArrayTransform(values2, LambdaFunction( ArrayTransform(lvArray, LambdaFunction(lvInt + 1, lvInt :: Nil)), lvArray :: Nil)) checkExpression(in, out) } test("resolution - hidden") { val in = ArrayTransform(values1, key) val out = ArrayTransform(values1, LambdaFunction(key, lvHiddenInt :: Nil, hidden = true)) checkExpression(in, out) } test("fail - name collisions") { val p = plan(ArrayTransform(values1, LambdaFunction(lv('x) + lv('X), lv('x) :: lv('X) :: Nil))) val msg = intercept[AnalysisException](Analyzer.execute(p)).getMessage assert(msg.contains("arguments should not have names that are semantically the same")) } test("fail - lambda arguments") { val p = plan(ArrayTransform(values1, LambdaFunction(lv('x) + lv('y) + lv('z), lv('x) :: lv('y) :: lv('z) :: Nil))) val msg = intercept[AnalysisException](Analyzer.execute(p)).getMessage assert(msg.contains("does not match the number of arguments expected")) } }
Example 44
Source File: ExcelRelation.scala From spark-hadoopoffice-ds with Apache License 2.0 | 5 votes |
package org.zuinnote.spark.office.excel import scala.collection.JavaConversions._ import org.apache.spark.sql.sources.{ BaseRelation, TableScan } import org.apache.spark.sql.types.DataType import org.apache.spark.sql.types.ArrayType import org.apache.spark.sql.types.StringType import org.apache.spark.sql.types.StructField import org.apache.spark.sql.types.StructType import org.apache.spark.sql.SQLContext import org.apache.spark.sql._ import org.apache.spark.rdd.RDD import org.apache.hadoop.conf._ import org.apache.hadoop.mapreduce._ import org.apache.commons.logging.LogFactory import org.apache.commons.logging.Log import org.zuinnote.hadoop.office.format.common.dao._ import org.zuinnote.hadoop.office.format.mapreduce._ import org.zuinnote.spark.office.excel.util.ExcelFile override def buildScan: RDD[Row] = { // read ExcelRows val excelRowsRDD = ExcelFile.load(sqlContext, location, hadoopParams) // map to schema val schemaFields = schema.fields excelRowsRDD.flatMap(excelKeyValueTuple => { // map the Excel row data structure to a Spark SQL schema val rowArray = new Array[Any](excelKeyValueTuple._2.get.length) var i = 0; for (x <- excelKeyValueTuple._2.get) { // parse through the SpreadSheetCellDAO val spreadSheetCellDAOStructArray = new Array[String](schemaFields.length) val currentSpreadSheetCellDAO: Array[SpreadSheetCellDAO] = excelKeyValueTuple._2.get.asInstanceOf[Array[SpreadSheetCellDAO]] spreadSheetCellDAOStructArray(0) = currentSpreadSheetCellDAO(i).getFormattedValue spreadSheetCellDAOStructArray(1) = currentSpreadSheetCellDAO(i).getComment spreadSheetCellDAOStructArray(2) = currentSpreadSheetCellDAO(i).getFormula spreadSheetCellDAOStructArray(3) = currentSpreadSheetCellDAO(i).getAddress spreadSheetCellDAOStructArray(4) = currentSpreadSheetCellDAO(i).getSheetName // add row representing one Excel row rowArray(i) = spreadSheetCellDAOStructArray i += 1 } Some(Row.fromSeq(rowArray)) }) } }
Example 45
Source File: WrappersSpec.scala From sparksql-scalapb with Apache License 2.0 | 5 votes |
package scalapb.spark import com.example.protos.wrappers._ import org.apache.spark.sql.SparkSession import org.apache.hadoop.io.ArrayPrimitiveWritable import scalapb.GeneratedMessageCompanion import org.apache.spark.sql.types.IntegerType import org.apache.spark.sql.types.ArrayType import org.apache.spark.sql.types.StructField import org.apache.spark.sql.types.StructType import org.apache.spark.sql.types.StringType import org.apache.spark.sql.Row import org.scalatest.BeforeAndAfterAll import org.scalatest.flatspec.AnyFlatSpec import org.scalatest.matchers.must.Matchers class WrappersSpec extends AnyFlatSpec with Matchers with BeforeAndAfterAll { val spark: SparkSession = SparkSession .builder() .appName("ScalaPB Demo") .master("local[2]") .getOrCreate() import spark.implicits.StringToColumn val data = Seq( PrimitiveWrappers( intValue = Option(45), stringValue = Option("boo"), ints = Seq(17, 19, 25), strings = Seq("foo", "bar") ), PrimitiveWrappers( intValue = None, stringValue = None, ints = Seq(17, 19, 25), strings = Seq("foo", "bar") ) ) "converting df with primitive wrappers" should "work with primitive implicits" in { import ProtoSQL.withPrimitiveWrappers.implicits._ val df = ProtoSQL.withPrimitiveWrappers.createDataFrame(spark, data) df.schema.fields.map(_.dataType).toSeq must be( Seq( IntegerType, StringType, ArrayType(IntegerType, false), ArrayType(StringType, false) ) ) df.collect must contain theSameElementsAs ( Seq( Row(45, "boo", Seq(17, 19, 25), Seq("foo", "bar")), Row(null, null, Seq(17, 19, 25), Seq("foo", "bar")) ) ) } "converting df with primitive wrappers" should "work with default implicits" in { import ProtoSQL.implicits._ val df = ProtoSQL.createDataFrame(spark, data) df.schema.fields.map(_.dataType).toSeq must be( Seq( StructType(Seq(StructField("value", IntegerType, true))), StructType(Seq(StructField("value", StringType, true))), ArrayType( StructType(Seq(StructField("value", IntegerType, true))), false ), ArrayType( StructType(Seq(StructField("value", StringType, true))), false ) ) ) df.collect must contain theSameElementsAs ( Seq( Row( Row(45), Row("boo"), Seq(Row(17), Row(19), Row(25)), Seq(Row("foo"), Row("bar")) ), Row( null, null, Seq(Row(17), Row(19), Row(25)), Seq(Row("foo"), Row("bar")) ) ) ) } }
Example 46
Source File: DataTypeMapping.scala From azure-kusto-spark with Apache License 2.0 | 5 votes |
package com.microsoft.kusto.spark.utils import org.apache.spark.sql.types.DataTypes._ import org.apache.spark.sql.types.{ArrayType, DataType, DataTypes, DecimalType, MapType, StructType} object DataTypeMapping { val kustoTypeToSparkTypeMap: Map[String, DataType] = Map( "string" -> StringType, "long" -> LongType, "datetime" -> TimestampType,// Kusto datetime is equivalent to TimestampType "timespan" -> StringType, "bool" -> BooleanType, "real" -> DoubleType, // Can be partitioned differently between precision and scale, total must be 34 to match .Net SqlDecimal "decimal" -> DataTypes.createDecimalType(20,14), "guid" -> StringType, "int" -> IntegerType, "dynamic" -> StringType ) val kustoJavaTypeToSparkTypeMap: Map[String, DataType] = Map( "string" -> StringType, "int64" -> LongType, "datetime" -> TimestampType, "timespan" -> StringType, "sbyte" -> BooleanType, "double" -> DoubleType, "sqldecimal" -> DataTypes.createDecimalType(20,14), "guid" -> StringType, "int32" -> IntegerType, "object" -> StringType ) val sparkTypeToKustoTypeMap: Map[DataType, String] = Map( StringType -> "string", BooleanType -> "bool", DateType -> "datetime", TimestampType -> "datetime", DataTypes.createDecimalType() -> "decimal", DoubleType -> "real", FloatType -> "real", ByteType -> "int", IntegerType -> "int", LongType -> "long", ShortType -> "int" ) def getSparkTypeToKustoTypeMap(fieldType: DataType): String ={ if(fieldType.isInstanceOf[DecimalType]) "decimal" else if (fieldType.isInstanceOf[ArrayType] || fieldType.isInstanceOf[StructType] || fieldType.isInstanceOf[MapType]) "dynamic" else DataTypeMapping.sparkTypeToKustoTypeMap.getOrElse(fieldType, "string") } }
Example 47
Source File: HashingTF.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, StructType} @Since("2.0.0") def setBinary(value: Boolean): this.type = set(binary, value) @Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { val outputSchema = transformSchema(dataset.schema) val hashingTF = new feature.HashingTF($(numFeatures)).setBinary($(binary)) // TODO: Make the hashingTF.transform natively in ml framework to avoid extra conversion. val t = udf { terms: Seq[_] => hashingTF.transform(terms).asML } val metadata = outputSchema($(outputCol)).metadata dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata)) } @Since("1.4.0") override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType require(inputType.isInstanceOf[ArrayType], s"The input column must be ArrayType, but got $inputType.") val attrGroup = new AttributeGroup($(outputCol), $(numFeatures)) SchemaUtils.appendColumn(schema, attrGroup.toStructField()) } @Since("1.4.1") override def copy(extra: ParamMap): HashingTF = defaultCopy(extra) } @Since("1.6.0") object HashingTF extends DefaultParamsReadable[HashingTF] { @Since("1.6.0") override def load(path: String): HashingTF = super.load(path) }
Example 48
Source File: GenerateSQL.scala From spark-tools with Apache License 2.0 | 5 votes |
package io.univalence.plumbus.compress import io.univalence.plumbus.compress.CompressDump._ import org.apache.spark.rdd.RDD import org.apache.spark.sql.types.{ ArrayType, StructType } import org.apache.spark.sql.{ DataFrame, Row } object GenerateSQL { def displayLigneFreqOverPkPerDump(df: DataFrame): Unit = { val pos = df.schema.fieldNames.indexOf(rowsName) val pos2 = df.schema .fields(pos) .dataType .asInstanceOf[ArrayType] .elementType .asInstanceOf[StructType] .fieldNames .indexOf(dtFinalColName) val rddF: RDD[(String, Int)] = df.rdd.flatMap( _.get(pos) .asInstanceOf[Seq[Row]] .flatMap( _.get(pos2).asInstanceOf[Seq[String]] ) .groupBy(x => x) .mapValues(_.size) ) rddF.countByValue().foreach(println) } def generateView(schema: StructType, tablename: String): String = { val keyFields: Vector[String] = schema.fieldNames.toVector.filterNot(_ == rowsName) val fields: Vector[String] = schema .fields(schema.fieldIndex(rowsName)) .dataType .asInstanceOf[ArrayType] .elementType .asInstanceOf[StructType] .fieldNames .filterNot(_ == dtFinalColName) .toVector val projectionFields: Seq[String] = keyFields ++ Seq("minDt", "maxDt", "minDt_prev", "maxDt_prev", "minDt_prev IS NULL as isInit") ++ fields.flatMap( name => { val name_prev = name + "_prev" Seq( s"""(minDt_prev IS NOT NULL) AND ( ($name <> $name_prev ) OR ($name_prev IS NULL AND $name IS NOT NULL ) OR ($name IS NULL AND $name_prev IS NOT NULL ) ) as ${name}_hasChanged""", name, name_prev ) } ) s""" select ${projectionFields.mkString(",\n")} from $tablename tbl, (select lineT2.*, LAG(minDt) OVER (order by minDt) as minDt_prev, LAG(maxDt) OVER (order by minDt) as maxDt_prev, ${fields.map(name => s"LAG($name) OVER (order by minDt) as ${name}_prev").mkString(",\n")} from (select lineT1.*, minDt, maxDt from tbl.$rowsName as lineT1, (select min(dts.item) as minDt, max(dts.item) as maxDt from lineT1.compressdumpdts as dts) as dtsE) as lineT2) as lineT3 """ } }
Example 49
Source File: XGBoost.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import eleflow.uberdata.IUberdataForecastUtil import eleflow.uberdata.core.data.DataTransformer import eleflow.uberdata.enums.SupportedAlgorithm import eleflow.uberdata.models.UberXGBOOSTModel import ml.dmlc.xgboost4j.LabeledPoint import ml.dmlc.xgboost4j.scala.DMatrix import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{DefaultParamsWritable, Identifiable} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.Dataset import org.apache.spark.sql.types.{ArrayType, DoubleType, StructField, StructType} import scala.reflect.ClassTag class XGBoost[I](override val uid: String, val models: RDD[(I, (UberXGBOOSTModel, Seq[(ModelParamEvaluation[I])]))])( implicit kt: ClassTag[I], ord: Ordering[I] = null) extends ForecastBaseModel[XGBoostSmallModel[I]] with HasInputCol with HasOutputCol with DefaultParamsWritable with HasFeaturesCol with HasNFutures with HasGroupByCol { def this( models: RDD[(I, (UberXGBOOSTModel, Seq[(ModelParamEvaluation[I])]))] )(implicit kt: ClassTag[I], ord: Ordering[I] ) = this(Identifiable.randomUID("xgboost"), models) override def transform(dataSet: Dataset[_]): DataFrame = { val schema = dataSet.schema val predSchema = transformSchema(schema) val joined = models.join(dataSet.rdd.map{case (r: Row) => (r.getAs[I]($(groupByCol).get), r)}) val predictions = joined.map { case (id, ((bestModel, metrics), row)) => val features = row.getAs[Array[org.apache.spark.ml.linalg.Vector]]( IUberdataForecastUtil.FEATURES_COL_NAME ) val label = DataTransformer.toFloat(row.getAs($(featuresCol))) val labelPoint = features.map { vec => val array = vec.toArray.map(_.toFloat) LabeledPoint(label, null, array) } val matrix = new DMatrix(labelPoint.toIterator) val (ownFeaturesPrediction, forecast) = bestModel.boosterInstance .predict(matrix) .flatMap(_.map(_.toDouble)) .splitAt(features.length) Row( row.toSeq :+ Vectors .dense(forecast) :+ SupportedAlgorithm.XGBoostAlgorithm.toString :+ bestModel.params .map(f => f._1 -> f._2.toString) :+ Vectors.dense(ownFeaturesPrediction): _* ) } dataSet.sqlContext.createDataFrame(predictions, predSchema) } @DeveloperApi override def transformSchema(schema: StructType): StructType = { schema.add(StructField($(outputCol), ArrayType(DoubleType))) } override def copy(extra: ParamMap): XGBoostSmallModel[I] = defaultCopy(extra) }
Example 50
Source File: XGBoostBaseBestModel.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import eleflow.uberdata.IUberdataForecastUtil import eleflow.uberdata.core.data.DataTransformer import eleflow.uberdata.enums.SupportedAlgorithm import ml.dmlc.xgboost4j.scala.{Booster, DMatrix} import ml.dmlc.xgboost4j.LabeledPoint import org.apache.spark.SparkContext import org.apache.spark.broadcast.Broadcast import org.apache.spark.ml.evaluation.TimeSeriesEvaluator import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.HasGroupByCol import org.apache.spark.ml.linalg.VectorUDT import org.apache.spark.sql.Row import org.apache.spark.sql.types.{ArrayType, FloatType, StructField, StructType} trait BaseXGBoostBestModelFinder[G, M <: org.apache.spark.ml.ForecastBaseModel[M]] extends BestModelFinder[G, M] with HasGroupByCol { protected def buildTrainSchema(sparkContext: SparkContext): Broadcast[StructType] = sparkContext.broadcast { StructType( Seq( StructField($(groupByCol).get, FloatType), StructField(IUberdataForecastUtil.FEATURES_COL_NAME, ArrayType(new VectorUDT)))) } protected def xGBoostEvaluation(row: Row, model: Booster, broadcastEvaluator: Broadcast[TimeSeriesEvaluator[G]], id: G, parameters: ParamMap): ModelParamEvaluation[G] = { val featuresArray = row .getAs[Array[org.apache.spark.ml.linalg.Vector]](IUberdataForecastUtil.FEATURES_COL_NAME) .map { vec => val values = vec.toArray.map(DataTransformer.toFloat) LabeledPoint(values.head, null, values.tail) } val features = new DMatrix(featuresArray.toIterator) log.warn(s"Evaluating forecast for id $id, with xgboost") val prediction = model.predict(features).flatten val (forecastToBeValidated, _) = prediction.splitAt(featuresArray.length) val toBeValidated = featuresArray.zip(forecastToBeValidated) val metric = broadcastEvaluator.value.evaluate(toBeValidated.map(f => (f._1.label.toDouble, f._2.toDouble))) val metricName = broadcastEvaluator.value.getMetricName new ModelParamEvaluation[G]( id, metric, parameters, Some(metricName), SupportedAlgorithm.XGBoostAlgorithm) } }
Example 51
Source File: JsonNestedExample.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.nested import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.types.{ArrayType, DataType, StructField, StructType} import scala.collection.mutable object JsonNestedExample { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main(args: Array[String]): Unit = { val isLocal = args(0).equalsIgnoreCase("l") val jsonPath = args(1) val outputTableName = args(2) val sparkSession = if (isLocal) { SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .config("spark.driver.host","127.0.0.1") .config("spark.sql.parquet.compression.codec", "gzip") .enableHiveSupport() .getOrCreate() } else { SparkSession.builder .appName("my-spark-app") .config("spark.some.config.option", "config-value") .enableHiveSupport() .getOrCreate() } println("---") val jsonDf = sparkSession.read.json(jsonPath) val localJsonDf = jsonDf.collect() println("--Df") jsonDf.foreach(row => { println("row:" + row) }) println("--local") localJsonDf.foreach(row => { println("row:" + row) }) jsonDf.createOrReplaceTempView("json_table") println("--Tree Schema") jsonDf.schema.printTreeString() println("--") jsonDf.write.saveAsTable(outputTableName) sparkSession.sqlContext.sql("select * from " + outputTableName).take(10).foreach(println) println("--") sparkSession.stop() } def populatedFlattedHashMap(row:Row, schema:StructType, fields:Array[StructField], flattedMap:mutable.HashMap[(String, DataType), mutable.MutableList[Any]], parentFieldName:String): Unit = { fields.foreach(field => { println("field:" + field.dataType) if (field.dataType.isInstanceOf[ArrayType]) { val elementType = field.dataType.asInstanceOf[ArrayType].elementType if (elementType.isInstanceOf[StructType]) { val childSchema = elementType.asInstanceOf[StructType] val childRow = Row.fromSeq(row.getAs[mutable.WrappedArray[Any]](field.name).toSeq) populatedFlattedHashMap(childRow, childSchema, childSchema.fields, flattedMap, parentFieldName + field.name + ".") } } else { val fieldList = flattedMap.getOrElseUpdate((parentFieldName + field.name, field.dataType), new mutable.MutableList[Any]) fieldList.+=:(row.getAs[Any](schema.fieldIndex(field.name))) } }) } }
Example 52
Source File: NestedTableExample.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.nested import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.types.{ArrayType, IntegerType, StringType, StructType} import org.apache.spark.sql.{Row, SparkSession} object NestedTableExample { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main(args: Array[String]): Unit = { val spark = SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .config("spark.driver.host","127.0.0.1") .enableHiveSupport() .getOrCreate() spark.sql("create table IF NOT EXISTS nested_empty " + "( A int, " + " B string, " + " nested ARRAY<STRUCT< " + " nested_C: int," + " nested_D: string" + " >>" + ") ") val rowRDD = spark.sparkContext. parallelize(Array( Row(1, "foo", Seq(Row(1, "barA"),Row(2, "bar"))), Row(2, "foo", Seq(Row(1, "barB"),Row(2, "bar"))), Row(3, "foo", Seq(Row(1, "barC"),Row(2, "bar"))))) val emptyDf = spark.sql("select * from nested_empty limit 0") val tableSchema = emptyDf.schema val populated1Df = spark.sqlContext.createDataFrame(rowRDD, tableSchema) println("----") populated1Df.collect().foreach(r => println(" emptySchemaExample:" + r)) val nestedSchema = new StructType() .add("nested_C", IntegerType) .add("nested_D", StringType) val definedSchema = new StructType() .add("A", IntegerType) .add("B", StringType) .add("nested", ArrayType(nestedSchema)) val populated2Df = spark.sqlContext.createDataFrame(rowRDD, definedSchema) println("----") populated1Df.collect().foreach(r => println(" BuiltExample:" + r)) spark.stop() } }
Example 53
Source File: PopulateHiveTable.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.nested import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.types.{ArrayType, IntegerType, StringType, StructType} object PopulateHiveTable { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main(args: Array[String]): Unit = { val spark = SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .config("spark.driver.host","127.0.0.1") .config("spark.sql.parquet.compression.codec", "gzip") .enableHiveSupport() .getOrCreate() spark.sql("create table IF NOT EXISTS nested_empty " + "( A int, " + " B string, " + " nested ARRAY<STRUCT< " + " nested_C: int," + " nested_D: string" + " >>" + ") ") val rowRDD = spark.sparkContext. parallelize(Array( Row(1, "foo", Seq(Row(1, "barA"),Row(2, "bar"))), Row(2, "foo", Seq(Row(1, "barB"),Row(2, "bar"))), Row(3, "foo", Seq(Row(1, "barC"),Row(2, "bar"))))) val emptyDf = spark.sql("select * from nested_empty limit 0") val tableSchema = emptyDf.schema val populated1Df = spark.sqlContext.createDataFrame(rowRDD, tableSchema) populated1Df.repartition(2).write.saveAsTable("nested_populated") println("----") populated1Df.collect().foreach(r => println(" emptySchemaExample:" + r)) val nestedSchema = new StructType() .add("nested_C", IntegerType) .add("nested_D", StringType) val definedSchema = new StructType() .add("A", IntegerType) .add("B", StringType) .add("nested", ArrayType(nestedSchema)) val populated2Df = spark.sqlContext.createDataFrame(rowRDD, definedSchema) println("----") populated1Df.collect().foreach(r => println(" BuiltExample:" + r)) spark.stop() } }
Example 54
Source File: AnnotatorApproach.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp import com.johnsnowlabs.storage.HasStorage import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.{Estimator, Model, PipelineModel, Transformer} import org.apache.spark.sql.{Dataset, SparkSession} import org.apache.spark.sql.types.{ArrayType, MetadataBuilder, StructField, StructType} import org.apache.spark.ml.util.DefaultParamsWritable override final def transformSchema(schema: StructType): StructType = { require(validate(schema), s"Wrong or missing inputCols annotators in $uid.\n" + msgHelper(schema) + s"\nMake sure such annotators exist in your pipeline, " + s"with the right output names and that they have following annotator types: " + s"${inputAnnotatorTypes.mkString(", ")}") val metadataBuilder: MetadataBuilder = new MetadataBuilder() metadataBuilder.putString("annotatorType", outputAnnotatorType) val outputFields = schema.fields :+ StructField(getOutputCol, ArrayType(Annotation.dataType), nullable = false, metadataBuilder.build) StructType(outputFields) } }
Example 55
Source File: FunctionsTestSpec.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp import com.johnsnowlabs.nlp.annotator.{PerceptronApproach, Tokenizer} import com.johnsnowlabs.nlp.training.POS import com.johnsnowlabs.nlp.util.io.{ExternalResource, ReadAs, ResourceHelper} import org.apache.spark.ml.Pipeline import org.apache.spark.sql.types.ArrayType import org.scalatest._ class FunctionsTestSpec extends FlatSpec { "functions in functions" should "work successfully" in { import com.johnsnowlabs.nlp.util.io.ResourceHelper.spark.implicits._ val trainingPerceptronDF = POS().readDataset(ResourceHelper.spark, "src/test/resources/anc-pos-corpus-small/", "|", "tags") val documentAssembler = new DocumentAssembler() .setInputCol("text") .setOutputCol("document") val tokenizer = new Tokenizer() .setInputCols(Array("document")) .setOutputCol("token") val pos = new PerceptronApproach() .setInputCols("document", "token") .setOutputCol("pos") .setPosColumn("tags") .setNIterations(3) val pipeline = new Pipeline() .setStages(Array( documentAssembler, tokenizer, pos )) val model = pipeline.fit(trainingPerceptronDF) val data = model.transform(Seq("Peter is a very good and compromised person.").toDF("text")) import functions._ val mapped = data.mapAnnotationsCol("pos", "modpos", (annotations: Seq[Annotation]) => { annotations.filter(_.result == "JJ") }) val modified = data.mapAnnotationsCol("pos", "modpos", (_: Seq[Annotation]) => { "hello world" }) val filtered = data.filterByAnnotationsCol("pos", (annotations: Seq[Annotation]) => { annotations.exists(_.result == "JJ") }) import org.apache.spark.sql.functions.col val udfed = data.select(mapAnnotations((annotations: Seq[Annotation]) => { annotations.filter(_.result == "JJ") }, ArrayType(Annotation.dataType))(col("pos"))) val udfed2 = data.select(mapAnnotationsStrict((annotations: Seq[Annotation]) => { annotations.filter(_.result == "JJ") })(col("pos"))) mapped.show(truncate = false) modified.show(truncate = false) filtered.show(truncate = false) udfed.show(truncate = false) udfed2.show(truncate = false) } }
Example 56
Source File: LemmatizerTestSpec.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp.annotators import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector import com.johnsnowlabs.nlp._ import org.apache.spark.ml.{Pipeline, PipelineModel} import org.apache.spark.sql.types.ArrayType import org.apache.spark.sql.{Dataset, Row} import org.scalatest._ class LemmatizerTestSpec extends FlatSpec with LemmatizerBehaviors { require(Some(SparkAccessor).isDefined) val lemmatizer = new Lemmatizer "a lemmatizer" should s"be of type ${AnnotatorType.TOKEN}" in { assert(lemmatizer.outputAnnotatorType == AnnotatorType.TOKEN) } val latinBodyData: Dataset[Row] = DataBuilder.basicDataBuild(ContentProvider.latinBody) "A full Normalizer pipeline with latin content" should behave like fullLemmatizerPipeline(latinBodyData) "A lemmatizer" should "be readable and writable" taggedAs Tag("LinuxOnly") in { val lemmatizer = new Lemmatizer().setDictionary("src/test/resources/lemma-corpus-small/lemmas_small.txt", "->", "\t") val path = "./test-output-tmp/lemmatizer" try { lemmatizer.write.overwrite.save(path) val lemmatizerRead = Lemmatizer.read.load(path) assert(lemmatizer.getDictionary.path == lemmatizerRead.getDictionary.path) } catch { case _: java.io.IOException => succeed } } "A lemmatizer" should "work under a pipeline framework" in { val data = ContentProvider.parquetData.limit(1000) val documentAssembler = new DocumentAssembler() .setInputCol("text") .setOutputCol("document") val sentenceDetector = new SentenceDetector() .setInputCols(Array("document")) .setOutputCol("sentence") val tokenizer = new Tokenizer() .setInputCols(Array("sentence")) .setOutputCol("token") val lemmatizer = new Lemmatizer() .setInputCols(Array("token")) .setOutputCol("lemma") .setDictionary("src/test/resources/lemma-corpus-small/lemmas_small.txt", "->", "\t") val finisher = new Finisher() .setInputCols("lemma") val pipeline = new Pipeline() .setStages(Array( documentAssembler, sentenceDetector, tokenizer, lemmatizer, finisher )) val recursivePipeline = new RecursivePipeline() .setStages(Array( documentAssembler, sentenceDetector, tokenizer, lemmatizer, finisher )) val model = pipeline.fit(data) model.transform(data).show() val PIPE_PATH = "./tmp_pipeline" model.write.overwrite().save(PIPE_PATH) val loadedPipeline = PipelineModel.read.load(PIPE_PATH) loadedPipeline.transform(data).show val recursiveModel = recursivePipeline.fit(data) recursiveModel.transform(data).show() recursiveModel.write.overwrite().save(PIPE_PATH) val loadedRecPipeline = PipelineModel.read.load(PIPE_PATH) loadedRecPipeline.transform(data).show succeed } }
Example 57
Source File: YelpHelpers.scala From morpheus with Apache License 2.0 | 5 votes |
package org.opencypher.morpheus.integration.yelp import org.apache.spark.sql.types.{ArrayType, DateType, IntegerType, LongType} import org.apache.spark.sql.{Column, DataFrame, SparkSession, functions} import org.opencypher.morpheus.api.io.GraphElement.sourceIdKey import org.opencypher.morpheus.api.io.Relationship.{sourceEndNodeKey, sourceStartNodeKey} import org.opencypher.morpheus.impl.table.SparkTable._ import org.opencypher.morpheus.integration.yelp.YelpConstants._ object YelpHelpers { case class YelpTables( userDf: DataFrame, businessDf: DataFrame, reviewDf: DataFrame ) def loadYelpTables(inputPath: String)(implicit spark: SparkSession): YelpTables = { import spark.implicits._ log("read business.json", 2) val rawBusinessDf = spark.read.json(s"$inputPath/business.json") log("read review.json", 2) val rawReviewDf = spark.read.json(s"$inputPath/review.json") log("read user.json", 2) val rawUserDf = spark.read.json(s"$inputPath/user.json") val businessDf = rawBusinessDf.select($"business_id".as(sourceIdKey), $"business_id", $"name", $"address", $"city", $"state") val reviewDf = rawReviewDf.select($"review_id".as(sourceIdKey), $"user_id".as(sourceStartNodeKey), $"business_id".as(sourceEndNodeKey), $"stars", $"date".cast(DateType)) val userDf = rawUserDf.select( $"user_id".as(sourceIdKey), $"name", $"yelping_since".cast(DateType), functions.split($"elite", ",").cast(ArrayType(LongType)).as("elite")) YelpTables(userDf, businessDf, reviewDf) } def printYelpStats(inputPath: String)(implicit spark: SparkSession): Unit = { val rawBusinessDf = spark.read.json(s"$inputPath/business.json") val rawReviewDf = spark.read.json(s"$inputPath/review.json") import spark.implicits._ rawBusinessDf.select($"city", $"state").distinct().show() rawBusinessDf.withColumnRenamed("business_id", "id") .join(rawReviewDf, $"id" === $"business_id") .groupBy($"city", $"state") .count().as("count") .orderBy($"count".desc, $"state".asc) .show(100) } def extractYelpCitySubset(inputPath: String, outputPath: String, city: String)(implicit spark: SparkSession): Unit = { import spark.implicits._ def emailColumn(userId: String): Column = functions.concat($"$userId", functions.lit("@yelp.com")) val rawUserDf = spark.read.json(s"$inputPath/user.json") val rawReviewDf = spark.read.json(s"$inputPath/review.json") val rawBusinessDf = spark.read.json(s"$inputPath/business.json") val businessDf = rawBusinessDf.filter($"city" === city) val reviewDf = rawReviewDf .join(businessDf, Seq("business_id"), "left_semi") .withColumn("user_email", emailColumn("user_id")) .withColumnRenamed("stars", "stars_tmp") .withColumn("stars", $"stars_tmp".cast(IntegerType)) .drop("stars_tmp") val userDf = rawUserDf .join(reviewDf, Seq("user_id"), "left_semi") .withColumn("email", emailColumn("user_id")) val friendDf = userDf .select($"email".as("user1_email"), functions.explode(functions.split($"friends", ", ")).as("user2_id")) .withColumn("user2_email", emailColumn("user2_id")) .select(s"user1_email", s"user2_email") businessDf.write.json(s"$outputPath/$cityGraphName/$yelpDB/business.json") reviewDf.write.json(s"$outputPath/$cityGraphName/$yelpDB/review.json") userDf.write.json(s"$outputPath/$cityGraphName/$yelpDB/user.json") friendDf.write.json(s"$outputPath/$cityGraphName/$yelpBookDB/friend.json") } implicit class DataFrameOps(df: DataFrame) { def prependIdColumn(idColumn: String, prefix: String): DataFrame = df.transformColumns(idColumn)(column => functions.concat(functions.lit(prefix), column).as(idColumn)) } }
Example 58
Source File: MorpheusRecordHeaderTest.scala From morpheus with Apache License 2.0 | 5 votes |
package org.opencypher.morpheus.impl.table import org.apache.spark.sql.types.{ArrayType, StringType, StructField} import org.opencypher.morpheus.impl.convert.SparkConversions._ import org.opencypher.okapi.api.types.{CTList, CTString} import org.opencypher.okapi.ir.api.expr.Var import org.opencypher.okapi.relational.impl.table.RecordHeader import org.opencypher.okapi.testing.BaseTestSuite class MorpheusRecordHeaderTest extends BaseTestSuite { it("computes a struct type from a given record header") { val header = RecordHeader.empty .withExpr(Var("a")(CTString)) .withExpr(Var("b")(CTString.nullable)) .withExpr(Var("c")(CTList(CTString.nullable))) header.toStructType.fields.toSet should equal(Set( StructField(header.column(Var("a")()), StringType, nullable = false), StructField(header.column(Var("b")()), StringType, nullable = true), StructField(header.column(Var("c")()), ArrayType(StringType, containsNull = true), nullable = false) )) } }
Example 59
Source File: HashingTF.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, StructType} @Since("2.0.0") def setBinary(value: Boolean): this.type = set(binary, value) @Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { val outputSchema = transformSchema(dataset.schema) val hashingTF = new feature.HashingTF($(numFeatures)).setBinary($(binary)) // TODO: Make the hashingTF.transform natively in ml framework to avoid extra conversion. val t = udf { terms: Seq[_] => hashingTF.transform(terms).asML } val metadata = outputSchema($(outputCol)).metadata dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata)) } @Since("1.4.0") override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType require(inputType.isInstanceOf[ArrayType], s"The input column must be ArrayType, but got $inputType.") val attrGroup = new AttributeGroup($(outputCol), $(numFeatures)) SchemaUtils.appendColumn(schema, attrGroup.toStructField()) } @Since("1.4.1") override def copy(extra: ParamMap): HashingTF = defaultCopy(extra) } @Since("1.6.0") object HashingTF extends DefaultParamsReadable[HashingTF] { @Since("1.6.0") override def load(path: String): HashingTF = super.load(path) }
Example 60
Source File: NGram.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.param._ import org.apache.spark.ml.util._ import org.apache.spark.sql.types.{ArrayType, DataType, StringType} @Since("1.5.0") def getN: Int = $(n) setDefault(n -> 2) override protected def createTransformFunc: Seq[String] => Seq[String] = { _.iterator.sliding($(n)).withPartial(false).map(_.mkString(" ")).toSeq } override protected def validateInputType(inputType: DataType): Unit = { require(inputType.sameType(ArrayType(StringType)), s"Input type must be ArrayType(StringType) but got $inputType.") } override protected def outputDataType: DataType = new ArrayType(StringType, false) } @Since("1.6.0") object NGram extends DefaultParamsReadable[NGram] { @Since("1.6.0") override def load(path: String): NGram = super.load(path) }
Example 61
Source File: Tokenizer.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Experimental import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.param._ import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.types.{ArrayType, DataType, StringType} def getPattern: String = $(pattern) setDefault(minTokenLength -> 1, gaps -> true, pattern -> "\\s+") override protected def createTransformFunc: String => Seq[String] = { str => val re = $(pattern).r val tokens = if ($(gaps)) re.split(str).toSeq else re.findAllIn(str).toSeq val minLength = $(minTokenLength) tokens.filter(_.length >= minLength) } override protected def validateInputType(inputType: DataType): Unit = { require(inputType == StringType, s"Input type must be string type but got $inputType.") } override protected def outputDataType: DataType = new ArrayType(StringType, false) override def copy(extra: ParamMap): RegexTokenizer = defaultCopy(extra) }
Example 62
Source File: HashingTF.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Experimental import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.mllib.feature import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, StructType} def setNumFeatures(value: Int): this.type = set(numFeatures, value) override def transform(dataset: DataFrame): DataFrame = { val outputSchema = transformSchema(dataset.schema) val hashingTF = new feature.HashingTF($(numFeatures)) val t = udf { terms: Seq[_] => hashingTF.transform(terms) } val metadata = outputSchema($(outputCol)).metadata dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata)) } override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType require(inputType.isInstanceOf[ArrayType], s"The input column must be ArrayType, but got $inputType.") val attrGroup = new AttributeGroup($(outputCol), $(numFeatures)) SchemaUtils.appendColumn(schema, attrGroup.toStructField()) } override def copy(extra: ParamMap): HashingTF = defaultCopy(extra) }