org.apache.spark.ml.linalg.SQLDataTypes.VectorType Scala Examples
The following examples show how to use org.apache.spark.ml.linalg.SQLDataTypes.VectorType.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: get_features_from_peinfo.scala From gsoc_relationship with Apache License 2.0 | 5 votes |
import com.datastax.spark.connector._ import play.api.libs.json.Json import play.api.libs.json._ import java.io.{ByteArrayOutputStream, ByteArrayInputStream} import java.util.zip.{GZIPOutputStream, GZIPInputStream} import Array.concat import org.apache.spark.sql.types._ import org.apache.spark.ml.linalg.SQLDataTypes.VectorType import org.apache.spark.ml.linalg._ import org.apache.spark.sql.Row import org.apache.spark.ml.feature.MinMaxScaler import org.apache.spark.ml.linalg.DenseVector import PreProcessingConfig._ case class peinfo_results_by_service_name_class(service_name: String, sha256: String) case class peinfo_results_by_sha256_class(sha256: String, service_name: String, results: Array[Byte]) case class peinfo_join_results_class(sha256: String, service_name: String, results: String) case class peinfo_int_final_array_rdd_class(sha256: String, array_results: Array[Double]) case class peinfo_binaray_final_array_rdd_class(sha256:String, array_results :Array[Double]) case class peinfo_final_array_rdd_class(sha256:String, array_results: Array[Double]) def unzip(x: Array[Byte]) : String = { val inputStream = new GZIPInputStream(new ByteArrayInputStream(x)) val output = scala.io.Source.fromInputStream(inputStream).mkString return output } def findAllIntinpeinfo( peinfo_json_results : JsLookupResult, time: Double): Array[Double]= { val entropy = peinfo_json_results \\ "entropy" ; val virt_address = peinfo_json_results \\ "virt_address"; val virt_size = peinfo_json_results \\ "virt_size"; val size = peinfo_json_results \\ "size"; var i= 0; var List = Array.iterate(0.0,17)(a=>a*0) for (k <- ( peinfo_json_results \\ "section_name")){ k.as[String] match { case ".text\u0000\u0000\u0000" => { List(0)=entropy(i).as[Double]; List(1)=Integer.parseInt(virt_address(i).as[String].substring(2), 16).toDouble; List(2)=virt_size(i).as[Double]; List(3)=size(i).as[Double] } case ".data\u0000\u0000\u0000" => { List(4)=entropy(i).as[Double]; List(5)=Integer.parseInt(virt_address(i).as[String].substring(2), 16).toDouble; List(6)=virt_size(i).as[Double]; List(7)=size(i).as[Double] } case ".rsrc\u0000\u0000\u0000" => { List(8)=entropy(i).as[Double]; List(9)=Integer.parseInt(virt_address(i).as[String].substring(2), 16).toDouble; List(10)=virt_size(i).as[Double]; List(11)=size(i).as[Double] } case ".rdata\u0000\u0000" => { List(12)=entropy(i).as[Double]; List(13)=Integer.parseInt(virt_address(i).as[String].substring(2), 16).toDouble; List(14)=virt_size(i).as[Double]; List(15)=size(i).as[Double] } case other => {} } i = i + 1 } List(16)= time return List.toArray } val peinfo_results_by_service_name_meta = sc.cassandraTable[peinfo_results_by_service_name_class](keyspace,service_name_table).where("service_name=?","peinfo") val peinfo_results_by_service_name_rdd = peinfo_results_by_service_name_meta.keyBy(x=> (x.sha256,x.service_name)) val peinfo_results_by_sha256_meta = sc.cassandraTable[peinfo_results_by_sha256_class](keyspace,sha256_table) val peinfo_results_by_sha256_rdd = peinfo_results_by_sha256_meta.keyBy(x => (x.sha256,x.service_name)) val peinfo_join_results = peinfo_results_by_service_name_rdd.join(peinfo_results_by_sha256_rdd).map(x=> (new peinfo_join_results_class(x._1._1,x._1._2, unzip(x._2._2.results)))).distinct().cache() val peinfo_int_final_array_rdd = peinfo_join_results.map(x=>(x.sha256,(Json.parse(x.results) \ "pe_sections"),{if ((Json.parse(x.results) \ "timestamp").isInstanceOf[JsUndefined]) 0.0 else (Json.parse(x.results) \ "timestamp" \\ "timestamp")(0).as[Double]})).filter(x=> !x._2.isInstanceOf[JsUndefined]).map(x=>new peinfo_int_final_array_rdd_class(x._1,findAllIntinpeinfo(x._2,x._3))) val peinfo_dllfunction_list= peinfo_join_results.map(x=>Json.parse(x.results) \ "imports").filter(x=> !x.isInstanceOf[JsUndefined]).flatMap(x=>x.as[List[Map[String, String]]].map(x=>(x("dll")+"."+x("function")))).toDF("func_name").groupBy("func_name").count.sort(desc("count")).filter("count > 10000").rdd.map(r => r.getString(0)).collect().toList implicit def bool2int(b:Boolean) = if (b) 1 else 0 def findAllBininpeinfo_dllfunction(peinfo_dllfunction : Seq[String]) : Array[Double] ={ val forlist = for (family <- peinfo_dllfunction_list) yield { (peinfo_dllfunction.contains(family):Int).toDouble } return (forlist).toArray } val List502 = Array.iterate(0.0,502)(a=>0.0) val peinfo_binaray_final_array_rdd = peinfo_join_results.map(x=>(x.sha256,(Json.parse(x.results) \ "imports"))).map(x=>new peinfo_binaray_final_array_rdd_class(x._1,{if (x._2.isInstanceOf[JsUndefined]) List502 else findAllBininpeinfo_dllfunction(x._2.as[Seq[Map[String, String]]].map(x=>(x("dll")+"."+x("function"))))})) val peinfo_int_final_array_rdd_before_join = peinfo_int_final_array_rdd.map(x=>(x.sha256,x.array_results)) val peinfo_binaray_final_array_rdd_before_join = peinfo_binaray_final_array_rdd.map(x=>(x.sha256,x.array_results)) val peinfo_array_rdd_by_join = peinfo_int_final_array_rdd_before_join.join(peinfo_binaray_final_array_rdd_before_join).map(x=> (x._1,concat(x._2._1,x._2._2))) val peinfo_final_array_rdd = peinfo_array_rdd_by_join.map(x=>new peinfo_final_array_rdd_class(x._1,x._2)) val peinfo_schema = new StructType().add("sha256", StringType).add("peinfo",VectorType) val peinfo_vector_rdd = peinfo_final_array_rdd.map(x=>(x.sha256,Vectors.dense(x.array_results))) val peinfo_vector_rowrdd = peinfo_vector_rdd.map(p => Row(p._1,p._2)) val peinfo_vector_dataframe = spark.createDataFrame(peinfo_vector_rowrdd, peinfo_schema) val peinfo_scaler = new MinMaxScaler() .setInputCol("peinfo") .setOutputCol("scaled_peinfo") val peinfo_scalerModel = peinfo_scaler.fit(peinfo_vector_dataframe) val peinfo_scaledData_df = peinfo_scalerModel.transform(peinfo_vector_dataframe) val peinfo_scaledData_rdd = peinfo_scaledData_df.select("sha256","scaled_peinfo").rdd.map(row=>(row.getAs[String]("sha256"),row.getAs[DenseVector]("scaled_peinfo"))).map(x=>new peinfo_final_array_rdd_class(x._1,x._2.toArray)) peinfo_scaledData_rdd.toDF().write.format("parquet").save(peinfo_final_array_file)
Example 2
Source File: FeaturePropSpec.scala From spark-tda with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.ml.linalg.{Vector, Vectors, DenseVector} import org.apache.spark.ml.linalg.SQLDataTypes.VectorType import org.apache.spark.sql.{SparkSession, DataFrame} import org.apache.spark.sql.types.{ StructField, IntegerType, DoubleType, BooleanType, StructType, StringType, ArrayType } import org.scalacheck.{Arbitrary, Gen} import org.scalacheck.Arbitrary.arbitrary import org.scalatest.PropSpec import com.holdenkarau.spark.testing.{ SharedSparkContext, DataframeGenerator, Column } abstract class FeaturePropSpec extends PropSpec with SharedSparkContext with DefaultReadWriteTest { implicit def arbitraryDenseVector: Arbitrary[DenseVector] = Arbitrary { for (arr <- arbitrary[Array[Double]]) yield new DenseVector(arr) } implicit def arbitraryVector: Arbitrary[Vector] = Arbitrary( Gen.frequency( 1 -> arbitrary[DenseVector] )) lazy val spark = SparkSession.builder().getOrCreate() def schema = StructType( List( StructField("integer", IntegerType), StructField("double", DoubleType), StructField("boolean", BooleanType), StructField("string", StringType) )) def integerGen = new Column("integer", Gen.choose(-100, 100)) def doubleGen = new Column("double", Gen.choose(-100.0, 100.0)) def stringGen = new Column("string", Gen.oneOf("A", "BC", "DEF", "GHIJ", "KLMNO")) def dataframeGen = DataframeGenerator.arbitraryDataFrameWithCustomFields( spark.sqlContext, schema)(integerGen, doubleGen, stringGen) def hasDistinctValues(df: DataFrame, columns: String*): Boolean = { columns.foldLeft(true) { (acc, col) => acc && df.select(col).distinct.count() > 1 } } }
Example 3
Source File: MLUserDefinedType.scala From spark-testing-base with Apache License 2.0 | 5 votes |
package com.holdenkarau.spark.testing import org.apache.spark.sql.types.DataType import org.apache.spark.ml.linalg.SQLDataTypes.{MatrixType, VectorType} import org.apache.spark.ml.linalg.{DenseMatrix, Vectors} import org.scalacheck.{Arbitrary, Gen} object MLUserDefinedType { def unapply(dataType: DataType): Option[Gen[Any]] = dataType match { case MatrixType => { val dense = for { rows <- Gen.choose(0, 20) cols <- Gen.choose(0, 20) values <- Gen.containerOfN[Array, Double](rows * cols, Arbitrary.arbitrary[Double]) } yield new DenseMatrix(rows, cols, values) val sparse = dense.map(_.toSparse) Some(Gen.oneOf(dense, sparse)) } case VectorType => { val dense = Arbitrary.arbitrary[Array[Double]].map(Vectors.dense) val sparse = for { indices <- Gen.nonEmptyContainerOf[Set, Int](Gen.choose(0, Int.MaxValue - 1)) values <- Gen.listOfN(indices.size, Arbitrary.arbitrary[Double]) } yield Vectors.sparse(indices.max + 1, indices.toSeq.zip(values)) Some(Gen.oneOf(dense, sparse)) } case _ => None } }
Example 4
Source File: MLScalaCheckTest.scala From spark-testing-base with Apache License 2.0 | 5 votes |
package com.holdenkarau.spark.testing import org.apache.spark.ml.linalg.SQLDataTypes.{MatrixType, VectorType} import org.apache.spark.sql.SQLContext import org.apache.spark.sql.types.{StructField, StructType} import org.scalacheck.Prop.forAll import org.scalatest.FunSuite import org.scalatest.prop.Checkers class MLScalaCheckTest extends FunSuite with SharedSparkContext with Checkers { // re-use the spark context override implicit def reuseContextIfPossible: Boolean = false test("vector generation") { val schema = StructType(List(StructField("vector", VectorType))) val sqlContext = new SQLContext(sc) val dataframeGen = DataframeGenerator.arbitraryDataFrame(sqlContext, schema) val property = forAll(dataframeGen.arbitrary) { dataframe => { dataframe.schema === schema && dataframe.count >= 0 } } check(property) } test("matrix generation") { val schema = StructType(List(StructField("matrix", MatrixType))) val sqlContext = new SQLContext(sc) val dataframeGen = DataframeGenerator.arbitraryDataFrame(sqlContext, schema) val property = forAll(dataframeGen.arbitrary) { dataframe => { dataframe.schema === schema && dataframe.count >= 0 } } check(property) } }
Example 5
Source File: udfs.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.stages import org.apache.spark.ml.linalg.SQLDataTypes.VectorType import org.apache.spark.ml.linalg.Vectors import org.apache.spark.sql.Column import org.apache.spark.sql.expressions.UserDefinedFunction import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.DoubleType import scala.collection.mutable //scalastyle:off object udfs { def get_value_at(colName: String, i: Int): Column = { udf({ vec: org.apache.spark.ml.linalg.Vector => vec(i) }, DoubleType)(col(colName)) } val to_vector: UserDefinedFunction = udf({ arr: Seq[Double] => Vectors.dense(arr.toArray) }, VectorType) def to_vector(colName: String): Column = to_vector(col(colName)) }
Example 6
Source File: VowpalWabbitInteractions.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.vw import com.microsoft.ml.spark.core.contracts.{HasInputCols, HasOutputCol, Wrappable} import org.apache.spark.ml.{ComplexParamsReadable, ComplexParamsWritable, Transformer} import org.apache.spark.ml.param.ParamMap import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.spark.sql.functions.{col, struct, udf} import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.types.{StructField, StructType} import org.apache.spark.ml.linalg.SQLDataTypes.VectorType object VowpalWabbitInteractions extends ComplexParamsReadable[VowpalWabbitInteractions] class VowpalWabbitInteractions(override val uid: String) extends Transformer with HasInputCols with HasOutputCol with HasNumBits with HasSumCollisions with Wrappable with ComplexParamsWritable { def this() = this(Identifiable.randomUID("VowpalWabbitInteractions")) override def transform(dataset: Dataset[_]): DataFrame = { val fieldSubset = dataset.schema.fields .filter(f => getInputCols.contains(f.name)) val mask = getMask val mode = udf((r: Row) => { // compute the final number of features val numElems = (0 until r.length) .map(r.getAs[Vector](_).numNonzeros).product val newIndices = new Array[Int](numElems) val newValues = new Array[Double](numElems) // build interaction features using FNV-1 val fnvPrime = 16777619 var i = 0 def interact(idx: Int, value: Double, ns: Int): Unit = { if (ns == r.size) { newIndices(i) += mask & idx newValues(i) += value i += 1 } else { val idx1 = idx * fnvPrime r.getAs[Vector](ns).foreachActive { case (idx2, value2) => interact(idx1 ^ idx2, value * value2, ns + 1) } } } // start the recursion interact(0, 1, 0) val (indicesSorted, valuesSorted) = VectorUtils.sortAndDistinct(newIndices, newValues, getSumCollisions) Vectors.sparse(1 << getNumBits, indicesSorted, valuesSorted) }) dataset.toDF.withColumn(getOutputCol, mode.apply(struct(fieldSubset.map(f => col(f.name)): _*))) } override def transformSchema(schema: StructType): StructType = { val fieldNames = schema.fields.map(_.name) for (f <- getInputCols) if (!fieldNames.contains(f)) throw new IllegalArgumentException("missing input column " + f) else { val fieldType = schema.fields(schema.fieldIndex(f)).dataType if (fieldType != VectorType) throw new IllegalArgumentException("column " + f + " must be of type Vector but is " + fieldType.typeName) } schema.add(StructField(getOutputCol, VectorType, true)) } override def copy(extra: ParamMap): VowpalWabbitFeaturizer = defaultCopy(extra) }
Example 7
Source File: FeatureSparkTypeTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.features import com.salesforce.op.features.types.FeatureType import com.salesforce.op.test.{TestCommon, TestSparkContext} import org.apache.spark.ml.linalg.SQLDataTypes.VectorType import org.apache.spark.sql.types._ import org.junit.runner.RunWith import org.scalatest.{Assertion, FlatSpec} import org.scalatest.junit.JUnitRunner import scala.reflect.runtime.universe._ @RunWith(classOf[JUnitRunner]) class FeatureSparkTypeTest extends FlatSpec with TestCommon { val primitiveTypes = Seq( (DoubleType, weakTypeTag[types.Real], DoubleType), (FloatType, weakTypeTag[types.Real], DoubleType), (LongType, weakTypeTag[types.Integral], LongType), (IntegerType, weakTypeTag[types.Integral], LongType), (ShortType, weakTypeTag[types.Integral], LongType), (ByteType, weakTypeTag[types.Integral], LongType), (DateType, weakTypeTag[types.Date], LongType), (TimestampType, weakTypeTag[types.DateTime], LongType), (StringType, weakTypeTag[types.Text], StringType), (BooleanType, weakTypeTag[types.Binary], BooleanType), (VectorType, weakTypeTag[types.OPVector], VectorType) ) val nonNullable = Seq( (DoubleType, weakTypeTag[types.RealNN], DoubleType), (FloatType, weakTypeTag[types.RealNN], DoubleType) ) private def mapType(v: DataType) = MapType(StringType, v, valueContainsNull = true) private def arrType(v: DataType) = ArrayType(v, containsNull = true) val collectionTypes = Seq( (arrType(LongType), weakTypeTag[types.DateList], arrType(LongType)), (arrType(DoubleType), weakTypeTag[types.Geolocation], arrType(DoubleType)), (arrType(StringType), weakTypeTag[types.TextList], arrType(StringType)), (mapType(StringType), weakTypeTag[types.TextMap], mapType(StringType)), (mapType(DoubleType), weakTypeTag[types.RealMap], mapType(DoubleType)), (mapType(LongType), weakTypeTag[types.IntegralMap], mapType(LongType)), (mapType(BooleanType), weakTypeTag[types.BinaryMap], mapType(BooleanType)), (mapType(arrType(StringType)), weakTypeTag[types.MultiPickListMap], mapType(arrType(StringType))), (mapType(arrType(DoubleType)), weakTypeTag[types.GeolocationMap], mapType(arrType(DoubleType))) ) Spec(FeatureSparkTypes.getClass) should "assign appropriate feature type tags for valid types and versa" in { primitiveTypes.map(scala.Function.tupled(assertTypes())) } it should "assign appropriate feature type tags for valid non-nullable types and versa" in { nonNullable.map(scala.Function.tupled(assertTypes(isNullable = false))) } it should "assign appropriate feature type tags for collection types and versa" in { collectionTypes.map(scala.Function.tupled(assertTypes())) } it should "error for unsupported types" in { val error = intercept[IllegalArgumentException](FeatureSparkTypes.featureTypeTagOf(BinaryType, isNullable = false)) error.getMessage shouldBe "Spark BinaryType is currently not supported" } it should "error for unknown types" in { val unknownType = NullType val error = intercept[IllegalArgumentException](FeatureSparkTypes.featureTypeTagOf(unknownType, isNullable = false)) error.getMessage shouldBe s"No feature type tag mapping for Spark type $unknownType" } def assertTypes( isNullable: Boolean = true )( sparkType: DataType, featureType: WeakTypeTag[_ <: FeatureType], expectedSparkType: DataType ): Assertion = { FeatureSparkTypes.featureTypeTagOf(sparkType, isNullable) shouldBe featureType FeatureSparkTypes.sparkTypeOf(featureType) shouldBe expectedSparkType } }
Example 8
Source File: ProtobufRequestRowSerializerTests.scala From sagemaker-spark with Apache License 2.0 | 5 votes |
package com.amazonaws.services.sagemaker.sparksdk.transformation.serializers import org.scalatest.{FlatSpec, Matchers} import org.scalatest.mock.MockitoSugar import org.apache.spark.ml.linalg.{DenseVector, SparseVector, SQLDataTypes} import org.apache.spark.ml.linalg.SQLDataTypes.VectorType import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType} import com.amazonaws.services.sagemaker.sparksdk.protobuf.ProtobufConverter class ProtobufRequestRowSerializerTests extends FlatSpec with Matchers with MockitoSugar { val labelColumnName = "label" val featuresColumnName = "features" val schema = StructType(Array(StructField(labelColumnName, DoubleType), StructField( featuresColumnName, VectorType))) it should "serialize a dense vector" in { val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray) val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema) val rrs = new ProtobufRequestRowSerializer(Some(schema)) val protobuf = ProtobufConverter.rowToProtobuf(row, featuresColumnName, Option.empty) val serialized = rrs.serializeRow(row) val protobufIterator = ProtobufConverter.recordIOByteArrayToProtobufs(serialized) val protobufFromRecordIO = protobufIterator.next assert(!protobufIterator.hasNext) assert(protobuf.equals(protobufFromRecordIO)) } it should "serialize a sparse vector" in { val vec = new SparseVector(100, Seq[Int](0, 10).toArray, Seq[Double](-100.0, 100.1).toArray) val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema) val rrs = new ProtobufRequestRowSerializer(Some(schema)) val protobuf = ProtobufConverter.rowToProtobuf(row, featuresColumnName, Option.empty) val serialized = rrs.serializeRow(row) val protobufIterator = ProtobufConverter.recordIOByteArrayToProtobufs(serialized) val protobufFromRecordIO = protobufIterator.next assert(!protobufIterator.hasNext) assert(protobuf.equals(protobufFromRecordIO)) } it should "fail to set schema on invalid features name" in { val vec = new SparseVector(100, Seq[Int](0, 10).toArray, Seq[Double](-100.0, 100.1).toArray) val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema) intercept[IllegalArgumentException] { val rrs = new ProtobufRequestRowSerializer(Some(schema), featuresColumnName = "doesNotExist") } } it should "fail on invalid types" in { val schemaWithInvalidFeaturesType = StructType(Array( StructField("label", DoubleType, nullable = false), StructField("features", StringType, nullable = false))) intercept[RuntimeException] { new ProtobufRequestRowSerializer(Some(schemaWithInvalidFeaturesType)) } } it should "validate correct schema" in { val validSchema = StructType(Array( StructField("features", SQLDataTypes.VectorType, nullable = false))) new ProtobufRequestRowSerializer(Some(validSchema)) } }