org.apache.spark.ml.linalg.SQLDataTypes.VectorType Scala Examples

The following examples show how to use org.apache.spark.ml.linalg.SQLDataTypes.VectorType. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: get_features_from_peinfo.scala    From gsoc_relationship   with Apache License 2.0 5 votes vote down vote up
import com.datastax.spark.connector._
import play.api.libs.json.Json
import play.api.libs.json._
import java.io.{ByteArrayOutputStream, ByteArrayInputStream}
import java.util.zip.{GZIPOutputStream, GZIPInputStream}
import Array.concat
import org.apache.spark.sql.types._
import org.apache.spark.ml.linalg.SQLDataTypes.VectorType 
import org.apache.spark.ml.linalg._
import org.apache.spark.sql.Row
import org.apache.spark.ml.feature.MinMaxScaler
import org.apache.spark.ml.linalg.DenseVector
import PreProcessingConfig._

case class peinfo_results_by_service_name_class(service_name: String, sha256: String)
case class peinfo_results_by_sha256_class(sha256: String, service_name: String, results: Array[Byte])
case class peinfo_join_results_class(sha256: String, service_name: String, results: String)
case class peinfo_int_final_array_rdd_class(sha256: String, array_results: Array[Double])
case class peinfo_binaray_final_array_rdd_class(sha256:String, array_results :Array[Double])
case class peinfo_final_array_rdd_class(sha256:String, array_results: Array[Double])

def unzip(x: Array[Byte]) : String = {      
    val inputStream = new GZIPInputStream(new ByteArrayInputStream(x))
    val output = scala.io.Source.fromInputStream(inputStream).mkString
    return output
}
def findAllIntinpeinfo( peinfo_json_results : JsLookupResult, time: Double): Array[Double]= {
    val entropy = peinfo_json_results \\ "entropy" ; val virt_address = peinfo_json_results \\ "virt_address"; val virt_size = peinfo_json_results \\ "virt_size"; val size = peinfo_json_results \\ "size";
    var i= 0; var List  = Array.iterate(0.0,17)(a=>a*0)
    for (k <- ( peinfo_json_results \\ "section_name")){
        k.as[String] match {
            case ".text\u0000\u0000\u0000" => { List(0)=entropy(i).as[Double]; List(1)=Integer.parseInt(virt_address(i).as[String].substring(2), 16).toDouble; List(2)=virt_size(i).as[Double]; List(3)=size(i).as[Double] }
            case ".data\u0000\u0000\u0000" => { List(4)=entropy(i).as[Double]; List(5)=Integer.parseInt(virt_address(i).as[String].substring(2), 16).toDouble; List(6)=virt_size(i).as[Double]; List(7)=size(i).as[Double] }
            case ".rsrc\u0000\u0000\u0000" => { List(8)=entropy(i).as[Double]; List(9)=Integer.parseInt(virt_address(i).as[String].substring(2), 16).toDouble; List(10)=virt_size(i).as[Double]; List(11)=size(i).as[Double] }
            case ".rdata\u0000\u0000" => { List(12)=entropy(i).as[Double]; List(13)=Integer.parseInt(virt_address(i).as[String].substring(2), 16).toDouble; List(14)=virt_size(i).as[Double]; List(15)=size(i).as[Double] }
            case other => {}
        }
        i = i + 1
    }
    List(16)= time
    return List.toArray
}

val peinfo_results_by_service_name_meta = sc.cassandraTable[peinfo_results_by_service_name_class](keyspace,service_name_table).where("service_name=?","peinfo")
val peinfo_results_by_service_name_rdd = peinfo_results_by_service_name_meta.keyBy(x=> (x.sha256,x.service_name))
val peinfo_results_by_sha256_meta = sc.cassandraTable[peinfo_results_by_sha256_class](keyspace,sha256_table)
val peinfo_results_by_sha256_rdd = peinfo_results_by_sha256_meta.keyBy(x => (x.sha256,x.service_name))
val peinfo_join_results = peinfo_results_by_service_name_rdd.join(peinfo_results_by_sha256_rdd).map(x=> (new peinfo_join_results_class(x._1._1,x._1._2, unzip(x._2._2.results)))).distinct().cache()

val peinfo_int_final_array_rdd = peinfo_join_results.map(x=>(x.sha256,(Json.parse(x.results) \ "pe_sections"),{if ((Json.parse(x.results) \ "timestamp").isInstanceOf[JsUndefined]) 0.0 else (Json.parse(x.results) \ "timestamp" \\ "timestamp")(0).as[Double]})).filter(x=> !x._2.isInstanceOf[JsUndefined]).map(x=>new  peinfo_int_final_array_rdd_class(x._1,findAllIntinpeinfo(x._2,x._3)))

val peinfo_dllfunction_list= peinfo_join_results.map(x=>Json.parse(x.results) \ "imports").filter(x=> !x.isInstanceOf[JsUndefined]).flatMap(x=>x.as[List[Map[String, String]]].map(x=>(x("dll")+"."+x("function")))).toDF("func_name").groupBy("func_name").count.sort(desc("count")).filter("count > 10000").rdd.map(r => r.getString(0)).collect().toList
implicit def bool2int(b:Boolean) = if (b) 1 else 0
def findAllBininpeinfo_dllfunction(peinfo_dllfunction : Seq[String]) : Array[Double] ={
    val forlist = for (family <- peinfo_dllfunction_list) yield {
        (peinfo_dllfunction.contains(family):Int).toDouble
    }
    return (forlist).toArray
}
val List502 = Array.iterate(0.0,502)(a=>0.0)
val peinfo_binaray_final_array_rdd = peinfo_join_results.map(x=>(x.sha256,(Json.parse(x.results) \ "imports"))).map(x=>new  peinfo_binaray_final_array_rdd_class(x._1,{if (x._2.isInstanceOf[JsUndefined]) List502 else findAllBininpeinfo_dllfunction(x._2.as[Seq[Map[String, String]]].map(x=>(x("dll")+"."+x("function"))))}))

val peinfo_int_final_array_rdd_before_join = peinfo_int_final_array_rdd.map(x=>(x.sha256,x.array_results))
val peinfo_binaray_final_array_rdd_before_join = peinfo_binaray_final_array_rdd.map(x=>(x.sha256,x.array_results))
val peinfo_array_rdd_by_join = peinfo_int_final_array_rdd_before_join.join(peinfo_binaray_final_array_rdd_before_join).map(x=> (x._1,concat(x._2._1,x._2._2)))
val peinfo_final_array_rdd = peinfo_array_rdd_by_join.map(x=>new peinfo_final_array_rdd_class(x._1,x._2))

val peinfo_schema = new StructType().add("sha256", StringType).add("peinfo",VectorType)
val peinfo_vector_rdd = peinfo_final_array_rdd.map(x=>(x.sha256,Vectors.dense(x.array_results)))
val peinfo_vector_rowrdd = peinfo_vector_rdd.map(p => Row(p._1,p._2))
val peinfo_vector_dataframe = spark.createDataFrame(peinfo_vector_rowrdd, peinfo_schema)
val peinfo_scaler = new MinMaxScaler()
  .setInputCol("peinfo")
  .setOutputCol("scaled_peinfo")
val peinfo_scalerModel = peinfo_scaler.fit(peinfo_vector_dataframe)
val peinfo_scaledData_df = peinfo_scalerModel.transform(peinfo_vector_dataframe)
val peinfo_scaledData_rdd = peinfo_scaledData_df.select("sha256","scaled_peinfo").rdd.map(row=>(row.getAs[String]("sha256"),row.getAs[DenseVector]("scaled_peinfo"))).map(x=>new peinfo_final_array_rdd_class(x._1,x._2.toArray))
peinfo_scaledData_rdd.toDF().write.format("parquet").save(peinfo_final_array_file) 
Example 2
Source File: FeaturePropSpec.scala    From spark-tda   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.feature

import org.apache.spark.ml.linalg.{Vector, Vectors, DenseVector}
import org.apache.spark.ml.linalg.SQLDataTypes.VectorType
import org.apache.spark.sql.{SparkSession, DataFrame}
import org.apache.spark.sql.types.{
  StructField,
  IntegerType,
  DoubleType,
  BooleanType,
  StructType,
  StringType,
  ArrayType
}
import org.scalacheck.{Arbitrary, Gen}
import org.scalacheck.Arbitrary.arbitrary
import org.scalatest.PropSpec
import com.holdenkarau.spark.testing.{
  SharedSparkContext,
  DataframeGenerator,
  Column
}


abstract class FeaturePropSpec
    extends PropSpec
    with SharedSparkContext
    with DefaultReadWriteTest {
  implicit def arbitraryDenseVector: Arbitrary[DenseVector] =
    Arbitrary {
      for (arr <- arbitrary[Array[Double]]) yield new DenseVector(arr)
    }

  implicit def arbitraryVector: Arbitrary[Vector] =
    Arbitrary(
      Gen.frequency(
        1 -> arbitrary[DenseVector]
      ))

  lazy val spark = SparkSession.builder().getOrCreate()

  def schema =
    StructType(
      List(
        StructField("integer", IntegerType),
        StructField("double", DoubleType),
        StructField("boolean", BooleanType),
        StructField("string", StringType)
      ))

  def integerGen = new Column("integer", Gen.choose(-100, 100))

  def doubleGen = new Column("double", Gen.choose(-100.0, 100.0))

  def stringGen =
    new Column("string", Gen.oneOf("A", "BC", "DEF", "GHIJ", "KLMNO"))

  def dataframeGen =
    DataframeGenerator.arbitraryDataFrameWithCustomFields(
      spark.sqlContext,
      schema)(integerGen, doubleGen, stringGen)

  def hasDistinctValues(df: DataFrame, columns: String*): Boolean = {
    columns.foldLeft(true) { (acc, col) =>
      acc && df.select(col).distinct.count() > 1
    }
  }
} 
Example 3
Source File: MLUserDefinedType.scala    From spark-testing-base   with Apache License 2.0 5 votes vote down vote up
package com.holdenkarau.spark.testing

import org.apache.spark.sql.types.DataType
import org.apache.spark.ml.linalg.SQLDataTypes.{MatrixType, VectorType}
import org.apache.spark.ml.linalg.{DenseMatrix, Vectors}
import org.scalacheck.{Arbitrary, Gen}


object MLUserDefinedType {
  def unapply(dataType: DataType): Option[Gen[Any]] =
    dataType match {
      case MatrixType => {
        val dense = for {
          rows <- Gen.choose(0, 20)
          cols <- Gen.choose(0, 20)
          values <- Gen.containerOfN[Array, Double](rows * cols, Arbitrary.arbitrary[Double])
        } yield new DenseMatrix(rows, cols, values)
        val sparse = dense.map(_.toSparse)
        Some(Gen.oneOf(dense, sparse))
      }
      case VectorType => {
        val dense = Arbitrary.arbitrary[Array[Double]].map(Vectors.dense)
        val sparse = for {
          indices <- Gen.nonEmptyContainerOf[Set, Int](Gen.choose(0, Int.MaxValue - 1))
          values <- Gen.listOfN(indices.size, Arbitrary.arbitrary[Double])
        } yield Vectors.sparse(indices.max + 1, indices.toSeq.zip(values))
        Some(Gen.oneOf(dense, sparse))
      }
      case _ => None
    }
} 
Example 4
Source File: MLScalaCheckTest.scala    From spark-testing-base   with Apache License 2.0 5 votes vote down vote up
package com.holdenkarau.spark.testing

import org.apache.spark.ml.linalg.SQLDataTypes.{MatrixType, VectorType}
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.types.{StructField, StructType}
import org.scalacheck.Prop.forAll
import org.scalatest.FunSuite
import org.scalatest.prop.Checkers

class MLScalaCheckTest extends FunSuite with SharedSparkContext with Checkers {
  // re-use the spark context
  override implicit def reuseContextIfPossible: Boolean = false

  test("vector generation") {
    val schema = StructType(List(StructField("vector", VectorType)))
    val sqlContext = new SQLContext(sc)
    val dataframeGen = DataframeGenerator.arbitraryDataFrame(sqlContext, schema)

    val property =
      forAll(dataframeGen.arbitrary) {
        dataframe => {
          dataframe.schema === schema && dataframe.count >= 0
        }
      }

    check(property)
  }

  test("matrix generation") {
    val schema = StructType(List(StructField("matrix", MatrixType)))
    val sqlContext = new SQLContext(sc)
    val dataframeGen = DataframeGenerator.arbitraryDataFrame(sqlContext, schema)

    val property =
      forAll(dataframeGen.arbitrary) {
        dataframe => {
          dataframe.schema === schema && dataframe.count >= 0
        }
      }

    check(property)
  }
} 
Example 5
Source File: udfs.scala    From mmlspark   with MIT License 5 votes vote down vote up
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.stages

import org.apache.spark.ml.linalg.SQLDataTypes.VectorType
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.sql.Column
import org.apache.spark.sql.expressions.UserDefinedFunction
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types.DoubleType

import scala.collection.mutable

//scalastyle:off
object udfs {

  def get_value_at(colName: String, i: Int): Column = {
    udf({
      vec: org.apache.spark.ml.linalg.Vector => vec(i)
    }, DoubleType)(col(colName))
  }

  val to_vector: UserDefinedFunction = udf({
    arr: Seq[Double] => Vectors.dense(arr.toArray)
  }, VectorType)

  def to_vector(colName: String): Column = to_vector(col(colName))

} 
Example 6
Source File: VowpalWabbitInteractions.scala    From mmlspark   with MIT License 5 votes vote down vote up
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.vw

import com.microsoft.ml.spark.core.contracts.{HasInputCols, HasOutputCol, Wrappable}
import org.apache.spark.ml.{ComplexParamsReadable, ComplexParamsWritable, Transformer}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.sql.{DataFrame, Dataset, Row}
import org.apache.spark.sql.functions.{col, struct, udf}
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql.types.{StructField, StructType}
import org.apache.spark.ml.linalg.SQLDataTypes.VectorType

object VowpalWabbitInteractions extends ComplexParamsReadable[VowpalWabbitInteractions]


class VowpalWabbitInteractions(override val uid: String) extends Transformer
  with HasInputCols with HasOutputCol with HasNumBits with HasSumCollisions with Wrappable with ComplexParamsWritable
{
  def this() = this(Identifiable.randomUID("VowpalWabbitInteractions"))

  override def transform(dataset: Dataset[_]): DataFrame = {
    val fieldSubset = dataset.schema.fields
      .filter(f => getInputCols.contains(f.name))

    val mask = getMask

    val mode = udf((r: Row) => {

      // compute the final number of features
      val numElems = (0 until r.length)
        .map(r.getAs[Vector](_).numNonzeros).product

      val newIndices = new Array[Int](numElems)
      val newValues = new Array[Double](numElems)

      // build interaction features using FNV-1
      val fnvPrime = 16777619
      var i = 0

      def interact(idx: Int, value: Double, ns: Int): Unit = {
        if (ns == r.size) {
          newIndices(i) += mask & idx
          newValues(i) += value

          i += 1
        }
        else {
          val idx1 = idx * fnvPrime

          r.getAs[Vector](ns).foreachActive { case (idx2, value2) =>
            interact(idx1 ^ idx2, value * value2, ns + 1)
          }
        }
      }

      // start the recursion
      interact(0, 1, 0)

      val (indicesSorted, valuesSorted) = VectorUtils.sortAndDistinct(newIndices, newValues, getSumCollisions)

      Vectors.sparse(1 << getNumBits, indicesSorted, valuesSorted)
    })

    dataset.toDF.withColumn(getOutputCol, mode.apply(struct(fieldSubset.map(f => col(f.name)): _*)))
  }

  override def transformSchema(schema: StructType): StructType = {
    val fieldNames = schema.fields.map(_.name)
    for (f <- getInputCols)
      if (!fieldNames.contains(f))
        throw new IllegalArgumentException("missing input column " + f)
      else {
        val fieldType = schema.fields(schema.fieldIndex(f)).dataType

        if (fieldType != VectorType)
          throw new IllegalArgumentException("column " + f + " must be of type Vector but is " + fieldType.typeName)
      }

    schema.add(StructField(getOutputCol, VectorType, true))
  }

  override def copy(extra: ParamMap): VowpalWabbitFeaturizer = defaultCopy(extra)
} 
Example 7
Source File: FeatureSparkTypeTest.scala    From TransmogrifAI   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package com.salesforce.op.features

import com.salesforce.op.features.types.FeatureType
import com.salesforce.op.test.{TestCommon, TestSparkContext}
import org.apache.spark.ml.linalg.SQLDataTypes.VectorType
import org.apache.spark.sql.types._
import org.junit.runner.RunWith
import org.scalatest.{Assertion, FlatSpec}
import org.scalatest.junit.JUnitRunner

import scala.reflect.runtime.universe._

@RunWith(classOf[JUnitRunner])
class FeatureSparkTypeTest extends FlatSpec with TestCommon {
  val primitiveTypes = Seq(
    (DoubleType, weakTypeTag[types.Real], DoubleType),
    (FloatType, weakTypeTag[types.Real], DoubleType),
    (LongType, weakTypeTag[types.Integral], LongType),
    (IntegerType, weakTypeTag[types.Integral], LongType),
    (ShortType, weakTypeTag[types.Integral], LongType),
    (ByteType, weakTypeTag[types.Integral], LongType),
    (DateType, weakTypeTag[types.Date], LongType),
    (TimestampType, weakTypeTag[types.DateTime], LongType),
    (StringType, weakTypeTag[types.Text], StringType),
    (BooleanType, weakTypeTag[types.Binary], BooleanType),
    (VectorType, weakTypeTag[types.OPVector], VectorType)
  )

  val nonNullable = Seq(
    (DoubleType, weakTypeTag[types.RealNN], DoubleType),
    (FloatType, weakTypeTag[types.RealNN], DoubleType)
  )

  private def mapType(v: DataType) = MapType(StringType, v, valueContainsNull = true)
  private def arrType(v: DataType) = ArrayType(v, containsNull = true)

  val collectionTypes = Seq(
    (arrType(LongType), weakTypeTag[types.DateList], arrType(LongType)),
    (arrType(DoubleType), weakTypeTag[types.Geolocation], arrType(DoubleType)),
    (arrType(StringType), weakTypeTag[types.TextList], arrType(StringType)),
    (mapType(StringType), weakTypeTag[types.TextMap], mapType(StringType)),
    (mapType(DoubleType), weakTypeTag[types.RealMap], mapType(DoubleType)),
    (mapType(LongType), weakTypeTag[types.IntegralMap], mapType(LongType)),
    (mapType(BooleanType), weakTypeTag[types.BinaryMap], mapType(BooleanType)),
    (mapType(arrType(StringType)), weakTypeTag[types.MultiPickListMap], mapType(arrType(StringType))),
    (mapType(arrType(DoubleType)), weakTypeTag[types.GeolocationMap], mapType(arrType(DoubleType)))
  )

  Spec(FeatureSparkTypes.getClass) should "assign appropriate feature type tags for valid types and versa" in {
    primitiveTypes.map(scala.Function.tupled(assertTypes()))
  }

  it should "assign appropriate feature type tags for valid non-nullable types and versa" in {
    nonNullable.map(scala.Function.tupled(assertTypes(isNullable = false)))
  }

  it should "assign appropriate feature type tags for collection types and versa" in {
    collectionTypes.map(scala.Function.tupled(assertTypes()))
  }

  it should "error for unsupported types" in {
    val error = intercept[IllegalArgumentException](FeatureSparkTypes.featureTypeTagOf(BinaryType, isNullable = false))
    error.getMessage shouldBe "Spark BinaryType is currently not supported"
  }

  it should "error for unknown types" in {
    val unknownType = NullType
    val error = intercept[IllegalArgumentException](FeatureSparkTypes.featureTypeTagOf(unknownType, isNullable = false))
    error.getMessage shouldBe s"No feature type tag mapping for Spark type $unknownType"
  }

  def assertTypes(
    isNullable: Boolean = true
  )(
    sparkType: DataType,
    featureType: WeakTypeTag[_ <: FeatureType],
    expectedSparkType: DataType
  ): Assertion = {
    FeatureSparkTypes.featureTypeTagOf(sparkType, isNullable) shouldBe featureType
    FeatureSparkTypes.sparkTypeOf(featureType) shouldBe expectedSparkType
  }

} 
Example 8
Source File: ProtobufRequestRowSerializerTests.scala    From sagemaker-spark   with Apache License 2.0 5 votes vote down vote up
package com.amazonaws.services.sagemaker.sparksdk.transformation.serializers

import org.scalatest.{FlatSpec, Matchers}
import org.scalatest.mock.MockitoSugar

import org.apache.spark.ml.linalg.{DenseVector, SparseVector, SQLDataTypes}
import org.apache.spark.ml.linalg.SQLDataTypes.VectorType
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}

import com.amazonaws.services.sagemaker.sparksdk.protobuf.ProtobufConverter

class ProtobufRequestRowSerializerTests extends FlatSpec with Matchers with MockitoSugar {

  val labelColumnName = "label"
  val featuresColumnName = "features"
  val schema = StructType(Array(StructField(labelColumnName, DoubleType), StructField(
    featuresColumnName, VectorType)))

  it should "serialize a dense vector" in {
    val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray)
    val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema)
    val rrs = new ProtobufRequestRowSerializer(Some(schema))
    val protobuf = ProtobufConverter.rowToProtobuf(row, featuresColumnName, Option.empty)
    val serialized = rrs.serializeRow(row)
    val protobufIterator = ProtobufConverter.recordIOByteArrayToProtobufs(serialized)
    val protobufFromRecordIO = protobufIterator.next

    assert(!protobufIterator.hasNext)
    assert(protobuf.equals(protobufFromRecordIO))
  }

  it should "serialize a sparse vector" in {
    val vec = new SparseVector(100, Seq[Int](0, 10).toArray, Seq[Double](-100.0, 100.1).toArray)
    val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema)
    val rrs = new ProtobufRequestRowSerializer(Some(schema))
    val protobuf = ProtobufConverter.rowToProtobuf(row, featuresColumnName, Option.empty)
    val serialized = rrs.serializeRow(row)
    val protobufIterator = ProtobufConverter.recordIOByteArrayToProtobufs(serialized)
    val protobufFromRecordIO = protobufIterator.next

    assert(!protobufIterator.hasNext)
    assert(protobuf.equals(protobufFromRecordIO))
  }

  it should "fail to set schema on invalid features name" in {
    val vec = new SparseVector(100, Seq[Int](0, 10).toArray, Seq[Double](-100.0, 100.1).toArray)
    val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema)
    intercept[IllegalArgumentException] {
      val rrs = new ProtobufRequestRowSerializer(Some(schema), featuresColumnName = "doesNotExist")
    }
  }


  it should "fail on invalid types" in {
    val schemaWithInvalidFeaturesType = StructType(Array(
      StructField("label", DoubleType, nullable = false),
      StructField("features", StringType, nullable = false)))
    intercept[RuntimeException] {
      new ProtobufRequestRowSerializer(Some(schemaWithInvalidFeaturesType))
    }
  }

  it should "validate correct schema" in {
    val validSchema = StructType(Array(
      StructField("features", SQLDataTypes.VectorType, nullable = false)))
    new ProtobufRequestRowSerializer(Some(validSchema))
  }
}