org.apache.spark.sql.types.DataTypes Scala Examples
The following examples show how to use org.apache.spark.sql.types.DataTypes.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: TestIndexing.scala From spark-solr with Apache License 2.0 | 5 votes |
package com.lucidworks.spark import java.util.UUID import com.lucidworks.spark.util.SolrDataFrameImplicits._ import com.lucidworks.spark.util.{ConfigurationConstants, SolrCloudUtil, SolrQuerySupport, SolrSupport} import org.apache.spark.sql.functions.{concat, lit} import org.apache.spark.sql.types.{DataTypes, StructField, StructType} class TestIndexing extends TestSuiteBuilder { test("Load csv file and index to Solr") { val collectionName = "testIndexing-" + UUID.randomUUID().toString SolrCloudUtil.buildCollection(zkHost, collectionName, null, 2, cloudClient, sc) try { val csvFileLocation = "src/test/resources/test-data/nyc_yellow_taxi_sample_1k.csv" val csvDF = sparkSession.read.format("com.databricks.spark.csv") .option("header", "true") .option("inferSchema", "true") .load(csvFileLocation) assert(csvDF.count() == 999) val solrOpts = Map("zkhost" -> zkHost, "collection" -> collectionName) val newDF = csvDF .withColumn("pickup_location", concat(csvDF.col("pickup_latitude"), lit(","), csvDF.col("pickup_longitude"))) .withColumn("dropoff_location", concat(csvDF.col("dropoff_latitude"), lit(","), csvDF.col("dropoff_longitude"))) newDF.write.option("zkhost", zkHost).option(ConfigurationConstants.GENERATE_UNIQUE_KEY, "true").solr(collectionName) // Explicit commit to make sure all docs are visible val solrCloudClient = SolrSupport.getCachedCloudClient(zkHost) solrCloudClient.commit(collectionName, true, true) val solrDF = sparkSession.read.format("solr").options(solrOpts).load() solrDF.printSchema() assert (solrDF.count() == 999) solrDF.take(10) } finally { SolrCloudUtil.deleteCollection(collectionName, cluster) } } test("Solr field types config") { val collectionName = "testIndexing-" + UUID.randomUUID().toString SolrCloudUtil.buildCollection(zkHost, collectionName, null, 2, cloudClient, sc) try { val csvFileLocation = "src/test/resources/test-data/simple.csv" val csvDF = sparkSession.read.format("com.databricks.spark.csv") .option("header", "true") .option("inferSchema", "true") .load(csvFileLocation) val solrOpts = Map("zkhost" -> zkHost, "collection" -> collectionName, ConfigurationConstants.SOLR_FIELD_TYPES -> "ntitle:text_en,nrating:string") csvDF.write.options(solrOpts).solr(collectionName) // Explicit commit to make sure all docs are visible val solrCloudClient = SolrSupport.getCachedCloudClient(zkHost) solrCloudClient.commit(collectionName, true, true) val solrBaseUrl = SolrSupport.getSolrBaseUrl(zkHost) val solrUrl = solrBaseUrl + collectionName + "/" val fieldTypes = SolrQuerySupport.getFieldTypes(Set.empty, solrUrl, cloudClient, collectionName) assert(fieldTypes("nrating").fieldType === "string") assert(fieldTypes("ntitle").fieldType === "text_en") } finally { SolrCloudUtil.deleteCollection(collectionName, cluster) } } test("Field additions") { val insertSchema = StructType(Array( StructField("index_only_field", DataTypes.StringType, nullable = true), StructField("store_only_field", DataTypes.BooleanType, nullable = true), StructField("a_s", DataTypes.StringType, nullable = true), StructField("s_b", DataTypes.StringType, nullable = true) )) val collection = "testFieldAdditions" + UUID.randomUUID().toString.replace("-", "_") try { SolrCloudUtil.buildCollection(zkHost, collection, null, 2, cloudClient, sc) val opts = Map("zkhost" -> zkHost, "collection" -> collection) val solrRelation = new SolrRelation(opts, sparkSession) val fieldsToAdd = SolrRelation.getFieldsToAdd(insertSchema, solrRelation.conf, solrRelation.solrVersion, solrRelation.dynamicSuffixes) assert(fieldsToAdd.isEmpty) } finally { SolrCloudUtil.deleteCollection(collection, cluster) } } }
Example 2
Source File: Surrogate.scala From automl with Apache License 2.0 | 5 votes |
package com.tencent.angel.spark.automl.tuner.surrogate import com.tencent.angel.spark.automl.tuner.config.ConfigurationSpace import org.apache.commons.logging.{Log, LogFactory} import org.apache.spark.ml.linalg.Vector import org.apache.spark.sql.types.{DataTypes, StructField, StructType} import scala.collection.mutable.ArrayBuffer def predict(X: Vector): (Double, Double) def stop(): Unit def curBest: (Vector, Double) = { if (minimize) curMin else curMax } def curMin: (Vector, Double) = { if (preY.isEmpty) (null, Double.MaxValue) else { val maxIdx: Int = preY.zipWithIndex.max._2 (preX(maxIdx), -preY(maxIdx)) } } def curMax: (Vector, Double) = { if (preY.isEmpty) (null, Double.MinValue) else { val maxIdx: Int = preY.zipWithIndex.max._2 (preX(maxIdx), preY(maxIdx)) } } }
Example 3
Source File: HousePriceDataBusinessLogic.scala From bdd-spark with MIT License | 5 votes |
import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types.{DataTypes, StructField, StructType} import org.json4s._ import org.json4s.jackson.JsonMethods._ object HousePriceDataBusinessLogic { import Spark._ def processHousePrices(housePrices : DataFrame, postcodes : DataFrame) : DataFrame = { housePrices.join(postcodes, "Postcode") } def processHousePricesAndSaveToParquet(housePrices : DataFrame, postcodes : DataFrame, parquetWriter: ParquetWriter) : Unit = { parquetWriter.write(housePrices.join(postcodes, "Postcode"), "results.parquet") } def processDataFromFilesFilterItThenSaveItToParquet(reader: FileReader, geoFilename : String, priceFilename: String, postcodeFileName: String, writer: ParquetWriter) : Unit = { val joined = loadAndJoin(reader, priceFilename, postcodeFileName) // If this was real code, a geoJSON library would be sensible here. Dirty code follows: val json = parse(reader.readText(geoFilename))\\ "coordinates" val coords = json match { case JArray(outer) => outer.map{ case JArray(inner) => inner } } val points = coords .map(c => (c(0), c(1))) .map{ case (JDouble(long), JDouble(lat)) => (long, lat) } val minLat = Math.min(points(0)._2, points(1)._2) val maxLat = Math.max(points(0)._2, points(1)._2) val minLong = Math.min(points(0)._1, points(1)._1) val maxLong = Math.max(points(0)._1, points(1)._1) val filtered = joined .filter(s"Latitude >= $minLat and Latitude <= $maxLat") .filter(s"Longitude >= $minLong and Longitude <= $maxLong") writer.write(filtered, "results.parquet") } def processDataFromFilesAndSaveToParquet(reader: FileReader, priceFilename: String, postcodeFileName: String, writer: ParquetWriter) : Unit = { val joined = loadAndJoin(reader, priceFilename, postcodeFileName) writer.write(joined, "results.parquet") } private def loadAndJoin(reader: FileReader, priceFilename: String, postcodeFileName: String): DataFrame = { val priceSchema = StructType(Seq( StructField("Price", DataTypes.IntegerType), StructField("Postcode", DataTypes.StringType), StructField("HouseType", DataTypes.StringType) )) val prices = reader .readLinesToRdd(priceFilename) .map(_.split(',')) .map(row => row.map(_.trim())) .map(splits => Row(splits(0).toInt, splits(1), splits(2))) val priceDf = spark.createDataFrame(prices, priceSchema) val postcodeSchema = StructType(Seq( StructField("Postcode", DataTypes.StringType), StructField("Latitude", DataTypes.DoubleType), StructField("Longitude", DataTypes.DoubleType) )) val postcodes = reader .readLinesToRdd(postcodeFileName) .map(_.split(',')) .map(row => row.map(_.trim())) .map(splits => Row(splits(0), splits(1).toDouble, splits(2).toDouble)) val postcodeDf = spark.createDataFrame(postcodes, postcodeSchema) val joined = priceDf.join(postcodeDf, "Postcode") joined } }
Example 4
Source File: DataTypeMapping.scala From azure-kusto-spark with Apache License 2.0 | 5 votes |
package com.microsoft.kusto.spark.utils import org.apache.spark.sql.types.DataTypes._ import org.apache.spark.sql.types.{ArrayType, DataType, DataTypes, DecimalType, MapType, StructType} object DataTypeMapping { val kustoTypeToSparkTypeMap: Map[String, DataType] = Map( "string" -> StringType, "long" -> LongType, "datetime" -> TimestampType,// Kusto datetime is equivalent to TimestampType "timespan" -> StringType, "bool" -> BooleanType, "real" -> DoubleType, // Can be partitioned differently between precision and scale, total must be 34 to match .Net SqlDecimal "decimal" -> DataTypes.createDecimalType(20,14), "guid" -> StringType, "int" -> IntegerType, "dynamic" -> StringType ) val kustoJavaTypeToSparkTypeMap: Map[String, DataType] = Map( "string" -> StringType, "int64" -> LongType, "datetime" -> TimestampType, "timespan" -> StringType, "sbyte" -> BooleanType, "double" -> DoubleType, "sqldecimal" -> DataTypes.createDecimalType(20,14), "guid" -> StringType, "int32" -> IntegerType, "object" -> StringType ) val sparkTypeToKustoTypeMap: Map[DataType, String] = Map( StringType -> "string", BooleanType -> "bool", DateType -> "datetime", TimestampType -> "datetime", DataTypes.createDecimalType() -> "decimal", DoubleType -> "real", FloatType -> "real", ByteType -> "int", IntegerType -> "int", LongType -> "long", ShortType -> "int" ) def getSparkTypeToKustoTypeMap(fieldType: DataType): String ={ if(fieldType.isInstanceOf[DecimalType]) "decimal" else if (fieldType.isInstanceOf[ArrayType] || fieldType.isInstanceOf[StructType] || fieldType.isInstanceOf[MapType]) "dynamic" else DataTypeMapping.sparkTypeToKustoTypeMap.getOrElse(fieldType, "string") } }
Example 5
Source File: ParallelPersonalizedPageRankSuite.scala From graphframes with Apache License 2.0 | 5 votes |
package org.graphframes.lib import com.github.zafarkhaja.semver.Version import org.apache.spark.ml.linalg.{SQLDataTypes, SparseVector} import org.apache.spark.sql.Row import org.apache.spark.sql.functions.col import org.apache.spark.sql.types.DataTypes import org.graphframes.examples.Graphs import org.graphframes.{GraphFrameTestSparkContext, SparkFunSuite, TestUtils} class ParallelPersonalizedPageRankSuite extends SparkFunSuite with GraphFrameTestSparkContext { val n = 100 test("Illegal function call argument setting") { val g = Graphs.star(n) val vertexIds: Array[Any] = Array(1L, 2L, 3L) // Not providing number of iterations intercept[IllegalArgumentException] { g.parallelPersonalizedPageRank.sourceIds(vertexIds).run() } // Not providing sourceIds intercept[IllegalArgumentException] { g.parallelPersonalizedPageRank.maxIter(15).run() } // Provided empty sourceIds intercept[IllegalArgumentException] { g.parallelPersonalizedPageRank.maxIter(15).sourceIds(Array()).run() } } test("Star example parallel personalized PageRank") { val g = Graphs.star(n) val resetProb = 0.15 val maxIter = 10 val vertexIds: Array[Any] = Array(1L, 2L, 3L) lazy val prc = g.parallelPersonalizedPageRank .maxIter(maxIter) .sourceIds(vertexIds) .resetProbability(resetProb) val pr = prc.run() TestUtils.testSchemaInvariants(g, pr) TestUtils.checkColumnType(pr.vertices.schema, "pageranks", SQLDataTypes.VectorType) TestUtils.checkColumnType(pr.edges.schema, "weight", DataTypes.DoubleType) } // In Spark <2.4, sourceIds must be smaller than Int.MaxValue, // which might not be the case for LONG_ID in graph.indexedVertices. if (Version.valueOf(org.apache.spark.SPARK_VERSION) .greaterThanOrEqualTo(Version.valueOf("2.4.0"))) { test("friends graph with parallel personalized PageRank") { val g = Graphs.friends val resetProb = 0.15 val maxIter = 10 val vertexIds: Array[Any] = Array("a") lazy val prc = g.parallelPersonalizedPageRank .maxIter(maxIter) .sourceIds(vertexIds) .resetProbability(resetProb) val pr = prc.run() val prInvalid = pr.vertices .select("pageranks") .collect() .filter { row: Row => vertexIds.size != row.getAs[SparseVector](0).size } assert(prInvalid.size === 0, s"found ${prInvalid.size} entries with invalid number of returned personalized pagerank vector") val gRank = pr.vertices .filter(col("id") === "g") .select("pageranks") .first().getAs[SparseVector](0) assert(gRank.numNonzeros === 0, s"User g (Gabby) doesn't connect with a. So its pagerank should be 0 but we got ${gRank.numNonzeros}.") } } }
Example 6
Source File: PageRankSuite.scala From graphframes with Apache License 2.0 | 5 votes |
package org.graphframes.lib import org.apache.spark.sql.functions.col import org.apache.spark.sql.types.DataTypes import org.graphframes.examples.Graphs import org.graphframes.{GraphFrameTestSparkContext, SparkFunSuite, TestUtils} class PageRankSuite extends SparkFunSuite with GraphFrameTestSparkContext { val n = 100 test("Star example") { val g = Graphs.star(n) val resetProb = 0.15 val errorTol = 1.0e-5 val pr = g.pageRank .resetProbability(resetProb) .tol(errorTol).run() TestUtils.testSchemaInvariants(g, pr) TestUtils.checkColumnType(pr.vertices.schema, "pagerank", DataTypes.DoubleType) TestUtils.checkColumnType(pr.edges.schema, "weight", DataTypes.DoubleType) } test("friends graph with personalized PageRank") { val results = Graphs.friends.pageRank.resetProbability(0.15).maxIter(10).sourceId("a").run() val gRank = results.vertices.filter(col("id") === "g").select("pagerank").first().getDouble(0) assert(gRank === 0.0, s"User g (Gabby) doesn't connect with a. So its pagerank should be 0 but we got $gRank.") } }
Example 7
Source File: SVDPlusPlusSuite.scala From graphframes with Apache License 2.0 | 5 votes |
package org.graphframes.lib import org.apache.spark.sql.Row import org.apache.spark.sql.types.DataTypes import org.graphframes.{GraphFrame, GraphFrameTestSparkContext, SparkFunSuite, TestUtils} import org.graphframes.examples.Graphs class SVDPlusPlusSuite extends SparkFunSuite with GraphFrameTestSparkContext { test("Test SVD++ with mean square error on training set") { val svdppErr = 8.0 val g = Graphs.ALSSyntheticData() val v2 = g.svdPlusPlus.maxIter(2).run() TestUtils.testSchemaInvariants(g, v2) Seq(SVDPlusPlus.COLUMN1, SVDPlusPlus.COLUMN2).foreach { case c => TestUtils.checkColumnType(v2.schema, c, DataTypes.createArrayType(DataTypes.DoubleType, false)) } Seq(SVDPlusPlus.COLUMN3, SVDPlusPlus.COLUMN4).foreach { case c => TestUtils.checkColumnType(v2.schema, c, DataTypes.DoubleType) } val err = v2.select(GraphFrame.ID, SVDPlusPlus.COLUMN4).rdd.map { case Row(vid: Long, vd: Double) => if (vid % 2 == 1) vd else 0.0 }.reduce(_ + _) / g.edges.count() assert(err <= svdppErr) } }
Example 8
Source File: TriangleCountSuite.scala From graphframes with Apache License 2.0 | 5 votes |
package org.graphframes.lib import org.apache.spark.sql.Row import org.apache.spark.sql.types.DataTypes import org.graphframes.{GraphFrameTestSparkContext, GraphFrame, SparkFunSuite, TestUtils} class TriangleCountSuite extends SparkFunSuite with GraphFrameTestSparkContext { test("Count a single triangle") { val edges = sqlContext.createDataFrame(Array(0L -> 1L, 1L -> 2L, 2L -> 0L)).toDF("src", "dst") val vertices = sqlContext.createDataFrame(Seq((0L, "a"), (1L, "b"), (2L, "c"))) .toDF("id", "a") val g = GraphFrame(vertices, edges) val v2 = g.triangleCount.run() TestUtils.testSchemaInvariants(g, v2) TestUtils.checkColumnType(v2.schema, "count", DataTypes.LongType) v2.select("id", "count", "a") .collect().foreach { case Row(vid: Long, count: Long, _) => assert(count === 1) } } test("Count two triangles") { val edges = sqlContext.createDataFrame(Array(0L -> 1L, 1L -> 2L, 2L -> 0L) ++ Array(0L -> -1L, -1L -> -2L, -2L -> 0L)).toDF("src", "dst") val g = GraphFrame.fromEdges(edges) val v2 = g.triangleCount.run() v2.select("id", "count").collect().foreach { case Row(id: Long, count: Long) => if (id == 0) { assert(count === 2) } else { assert(count === 1) } } } test("Count one triangles with bi-directed edges") { // Note: This is different from GraphX, which double-counts triangles with bidirected edges. val triangles = Array(0L -> 1L, 1L -> 2L, 2L -> 0L) ++ Array(0L -> -1L, -1L -> -2L, -2L -> 0L) val revTriangles = triangles.map { case (a, b) => (b, a) } val edges = sqlContext.createDataFrame(triangles ++ revTriangles).toDF("src", "dst") val g = GraphFrame.fromEdges(edges) val v2 = g.triangleCount.run() v2.select("id", "count").collect().foreach { case Row(id: Long, count: Long) => if (id == 0) { assert(count === 2) } else { assert(count === 1) } } } test("Count a single triangle with duplicate edges") { val edges = sqlContext.createDataFrame(Array(0L -> 1L, 1L -> 2L, 2L -> 0L) ++ Array(0L -> 1L, 1L -> 2L, 2L -> 0L)).toDF("src", "dst") val g = GraphFrame.fromEdges(edges) val v2 = g.triangleCount.run() v2.select("id", "count").collect().foreach { case Row(id: Long, count: Long) => assert(count === 1) } } test("no triangle") { val edges = sqlContext.createDataFrame(Array(0L -> 1L, 1L -> 2L)).toDF("src", "dst") val g = GraphFrame.fromEdges(edges) val v2 = g.triangleCount.run() v2.select("count").collect().foreach { case Row(count: Long) => assert(count === 0) } } }
Example 9
Source File: LabelPropagationSuite.scala From graphframes with Apache License 2.0 | 5 votes |
package org.graphframes.lib import org.apache.spark.sql.types.DataTypes import org.graphframes.{GraphFrameTestSparkContext, SparkFunSuite, TestUtils} import org.graphframes.examples.Graphs class LabelPropagationSuite extends SparkFunSuite with GraphFrameTestSparkContext { val n = 5 test("Toy example") { val g = Graphs.twoBlobs(n) val labels = g.labelPropagation.maxIter(4 * n).run() TestUtils.testSchemaInvariants(g, labels) TestUtils.checkColumnType(labels.schema, "label", DataTypes.LongType) val clique1 = labels.filter(s"id < $n").select("label").collect().toSeq.map(_.getLong(0)).toSet assert(clique1.size === 1) val clique2 = labels.filter(s"id >= $n").select("label").collect().toSeq.map(_.getLong(0)).toSet assert(clique2.size === 1) assert(clique1 !== clique2) } }
Example 10
Source File: ShortestPathsSuite.scala From graphframes with Apache License 2.0 | 5 votes |
package org.graphframes.lib import org.apache.spark.sql.Row import org.apache.spark.sql.types.DataTypes import org.graphframes._ class ShortestPathsSuite extends SparkFunSuite with GraphFrameTestSparkContext { test("Simple test") { val edgeSeq = Seq((1, 2), (1, 5), (2, 3), (2, 5), (3, 4), (4, 5), (4, 6)).flatMap { case e => Seq(e, e.swap) } .map { case (src, dst) => (src.toLong, dst.toLong) } val edges = sqlContext.createDataFrame(edgeSeq).toDF("src", "dst") val graph = GraphFrame.fromEdges(edges) // Ground truth val shortestPaths = Set( (1, Map(1 -> 0, 4 -> 2)), (2, Map(1 -> 1, 4 -> 2)), (3, Map(1 -> 2, 4 -> 1)), (4, Map(1 -> 2, 4 -> 0)), (5, Map(1 -> 1, 4 -> 1)), (6, Map(1 -> 3, 4 -> 1))) val landmarks = Seq(1, 4).map(_.toLong) val v2 = graph.shortestPaths.landmarks(landmarks).run() TestUtils.testSchemaInvariants(graph, v2) TestUtils.checkColumnType(v2.schema, "distances", DataTypes.createMapType(v2.schema("id").dataType, DataTypes.IntegerType, false)) val newVs = v2.select("id", "distances").collect().toSeq val results = newVs.map { case Row(id: Long, spMap: Map[Long, Int] @unchecked) => (id, spMap) } assert(results.toSet === shortestPaths) } test("friends graph") { val friends = examples.Graphs.friends val v = friends.shortestPaths.landmarks(Seq("a", "d")).run() val expected = Set[(String, Map[String, Int])](("a", Map("a" -> 0, "d" -> 2)), ("b", Map.empty), ("c", Map.empty), ("d", Map("a" -> 1, "d" -> 0)), ("e", Map("a" -> 2, "d" -> 1)), ("f", Map.empty), ("g", Map.empty)) val results = v.select("id", "distances").collect().map { case Row(id: String, spMap: Map[String, Int] @unchecked) => (id, spMap) }.toSet assert(results === expected) } }
Example 11
Source File: StronglyConnectedComponentsSuite.scala From graphframes with Apache License 2.0 | 5 votes |
package org.graphframes.lib import org.apache.spark.sql.Row import org.apache.spark.sql.types.DataTypes import org.graphframes.{GraphFrameTestSparkContext, GraphFrame, SparkFunSuite, TestUtils} class StronglyConnectedComponentsSuite extends SparkFunSuite with GraphFrameTestSparkContext { test("Island Strongly Connected Components") { val vertices = sqlContext.createDataFrame(Seq( (1L, "a"), (2L, "b"), (3L, "c"), (4L, "d"), (5L, "e"))).toDF("id", "value") val edges = sqlContext.createDataFrame(Seq.empty[(Long, Long)]).toDF("src", "dst") val graph = GraphFrame(vertices, edges) val c = graph.stronglyConnectedComponents.maxIter(5).run() TestUtils.testSchemaInvariants(graph, c) TestUtils.checkColumnType(c.schema, "component", DataTypes.LongType) for (Row(id: Long, component: Long, _) <- c.select("id", "component", "value").collect()) { assert(id === component) } } }
Example 12
Source File: UnaryTransformerExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.param.DoubleParam import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions.col import org.apache.spark.sql.types.{DataType, DataTypes} import org.apache.spark.util.Utils // $example off$ object MyTransformer extends DefaultParamsReadable[MyTransformer] // $example off$ def main(args: Array[String]) { val spark = SparkSession .builder() .appName("UnaryTransformerExample") .getOrCreate() // $example on$ val myTransformer = new MyTransformer() .setShift(0.5) .setInputCol("input") .setOutputCol("output") // Create data, transform, and display it. val data = spark.range(0, 5).toDF("input") .select(col("input").cast("double").as("input")) val result = myTransformer.transform(data) println("Transformed by adding constant value") result.show() // Save and load the Transformer. val tmpDir = Utils.createTempDir() val dirName = tmpDir.getCanonicalPath myTransformer.write.overwrite().save(dirName) val sameTransformer = MyTransformer.load(dirName) // Transform the data to show the results are identical. println("Same transform applied from loaded model") val sameResult = sameTransformer.transform(data) sameResult.show() Utils.deleteRecursively(tmpDir) // $example off$ spark.stop() } } // scalastyle:on println
Example 13
Source File: Util.scala From mimir with Apache License 2.0 | 5 votes |
package mimir.exec.spark.datasource.google.spreadsheet import com.google.api.services.sheets.v4.model.{ExtendedValue, CellData, RowData} import org.apache.spark.sql.Row import org.apache.spark.sql.types.{DataTypes, StructType} import scala.collection.JavaConverters._ object Util { def convert(schema: StructType, row: Row): Map[String, Object] = schema.iterator.zipWithIndex.map { case (f, i) => f.name -> row(i).asInstanceOf[AnyRef]}.toMap def toRowData(row: Row): RowData = new RowData().setValues( row.schema.fields.zipWithIndex.map { case (f, i) => new CellData() .setUserEnteredValue( f.dataType match { case DataTypes.StringType => new ExtendedValue().setStringValue(row.getString(i)) case DataTypes.LongType => new ExtendedValue().setNumberValue(row.getLong(i).toDouble) case DataTypes.IntegerType => new ExtendedValue().setNumberValue(row.getInt(i).toDouble) case DataTypes.FloatType => new ExtendedValue().setNumberValue(row.getFloat(i).toDouble) case DataTypes.BooleanType => new ExtendedValue().setBoolValue(row.getBoolean(i)) case DataTypes.DateType => new ExtendedValue().setStringValue(row.getDate(i).toString) case DataTypes.ShortType => new ExtendedValue().setNumberValue(row.getShort(i).toDouble) case DataTypes.TimestampType => new ExtendedValue().setStringValue(row.getTimestamp(i).toString) case DataTypes.DoubleType => new ExtendedValue().setNumberValue(row.getDouble(i)) } ) }.toList.asJava ) }
Example 14
Source File: LOFSuite.scala From spark-lof with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.outlier import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.sql.SparkSession import org.apache.spark.sql.types.{DataTypes, StructField, StructType} import org.apache.spark.sql.functions._ object LOFSuite { def main(args: Array[String]): Unit = { val spark = SparkSession .builder() .appName("LOFExample") .master("local[4]") .getOrCreate() val schema = new StructType(Array( new StructField("col1", DataTypes.DoubleType), new StructField("col2", DataTypes.DoubleType))) val df = spark.read.schema(schema).csv("data/outlier.csv") val assembler = new VectorAssembler() .setInputCols(df.columns) .setOutputCol("features") val data = assembler.transform(df).repartition(4) val startTime = System.currentTimeMillis() val result = new LOF() .setMinPts(5) .transform(data) val endTime = System.currentTimeMillis() result.count() // Outliers have much higher LOF value than normal data result.sort(desc(LOF.lof)).head(10).foreach { row => println(row.get(0) + " | " + row.get(1) + " | " + row.get(2)) } println("Total time = " + (endTime - startTime) / 1000.0 + "s") } }
Example 15
Source File: Util.scala From spark-google-spreadsheets with Apache License 2.0 | 5 votes |
package com.github.potix2.spark.google.spreadsheets import com.google.api.services.sheets.v4.model.{ExtendedValue, CellData, RowData} import org.apache.spark.sql.Row import org.apache.spark.sql.types.{DataTypes, StructType} import scala.collection.JavaConverters._ object Util { def convert(schema: StructType, row: Row): Map[String, Object] = schema.iterator.zipWithIndex.map { case (f, i) => f.name -> row(i).asInstanceOf[AnyRef]} toMap def toRowData(row: Row): RowData = new RowData().setValues( row.schema.fields.zipWithIndex.map { case (f, i) => new CellData() .setUserEnteredValue( f.dataType match { case DataTypes.StringType => new ExtendedValue().setStringValue(row.getString(i)) case DataTypes.LongType => new ExtendedValue().setNumberValue(row.getLong(i).toDouble) case DataTypes.IntegerType => new ExtendedValue().setNumberValue(row.getInt(i).toDouble) case DataTypes.FloatType => new ExtendedValue().setNumberValue(row.getFloat(i).toDouble) case DataTypes.BooleanType => new ExtendedValue().setBoolValue(row.getBoolean(i)) case DataTypes.DateType => new ExtendedValue().setStringValue(row.getDate(i).toString) case DataTypes.ShortType => new ExtendedValue().setNumberValue(row.getShort(i).toDouble) case DataTypes.TimestampType => new ExtendedValue().setStringValue(row.getTimestamp(i).toString) case DataTypes.DoubleType => new ExtendedValue().setNumberValue(row.getDouble(i)) } ) }.toList.asJava ) }
Example 16
Source File: SparkSpreadsheetServiceWriteSuite.scala From spark-google-spreadsheets with Apache License 2.0 | 5 votes |
package com.github.potix2.spark.google.spreadsheets import com.github.potix2.spark.google.spreadsheets.SparkSpreadsheetService.SparkSpreadsheet import com.google.api.services.sheets.v4.model.{ExtendedValue, CellData, RowData} import org.apache.spark.sql.types.{DataTypes, StructField, StructType} import org.scalatest.{BeforeAndAfter, FlatSpec} import scala.collection.JavaConverters._ class SparkSpreadsheetServiceWriteSuite extends FlatSpec with BeforeAndAfter { private val serviceAccountId = "53797494708-ds5v22b6cbpchrv2qih1vg8kru098k9i@developer.gserviceaccount.com" private val testCredentialPath = "src/test/resources/spark-google-spreadsheets-test-eb7b191d1e1d.p12" private val TEST_SPREADSHEET_NAME = "WriteSuite" private val TEST_SPREADSHEET_ID = "163Ja2OWUephWjIa-jpwTlvGcg8EJwCFCfxrF7aI117s" private val context: SparkSpreadsheetService.SparkSpreadsheetContext = SparkSpreadsheetService.SparkSpreadsheetContext(Some(serviceAccountId), new java.io.File(testCredentialPath)) var spreadsheet: SparkSpreadsheet = null var worksheetName: String = "" def definedSchema: StructType = { new StructType() .add(new StructField("col_1", DataTypes.StringType)) .add(new StructField("col_2", DataTypes.LongType)) .add(new StructField("col_3", DataTypes.StringType)) } case class Elem(col_1: String, col_2: Long, col_3: String) def extractor(e: Elem): RowData = new RowData().setValues( List( new CellData().setUserEnteredValue( new ExtendedValue().setStringValue(e.col_1) ), new CellData().setUserEnteredValue( new ExtendedValue().setNumberValue(e.col_2.toDouble) ), new CellData().setUserEnteredValue( new ExtendedValue().setStringValue(e.col_3) ) ).asJava ) before { spreadsheet = context.findSpreadsheet(TEST_SPREADSHEET_ID) worksheetName = scala.util.Random.alphanumeric.take(16).mkString val data = List( Elem("a", 1L, "x"), Elem("b", 2L, "y"), Elem("c", 3L, "z") ) spreadsheet.addWorksheet(worksheetName, definedSchema, data, extractor) } after { spreadsheet.deleteWorksheet(worksheetName) } behavior of "A Spreadsheet" it should "find the new worksheet" in { val newWorksheet = spreadsheet.findWorksheet(worksheetName) assert(newWorksheet.isDefined) assert(newWorksheet.get.name == worksheetName) assert(newWorksheet.get.headers == Seq("col_1", "col_2", "col_3")) val rows = newWorksheet.get.rows assert(rows.head == Map("col_1" -> "a", "col_2" -> "1", "col_3" -> "x")) } behavior of "SparkWorksheet#updateCells" it should "update values in a worksheet" in { val newWorksheet = spreadsheet.findWorksheet(worksheetName) assert(newWorksheet.isDefined) val newData = List( Elem("f", 5L, "yy"), Elem("e", 4L, "xx"), Elem("c", 3L, "z"), Elem("b", 2L, "y"), Elem("a", 1L, "x") ) newWorksheet.get.updateCells(definedSchema, newData, extractor) val rows = newWorksheet.get.rows assert(rows.head == Map("col_1" -> "f", "col_2" -> "5", "col_3" -> "yy")) assert(rows.last == Map("col_1" -> "a", "col_2" -> "1", "col_3" -> "x")) } }