org.apache.spark.sql.functions.explode Scala Examples
The following examples show how to use org.apache.spark.sql.functions.explode.
Example 1
Source File: JsonUtil.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.util import org.apache.spark.sql.functions.explode import org.apache.spark.sql.{Column, DataFrame, SQLContext, SparkSession} import scala.collection.mutable.ArrayBuffer object JsonUtil extends Serializable{ // The tag you want to parse,If you want to open an array field,you have to write it like this:links_name(MasterField_ChildField) def ParserJsonDF(df:DataFrame,tag:String): DataFrame = { var openArrField:String="" var ArrSchame:String="" var tagARR: Array[String] = tag.split(",") var tagNew:String="" for(tt<-tagARR){ if(tt.indexOf("_")> -1){ //包含“.” val openField: Array[String] = tt.split("_") openArrField=openField(0) ArrSchame+=(openField(1)+",") }else{ tagNew+=(tt+",") } } tagNew+=openArrField ArrSchame=ArrSchame.substring(0,ArrSchame.length-1) tagARR = tagNew.split(",") var FinalDF:DataFrame=df //如果用户选择返回字段 var strings: Seq[Column] =tagNew.split(",") => new Column(p)) if(tag.length>0){ val df00 = : _*) FinalDF=df00 } //如果用户选择打开的数组字段,并给出schame if(openArrField.length>0&&ArrSchame.length>0){ val schames: Array[String] = ArrSchame.split(",") var selARR:ArrayBuffer[String]=ArrayBuffer()//分别取出已经打开的字段 //遍历数组,封装到column对象中 var coARR:ArrayBuffer[Column]=ArrayBuffer()//打开字段的select方法用 val sss = tagNew.split(",")//打开字段后todf方法用 var co: Column =null for(each<-tagARR){ if(each==openArrField){ co = explode(FinalDF(openArrField)) for(x<-schames){ selARR+=(openArrField+"."+x) } }else{ selARR+=each co=FinalDF(each) } coARR+=co } println("###################") selARR.foreach(println(_)) var selSEQ: Seq[Column] = => new Column(q)) var df01: DataFrame = : _*).toDF(sss:_*) FinalDF = : _*) } FinalDF } }
Example 2
Source File: ReebDiagramTest.scala From spark-tda with Apache License 2.0 | 5 votes |
package import{Vectors, EuclideanDistance, Vector} import org.apache.spark.sql.functions.{col, explode, udf} import org.scalatest.{PropSpec, Matchers, GivenWhenThen} import org.scalatest.prop.GeneratorDrivenPropertyChecks class ReebDiagramTest extends FeaturePropSpec with GivenWhenThen with GeneratorDrivenPropertyChecks with Matchers { val assembler = new VectorAssembler() .setInputCols(Array("double", "integer")) .setOutputCol("vector") val cover = new Cover() .setExploding(true) .setInputCols("double", "integer") .setOutputCol("cover_id") property("argument topTreeSize must be positive") { intercept[IllegalArgumentException] { val reeb = new ReebDiagram() // .setIdCol("id") // .setCoverCol("cover_id") // .setFeaturesCol("vector") // .setOutputCol("cluster_id") .setTopTreeSize(0) } } property("placeholder") { val reeb = new ReebDiagram() .setK(15) .setIdCol("id") .setCoverCol("cover_id") .setFeaturesCol("vector") .setOutputCol("cluster_id") forAll(dataframeGen.arbitrary) { df => val assembled = assembler.transform(df) whenever( assembled.count() > 0 && hasDistinctValues(assembled, "double", "integer")) { val transformed = cover .fit(assembled) .transform(assembled) val result = reeb .setTopTreeSize(1) .fit(transformed) .transform(transformed) // } } } }
Example 3
Source File: CoverTest.scala From spark-tda with Apache License 2.0 | 5 votes |
package import import org.apache.spark.sql.functions.{col, explode, udf} import org.scalatest.{PropSpec, Matchers, GivenWhenThen} import org.scalatest.prop.GeneratorDrivenPropertyChecks class CoverTest extends FeaturePropSpec with GivenWhenThen with GeneratorDrivenPropertyChecks with Matchers { val assembler = new VectorAssembler() .setInputCols(Array("double", "integer")) .setOutputCol("vector") property("argument numSplits must be positive") { intercept[IllegalArgumentException] { val cover = new Cover() .setInputCols("double") .setOutputCol("cover_ids") .setNumSplits(0) } } property("argument overlapRatio must be positive") { intercept[IllegalArgumentException] { val cover = new Cover() .setInputCols("double") .setOutputCol("cover_ids") .setOverlapRatio(0.0) } } property("cover estimator changes nothing with the original dataframe") { val cover = new Cover() .setInputCols("double", "integer", "vector") .setOutputCol("cover_ids") forAll(dataframeGen.arbitrary) { df => val transformed = assembler.transform(df) whenever( transformed.count() > 0 && hasDistinctValues(transformed, "double", "integer", "vector")) { val covered = cover .fit(transformed) .transform(transformed) .drop("cover_ids") .except(transformed) .count() should be(0) } } } property("generated cover covers all range of specified columns") { val cover = new Cover() .setInputCols("double", "integer", "vector") .setOutputCol("cover_ids") val uncovered = udf { xs: Seq[Long] => xs.length == 0 } forAll(dataframeGen.arbitrary) { df => val transformed = assembler.transform(df) whenever( transformed.count() > 0 && hasDistinctValues(transformed, "double", "integer", "vector")) { cover .fit(transformed) .transform(transformed) .where(uncovered(col("cover_ids"))) .count() should be(0) } } } property("Cover is readable/writable") { val cover = new Cover() .setInputCols("double", "integer") .setOutputCol("cover_ids") testDefaultReadWrite(cover) } property("CoverModel is readable/writable") { val model = new CoverModel("myCoverModel", Vectors.dense(-1.0, 0.0), Vectors.dense(1.0, 10.0)) .setInputCols("double", "integer") .setOutputCol("cover_ids") val newModel = testDefaultReadWrite(model) assert(newModel.min === model.min) assert(newModel.max === model.max) } }
Example 4
Source File: TriangleCount.scala From graphframes with Apache License 2.0 | 5 votes |
package org.graphframes.lib import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.{array, col, explode, when} import org.graphframes.GraphFrame import org.graphframes.GraphFrame.{DST, ID, LONG_DST, LONG_SRC, SRC} class TriangleCount private[graphframes] (private val graph: GraphFrame) extends Arguments { def run(): DataFrame = { } } private object TriangleCount { private def run(graph: GraphFrame): DataFrame = { // Dedup edges by flipping them to have LONG_SRC < LONG_DST // TODO (when we drop support for Spark 1.4): Use functions greatest, smallest instead of UDFs val dedupedE = graph.indexedEdges .filter(s"$LONG_SRC != $LONG_DST") .selectExpr( s"if($LONG_SRC < $LONG_DST, $SRC, $DST) as $SRC", s"if($LONG_SRC < $LONG_DST, $DST, $SRC) as $DST") .dropDuplicates(Seq(SRC, DST)) val g2 = GraphFrame(graph.vertices, dedupedE) // Because SRC < DST, there exists only one type of triangles: // - Non-cycle with one edge flipped. These are counted 1 time each by motif finding. val triangles = g2.find("(a)-[]->(b); (b)-[]->(c); (a)-[]->(c)") val triangleCounts = triangles .select(explode(array(col(""), col(""), col(""))).as(ID)) .groupBy(ID) .count() val v = graph.vertices val countsCol = when(col("count").isNull, 0L).otherwise(col("count")) val newV = v.join(triangleCounts, v(ID) === triangleCounts(ID), "left_outer") .select( +: :_ *) newV } private val COUNT_ID = "count" }
Example 5
Source File: Tutorial_04_ExplodeJson.scala From learn-spark with Apache License 2.0 | 5 votes |
package com.allaboutscala.learn.spark.functions import com.allaboutscala.learn.spark.utils.Context import org.apache.spark.sql.functions.explode object Tutorial_04_ExplodeJson extends App with Context { import sparkSession.sqlContext.implicits._ val tagsDF = sparkSession .read .option("multiLine", true) .option("inferSchema", true) .json("src/main/resources/tags_sample.json") val df =$"stackoverflow") as "stackoverflow_tags") df.printSchema() $"" as "id", $"" as "author", $"" as "tag_name", $"" as "frameworks_id", $"" as "frameworks_name" ).show() }
Example 6
Source File: ClassifierDatasetEncoder.scala From spark-nlp with Apache License 2.0 | 5 votes |
package import com.johnsnowlabs.nlp.Annotation import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.{size, explode, col} import scala.collection.mutable class ClassifierDatasetEncoder(val params: ClassifierDatasetEncoderParams) extends Serializable { val tags2Id: Map[String, Int] = params.tags.zipWithIndex .map(p => (p._1, p._2)) .toMap val tags: Array[String] = tags2Id .map(p => (p._2, p._1)) .toArray .sortBy(p => p._1) .map(p => p._2) def encodeTags(labels: Array[String]): Array[Array[Int]] = { { t => val labelIDsArray = Array.fill(tags.length)(0) labelIDsArray(tags2Id(t)) = 1 labelIDsArray } } def decodeOutputData(tagIds: Array[Array[Float]]): Array[Array[(String, Float)]] = { val scoresMetadata = { scores => scores.zipWithIndex.flatMap { case (score, idx) => val tag = tags2Id.find(_._2 == idx).map(_._1).getOrElse("NA") Map(tag -> score) } } scoresMetadata } } case class ClassifierDatasetEncoderParams(tags: Array[String])
Example 7
Source File: functions.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp import org.apache.spark.sql.expressions.UserDefinedFunction import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.functions.{array, col, explode, udf} import org.apache.spark.sql.types.DataType import scala.reflect.runtime.universe._ object functions { implicit class FilterAnnotations(dataset: DataFrame) { def filterByAnnotationsCol(column: String, function: Seq[Annotation] => Boolean): DataFrame = { val meta = dataset.schema(column).metadata val func = udf { annotatorProperties: Seq[Row] => function( } dataset.filter(func(col(column)).as(column, meta)) } } def mapAnnotations[T](function: Seq[Annotation] => T, outputType: DataType): UserDefinedFunction = udf ( { annotatorProperties: Seq[Row] => function( }, outputType) def mapAnnotationsStrict(function: Seq[Annotation] => Seq[Annotation]): UserDefinedFunction = udf { annotatorProperties: Seq[Row] => function( } implicit class MapAnnotations(dataset: DataFrame) { def mapAnnotationsCol[T: TypeTag](column: String, outputCol: String, function: Seq[Annotation] => T): DataFrame = { val meta = dataset.schema(column).metadata val func = udf { annotatorProperties: Seq[Row] => function( } dataset.withColumn(outputCol, func(col(column)).as(outputCol, meta)) } } implicit class EachAnnotations(dataset: DataFrame) { import dataset.sparkSession.implicits._ def eachAnnotationsCol[T: TypeTag](column: String, function: Seq[Annotation] => Unit): Unit = {[Array[Annotation]].foreach(function(_)) } } implicit class ExplodeAnnotations(dataset: DataFrame) { def explodeAnnotationsCol[T: TypeTag](column: String, outputCol: String): DataFrame = { val meta = dataset.schema(column).metadata dataset. withColumn(outputCol, explode(col(column))). withColumn(outputCol, array(col(outputCol)).as(outputCol, meta)) } } }
Example 8
Source File: ElmoEmbeddingsTestSpec.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp.embeddings import com.johnsnowlabs.nlp.annotators.Tokenizer import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector import com.johnsnowlabs.nlp.base.DocumentAssembler import import import org.apache.spark.sql.functions.{size, explode} import org.scalatest._ class ElmoEmbeddingsTestSpec extends FlatSpec { "Elmo Embeddings" should "generate annotations" ignore { System.out.println("Working Directory = " + System.getProperty("user.dir")) val data = Seq( "I like pancakes in the summer. I hate ice cream in winter.", "If I had asked people what they wanted, they would have said faster horses" ).toDF("text") val document = new DocumentAssembler() .setInputCol("text") .setOutputCol("document") val sentence = new SentenceDetector() .setInputCols("document") .setOutputCol("sentence") val tokenizer = new Tokenizer() .setInputCols(Array("document")) .setOutputCol("token") val elmoSavedModel = ElmoEmbeddings.pretrained() .setPoolingLayer("word_emb") .setInputCols(Array("token", "document")) .setOutputCol("embeddings") elmoSavedModel.write.overwrite().save("./tmp_elmo_tf") val embeddings = ElmoEmbeddings.load("./tmp_elmo_tf") val pipeline = new Pipeline().setStages(Array( document, sentence, tokenizer, embeddings )) val elmoDDD ="embeddings.result").show(false)"embeddings.metadata").show(false) val explodeEmbds =$"embeddings.embeddings").as("embedding"))"embeddings.embeddings")).as("embeddings_size")).show$"embedding").as("embeddings_size")).show } }