org.apache.spark.sql.functions.explode Scala Examples
The following examples show how to use org.apache.spark.sql.functions.explode.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: JsonUtil.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.util import org.apache.spark.sql.functions.explode import org.apache.spark.sql.{Column, DataFrame, SQLContext, SparkSession} import scala.collection.mutable.ArrayBuffer object JsonUtil extends Serializable{ // The tag you want to parse,If you want to open an array field,you have to write it like this:links_name(MasterField_ChildField) def ParserJsonDF(df:DataFrame,tag:String): DataFrame = { var openArrField:String="" var ArrSchame:String="" var tagARR: Array[String] = tag.split(",") var tagNew:String="" for(tt<-tagARR){ if(tt.indexOf("_")> -1){ //包含“.” val openField: Array[String] = tt.split("_") openArrField=openField(0) ArrSchame+=(openField(1)+",") }else{ tagNew+=(tt+",") } } tagNew+=openArrField ArrSchame=ArrSchame.substring(0,ArrSchame.length-1) tagARR = tagNew.split(",") var FinalDF:DataFrame=df //如果用户选择返回字段 var strings: Seq[Column] =tagNew.split(",").toSeq.map(p => new Column(p)) if(tag.length>0){ val df00 = FinalDF.select(strings : _*) FinalDF=df00 } //如果用户选择打开的数组字段,并给出schame if(openArrField.length>0&&ArrSchame.length>0){ val schames: Array[String] = ArrSchame.split(",") var selARR:ArrayBuffer[String]=ArrayBuffer()//分别取出已经打开的字段 //遍历数组,封装到column对象中 var coARR:ArrayBuffer[Column]=ArrayBuffer()//打开字段的select方法用 val sss = tagNew.split(",")//打开字段后todf方法用 var co: Column =null for(each<-tagARR){ if(each==openArrField){ co = explode(FinalDF(openArrField)) for(x<-schames){ selARR+=(openArrField+"."+x) } }else{ selARR+=each co=FinalDF(each) } coARR+=co } println("###################") selARR.foreach(println(_)) var selSEQ: Seq[Column] = selARR.toSeq.map(q => new Column(q)) var df01: DataFrame = FinalDF.select(coARR : _*).toDF(sss:_*) FinalDF = df01.select(selSEQ : _*) } FinalDF } }
Example 2
Source File: ReebDiagramTest.scala From spark-tda with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.ml.linalg.{Vectors, EuclideanDistance, Vector} import org.apache.spark.sql.functions.{col, explode, udf} import org.scalatest.{PropSpec, Matchers, GivenWhenThen} import org.scalatest.prop.GeneratorDrivenPropertyChecks class ReebDiagramTest extends FeaturePropSpec with GivenWhenThen with GeneratorDrivenPropertyChecks with Matchers { val assembler = new VectorAssembler() .setInputCols(Array("double", "integer")) .setOutputCol("vector") val cover = new Cover() .setExploding(true) .setInputCols("double", "integer") .setOutputCol("cover_id") property("argument topTreeSize must be positive") { intercept[IllegalArgumentException] { val reeb = new ReebDiagram() // .setIdCol("id") // .setCoverCol("cover_id") // .setFeaturesCol("vector") // .setOutputCol("cluster_id") .setTopTreeSize(0) } } property("placeholder") { val reeb = new ReebDiagram() .setK(15) .setIdCol("id") .setCoverCol("cover_id") .setFeaturesCol("vector") .setOutputCol("cluster_id") forAll(dataframeGen.arbitrary) { df => val assembled = assembler.transform(df) whenever( assembled.count() > 0 && hasDistinctValues(assembled, "double", "integer")) { val transformed = cover .fit(assembled) .transform(assembled) val result = reeb .setTopTreeSize(1) .fit(transformed) .transform(transformed) // result.show() } } } }
Example 3
Source File: CoverTest.scala From spark-tda with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.ml.linalg.Vectors import org.apache.spark.sql.functions.{col, explode, udf} import org.scalatest.{PropSpec, Matchers, GivenWhenThen} import org.scalatest.prop.GeneratorDrivenPropertyChecks class CoverTest extends FeaturePropSpec with GivenWhenThen with GeneratorDrivenPropertyChecks with Matchers { val assembler = new VectorAssembler() .setInputCols(Array("double", "integer")) .setOutputCol("vector") property("argument numSplits must be positive") { intercept[IllegalArgumentException] { val cover = new Cover() .setInputCols("double") .setOutputCol("cover_ids") .setNumSplits(0) } } property("argument overlapRatio must be positive") { intercept[IllegalArgumentException] { val cover = new Cover() .setInputCols("double") .setOutputCol("cover_ids") .setOverlapRatio(0.0) } } property("cover estimator changes nothing with the original dataframe") { val cover = new Cover() .setInputCols("double", "integer", "vector") .setOutputCol("cover_ids") forAll(dataframeGen.arbitrary) { df => val transformed = assembler.transform(df) whenever( transformed.count() > 0 && hasDistinctValues(transformed, "double", "integer", "vector")) { val covered = cover .fit(transformed) .transform(transformed) .drop("cover_ids") .except(transformed) .count() should be(0) } } } property("generated cover covers all range of specified columns") { val cover = new Cover() .setInputCols("double", "integer", "vector") .setOutputCol("cover_ids") val uncovered = udf { xs: Seq[Long] => xs.length == 0 } forAll(dataframeGen.arbitrary) { df => val transformed = assembler.transform(df) whenever( transformed.count() > 0 && hasDistinctValues(transformed, "double", "integer", "vector")) { cover .fit(transformed) .transform(transformed) .where(uncovered(col("cover_ids"))) .count() should be(0) } } } property("Cover is readable/writable") { val cover = new Cover() .setInputCols("double", "integer") .setOutputCol("cover_ids") testDefaultReadWrite(cover) } property("CoverModel is readable/writable") { val model = new CoverModel("myCoverModel", Vectors.dense(-1.0, 0.0), Vectors.dense(1.0, 10.0)) .setInputCols("double", "integer") .setOutputCol("cover_ids") val newModel = testDefaultReadWrite(model) assert(newModel.min === model.min) assert(newModel.max === model.max) } }
Example 4
Source File: TriangleCount.scala From graphframes with Apache License 2.0 | 5 votes |
package org.graphframes.lib import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.{array, col, explode, when} import org.graphframes.GraphFrame import org.graphframes.GraphFrame.{DST, ID, LONG_DST, LONG_SRC, SRC} class TriangleCount private[graphframes] (private val graph: GraphFrame) extends Arguments { def run(): DataFrame = { TriangleCount.run(graph) } } private object TriangleCount { private def run(graph: GraphFrame): DataFrame = { // Dedup edges by flipping them to have LONG_SRC < LONG_DST // TODO (when we drop support for Spark 1.4): Use functions greatest, smallest instead of UDFs val dedupedE = graph.indexedEdges .filter(s"$LONG_SRC != $LONG_DST") .selectExpr( s"if($LONG_SRC < $LONG_DST, $SRC, $DST) as $SRC", s"if($LONG_SRC < $LONG_DST, $DST, $SRC) as $DST") .dropDuplicates(Seq(SRC, DST)) val g2 = GraphFrame(graph.vertices, dedupedE) // Because SRC < DST, there exists only one type of triangles: // - Non-cycle with one edge flipped. These are counted 1 time each by motif finding. val triangles = g2.find("(a)-[]->(b); (b)-[]->(c); (a)-[]->(c)") val triangleCounts = triangles .select(explode(array(col("a.id"), col("b.id"), col("c.id"))).as(ID)) .groupBy(ID) .count() val v = graph.vertices val countsCol = when(col("count").isNull, 0L).otherwise(col("count")) val newV = v.join(triangleCounts, v(ID) === triangleCounts(ID), "left_outer") .select(countsCol.as(COUNT_ID) +: v.columns.map(v.apply) :_ *) newV } private val COUNT_ID = "count" }
Example 5
Source File: Tutorial_04_ExplodeJson.scala From learn-spark with Apache License 2.0 | 5 votes |
package com.allaboutscala.learn.spark.functions import com.allaboutscala.learn.spark.utils.Context import org.apache.spark.sql.functions.explode object Tutorial_04_ExplodeJson extends App with Context { import sparkSession.sqlContext.implicits._ val tagsDF = sparkSession .read .option("multiLine", true) .option("inferSchema", true) .json("src/main/resources/tags_sample.json") val df = tagsDF.select(explode($"stackoverflow") as "stackoverflow_tags") df.printSchema() df.select( $"stackoverflow_tags.tag.id" as "id", $"stackoverflow_tags.tag.author" as "author", $"stackoverflow_tags.tag.name" as "tag_name", $"stackoverflow_tags.tag.frameworks.id" as "frameworks_id", $"stackoverflow_tags.tag.frameworks.name" as "frameworks_name" ).show() }
Example 6
Source File: ClassifierDatasetEncoder.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.ml.tensorflow import com.johnsnowlabs.nlp.Annotation import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.{size, explode, col} import scala.collection.mutable class ClassifierDatasetEncoder(val params: ClassifierDatasetEncoderParams) extends Serializable { val tags2Id: Map[String, Int] = params.tags.zipWithIndex .map(p => (p._1, p._2)) .toMap val tags: Array[String] = tags2Id .map(p => (p._2, p._1)) .toArray .sortBy(p => p._1) .map(p => p._2) def encodeTags(labels: Array[String]): Array[Array[Int]] = { labels.map { t => val labelIDsArray = Array.fill(tags.length)(0) labelIDsArray(tags2Id(t)) = 1 labelIDsArray } } def decodeOutputData(tagIds: Array[Array[Float]]): Array[Array[(String, Float)]] = { val scoresMetadata = tagIds.map { scores => scores.zipWithIndex.flatMap { case (score, idx) => val tag = tags2Id.find(_._2 == idx).map(_._1).getOrElse("NA") Map(tag -> score) } } scoresMetadata } } case class ClassifierDatasetEncoderParams(tags: Array[String])
Example 7
Source File: functions.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp import org.apache.spark.sql.expressions.UserDefinedFunction import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.functions.{array, col, explode, udf} import org.apache.spark.sql.types.DataType import scala.reflect.runtime.universe._ object functions { implicit class FilterAnnotations(dataset: DataFrame) { def filterByAnnotationsCol(column: String, function: Seq[Annotation] => Boolean): DataFrame = { val meta = dataset.schema(column).metadata val func = udf { annotatorProperties: Seq[Row] => function(annotatorProperties.map(Annotation(_))) } dataset.filter(func(col(column)).as(column, meta)) } } def mapAnnotations[T](function: Seq[Annotation] => T, outputType: DataType): UserDefinedFunction = udf ( { annotatorProperties: Seq[Row] => function(annotatorProperties.map(Annotation(_))) }, outputType) def mapAnnotationsStrict(function: Seq[Annotation] => Seq[Annotation]): UserDefinedFunction = udf { annotatorProperties: Seq[Row] => function(annotatorProperties.map(Annotation(_))) } implicit class MapAnnotations(dataset: DataFrame) { def mapAnnotationsCol[T: TypeTag](column: String, outputCol: String, function: Seq[Annotation] => T): DataFrame = { val meta = dataset.schema(column).metadata val func = udf { annotatorProperties: Seq[Row] => function(annotatorProperties.map(Annotation(_))) } dataset.withColumn(outputCol, func(col(column)).as(outputCol, meta)) } } implicit class EachAnnotations(dataset: DataFrame) { import dataset.sparkSession.implicits._ def eachAnnotationsCol[T: TypeTag](column: String, function: Seq[Annotation] => Unit): Unit = { dataset.select(column).as[Array[Annotation]].foreach(function(_)) } } implicit class ExplodeAnnotations(dataset: DataFrame) { def explodeAnnotationsCol[T: TypeTag](column: String, outputCol: String): DataFrame = { val meta = dataset.schema(column).metadata dataset. withColumn(outputCol, explode(col(column))). withColumn(outputCol, array(col(outputCol)).as(outputCol, meta)) } } }
Example 8
Source File: ElmoEmbeddingsTestSpec.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp.embeddings import com.johnsnowlabs.nlp.annotators.Tokenizer import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector import com.johnsnowlabs.nlp.base.DocumentAssembler import com.johnsnowlabs.nlp.util.io.ResourceHelper.spark.implicits._ import org.apache.spark.ml.Pipeline import org.apache.spark.sql.functions.{size, explode} import org.scalatest._ class ElmoEmbeddingsTestSpec extends FlatSpec { "Elmo Embeddings" should "generate annotations" ignore { System.out.println("Working Directory = " + System.getProperty("user.dir")) val data = Seq( "I like pancakes in the summer. I hate ice cream in winter.", "If I had asked people what they wanted, they would have said faster horses" ).toDF("text") val document = new DocumentAssembler() .setInputCol("text") .setOutputCol("document") val sentence = new SentenceDetector() .setInputCols("document") .setOutputCol("sentence") val tokenizer = new Tokenizer() .setInputCols(Array("document")) .setOutputCol("token") val elmoSavedModel = ElmoEmbeddings.pretrained() .setPoolingLayer("word_emb") .setInputCols(Array("token", "document")) .setOutputCol("embeddings") elmoSavedModel.write.overwrite().save("./tmp_elmo_tf") val embeddings = ElmoEmbeddings.load("./tmp_elmo_tf") val pipeline = new Pipeline().setStages(Array( document, sentence, tokenizer, embeddings )) val elmoDDD = pipeline.fit(data).transform(data) elmoDDD.select("embeddings.result").show(false) elmoDDD.select("embeddings.metadata").show(false) val explodeEmbds = elmoDDD.select(explode($"embeddings.embeddings").as("embedding")) elmoDDD.select(size(elmoDDD("embeddings.embeddings")).as("embeddings_size")).show explodeEmbds.select(size($"embedding").as("embeddings_size")).show } }