org.apache.spark.sql.functions.explode Scala Example

Source File: JsonUtil.scala From piflow with BSD 2-Clause "Simplified" License

5 votes

package cn.piflow.bundle.util

import org.apache.spark.sql.functions.explode
import org.apache.spark.sql.{Column, DataFrame, SQLContext, SparkSession}

import scala.collection.mutable.ArrayBuffer

object JsonUtil extends Serializable{


//  The tag you want to parse,If you want to open an array field,you have to write it like this:links_name(MasterField_ChildField)
  def ParserJsonDF(df:DataFrame,tag:String): DataFrame = {

    var openArrField:String=""
    var ArrSchame:String=""

    var tagARR: Array[String] = tag.split(",")
    var tagNew:String=""


    for(tt<-tagARR){

      if(tt.indexOf("_")> -1){
        //包含“.”
        val openField: Array[String] = tt.split("_")
        openArrField=openField(0)

        ArrSchame+=(openField(1)+",")
      }else{
        tagNew+=(tt+",")
      }
    }
    tagNew+=openArrField
    ArrSchame=ArrSchame.substring(0,ArrSchame.length-1)

    tagARR = tagNew.split(",")
    var FinalDF:DataFrame=df

    //如果用户选择返回字段
    var strings: Seq[Column] =tagNew.split(",").toSeq.map(p => new Column(p))

    if(tag.length>0){
      val df00 = FinalDF.select(strings : _*)
      FinalDF=df00
    }

    //如果用户选择打开的数组字段，并给出schame
    if(openArrField.length>0&&ArrSchame.length>0){

      val schames: Array[String] = ArrSchame.split(",")

      var selARR:ArrayBuffer[String]=ArrayBuffer()//分别取出已经打开的字段
      //遍历数组，封装到column对象中
      var coARR:ArrayBuffer[Column]=ArrayBuffer()//打开字段的select方法用
      val sss = tagNew.split(",")//打开字段后todf方法用
      var co: Column =null
      for(each<-tagARR){
        if(each==openArrField){
          co = explode(FinalDF(openArrField))
          for(x<-schames){

            selARR+=(openArrField+"."+x)
          }
        }else{
          selARR+=each
          co=FinalDF(each)
        }
        coARR+=co
      }
      println("###################")
      selARR.foreach(println(_))
      var selSEQ: Seq[Column] = selARR.toSeq.map(q => new Column(q))

      var df01: DataFrame = FinalDF.select(coARR : _*).toDF(sss:_*)
      FinalDF = df01.select(selSEQ : _*)

    }

FinalDF

  }
}

Source File: ReebDiagramTest.scala From spark-tda with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.ml.linalg.{Vectors, EuclideanDistance, Vector}
import org.apache.spark.sql.functions.{col, explode, udf}
import org.scalatest.{PropSpec, Matchers, GivenWhenThen}
import org.scalatest.prop.GeneratorDrivenPropertyChecks


class ReebDiagramTest
    extends FeaturePropSpec
    with GivenWhenThen
    with GeneratorDrivenPropertyChecks
    with Matchers {
  val assembler = new VectorAssembler()
    .setInputCols(Array("double", "integer"))
    .setOutputCol("vector")
  val cover = new Cover()
    .setExploding(true)
    .setInputCols("double", "integer")
    .setOutputCol("cover_id")

  property("argument topTreeSize must be positive") {
    intercept[IllegalArgumentException] {
      val reeb = new ReebDiagram()
//        .setIdCol("id")
//        .setCoverCol("cover_id")
//        .setFeaturesCol("vector")
//        .setOutputCol("cluster_id")
        .setTopTreeSize(0)
    }
  }

  property("placeholder") {
    val reeb = new ReebDiagram()
      .setK(15)
      .setIdCol("id")
      .setCoverCol("cover_id")
      .setFeaturesCol("vector")
      .setOutputCol("cluster_id")
    forAll(dataframeGen.arbitrary) { df =>
      val assembled = assembler.transform(df)
      whenever(
        assembled.count() > 0 && hasDistinctValues(assembled,
                                                   "double",
                                                   "integer")) {
        val transformed = cover
          .fit(assembled)
          .transform(assembled)
        val result = reeb
          .setTopTreeSize(1)
          .fit(transformed)
          .transform(transformed)
//        result.show()
      }
    }
  }
}

Source File: CoverTest.scala From spark-tda with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.sql.functions.{col, explode, udf}
import org.scalatest.{PropSpec, Matchers, GivenWhenThen}
import org.scalatest.prop.GeneratorDrivenPropertyChecks


class CoverTest
    extends FeaturePropSpec
    with GivenWhenThen
    with GeneratorDrivenPropertyChecks
    with Matchers {
  val assembler = new VectorAssembler()
    .setInputCols(Array("double", "integer"))
    .setOutputCol("vector")

  property("argument numSplits must be positive") {
    intercept[IllegalArgumentException] {
      val cover = new Cover()
        .setInputCols("double")
        .setOutputCol("cover_ids")
        .setNumSplits(0)
    }
  }

  property("argument overlapRatio must be positive") {
    intercept[IllegalArgumentException] {
      val cover = new Cover()
        .setInputCols("double")
        .setOutputCol("cover_ids")
        .setOverlapRatio(0.0)
    }
  }

  property("cover estimator changes nothing with the original dataframe") {
    val cover = new Cover()
      .setInputCols("double", "integer", "vector")
      .setOutputCol("cover_ids")

    forAll(dataframeGen.arbitrary) { df =>
      val transformed = assembler.transform(df)
      whenever(
        transformed.count() > 0 && hasDistinctValues(transformed,
                                                     "double",
                                                     "integer",
                                                     "vector")) {
        val covered = cover
          .fit(transformed)
          .transform(transformed)
          .drop("cover_ids")
          .except(transformed)
          .count() should be(0)
      }
    }
  }

  property("generated cover covers all range of specified columns") {
    val cover = new Cover()
      .setInputCols("double", "integer", "vector")
      .setOutputCol("cover_ids")
    val uncovered = udf { xs: Seq[Long] =>
      xs.length == 0
    }

    forAll(dataframeGen.arbitrary) { df =>
      val transformed = assembler.transform(df)
      whenever(
        transformed.count() > 0 && hasDistinctValues(transformed,
                                                     "double",
                                                     "integer",
                                                     "vector")) {
        cover
          .fit(transformed)
          .transform(transformed)
          .where(uncovered(col("cover_ids")))
          .count() should be(0)
      }
    }
  }

  property("Cover is readable/writable") {
    val cover = new Cover()
      .setInputCols("double", "integer")
      .setOutputCol("cover_ids")
    testDefaultReadWrite(cover)
  }

  property("CoverModel is readable/writable") {
    val model = new CoverModel("myCoverModel",
                               Vectors.dense(-1.0, 0.0),
                               Vectors.dense(1.0, 10.0))
      .setInputCols("double", "integer")
      .setOutputCol("cover_ids")
    val newModel = testDefaultReadWrite(model)
    assert(newModel.min === model.min)
    assert(newModel.max === model.max)
  }
}

Source File: TriangleCount.scala From graphframes with Apache License 2.0

5 votes

package org.graphframes.lib

import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.{array, col, explode, when}

import org.graphframes.GraphFrame
import org.graphframes.GraphFrame.{DST, ID, LONG_DST, LONG_SRC, SRC}


class TriangleCount private[graphframes] (private val graph: GraphFrame) extends Arguments {

  def run(): DataFrame = {
    TriangleCount.run(graph)
  }
}

private object TriangleCount {

  private def run(graph: GraphFrame): DataFrame = {
    // Dedup edges by flipping them to have LONG_SRC < LONG_DST
    // TODO (when we drop support for Spark 1.4): Use functions greatest, smallest instead of UDFs
    val dedupedE = graph.indexedEdges
      .filter(s"$LONG_SRC != $LONG_DST")
      .selectExpr(
        s"if($LONG_SRC < $LONG_DST, $SRC, $DST) as $SRC",
        s"if($LONG_SRC < $LONG_DST, $DST, $SRC) as $DST")
      .dropDuplicates(Seq(SRC, DST))
    val g2 = GraphFrame(graph.vertices, dedupedE)

    // Because SRC < DST, there exists only one type of triangles:
    // - Non-cycle with one edge flipped.  These are counted 1 time each by motif finding.
    val triangles = g2.find("(a)-[]->(b); (b)-[]->(c); (a)-[]->(c)")

    val triangleCounts = triangles
      .select(explode(array(col("a.id"), col("b.id"), col("c.id"))).as(ID))
      .groupBy(ID)
      .count()

    val v = graph.vertices
    val countsCol = when(col("count").isNull, 0L).otherwise(col("count"))
    val newV = v.join(triangleCounts, v(ID) === triangleCounts(ID), "left_outer")
      .select(countsCol.as(COUNT_ID) +: v.columns.map(v.apply) :_ *)
    newV
  }

  private val COUNT_ID = "count"
}

Source File: Tutorial_04_ExplodeJson.scala From learn-spark with Apache License 2.0

5 votes

package com.allaboutscala.learn.spark.functions

import com.allaboutscala.learn.spark.utils.Context
import org.apache.spark.sql.functions.explode


object Tutorial_04_ExplodeJson extends App with Context {

  import sparkSession.sqlContext.implicits._

  val tagsDF = sparkSession
    .read
    .option("multiLine", true)
    .option("inferSchema", true)
    .json("src/main/resources/tags_sample.json")

  val df = tagsDF.select(explode($"stackoverflow") as "stackoverflow_tags")

  df.printSchema()

  df.select(
    $"stackoverflow_tags.tag.id" as "id",
    $"stackoverflow_tags.tag.author" as "author",
    $"stackoverflow_tags.tag.name" as "tag_name",
    $"stackoverflow_tags.tag.frameworks.id" as "frameworks_id",
    $"stackoverflow_tags.tag.frameworks.name" as "frameworks_name"
  ).show()
}

Source File: ClassifierDatasetEncoder.scala From spark-nlp with Apache License 2.0

5 votes

package com.johnsnowlabs.ml.tensorflow

import com.johnsnowlabs.nlp.Annotation
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.{size, explode, col}

import scala.collection.mutable

class ClassifierDatasetEncoder(val params: ClassifierDatasetEncoderParams) extends Serializable {

  val tags2Id: Map[String, Int] = params.tags.zipWithIndex
    .map(p => (p._1, p._2))
    .toMap

  val tags: Array[String] = tags2Id
    .map(p => (p._2, p._1))
    .toArray
    .sortBy(p => p._1)
    .map(p => p._2)

  def encodeTags(labels: Array[String]): Array[Array[Int]] = {
    labels.map { t =>
      val labelIDsArray = Array.fill(tags.length)(0)
      labelIDsArray(tags2Id(t)) = 1
      labelIDsArray
    }
  }

  
  def decodeOutputData(tagIds: Array[Array[Float]]): Array[Array[(String, Float)]] = {
    val scoresMetadata = tagIds.map { scores =>
      scores.zipWithIndex.flatMap {
        case (score, idx) =>
          val tag = tags2Id.find(_._2 == idx).map(_._1).getOrElse("NA")
          Map(tag -> score)
      }
    }

    scoresMetadata
  }
}

case class ClassifierDatasetEncoderParams(tags: Array[String])

Source File: functions.scala From spark-nlp with Apache License 2.0

5 votes

package com.johnsnowlabs.nlp

import org.apache.spark.sql.expressions.UserDefinedFunction
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.functions.{array, col, explode, udf}
import org.apache.spark.sql.types.DataType

import scala.reflect.runtime.universe._

object functions {

  implicit class FilterAnnotations(dataset: DataFrame) {
    def filterByAnnotationsCol(column: String, function: Seq[Annotation] => Boolean): DataFrame = {
      val meta = dataset.schema(column).metadata
      val func = udf {
        annotatorProperties: Seq[Row] =>
          function(annotatorProperties.map(Annotation(_)))
      }
      dataset.filter(func(col(column)).as(column, meta))
    }
  }

  def mapAnnotations[T](function: Seq[Annotation] => T, outputType: DataType): UserDefinedFunction = udf ( {
    annotatorProperties: Seq[Row] =>
      function(annotatorProperties.map(Annotation(_)))
  }, outputType)

  def mapAnnotationsStrict(function: Seq[Annotation] => Seq[Annotation]): UserDefinedFunction = udf {
    annotatorProperties: Seq[Row] =>
      function(annotatorProperties.map(Annotation(_)))
  }

  implicit class MapAnnotations(dataset: DataFrame) {
    def mapAnnotationsCol[T: TypeTag](column: String, outputCol: String, function: Seq[Annotation] => T): DataFrame = {
      val meta = dataset.schema(column).metadata
      val func = udf {
        annotatorProperties: Seq[Row] =>
          function(annotatorProperties.map(Annotation(_)))
      }
      dataset.withColumn(outputCol, func(col(column)).as(outputCol, meta))
    }
  }

  implicit class EachAnnotations(dataset: DataFrame) {

    import dataset.sparkSession.implicits._

    def eachAnnotationsCol[T: TypeTag](column: String, function: Seq[Annotation] => Unit): Unit = {
      dataset.select(column).as[Array[Annotation]].foreach(function(_))
    }
  }

  implicit class ExplodeAnnotations(dataset: DataFrame) {
    def explodeAnnotationsCol[T: TypeTag](column: String, outputCol: String): DataFrame = {
      val meta = dataset.schema(column).metadata
      dataset.
        withColumn(outputCol, explode(col(column))).
        withColumn(outputCol, array(col(outputCol)).as(outputCol, meta))
    }
  }

}

Source File: ElmoEmbeddingsTestSpec.scala From spark-nlp with Apache License 2.0

5 votes

package com.johnsnowlabs.nlp.embeddings

import com.johnsnowlabs.nlp.annotators.Tokenizer
import com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector
import com.johnsnowlabs.nlp.base.DocumentAssembler
import com.johnsnowlabs.nlp.util.io.ResourceHelper.spark.implicits._
import org.apache.spark.ml.Pipeline
import org.apache.spark.sql.functions.{size, explode}
import org.scalatest._

class ElmoEmbeddingsTestSpec extends FlatSpec {
  "Elmo Embeddings" should "generate annotations" ignore {
    System.out.println("Working Directory = " + System.getProperty("user.dir"))
    val data = Seq(
      "I like pancakes in the summer. I hate ice cream in winter.",
      "If I had asked people what they wanted, they would have said faster horses"
    ).toDF("text")

    val document = new DocumentAssembler()
      .setInputCol("text")
      .setOutputCol("document")

    val sentence = new SentenceDetector()
      .setInputCols("document")
      .setOutputCol("sentence")

    val tokenizer = new Tokenizer()
      .setInputCols(Array("document"))
      .setOutputCol("token")

    val elmoSavedModel = ElmoEmbeddings.pretrained()
      .setPoolingLayer("word_emb")
      .setInputCols(Array("token", "document"))
      .setOutputCol("embeddings")

    elmoSavedModel.write.overwrite().save("./tmp_elmo_tf")

    val embeddings = ElmoEmbeddings.load("./tmp_elmo_tf")

    val pipeline = new Pipeline().setStages(Array(
      document,
      sentence,
      tokenizer,
      embeddings
    ))

    val elmoDDD = pipeline.fit(data).transform(data)

    elmoDDD.select("embeddings.result").show(false)
    elmoDDD.select("embeddings.metadata").show(false)
    val explodeEmbds = elmoDDD.select(explode($"embeddings.embeddings").as("embedding"))
    elmoDDD.select(size(elmoDDD("embeddings.embeddings")).as("embeddings_size")).show
    explodeEmbds.select(size($"embedding").as("embeddings_size")).show


  }


}

org.apache.spark.sql.functions.explode Scala Examples