org.apache.spark.sql.api.java.UDF2 Java Examples

The following examples show how to use org.apache.spark.sql.api.java.UDF2. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: SparkRunner.java From bpmn.ai with BSD 3-Clause "New" or "Revised" License

6 votes

private void registerUDFs() {
    // register our own aggregation function
    sparkSession.udf().register("AllButEmptyString", new AllButEmptyStringAggregationFunction());
    sparkSession.udf().register("ProcessState", new ProcessStatesAggregationFunction());
    sparkSession.udf().register("isALong", (UDF1<Object, Boolean>) o -> {
        if(o instanceof Long)
            return true;
        if(o instanceof String && Longs.tryParse((String) o) != null)
            return true;
        return false;
    }, DataTypes.BooleanType);
    sparkSession.udf().register("timestampStringToLong", (UDF1<Object, Long>) o -> {
        if(o instanceof String && Longs.tryParse((String) o) != null) {
            return Longs.tryParse((String) o) / 1000;
        }
        return null;
    }, DataTypes.LongType);
    sparkSession.udf().register("activityBeforeTimestamp", (UDF2<String, String, String>) (s, s2) -> {
        // get broadcast
        Map<String, String> activities = (Map<String, String>) SparkBroadcastHelper.getInstance().getBroadcastVariable(SparkBroadcastHelper.BROADCAST_VARIABLE.PROCESS_INSTANCE_TIMESTAMP_MAP);
        // is pid contained in broadcast?
        if (activities == null || activities.isEmpty()){
            return "Error: Broadcast not found";
        } else {
            if (activities.containsKey(s)) {
                Timestamp tsAct = new Timestamp(Long.parseLong(activities.get(s)));
                if(s2 == null || s2.isEmpty()){
                    return "FALSE";
                }
                Timestamp tsObject = new Timestamp(Long.parseLong(s2));
                if (tsObject.after(tsAct)) {
                    return "FALSE";
                } else {
                    return "TRUE";
                }
            }
        }
        return "FALSE";
    }, DataTypes.StringType);
}

Example #2

Source File: SparkReader.java From GeoTriples with Apache License 2.0

5 votes

/**
 * Read the input GeoJSON files into a Spark Dataset.
 * GeoJSON attributes are located in the column "Properties" and the geometry in the column "Geometry",
 * and hence it expands them. Then convert the GeoJSON Geometry into WKT using a UDF.
 *
 * @return a Spark's Dataset containing the data.
 */
private Dataset<Row> readGeoJSON(){
    Dataset<Row> dataset = spark.read()
            .option("multyLine", true)
            .format("json")
            .json(filenames);

    //Expand the fields
    dataset = dataset.drop("_corrupt_record").filter(dataset.col("geometry").isNotNull());
    StructType schema = dataset.schema();
    StructField[] gj_fields =  schema.fields();
    for (StructField sf : gj_fields){
        DataType dt =  sf.dataType();
        if (dt instanceof  StructType) {
            StructType st = (StructType) dt;
            if (st.fields().length > 0) {
                String column_name = sf.name();
                for (String field : st.fieldNames())
                    dataset = dataset.withColumn(field, functions.explode(functions.array(column_name + "." + field)));
                dataset = dataset.drop(column_name);
            }
        }
    }
    //Convert GeoJSON Geometry into WKT
    UDF2<String, WrappedArray, String> coords2WKT =
            (String type, WrappedArray coords) ->{ return Coordinates2WKT.convert.apply(type, coords); };

    spark.udf().register("coords2WKT", coords2WKT, DataTypes.StringType);
    dataset = dataset.withColumn("geometry",
            functions.callUDF("coords2WKT", dataset.col("type"), dataset.col("coordinates")));
    dataset = dataset.drop(dataset.col("type")).drop(dataset.col("coordinates"));

    return dataset;
}

Example #3

Source File: UDFExample.java From Apache-Spark-2x-for-Java-Developers with MIT License

4 votes

public static void main(String[] args) {
	//Window Specific property if Hadoop is not instaalled or HADOOP_HOME is not set
	 System.setProperty("hadoop.home.dir", "E:\\hadoop");
	
	 //Build a Spark Session	
      SparkSession sparkSession = SparkSession
      .builder()
      .master("local")
	  .config("spark.sql.warehouse.dir","file:///E:/hadoop/warehouse")
      .appName("EdgeBuilder")
      .getOrCreate();
      Logger rootLogger = LogManager.getRootLogger();
	  rootLogger.setLevel(Level.WARN); 
	// Read the CSV data
		 Dataset<Row> emp_ds = sparkSession.read()
				 .format("com.databricks.spark.csv")
   		         .option("header", "true")
   		         .option("inferSchema", "true")
   		         .load("src/main/resources/employee.txt");    
    		
	    UDF2 calcDays=new CalcDaysUDF();
	  //Registering the UDFs in Spark Session created above      
	    sparkSession.udf().register("calcDays", calcDays, DataTypes.LongType);
	    
	    emp_ds.createOrReplaceTempView("emp_ds");
	    
	    emp_ds.printSchema();
	    emp_ds.show();
	    
	    sparkSession.sql("select calcDays(hiredate,'dd-MM-yyyy') from emp_ds").show();   
	    //Instantiate UDAF
	    AverageUDAF calcAvg= new AverageUDAF();
	    //Register UDAF to SparkSession
	    sparkSession.udf().register("calAvg", calcAvg);
	    //Use UDAF
	    sparkSession.sql("select deptno,calAvg(salary) from emp_ds group by deptno ").show(); 
	   
	    //
	    TypeSafeUDAF typeSafeUDAF=new TypeSafeUDAF();
	    
	    Dataset<Employee> emf = emp_ds.as(Encoders.bean(Employee.class));
	    emf.printSchema();
	    emf.show();
	    
	    TypedColumn<Employee, Double> averageSalary = typeSafeUDAF.toColumn().name("averageTypeSafe");
	    Dataset<Double> result = emf.select(averageSalary);
	   result.show();
	    

}