org.apache.spark.sql.Dataset#cache

Source File: RewriteManifestsAction.java From iceberg with Apache License 2.0

6 votes

private <T, U> U withReusableDS(Dataset<T> ds, Function<Dataset<T>, U> func) {
  Dataset<T> reusableDS;
  if (useCaching) {
    reusableDS = ds.cache();
  } else {
    int parallelism = SQLConf.get().numShufflePartitions();
    reusableDS = ds.repartition(parallelism).map((MapFunction<T, T>) value -> value, ds.exprEnc());
  }

  try {
    return func.apply(reusableDS);
  } finally {
    if (useCaching) {
      reusableDS.unpersist(false);
    }
  }
}

Source File: DataFilterStep.java From bpmn.ai with BSD 3-Clause "New" or "Revised" License

6 votes

@Override
public Dataset<Row> runPreprocessingStep(Dataset<Row> dataset, Map<String, Object> parameters, SparkRunnerConfig config) {

	if (parameters == null || parameters.size() == 0) {
        BpmnaiLogger.getInstance().writeWarn("No parameters found for the DataFilterStep");
        return dataset;
    }
	
	String query = (String) parameters.get("query");               
    BpmnaiLogger.getInstance().writeInfo("Filtering data with filter query: " + query + ".");
    dataset = dataset.filter(query);

    dataset.cache();
    if(dataset.count() == 0) {
        BpmnaiLogger.getInstance().writeInfo("Filtering resulted in zero lines of data. Aborting. Please check your filter query.");
        System.exit(1);
    }
           
    return dataset;
}

Source File: WriteToDiscStep.java From bpmn.ai with BSD 3-Clause "New" or "Revised" License

6 votes

@Override
public Dataset<Row> runPreprocessingStep(Dataset<Row> dataset, Map<String, Object> parameters, SparkRunnerConfig config) {
	
    // remove spaces from column names as parquet does not support them
    for(String columnName : dataset.columns()) {
        if(columnName.contains(" ")) {
            String newColumnName = columnName.replace(' ', '_');
            dataset = dataset.withColumnRenamed(columnName, newColumnName);
        }
    }

    dataset.cache();
    BpmnaiUtils.getInstance().writeDatasetToParquet(dataset, "result", config);

    if(config.isGenerateResultPreview()) {
        dataset.limit(config.getResultPreviewLineCount()).write().mode(SaveMode.Overwrite).saveAsTable(BpmnaiVariables.RESULT_PREVIEW_TEMP_TABLE);
    }

    return dataset;
}

Source File: ValueSetsTest.java From bunsen with Apache License 2.0

5 votes

@Test
public void testGetLatest() {

  String database = "test_get_latest";
  spark.sql("CREATE DATABASE " + database);

  ValueSets.getEmpty(spark)
      .withValueSets(
          valueSet("urn:cerner:valueset:newvalueset", "1"),
          valueSet("urn:cerner:valueset:newvalueset", "2"),
          valueSet("urn:cerner:valueset:othervalueset", "1"))
      .writeToDatabase(database);

  Dataset<Value> latest = ValueSets.getFromDatabase(spark, database)
      .getLatestValues(ImmutableSet.of("urn:cerner:valueset:newvalueset",
          "urn:cerner:valueset:othervalueset"),
          true);

  latest.cache();

  Assert.assertEquals(2, latest.count());

  Assert.assertEquals(0, latest.where(
      "valueSetUri == 'urn:cerner:valueset:newvalueset' AND valueSetVersion == '1'")
      .count());

  Assert.assertEquals(1, latest.where(
      "valueSetUri == 'urn:cerner:valueset:newvalueset' AND valueSetVersion == '2'")
      .count());

  Assert.assertEquals(1, latest.where(
      "valueSetUri == 'urn:cerner:valueset:othervalueset' AND valueSetVersion == '1'")
      .count());
}

Source File: ConceptMapsTest.java From bunsen with Apache License 2.0

5 votes

@Test
public void testGetLatest() {

  String database = "test_get_latest";
  spark.sql("create database " + database);

  ConceptMaps.getEmpty(spark)
      .withConceptMaps(
          conceptMap("urn:cerner:map:newmap", "1"),
          conceptMap("urn:cerner:map:newmap", "2"),
          conceptMap("urn:cerner:map:othermap", "1"))
      .writeToDatabase(database);

  Dataset<Mapping> latest = ConceptMaps.getFromDatabase(spark, database)
      .getLatestMappings(
          ImmutableSet.of("urn:cerner:map:newmap",
              "urn:cerner:map:othermap"),
          true);

  latest.cache();

  Assert.assertEquals(2, latest.count());

  Assert.assertEquals(0,
      latest.where("conceptMapUri == 'urn:cerner:map:newmap' and conceptMapVersion == '1'")
          .count());

  Assert.assertEquals(1,
      latest.where("conceptMapUri == 'urn:cerner:map:newmap' and conceptMapVersion == '2'")
          .count());

  Assert.assertEquals(1,
      latest.where("conceptMapUri == 'urn:cerner:map:othermap' and conceptMapVersion == '1'")
          .count());
}

Source File: ValueSetsTest.java From bunsen with Apache License 2.0

5 votes

@Test
public void testGetLatest() {

  String database = "test_get_latest";
  spark.sql("CREATE DATABASE " + database);

  ValueSets.getEmpty(spark)
      .withValueSets(
          valueSet("urn:cerner:valueset:newvalueset", "1"),
          valueSet("urn:cerner:valueset:newvalueset", "2"),
          valueSet("urn:cerner:valueset:othervalueset", "1"))
      .writeToDatabase(database);

  Dataset<Value> latest = ValueSets.getFromDatabase(spark, database)
      .getLatestValues(ImmutableSet.of("urn:cerner:valueset:newvalueset",
          "urn:cerner:valueset:othervalueset"),
          true);

  latest.cache();

  Assert.assertEquals(2, latest.count());

  Assert.assertEquals(0, latest.where(
      "valueSetUri == 'urn:cerner:valueset:newvalueset' AND valueSetVersion == '1'")
      .count());

  Assert.assertEquals(1, latest.where(
      "valueSetUri == 'urn:cerner:valueset:newvalueset' AND valueSetVersion == '2'")
      .count());

  Assert.assertEquals(1, latest.where(
      "valueSetUri == 'urn:cerner:valueset:othervalueset' AND valueSetVersion == '1'")
      .count());
}

Source File: ConceptMapsTest.java From bunsen with Apache License 2.0

5 votes

@Test
public void testGetLatest() {

  String database = "test_get_latest";
  spark.sql("create database " + database);

  ConceptMaps.getEmpty(spark)
      .withConceptMaps(
          conceptMap("urn:cerner:map:newmap", "1"),
          conceptMap("urn:cerner:map:newmap", "2"),
          conceptMap("urn:cerner:map:othermap", "1"))
      .writeToDatabase(database);

  Dataset<Mapping> latest = ConceptMaps.getFromDatabase(spark, database)
      .getLatestMappings(
          ImmutableSet.of("urn:cerner:map:newmap",
              "urn:cerner:map:othermap"),
          true);

  latest.cache();

  Assert.assertEquals(2, latest.count());

  Assert.assertEquals(0,
      latest.where("conceptMapUri == 'urn:cerner:map:newmap' and conceptMapVersion == '1'")
          .count());

  Assert.assertEquals(1,
      latest.where("conceptMapUri == 'urn:cerner:map:newmap' and conceptMapVersion == '2'")
          .count());

  Assert.assertEquals(1,
      latest.where("conceptMapUri == 'urn:cerner:map:othermap' and conceptMapVersion == '1'")
          .count());
}

Source File: TypeCastStep.java From bpmn.ai with BSD 3-Clause "New" or "Revised" License

4 votes

@Override
public Dataset<Row> runPreprocessingStep(Dataset<Row> dataset, Map<String, Object> parameters, SparkRunnerConfig config) {

    // get variables
    Map<String, String> varMap = (Map<String, String>) SparkBroadcastHelper.getInstance().getBroadcastVariable(SparkBroadcastHelper.BROADCAST_VARIABLE.PROCESS_VARIABLES_ESCALATED);

    List<StructField> datasetFields = Arrays.asList(dataset.schema().fields());

    List<ColumnConfiguration> columnConfigurations = null;
    List<VariableConfiguration> variableConfigurations = null;

    Configuration configuration = ConfigurationUtils.getInstance().getConfiguration(config);
    if(configuration != null) {
        PreprocessingConfiguration preprocessingConfiguration = configuration.getPreprocessingConfiguration();
        columnConfigurations = preprocessingConfiguration.getColumnConfiguration();
        variableConfigurations = preprocessingConfiguration.getVariableConfiguration();
    }

    Map<String, ColumnConfiguration> columnTypeConfigMap = new HashMap<>();
    Map<String, VariableConfiguration> variableTypeConfigMap = new HashMap<>();

    if(columnConfigurations != null) {
        for(ColumnConfiguration cc : columnConfigurations) {
            columnTypeConfigMap.put(cc.getColumnName(), cc);
        }
    }

    if(variableConfigurations != null) {
        for(VariableConfiguration vc : variableConfigurations) {
            variableTypeConfigMap.put(vc.getVariableName(), vc);
        }
    }

    for(String column : dataset.columns()) {

        // skip revision columns as they are handled for each variable column
        if(column.endsWith("_rev")) {
            continue;
        }

        DataType newDataType = null;
        boolean isVariableColumn  = false;
        String configurationDataType = null;
        String configurationParseFormat = null;

        if(variableTypeConfigMap.keySet().contains(column)) {
            // was initially a variable
            configurationDataType = variableTypeConfigMap.get(column).getVariableType();
            configurationParseFormat = variableTypeConfigMap.get(column).getParseFormat();
            if (config.getPipelineMode().equals(BpmnaiVariables.PIPELINE_MODE_LEARN)) {
                isVariableColumn = varMap.keySet().contains(column);
            } else {
                isVariableColumn = true;
            }
        } else if(columnTypeConfigMap.keySet().contains(column)){
            // was initially a column
            configurationDataType = columnTypeConfigMap.get(column).getColumnType();
            configurationParseFormat = columnTypeConfigMap.get(column).getParseFormat();
        }

        newDataType = mapDataType(datasetFields, column, configurationDataType);

        // only check for cast errors if dev feature is enabled and if a change in the datatype has been done
        if(config.isDevTypeCastCheckEnabled() && !newDataType.equals(getCurrentDataType(datasetFields, column))) {
            // add a column with casted value to be able to check the cast results
            dataset = castColumn(dataset, column, column+"_casted", newDataType, configurationParseFormat);

            // add a column for cast results and write CAST_ERROR? in it if there might be a cast error
            dataset = dataset.withColumn(column+"_castresult",
                    when(dataset.col(column).isNotNull().and(dataset.col(column).notEqual(lit(""))),
                            when(dataset.col(column+"_casted").isNull(), lit("CAST_ERROR?"))
                                    .otherwise(lit(""))
                    ).otherwise(lit(""))
            );
            dataset.cache();

            // check for cast errors and write warning to application log
            if(dataset.filter(column+"_castresult == 'CAST_ERROR?'").count() > 0) {
                BpmnaiLogger.getInstance().writeWarn("Column '" + column + "' seems to have cast errors. Please check the data type (is defined as '" + configurationDataType + "')");
            } else {
                // drop help columns as there are no cast errors for this column and rename casted column to actual column name
                dataset = dataset.drop(column, column+"_castresult").withColumnRenamed(column+"_casted", column);
            }
        } else {
            // cast without checking the cast result, entries are null is spark can't cast it
            dataset = castColumn(dataset, column, column, newDataType, configurationParseFormat);
        }

        // cast revision columns for former variables, revisions columns only exist on process level
        if(config.getDataLevel().equals(BpmnaiVariables.DATA_LEVEL_PROCESS) && config.isRevCountEnabled() && isVariableColumn) {
            dataset = dataset.withColumn(column+"_rev", dataset.col(column+"_rev").cast("integer"));
        }
    }

    if(config.isWriteStepResultsIntoFile()) {
        BpmnaiUtils.getInstance().writeDatasetToCSV(dataset, "type_cast_columns", config);
    }

    //return preprocessed data
    return dataset;
}

Source File: AggregateActivityInstancesStep.java From bpmn.ai with BSD 3-Clause "New" or "Revised" License

4 votes

@Override
public Dataset<Row> runPreprocessingStep(Dataset<Row> dataset, Map<String, Object> parameters, SparkRunnerConfig config) {

    //apply first and processState aggregator
    Map<String, String> aggregationMap = new HashMap<>();
    for(String column : dataset.columns()) {
        if(column.equals(BpmnaiVariables.VAR_PROCESS_INSTANCE_ID)) {
            continue;
        } else if(column.equals(BpmnaiVariables.VAR_DURATION) || column.endsWith("_rev")) {
            aggregationMap.put(column, "max");
        } else if(column.equals(BpmnaiVariables.VAR_STATE)) {
            aggregationMap.put(column, "ProcessState");
        } else if(column.equals(BpmnaiVariables.VAR_ACT_INST_ID)) {
            //ignore it, as we aggregate by it
            continue;
        } else {
            aggregationMap.put(column, "AllButEmptyString");
        }
    }

    //first aggregation
    //activity level, take only processInstance and activityInstance rows
    dataset = dataset
            .filter(dataset.col(BpmnaiVariables.VAR_DATA_SOURCE).notEqual(BpmnaiVariables.EVENT_PROCESS_INSTANCE))
            .groupBy(BpmnaiVariables.VAR_PROCESS_INSTANCE_ID, BpmnaiVariables.VAR_ACT_INST_ID)
            .agg(aggregationMap);

    //rename back columns after aggregation
    String pattern = "(max|allbutemptystring|processstate)\\((.+)\\)";
    Pattern r = Pattern.compile(pattern);

    for(String columnName : dataset.columns()) {
        Matcher m = r.matcher(columnName);
        if(m.find()) {
            String newColumnName = m.group(2);
            dataset = dataset.withColumnRenamed(columnName, newColumnName);
        }
    }


    //in case we add the CSV we have a name column in the first dataset of the join so we call drop again to make sure it is gone
    dataset = dataset.drop(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_NAME);
    dataset = dataset.drop(BpmnaiVariables.VAR_DATA_SOURCE);

    dataset = dataset.sort(BpmnaiVariables.VAR_START_TIME);

    dataset.cache();
    BpmnaiLogger.getInstance().writeInfo("Found " + dataset.count() + " activity instances.");

    if(config.isWriteStepResultsIntoFile()) {
        BpmnaiUtils.getInstance().writeDatasetToCSV(dataset, "agg_of_activity_instances", config);
    }

    //return preprocessed data
    return dataset;
}

Source File: DataFilterOnActivityStep.java From bpmn.ai with BSD 3-Clause "New" or "Revised" License

4 votes

/**
 * @param dataSet the incoming dataset for this processing step
 * @param parameters
 * @return the filtered DataSet
 */
@Override
public Dataset<Row> runPreprocessingStep(Dataset<Row> dataSet, Map<String, Object> parameters, SparkRunnerConfig config) {
    // any parameters set?
    if (parameters == null || parameters.size() == 0) {
        BpmnaiLogger.getInstance().writeWarn("No parameters found for the DataFilterOnActivityStep");
        return dataSet;
    }

    // get query parameter
    String query = (String) parameters.get("query");
    BpmnaiLogger.getInstance().writeInfo("Filtering data with activity instance filter query: " + query + ".");

    // save size of initial dataset for log
    dataSet.cache();
    Long initialDSCount = dataSet.count();

    // repartition by process instance and order by start_time for this operation
    dataSet = dataSet.repartition(dataSet.col(BpmnaiVariables.VAR_PROCESS_INSTANCE_ID)).sortWithinPartitions(BpmnaiVariables.VAR_START_TIME);

    // we temporarily store variable updates (rows with a var type set) separately.
    Dataset<Row> variables = dataSet.filter(col(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_TYPE).isNotNull());
    //find first occurrence of activity instance
    final Dataset<Row> dsTmp = dataSet.filter(dataSet.col(BpmnaiVariables.VAR_ACT_ID).equalTo(query)).filter(dataSet.col(BpmnaiVariables.VAR_END_TIME).isNull()); //TODO: ENSURING THAT THIS ISN'T A VARIABLE ROW

    // now we look for the first occurrence of the activity id contained in "query". The result comprises of a dataset of corresponding activity instances.
    final Dataset<Row> dsActivityInstances = dataSet.filter(dataSet.col(BpmnaiVariables.VAR_ACT_ID).like(query)).filter(dataSet.col(BpmnaiVariables.VAR_END_TIME).isNull()); //TODO: ENSURING THAT THIS ISN'T A VARIABLE ROW

    // we slim the resulting dataset down: only the activity instances process id and the instances start time are relevant.
    List<Row> activityRows = dsActivityInstances.select(BpmnaiVariables.VAR_PROCESS_INSTANCE_ID, BpmnaiVariables.VAR_START_TIME).collectAsList();
    Map<String, String> activities = activityRows.stream().collect(Collectors.toMap(
            r -> r.getAs(BpmnaiVariables.VAR_PROCESS_INSTANCE_ID), r -> r.getAs(BpmnaiVariables.VAR_START_TIME)));
    // broadcasting the PID - Start time Map to use it in a user defined function
    SparkBroadcastHelper.getInstance().broadcastVariable(SparkBroadcastHelper.BROADCAST_VARIABLE.PROCESS_INSTANCE_TIMESTAMP_MAP, activities);

    // now we have to select for each process instance in our inital dataset all events that happend before the first occurence of our selected activity.
    // We first narrow it down to the process instances in question
    Dataset<Row> selectedProcesses = dataSet.filter(col(BpmnaiVariables.VAR_PROCESS_INSTANCE_ID).isin(activities.keySet().toArray()));
    // Then, we mark all events that should be removed
    Dataset<Row> activityDataSet = selectedProcesses.withColumn("data_filter_on_activity",
            callUDF("activityBeforeTimestamp",
                    selectedProcesses.col(BpmnaiVariables.VAR_PROCESS_INSTANCE_ID),
                    selectedProcesses.col(BpmnaiVariables.VAR_START_TIME)));
    // And we keep the rest
    activityDataSet = activityDataSet.filter(col("data_filter_on_activity").like("TRUE"));
    // Clean up
    activityDataSet = activityDataSet.drop("data_filter_on_activity");

    // However, we lost all variable updates in this approach, so now we add the variables in question to the dataset
    // first, we narrow it down to keep only variables that have a corresponding activity instance
    activityDataSet = activityDataSet.withColumnRenamed(BpmnaiVariables.VAR_ACT_INST_ID, BpmnaiVariables.VAR_ACT_INST_ID+"_RIGHT");

    variables = variables.join(activityDataSet.select(BpmnaiVariables.VAR_ACT_INST_ID+"_RIGHT").distinct(), variables.col(BpmnaiVariables.VAR_ACT_INST_ID).equalTo(activityDataSet.col(BpmnaiVariables.VAR_ACT_INST_ID+"_RIGHT")),"inner");

    activityDataSet = activityDataSet.withColumnRenamed(BpmnaiVariables.VAR_ACT_INST_ID+"_RIGHT", BpmnaiVariables.VAR_ACT_INST_ID);
    variables = variables.drop(BpmnaiVariables.VAR_ACT_INST_ID+"_RIGHT");
    dataSet = activityDataSet.union(variables);

    dataSet.cache();
    BpmnaiLogger.getInstance().writeInfo("DataFilterOnActivityStep: The filtered DataSet contains "+dataSet.count()+" rows, (before: "+ initialDSCount+" rows)");

    if (config.isWriteStepResultsIntoFile()) {
        BpmnaiUtils.getInstance().writeDatasetToCSV(dataSet, "data_filter_on_activity_step", config);
    }

    return dataSet;


}

Source File: BikeRentalPrediction.java From Apache-Spark-2x-for-Java-Developers with MIT License

4 votes

public static void main(String[] args) {
	System.setProperty("hadoop.home.dir", "E:\\sumitK\\Hadoop");
	SparkSession sparkSession = SparkSession
			.builder()
			.master("local")
			.config("spark.sql.warehouse.dir",
					"file:///E:/sumitK/Hadoop/warehouse")
			.appName("BikeRentalPrediction").getOrCreate();
	Logger rootLogger = LogManager.getRootLogger();
	rootLogger.setLevel(Level.WARN);
	//We use the sqlContext.read method to read the data and set a few options:
	//  'format': specifies the Spark CSV data source
	//  'header': set to true to indicate that the first line of the CSV data file is a header
    // The file is called 'hour.csv'.	
	Dataset<Row> ds=sparkSession.read()
			  .format("org.apache.spark.sql.execution.datasources.csv.CSVFileFormat")
			  .option("header", "true")
			  .load("E:\\sumitK\\Hadoop\\Bike-Sharing-Dataset\\hour.csv");
	
	ds.cache();
	
	ds.select("season").show();;
	
	ds.show();
	
	System.out.println("Our dataset has rows :: "+ ds.count());
	
	Dataset<Row> df = ds.drop("instant").drop("dteday").drop("casual").drop("registered");
	df.printSchema();
	//col("...") is preferable to df.col("...")
	Dataset<Row> dformatted = df.select(col("season").cast(DataTypes.IntegerType),
			                            col("yr").cast(DataTypes.IntegerType),
										col("mnth").cast(DataTypes.IntegerType),
										col("hr").cast(DataTypes.IntegerType),
										col("holiday").cast(DataTypes.IntegerType),
										col("weekday").cast(DataTypes.IntegerType),
										col("workingday").cast(DataTypes.IntegerType),
										col("weathersit").cast(DataTypes.IntegerType),
										col("temp").cast(DataTypes.IntegerType),
										col("atemp").cast(DataTypes.IntegerType),
										col("hum").cast(DataTypes.IntegerType),
										col("windspeed").cast(DataTypes.IntegerType),
										col("cnt").cast(DataTypes.IntegerType));
	
	
dformatted.printSchema();	
Dataset<Row>[] data=	dformatted.randomSplit(new double[]{0.7,0.3});
System.out.println("We have training examples count :: "+ data[0].count()+" and test examples count ::"+data[1].count());

///
//removing 'cnt' cloumn and then forming str array
String[] featuresCols = dformatted.drop("cnt").columns();

for(String str:featuresCols){
	System.out.println(str+" :: ");
}

//This concatenates all feature columns into a single feature vector in a new column "rawFeatures".
VectorAssembler vectorAssembler = new VectorAssembler().setInputCols(featuresCols).setOutputCol("rawFeatures");
//This identifies categorical features and indexes them.
VectorIndexer vectorIndexer= new VectorIndexer().setInputCol("rawFeatures").setOutputCol("features").setMaxCategories(4);

//Takes the "features" column and learns to predict "cnt"
GBTRegressor gbt = new GBTRegressor().setLabelCol("cnt");
		
// Define a grid of hyperparameters to test:
  //  - maxDepth: max depth of each decision tree in the GBT ensemble
//  - maxIter: iterations, i.e., number of trees in each GBT ensemble
// In this example notebook, we keep these values small.  In practice, to get the highest accuracy, you would likely want to try deeper trees (10 or higher) and more trees in the ensemble (>100).
ParamMap[]	paramGrid = new ParamGridBuilder().addGrid(gbt.maxDepth(),new int[]{2, 5}).addGrid(gbt.maxIter(),new int[] {10, 100}).build();
// We define an evaluation metric.  This tells CrossValidator how well we are doing by comparing the true labels with predictions.
RegressionEvaluator evaluator = new RegressionEvaluator().setMetricName("rmse").setLabelCol(gbt.getLabelCol()).setPredictionCol(gbt.getPredictionCol());

//	# Declare the CrossValidator, which runs model tuning for us.
	CrossValidator cv = new CrossValidator().setEstimator(gbt).setEvaluator(evaluator).setEstimatorParamMaps(paramGrid);
		
	Pipeline pipeline = new Pipeline().setStages(new PipelineStage[]{vectorAssembler,vectorIndexer,cv});
			
	PipelineModel pipelineModel=pipeline.fit(data[0]);
	
	Dataset<Row> predictions = pipelineModel.transform(data[1]);
	
	predictions.show();
	//predictions.select("cnt", "prediction", *featuresCols);
}

Java Code Examples for org.apache.spark.sql.Dataset#cache()