Java Code Examples for org.apache.spark.sql.Dataset#cache()
The following examples show how to use
org.apache.spark.sql.Dataset#cache() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: RewriteManifestsAction.java From iceberg with Apache License 2.0 | 6 votes |
private <T, U> U withReusableDS(Dataset<T> ds, Function<Dataset<T>, U> func) { Dataset<T> reusableDS; if (useCaching) { reusableDS = ds.cache(); } else { int parallelism = SQLConf.get().numShufflePartitions(); reusableDS = ds.repartition(parallelism).map((MapFunction<T, T>) value -> value, ds.exprEnc()); } try { return func.apply(reusableDS); } finally { if (useCaching) { reusableDS.unpersist(false); } } }
Example 2
Source File: DataFilterStep.java From bpmn.ai with BSD 3-Clause "New" or "Revised" License | 6 votes |
@Override public Dataset<Row> runPreprocessingStep(Dataset<Row> dataset, Map<String, Object> parameters, SparkRunnerConfig config) { if (parameters == null || parameters.size() == 0) { BpmnaiLogger.getInstance().writeWarn("No parameters found for the DataFilterStep"); return dataset; } String query = (String) parameters.get("query"); BpmnaiLogger.getInstance().writeInfo("Filtering data with filter query: " + query + "."); dataset = dataset.filter(query); dataset.cache(); if(dataset.count() == 0) { BpmnaiLogger.getInstance().writeInfo("Filtering resulted in zero lines of data. Aborting. Please check your filter query."); System.exit(1); } return dataset; }
Example 3
Source File: WriteToDiscStep.java From bpmn.ai with BSD 3-Clause "New" or "Revised" License | 6 votes |
@Override public Dataset<Row> runPreprocessingStep(Dataset<Row> dataset, Map<String, Object> parameters, SparkRunnerConfig config) { // remove spaces from column names as parquet does not support them for(String columnName : dataset.columns()) { if(columnName.contains(" ")) { String newColumnName = columnName.replace(' ', '_'); dataset = dataset.withColumnRenamed(columnName, newColumnName); } } dataset.cache(); BpmnaiUtils.getInstance().writeDatasetToParquet(dataset, "result", config); if(config.isGenerateResultPreview()) { dataset.limit(config.getResultPreviewLineCount()).write().mode(SaveMode.Overwrite).saveAsTable(BpmnaiVariables.RESULT_PREVIEW_TEMP_TABLE); } return dataset; }
Example 4
Source File: ValueSetsTest.java From bunsen with Apache License 2.0 | 5 votes |
@Test public void testGetLatest() { String database = "test_get_latest"; spark.sql("CREATE DATABASE " + database); ValueSets.getEmpty(spark) .withValueSets( valueSet("urn:cerner:valueset:newvalueset", "1"), valueSet("urn:cerner:valueset:newvalueset", "2"), valueSet("urn:cerner:valueset:othervalueset", "1")) .writeToDatabase(database); Dataset<Value> latest = ValueSets.getFromDatabase(spark, database) .getLatestValues(ImmutableSet.of("urn:cerner:valueset:newvalueset", "urn:cerner:valueset:othervalueset"), true); latest.cache(); Assert.assertEquals(2, latest.count()); Assert.assertEquals(0, latest.where( "valueSetUri == 'urn:cerner:valueset:newvalueset' AND valueSetVersion == '1'") .count()); Assert.assertEquals(1, latest.where( "valueSetUri == 'urn:cerner:valueset:newvalueset' AND valueSetVersion == '2'") .count()); Assert.assertEquals(1, latest.where( "valueSetUri == 'urn:cerner:valueset:othervalueset' AND valueSetVersion == '1'") .count()); }
Example 5
Source File: ConceptMapsTest.java From bunsen with Apache License 2.0 | 5 votes |
@Test public void testGetLatest() { String database = "test_get_latest"; spark.sql("create database " + database); ConceptMaps.getEmpty(spark) .withConceptMaps( conceptMap("urn:cerner:map:newmap", "1"), conceptMap("urn:cerner:map:newmap", "2"), conceptMap("urn:cerner:map:othermap", "1")) .writeToDatabase(database); Dataset<Mapping> latest = ConceptMaps.getFromDatabase(spark, database) .getLatestMappings( ImmutableSet.of("urn:cerner:map:newmap", "urn:cerner:map:othermap"), true); latest.cache(); Assert.assertEquals(2, latest.count()); Assert.assertEquals(0, latest.where("conceptMapUri == 'urn:cerner:map:newmap' and conceptMapVersion == '1'") .count()); Assert.assertEquals(1, latest.where("conceptMapUri == 'urn:cerner:map:newmap' and conceptMapVersion == '2'") .count()); Assert.assertEquals(1, latest.where("conceptMapUri == 'urn:cerner:map:othermap' and conceptMapVersion == '1'") .count()); }
Example 6
Source File: ValueSetsTest.java From bunsen with Apache License 2.0 | 5 votes |
@Test public void testGetLatest() { String database = "test_get_latest"; spark.sql("CREATE DATABASE " + database); ValueSets.getEmpty(spark) .withValueSets( valueSet("urn:cerner:valueset:newvalueset", "1"), valueSet("urn:cerner:valueset:newvalueset", "2"), valueSet("urn:cerner:valueset:othervalueset", "1")) .writeToDatabase(database); Dataset<Value> latest = ValueSets.getFromDatabase(spark, database) .getLatestValues(ImmutableSet.of("urn:cerner:valueset:newvalueset", "urn:cerner:valueset:othervalueset"), true); latest.cache(); Assert.assertEquals(2, latest.count()); Assert.assertEquals(0, latest.where( "valueSetUri == 'urn:cerner:valueset:newvalueset' AND valueSetVersion == '1'") .count()); Assert.assertEquals(1, latest.where( "valueSetUri == 'urn:cerner:valueset:newvalueset' AND valueSetVersion == '2'") .count()); Assert.assertEquals(1, latest.where( "valueSetUri == 'urn:cerner:valueset:othervalueset' AND valueSetVersion == '1'") .count()); }
Example 7
Source File: ConceptMapsTest.java From bunsen with Apache License 2.0 | 5 votes |
@Test public void testGetLatest() { String database = "test_get_latest"; spark.sql("create database " + database); ConceptMaps.getEmpty(spark) .withConceptMaps( conceptMap("urn:cerner:map:newmap", "1"), conceptMap("urn:cerner:map:newmap", "2"), conceptMap("urn:cerner:map:othermap", "1")) .writeToDatabase(database); Dataset<Mapping> latest = ConceptMaps.getFromDatabase(spark, database) .getLatestMappings( ImmutableSet.of("urn:cerner:map:newmap", "urn:cerner:map:othermap"), true); latest.cache(); Assert.assertEquals(2, latest.count()); Assert.assertEquals(0, latest.where("conceptMapUri == 'urn:cerner:map:newmap' and conceptMapVersion == '1'") .count()); Assert.assertEquals(1, latest.where("conceptMapUri == 'urn:cerner:map:newmap' and conceptMapVersion == '2'") .count()); Assert.assertEquals(1, latest.where("conceptMapUri == 'urn:cerner:map:othermap' and conceptMapVersion == '1'") .count()); }
Example 8
Source File: TypeCastStep.java From bpmn.ai with BSD 3-Clause "New" or "Revised" License | 4 votes |
@Override public Dataset<Row> runPreprocessingStep(Dataset<Row> dataset, Map<String, Object> parameters, SparkRunnerConfig config) { // get variables Map<String, String> varMap = (Map<String, String>) SparkBroadcastHelper.getInstance().getBroadcastVariable(SparkBroadcastHelper.BROADCAST_VARIABLE.PROCESS_VARIABLES_ESCALATED); List<StructField> datasetFields = Arrays.asList(dataset.schema().fields()); List<ColumnConfiguration> columnConfigurations = null; List<VariableConfiguration> variableConfigurations = null; Configuration configuration = ConfigurationUtils.getInstance().getConfiguration(config); if(configuration != null) { PreprocessingConfiguration preprocessingConfiguration = configuration.getPreprocessingConfiguration(); columnConfigurations = preprocessingConfiguration.getColumnConfiguration(); variableConfigurations = preprocessingConfiguration.getVariableConfiguration(); } Map<String, ColumnConfiguration> columnTypeConfigMap = new HashMap<>(); Map<String, VariableConfiguration> variableTypeConfigMap = new HashMap<>(); if(columnConfigurations != null) { for(ColumnConfiguration cc : columnConfigurations) { columnTypeConfigMap.put(cc.getColumnName(), cc); } } if(variableConfigurations != null) { for(VariableConfiguration vc : variableConfigurations) { variableTypeConfigMap.put(vc.getVariableName(), vc); } } for(String column : dataset.columns()) { // skip revision columns as they are handled for each variable column if(column.endsWith("_rev")) { continue; } DataType newDataType = null; boolean isVariableColumn = false; String configurationDataType = null; String configurationParseFormat = null; if(variableTypeConfigMap.keySet().contains(column)) { // was initially a variable configurationDataType = variableTypeConfigMap.get(column).getVariableType(); configurationParseFormat = variableTypeConfigMap.get(column).getParseFormat(); if (config.getPipelineMode().equals(BpmnaiVariables.PIPELINE_MODE_LEARN)) { isVariableColumn = varMap.keySet().contains(column); } else { isVariableColumn = true; } } else if(columnTypeConfigMap.keySet().contains(column)){ // was initially a column configurationDataType = columnTypeConfigMap.get(column).getColumnType(); configurationParseFormat = columnTypeConfigMap.get(column).getParseFormat(); } newDataType = mapDataType(datasetFields, column, configurationDataType); // only check for cast errors if dev feature is enabled and if a change in the datatype has been done if(config.isDevTypeCastCheckEnabled() && !newDataType.equals(getCurrentDataType(datasetFields, column))) { // add a column with casted value to be able to check the cast results dataset = castColumn(dataset, column, column+"_casted", newDataType, configurationParseFormat); // add a column for cast results and write CAST_ERROR? in it if there might be a cast error dataset = dataset.withColumn(column+"_castresult", when(dataset.col(column).isNotNull().and(dataset.col(column).notEqual(lit(""))), when(dataset.col(column+"_casted").isNull(), lit("CAST_ERROR?")) .otherwise(lit("")) ).otherwise(lit("")) ); dataset.cache(); // check for cast errors and write warning to application log if(dataset.filter(column+"_castresult == 'CAST_ERROR?'").count() > 0) { BpmnaiLogger.getInstance().writeWarn("Column '" + column + "' seems to have cast errors. Please check the data type (is defined as '" + configurationDataType + "')"); } else { // drop help columns as there are no cast errors for this column and rename casted column to actual column name dataset = dataset.drop(column, column+"_castresult").withColumnRenamed(column+"_casted", column); } } else { // cast without checking the cast result, entries are null is spark can't cast it dataset = castColumn(dataset, column, column, newDataType, configurationParseFormat); } // cast revision columns for former variables, revisions columns only exist on process level if(config.getDataLevel().equals(BpmnaiVariables.DATA_LEVEL_PROCESS) && config.isRevCountEnabled() && isVariableColumn) { dataset = dataset.withColumn(column+"_rev", dataset.col(column+"_rev").cast("integer")); } } if(config.isWriteStepResultsIntoFile()) { BpmnaiUtils.getInstance().writeDatasetToCSV(dataset, "type_cast_columns", config); } //return preprocessed data return dataset; }
Example 9
Source File: AggregateActivityInstancesStep.java From bpmn.ai with BSD 3-Clause "New" or "Revised" License | 4 votes |
@Override public Dataset<Row> runPreprocessingStep(Dataset<Row> dataset, Map<String, Object> parameters, SparkRunnerConfig config) { //apply first and processState aggregator Map<String, String> aggregationMap = new HashMap<>(); for(String column : dataset.columns()) { if(column.equals(BpmnaiVariables.VAR_PROCESS_INSTANCE_ID)) { continue; } else if(column.equals(BpmnaiVariables.VAR_DURATION) || column.endsWith("_rev")) { aggregationMap.put(column, "max"); } else if(column.equals(BpmnaiVariables.VAR_STATE)) { aggregationMap.put(column, "ProcessState"); } else if(column.equals(BpmnaiVariables.VAR_ACT_INST_ID)) { //ignore it, as we aggregate by it continue; } else { aggregationMap.put(column, "AllButEmptyString"); } } //first aggregation //activity level, take only processInstance and activityInstance rows dataset = dataset .filter(dataset.col(BpmnaiVariables.VAR_DATA_SOURCE).notEqual(BpmnaiVariables.EVENT_PROCESS_INSTANCE)) .groupBy(BpmnaiVariables.VAR_PROCESS_INSTANCE_ID, BpmnaiVariables.VAR_ACT_INST_ID) .agg(aggregationMap); //rename back columns after aggregation String pattern = "(max|allbutemptystring|processstate)\\((.+)\\)"; Pattern r = Pattern.compile(pattern); for(String columnName : dataset.columns()) { Matcher m = r.matcher(columnName); if(m.find()) { String newColumnName = m.group(2); dataset = dataset.withColumnRenamed(columnName, newColumnName); } } //in case we add the CSV we have a name column in the first dataset of the join so we call drop again to make sure it is gone dataset = dataset.drop(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_NAME); dataset = dataset.drop(BpmnaiVariables.VAR_DATA_SOURCE); dataset = dataset.sort(BpmnaiVariables.VAR_START_TIME); dataset.cache(); BpmnaiLogger.getInstance().writeInfo("Found " + dataset.count() + " activity instances."); if(config.isWriteStepResultsIntoFile()) { BpmnaiUtils.getInstance().writeDatasetToCSV(dataset, "agg_of_activity_instances", config); } //return preprocessed data return dataset; }
Example 10
Source File: DataFilterOnActivityStep.java From bpmn.ai with BSD 3-Clause "New" or "Revised" License | 4 votes |
/** * @param dataSet the incoming dataset for this processing step * @param parameters * @return the filtered DataSet */ @Override public Dataset<Row> runPreprocessingStep(Dataset<Row> dataSet, Map<String, Object> parameters, SparkRunnerConfig config) { // any parameters set? if (parameters == null || parameters.size() == 0) { BpmnaiLogger.getInstance().writeWarn("No parameters found for the DataFilterOnActivityStep"); return dataSet; } // get query parameter String query = (String) parameters.get("query"); BpmnaiLogger.getInstance().writeInfo("Filtering data with activity instance filter query: " + query + "."); // save size of initial dataset for log dataSet.cache(); Long initialDSCount = dataSet.count(); // repartition by process instance and order by start_time for this operation dataSet = dataSet.repartition(dataSet.col(BpmnaiVariables.VAR_PROCESS_INSTANCE_ID)).sortWithinPartitions(BpmnaiVariables.VAR_START_TIME); // we temporarily store variable updates (rows with a var type set) separately. Dataset<Row> variables = dataSet.filter(col(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_TYPE).isNotNull()); //find first occurrence of activity instance final Dataset<Row> dsTmp = dataSet.filter(dataSet.col(BpmnaiVariables.VAR_ACT_ID).equalTo(query)).filter(dataSet.col(BpmnaiVariables.VAR_END_TIME).isNull()); //TODO: ENSURING THAT THIS ISN'T A VARIABLE ROW // now we look for the first occurrence of the activity id contained in "query". The result comprises of a dataset of corresponding activity instances. final Dataset<Row> dsActivityInstances = dataSet.filter(dataSet.col(BpmnaiVariables.VAR_ACT_ID).like(query)).filter(dataSet.col(BpmnaiVariables.VAR_END_TIME).isNull()); //TODO: ENSURING THAT THIS ISN'T A VARIABLE ROW // we slim the resulting dataset down: only the activity instances process id and the instances start time are relevant. List<Row> activityRows = dsActivityInstances.select(BpmnaiVariables.VAR_PROCESS_INSTANCE_ID, BpmnaiVariables.VAR_START_TIME).collectAsList(); Map<String, String> activities = activityRows.stream().collect(Collectors.toMap( r -> r.getAs(BpmnaiVariables.VAR_PROCESS_INSTANCE_ID), r -> r.getAs(BpmnaiVariables.VAR_START_TIME))); // broadcasting the PID - Start time Map to use it in a user defined function SparkBroadcastHelper.getInstance().broadcastVariable(SparkBroadcastHelper.BROADCAST_VARIABLE.PROCESS_INSTANCE_TIMESTAMP_MAP, activities); // now we have to select for each process instance in our inital dataset all events that happend before the first occurence of our selected activity. // We first narrow it down to the process instances in question Dataset<Row> selectedProcesses = dataSet.filter(col(BpmnaiVariables.VAR_PROCESS_INSTANCE_ID).isin(activities.keySet().toArray())); // Then, we mark all events that should be removed Dataset<Row> activityDataSet = selectedProcesses.withColumn("data_filter_on_activity", callUDF("activityBeforeTimestamp", selectedProcesses.col(BpmnaiVariables.VAR_PROCESS_INSTANCE_ID), selectedProcesses.col(BpmnaiVariables.VAR_START_TIME))); // And we keep the rest activityDataSet = activityDataSet.filter(col("data_filter_on_activity").like("TRUE")); // Clean up activityDataSet = activityDataSet.drop("data_filter_on_activity"); // However, we lost all variable updates in this approach, so now we add the variables in question to the dataset // first, we narrow it down to keep only variables that have a corresponding activity instance activityDataSet = activityDataSet.withColumnRenamed(BpmnaiVariables.VAR_ACT_INST_ID, BpmnaiVariables.VAR_ACT_INST_ID+"_RIGHT"); variables = variables.join(activityDataSet.select(BpmnaiVariables.VAR_ACT_INST_ID+"_RIGHT").distinct(), variables.col(BpmnaiVariables.VAR_ACT_INST_ID).equalTo(activityDataSet.col(BpmnaiVariables.VAR_ACT_INST_ID+"_RIGHT")),"inner"); activityDataSet = activityDataSet.withColumnRenamed(BpmnaiVariables.VAR_ACT_INST_ID+"_RIGHT", BpmnaiVariables.VAR_ACT_INST_ID); variables = variables.drop(BpmnaiVariables.VAR_ACT_INST_ID+"_RIGHT"); dataSet = activityDataSet.union(variables); dataSet.cache(); BpmnaiLogger.getInstance().writeInfo("DataFilterOnActivityStep: The filtered DataSet contains "+dataSet.count()+" rows, (before: "+ initialDSCount+" rows)"); if (config.isWriteStepResultsIntoFile()) { BpmnaiUtils.getInstance().writeDatasetToCSV(dataSet, "data_filter_on_activity_step", config); } return dataSet; }
Example 11
Source File: BikeRentalPrediction.java From Apache-Spark-2x-for-Java-Developers with MIT License | 4 votes |
public static void main(String[] args) { System.setProperty("hadoop.home.dir", "E:\\sumitK\\Hadoop"); SparkSession sparkSession = SparkSession .builder() .master("local") .config("spark.sql.warehouse.dir", "file:///E:/sumitK/Hadoop/warehouse") .appName("BikeRentalPrediction").getOrCreate(); Logger rootLogger = LogManager.getRootLogger(); rootLogger.setLevel(Level.WARN); //We use the sqlContext.read method to read the data and set a few options: // 'format': specifies the Spark CSV data source // 'header': set to true to indicate that the first line of the CSV data file is a header // The file is called 'hour.csv'. Dataset<Row> ds=sparkSession.read() .format("org.apache.spark.sql.execution.datasources.csv.CSVFileFormat") .option("header", "true") .load("E:\\sumitK\\Hadoop\\Bike-Sharing-Dataset\\hour.csv"); ds.cache(); ds.select("season").show();; ds.show(); System.out.println("Our dataset has rows :: "+ ds.count()); Dataset<Row> df = ds.drop("instant").drop("dteday").drop("casual").drop("registered"); df.printSchema(); //col("...") is preferable to df.col("...") Dataset<Row> dformatted = df.select(col("season").cast(DataTypes.IntegerType), col("yr").cast(DataTypes.IntegerType), col("mnth").cast(DataTypes.IntegerType), col("hr").cast(DataTypes.IntegerType), col("holiday").cast(DataTypes.IntegerType), col("weekday").cast(DataTypes.IntegerType), col("workingday").cast(DataTypes.IntegerType), col("weathersit").cast(DataTypes.IntegerType), col("temp").cast(DataTypes.IntegerType), col("atemp").cast(DataTypes.IntegerType), col("hum").cast(DataTypes.IntegerType), col("windspeed").cast(DataTypes.IntegerType), col("cnt").cast(DataTypes.IntegerType)); dformatted.printSchema(); Dataset<Row>[] data= dformatted.randomSplit(new double[]{0.7,0.3}); System.out.println("We have training examples count :: "+ data[0].count()+" and test examples count ::"+data[1].count()); /// //removing 'cnt' cloumn and then forming str array String[] featuresCols = dformatted.drop("cnt").columns(); for(String str:featuresCols){ System.out.println(str+" :: "); } //This concatenates all feature columns into a single feature vector in a new column "rawFeatures". VectorAssembler vectorAssembler = new VectorAssembler().setInputCols(featuresCols).setOutputCol("rawFeatures"); //This identifies categorical features and indexes them. VectorIndexer vectorIndexer= new VectorIndexer().setInputCol("rawFeatures").setOutputCol("features").setMaxCategories(4); //Takes the "features" column and learns to predict "cnt" GBTRegressor gbt = new GBTRegressor().setLabelCol("cnt"); // Define a grid of hyperparameters to test: // - maxDepth: max depth of each decision tree in the GBT ensemble // - maxIter: iterations, i.e., number of trees in each GBT ensemble // In this example notebook, we keep these values small. In practice, to get the highest accuracy, you would likely want to try deeper trees (10 or higher) and more trees in the ensemble (>100). ParamMap[] paramGrid = new ParamGridBuilder().addGrid(gbt.maxDepth(),new int[]{2, 5}).addGrid(gbt.maxIter(),new int[] {10, 100}).build(); // We define an evaluation metric. This tells CrossValidator how well we are doing by comparing the true labels with predictions. RegressionEvaluator evaluator = new RegressionEvaluator().setMetricName("rmse").setLabelCol(gbt.getLabelCol()).setPredictionCol(gbt.getPredictionCol()); // # Declare the CrossValidator, which runs model tuning for us. CrossValidator cv = new CrossValidator().setEstimator(gbt).setEvaluator(evaluator).setEstimatorParamMaps(paramGrid); Pipeline pipeline = new Pipeline().setStages(new PipelineStage[]{vectorAssembler,vectorIndexer,cv}); PipelineModel pipelineModel=pipeline.fit(data[0]); Dataset<Row> predictions = pipelineModel.transform(data[1]); predictions.show(); //predictions.select("cnt", "prediction", *featuresCols); }