org.apache.spark.api.java.function.FilterFunction Java Examples
The following examples show how to use
org.apache.spark.api.java.function.FilterFunction.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: Hierarchies.java From bunsen with Apache License 2.0 | 6 votes |
/** * Returns the collection of ancestors from the table in the given database. * * @param spark the spark session * @param database name of the database containing the ancestors table * @return a Hierarchies instance. */ public static Hierarchies getFromDatabase(SparkSession spark, String database) { Dataset<Ancestor> ancestors = spark.sql("SELECT * FROM " + database + "." + ANCESTORS_TABLE) .as(ANCESTOR_ENCODER); Dataset<UrlAndVersion> members = ancestors.filter((FilterFunction<Ancestor>) ancestor -> ancestor.getUri().startsWith(HIERARCHY_URI_PREFIX)) .select(col("uri").alias("url"), col("version")) .distinct() .as(URI_AND_VERSION_ENCODER); return new Hierarchies(spark, members, ancestors); }
Example #2
Source File: DetermineProcessVariablesStep.java From bpmn.ai with BSD 3-Clause "New" or "Revised" License | 4 votes |
private Dataset<Row> doFilterVariables(Dataset<Row> dataset, boolean writeStepResultIntoFile, SparkRunnerConfig config) { List<String> variablesToFilter = new ArrayList<>(); Configuration configuration = ConfigurationUtils.getInstance().getConfiguration(config); if(configuration != null) { PreprocessingConfiguration preprocessingConfiguration = configuration.getPreprocessingConfiguration(); if(preprocessingConfiguration != null) { for(VariableConfiguration vc : preprocessingConfiguration.getVariableConfiguration()) { if(!vc.isUseVariable()) { variablesToFilter.add(vc.getVariableName()); BpmnaiLogger.getInstance().writeInfo("The variable '" + vc.getVariableName() + "' will be filtered out. Comment: " + vc.getComment()); } } } } //check if all variables that should be filtered actually exist, otherwise log a warning List<Row> existingVariablesRows = dataset.select(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_NAME).distinct().collectAsList(); List<String> existingVariables = existingVariablesRows .stream() .map(r -> r.getString(0)).collect(Collectors.toList()); variablesToFilter .stream() .forEach(new Consumer<String>() { @Override public void accept(String s) { if(!existingVariables.contains(s)) { // log the fact that a variable that should be filtered does not exist BpmnaiLogger.getInstance().writeWarn("The variable '" + s + "' is configured to be filtered, but does not exist in the data."); } } }); dataset = dataset.filter((FilterFunction<Row>) row -> { // keep the row if the variable name column does not contain a value that should be filtered String variable = row.getAs(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_NAME); //TODO: cleanup boolean keep = !variablesToFilter.contains(variable); if(variable != null && variable.startsWith("_CORRELATION_ID_")) { keep = false; } return keep; }); if(writeStepResultIntoFile) { BpmnaiUtils.getInstance().writeDatasetToCSV(dataset, "variable_filter", config); } return dataset; }
Example #3
Source File: AbstractConceptMaps.java From bunsen with Apache License 2.0 | 3 votes |
/** * Returns a dataset with the mappings for each uri and version. * * @param uriToVersion a map of concept map URI to the version to load * @return a dataset of mappings for the given URIs and versions. */ public Dataset<Mapping> getMappings(Map<String,String> uriToVersion) { JavaSparkContext context = new JavaSparkContext(this.spark.sparkContext()); Broadcast<Map<String,String>> broadcastMaps = context.broadcast(uriToVersion); return this.mappings.filter((FilterFunction<Mapping>) mapping -> { String latestVersion = broadcastMaps.getValue().get(mapping.getConceptMapUri()); return latestVersion != null && latestVersion.equals(mapping.getConceptMapVersion()); }); }
Example #4
Source File: AbstractValueSets.java From bunsen with Apache License 2.0 | 3 votes |
/** * Returns a dataset with the values for each element in the map of uri to version. * * @param uriToVersion a map of value set URI to the version to load * @return a dataset of values for the given URIs and versions. */ public Dataset<Value> getValues(Map<String,String> uriToVersion) { JavaSparkContext context = new JavaSparkContext(this.spark.sparkContext()); Broadcast<Map<String,String>> broadcastUrisToVersion = context.broadcast(uriToVersion); return this.values.filter((FilterFunction<Value>) value -> { String latestVersion = broadcastUrisToVersion.getValue().get(value.getValueSetUri()); return latestVersion != null && latestVersion.equals(value.getValueSetVersion()); }); }