Java Code Examples for org.apache.spark.sql.Dataset#filter()
The following examples show how to use
org.apache.spark.sql.Dataset#filter() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: DataFilterStep.java From bpmn.ai with BSD 3-Clause "New" or "Revised" License | 6 votes |
@Override public Dataset<Row> runPreprocessingStep(Dataset<Row> dataset, Map<String, Object> parameters, SparkRunnerConfig config) { if (parameters == null || parameters.size() == 0) { BpmnaiLogger.getInstance().writeWarn("No parameters found for the DataFilterStep"); return dataset; } String query = (String) parameters.get("query"); BpmnaiLogger.getInstance().writeInfo("Filtering data with filter query: " + query + "."); dataset = dataset.filter(query); dataset.cache(); if(dataset.count() == 0) { BpmnaiLogger.getInstance().writeInfo("Filtering resulted in zero lines of data. Aborting. Please check your filter query."); System.exit(1); } return dataset; }
Example 2
Source File: WaterInteractions.java From mmtf-spark with Apache License 2.0 | 6 votes |
/** * Remove rows where the water interaction does not include at least one organic ligand (LGO) * and one protein residue (PRO). * * TODO need to handle cases of maxInteractions > 4 * @param data * @param maxInteractions * @return */ private static Dataset<Row> filterBridgingWaterInteractions(Dataset<Row> data, String maxInteractions) { if (maxInteractions.compareTo("4") > 0) { throw new IllegalArgumentException("maxInteractions > 4 are not supported, yet"); } if (maxInteractions.equals("2")) { data = data.filter(col("type1").equalTo("LGO").or(col("type2").equalTo("LGO"))); data = data.filter(col("type1").equalTo("PRO").or(col("type2").equalTo("PRO"))); } else if (maxInteractions.equals("3")) { data = data.filter(col("type1").equalTo("LGO").or(col("type2").equalTo("LGO")) .or(col("type3").equalTo("LGO"))); data = data.filter(col("type1").equalTo("PRO").or(col("type2").equalTo("PRO")) .or(col("type3").equalTo("PRO"))); } else if (maxInteractions.equals("4")) { data = data.filter(col("type1").equalTo("LGO").or(col("type2").equalTo("LGO")) .or(col("type3").equalTo("LGO")).or(col("type4").equalTo("LGO"))); data = data.filter(col("type1").equalTo("PRO").or(col("type2").equalTo("PRO")) .or(col("type3").equalTo("PRO")).or(col("type4").equalTo("PRO"))); } return data; }
Example 3
Source File: PdbDrugBankMapping.java From mmtf-spark with Apache License 2.0 | 6 votes |
public static void main(String[] args) throws IOException { SparkSession spark = SparkSession.builder().master("local[*]").appName(PdbDrugBankMapping.class.getSimpleName()) .getOrCreate(); // download open DrugBank dataset Dataset<Row> drugBank = DrugBankDataset.getOpenDrugLinks(); // find some tryrosine kinase inhibitors with generic name stem: "tinib" drugBank = drugBank.filter("Commonname LIKE '%tinib'"); // get PDB ligand annotations Dataset<Row> ligands = CustomReportService.getDataset("ligandId","ligandMolecularWeight","ligandFormula","ligandSmiles","InChIKey"); // join ligand dataset with DrugBank info by InChIKey ligands = ligands.join(drugBank, ligands.col("InChIKey").equalTo(drugBank.col("StandardInChIKey"))); // show one example per drug molecule ligands = ligands.dropDuplicates("Commonname"); ligands.select("structureChainId", "ligandId", "DrugBankID", "Commonname", "ligandMolecularWeight","ligandFormula", "InChIKey", "ligandSmiles") .sort("Commonname").show(50); spark.close(); }
Example 4
Source File: DrugBankDemo.java From mmtf-spark with Apache License 2.0 | 6 votes |
public static void main(String[] args) throws IOException { SparkSession spark = SparkSession.builder().master("local[*]").appName(DrugBankDemo.class.getSimpleName()) .getOrCreate(); // download open DrugBank dataset Dataset<Row> openDrugLinks = DrugBankDataset.getOpenDrugLinks(); // find all drugs with an InChIKey openDrugLinks = openDrugLinks.filter("StandardInChIKey IS NOT NULL"); // show some sample data openDrugLinks.select("DrugBankID", "Commonname", "CAS", "StandardInChIKey").show(); // The DrugBank password protected datasets contain more information. // You need to create a DrugBank account and supply username/password // to access these datasets. // Download DrugBank dataset for approved drugs // String username = args[0]; // String password = args[1]; // Dataset<Row> drugLinks = // DrugBankDataset.getDrugLinks(DrugGroup.APPROVED, username, password); // drugLinks.show(); spark.close(); }
Example 5
Source File: HoodieClientTestUtils.java From hudi with Apache License 2.0 | 6 votes |
/** * Obtain all new data written into the Hoodie table since the given timestamp. */ public static Dataset<Row> readSince(String basePath, SQLContext sqlContext, HoodieTimeline commitTimeline, String lastCommitTime) { List<HoodieInstant> commitsToReturn = commitTimeline.findInstantsAfter(lastCommitTime, Integer.MAX_VALUE).getInstants().collect(Collectors.toList()); try { // Go over the commit metadata, and obtain the new files that need to be read. HashMap<String, String> fileIdToFullPath = getLatestFileIDsToFullPath(basePath, commitTimeline, commitsToReturn); String[] paths = fileIdToFullPath.values().toArray(new String[fileIdToFullPath.size()]); Dataset<Row> rows = null; if (paths[0].endsWith(HoodieFileFormat.PARQUET.getFileExtension())) { rows = sqlContext.read().parquet(paths); } return rows.filter(String.format("%s >'%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, lastCommitTime)); } catch (IOException e) { throw new HoodieException("Error pulling data incrementally from commitTimestamp :" + lastCommitTime, e); } }
Example 6
Source File: ParDoTranslatorBatch.java From beam with Apache License 2.0 | 6 votes |
private void pruneOutputFilteredByTag( TranslationContext context, Dataset<Tuple2<TupleTag<?>, WindowedValue<?>>> allOutputs, Map.Entry<TupleTag<?>, PValue> output, Coder<? extends BoundedWindow> windowCoder) { Dataset<Tuple2<TupleTag<?>, WindowedValue<?>>> filteredDataset = allOutputs.filter(new DoFnFilterFunction(output.getKey())); Coder<WindowedValue<?>> windowedValueCoder = (Coder<WindowedValue<?>>) (Coder<?>) WindowedValue.getFullCoder( ((PCollection<OutputT>) output.getValue()).getCoder(), windowCoder); Dataset<WindowedValue<?>> outputDataset = filteredDataset.map( (MapFunction<Tuple2<TupleTag<?>, WindowedValue<?>>, WindowedValue<?>>) value -> value._2, EncoderHelpers.fromBeamCoder(windowedValueCoder)); context.putDatasetWildcard(output.getValue(), outputDataset); }
Example 7
Source File: CustomReportDemo.java From mmtf-spark with Apache License 2.0 | 5 votes |
/** * @param args no input arguments * @throws IOException if custom report web service fails */ public static void main(String[] args) throws IOException { long start = System.nanoTime(); SparkConf conf = new SparkConf().setMaster("local[1]").setAppName(CustomReportDemo.class.getSimpleName()); JavaSparkContext sc = new JavaSparkContext(conf); // retrieve PDB annotation: Binding affinities (Ki, Kd), // group name of the ligand (hetId), and the // Enzyme Classification number (ecNo) Dataset<Row> ds = CustomReportService.getDataset("Ki","Kd","hetId","ecNo"); // show the schema of this dataset ds.printSchema(); // select structures that either have a Ki or Kd value(s) and // are protein-serine/threonine kinases (EC 2.7.1.*): // A. by using dataset operations ds = ds.filter("(Ki IS NOT NULL OR Kd IS NOT NULL) AND ecNo LIKE '2.7.11.%'"); ds.show(10); // B. by creating a temporary query and running SQL ds.createOrReplaceTempView("table"); ds.sparkSession().sql("SELECT * from table WHERE (Ki IS NOT NULL OR Kd IS NOT NULL) AND ecNo LIKE '2.7.11.%'"); ds.show(10); long end = System.nanoTime(); System.out.println("Time: " + (end-start)/1E9 + "sec."); sc.close(); }
Example 8
Source File: PdbMetadataDemo.java From mmtf-spark with Apache License 2.0 | 5 votes |
public static void main(String[] args) throws IOException { SparkSession spark = SparkSession.builder().master("local[*]").appName(PdbMetadataDemo.class.getSimpleName()) .getOrCreate(); // query the following fields from the _citation category using PDBj's Mine2 web service: // journal_abbrev, pdbx_database_id_PubMed, year. // Note, mixed case column names must be quoted and escaped with \". String sqlQuery = "SELECT pdbid, journal_abbrev, \"pdbx_database_id_PubMed\", year from citation WHERE id = 'primary'"; Dataset<Row>ds = PdbjMineDataset.getDataset(sqlQuery); System.out.println("First 10 results from query: " + sqlQuery); ds.show(10, false); // filter out unpublished entries (they contain the word "published" in various upper/lower case combinations) ds = ds.filter("UPPER(journal_abbrev) NOT LIKE '%PUBLISHED%'"); // print the top 10 journals System.out.println("Top 10 journals that publish PDB structures:"); ds.groupBy("journal_abbrev").count().sort(col("count").desc()).show(10, false); // filter out entries without a PubMed Id (is -1 if PubMed Id is not available) ds = ds.filter("pdbx_database_id_PubMed > 0"); System.out.println("Entries with PubMed Ids: " + ds.count()); // show growth of papers in PubMed System.out.println("PubMed Ids per year: "); ds.groupBy("year").count().sort(col("year").desc()).show(10, false); spark.close(); }
Example 9
Source File: SparkMLScoringOnline.java From -Data-Stream-Development-with-Apache-Spark-Kafka-and-Spring-Boot with MIT License | 4 votes |
public static void main(String[] args) throws InterruptedException, StreamingQueryException { System.setProperty("hadoop.home.dir", HADOOP_HOME_DIR_VALUE); // * the schema can be written on disk, and read from disk // * the schema is not mandatory to be complete, it can contain only the needed fields StructType RSVP_SCHEMA = new StructType() .add("event", new StructType() .add("event_id", StringType, true) .add("event_name", StringType, true) .add("event_url", StringType, true) .add("time", LongType, true)) .add("group", new StructType() .add("group_city", StringType, true) .add("group_country", StringType, true) .add("group_id", LongType, true) .add("group_lat", DoubleType, true) .add("group_lon", DoubleType, true) .add("group_name", StringType, true) .add("group_state", StringType, true) .add("group_topics", DataTypes.createArrayType( new StructType() .add("topicName", StringType, true) .add("urlkey", StringType, true)), true) .add("group_urlname", StringType, true)) .add("guests", LongType, true) .add("member", new StructType() .add("member_id", LongType, true) .add("member_name", StringType, true) .add("photo", StringType, true)) .add("mtime", LongType, true) .add("response", StringType, true) .add("rsvp_id", LongType, true) .add("venue", new StructType() .add("lat", DoubleType, true) .add("lon", DoubleType, true) .add("venue_id", LongType, true) .add("venue_name", StringType, true)) .add("visibility", StringType, true); final SparkConf conf = new SparkConf() .setMaster(RUN_LOCAL_WITH_AVAILABLE_CORES) .setAppName(APPLICATION_NAME) .set("spark.sql.caseSensitive", CASE_SENSITIVE); SparkSession spark = SparkSession .builder() .config(conf) .getOrCreate(); PipelineModel pipelineModel = PipelineModel.load(MODEL_FOLDER_PATH); Dataset<Row> meetupStream = spark.readStream() .format(KAFKA_FORMAT) .option("kafka.bootstrap.servers", KAFKA_BROKERS) .option("subscribe", KAFKA_TOPIC) .load(); Dataset<Row> gatheredDF = meetupStream.select( (from_json(col("value").cast("string"), RSVP_SCHEMA)) .alias("rsvp")) .alias("meetup") .select("meetup.*"); Dataset<Row> filteredDF = gatheredDF.filter(e -> !e.anyNull()); Dataset<Row> preparedDF = filteredDF.select( col("rsvp.group.group_city"), col("rsvp.group.group_lat"), col("rsvp.group.group_lon"), col("rsvp.response") ); preparedDF.printSchema(); Dataset<Row> predictionDF = pipelineModel.transform(preparedDF); StreamingQuery query = predictionDF.writeStream() .format(JSON_FORMAT) .option("path", RESULT_FOLDER_PATH) .option("checkpointLocation", CHECKPOINT_LOCATION) .trigger(Trigger.ProcessingTime(QUERY_INTERVAL_SECONDS)) .option("truncate", false) .start(); query.awaitTermination(); }
Example 10
Source File: DetermineProcessVariablesStep.java From bpmn.ai with BSD 3-Clause "New" or "Revised" License | 4 votes |
private Dataset<Row> doFilterVariables(Dataset<Row> dataset, boolean writeStepResultIntoFile, SparkRunnerConfig config) { List<String> variablesToFilter = new ArrayList<>(); Configuration configuration = ConfigurationUtils.getInstance().getConfiguration(config); if(configuration != null) { PreprocessingConfiguration preprocessingConfiguration = configuration.getPreprocessingConfiguration(); if(preprocessingConfiguration != null) { for(VariableConfiguration vc : preprocessingConfiguration.getVariableConfiguration()) { if(!vc.isUseVariable()) { variablesToFilter.add(vc.getVariableName()); BpmnaiLogger.getInstance().writeInfo("The variable '" + vc.getVariableName() + "' will be filtered out. Comment: " + vc.getComment()); } } } } //check if all variables that should be filtered actually exist, otherwise log a warning List<Row> existingVariablesRows = dataset.select(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_NAME).distinct().collectAsList(); List<String> existingVariables = existingVariablesRows .stream() .map(r -> r.getString(0)).collect(Collectors.toList()); variablesToFilter .stream() .forEach(new Consumer<String>() { @Override public void accept(String s) { if(!existingVariables.contains(s)) { // log the fact that a variable that should be filtered does not exist BpmnaiLogger.getInstance().writeWarn("The variable '" + s + "' is configured to be filtered, but does not exist in the data."); } } }); dataset = dataset.filter((FilterFunction<Row>) row -> { // keep the row if the variable name column does not contain a value that should be filtered String variable = row.getAs(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_NAME); //TODO: cleanup boolean keep = !variablesToFilter.contains(variable); if(variable != null && variable.startsWith("_CORRELATION_ID_")) { keep = false; } return keep; }); if(writeStepResultIntoFile) { BpmnaiUtils.getInstance().writeDatasetToCSV(dataset, "variable_filter", config); } return dataset; }
Example 11
Source File: DataFilterOnActivityStep.java From bpmn.ai with BSD 3-Clause "New" or "Revised" License | 4 votes |
/** * @param dataSet the incoming dataset for this processing step * @param parameters * @return the filtered DataSet */ @Override public Dataset<Row> runPreprocessingStep(Dataset<Row> dataSet, Map<String, Object> parameters, SparkRunnerConfig config) { // any parameters set? if (parameters == null || parameters.size() == 0) { BpmnaiLogger.getInstance().writeWarn("No parameters found for the DataFilterOnActivityStep"); return dataSet; } // get query parameter String query = (String) parameters.get("query"); BpmnaiLogger.getInstance().writeInfo("Filtering data with activity instance filter query: " + query + "."); // save size of initial dataset for log dataSet.cache(); Long initialDSCount = dataSet.count(); // repartition by process instance and order by start_time for this operation dataSet = dataSet.repartition(dataSet.col(BpmnaiVariables.VAR_PROCESS_INSTANCE_ID)).sortWithinPartitions(BpmnaiVariables.VAR_START_TIME); // we temporarily store variable updates (rows with a var type set) separately. Dataset<Row> variables = dataSet.filter(col(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_TYPE).isNotNull()); //find first occurrence of activity instance final Dataset<Row> dsTmp = dataSet.filter(dataSet.col(BpmnaiVariables.VAR_ACT_ID).equalTo(query)).filter(dataSet.col(BpmnaiVariables.VAR_END_TIME).isNull()); //TODO: ENSURING THAT THIS ISN'T A VARIABLE ROW // now we look for the first occurrence of the activity id contained in "query". The result comprises of a dataset of corresponding activity instances. final Dataset<Row> dsActivityInstances = dataSet.filter(dataSet.col(BpmnaiVariables.VAR_ACT_ID).like(query)).filter(dataSet.col(BpmnaiVariables.VAR_END_TIME).isNull()); //TODO: ENSURING THAT THIS ISN'T A VARIABLE ROW // we slim the resulting dataset down: only the activity instances process id and the instances start time are relevant. List<Row> activityRows = dsActivityInstances.select(BpmnaiVariables.VAR_PROCESS_INSTANCE_ID, BpmnaiVariables.VAR_START_TIME).collectAsList(); Map<String, String> activities = activityRows.stream().collect(Collectors.toMap( r -> r.getAs(BpmnaiVariables.VAR_PROCESS_INSTANCE_ID), r -> r.getAs(BpmnaiVariables.VAR_START_TIME))); // broadcasting the PID - Start time Map to use it in a user defined function SparkBroadcastHelper.getInstance().broadcastVariable(SparkBroadcastHelper.BROADCAST_VARIABLE.PROCESS_INSTANCE_TIMESTAMP_MAP, activities); // now we have to select for each process instance in our inital dataset all events that happend before the first occurence of our selected activity. // We first narrow it down to the process instances in question Dataset<Row> selectedProcesses = dataSet.filter(col(BpmnaiVariables.VAR_PROCESS_INSTANCE_ID).isin(activities.keySet().toArray())); // Then, we mark all events that should be removed Dataset<Row> activityDataSet = selectedProcesses.withColumn("data_filter_on_activity", callUDF("activityBeforeTimestamp", selectedProcesses.col(BpmnaiVariables.VAR_PROCESS_INSTANCE_ID), selectedProcesses.col(BpmnaiVariables.VAR_START_TIME))); // And we keep the rest activityDataSet = activityDataSet.filter(col("data_filter_on_activity").like("TRUE")); // Clean up activityDataSet = activityDataSet.drop("data_filter_on_activity"); // However, we lost all variable updates in this approach, so now we add the variables in question to the dataset // first, we narrow it down to keep only variables that have a corresponding activity instance activityDataSet = activityDataSet.withColumnRenamed(BpmnaiVariables.VAR_ACT_INST_ID, BpmnaiVariables.VAR_ACT_INST_ID+"_RIGHT"); variables = variables.join(activityDataSet.select(BpmnaiVariables.VAR_ACT_INST_ID+"_RIGHT").distinct(), variables.col(BpmnaiVariables.VAR_ACT_INST_ID).equalTo(activityDataSet.col(BpmnaiVariables.VAR_ACT_INST_ID+"_RIGHT")),"inner"); activityDataSet = activityDataSet.withColumnRenamed(BpmnaiVariables.VAR_ACT_INST_ID+"_RIGHT", BpmnaiVariables.VAR_ACT_INST_ID); variables = variables.drop(BpmnaiVariables.VAR_ACT_INST_ID+"_RIGHT"); dataSet = activityDataSet.union(variables); dataSet.cache(); BpmnaiLogger.getInstance().writeInfo("DataFilterOnActivityStep: The filtered DataSet contains "+dataSet.count()+" rows, (before: "+ initialDSCount+" rows)"); if (config.isWriteStepResultsIntoFile()) { BpmnaiUtils.getInstance().writeDatasetToCSV(dataSet, "data_filter_on_activity_step", config); } return dataSet; }
Example 12
Source File: AtpInteractionAnalysis.java From mmtf-spark with Apache License 2.0 | 4 votes |
/** * @param args input arguments * @throws IOException */ public static void main(String[] args) throws IOException { String path = MmtfReader.getMmtfFullPath(); long start = System.nanoTime(); SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(AtpInteractionAnalysis.class.getSimpleName()); JavaSparkContext sc = new JavaSparkContext(conf); // read PDB in MMTF format JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc); // filter by sequence identity subset int sequenceIdentity = 20; double resolution = 2.0; pdb = pdb.filter(new Pisces(sequenceIdentity, resolution)); // find ATP interactions within 3 Angstroms GroupInteractionExtractor finder = new GroupInteractionExtractor("ATP", 3); Dataset<Row> interactions = finder.getDataset(pdb).cache(); // TODO add a line to only analyze interactions // with the oxygens in the terminal phosphate group of ATP // (O1G, O2G, O3G) // Tip: Google SQL LIKE interactions = interactions.filter("atom1 LIKE('O%G')"); // show the data schema of the dataset and some data interactions.printSchema(); interactions.show(20); long n = interactions.count(); System.out.println("# interactions: " + n); System.out.println("Top interacting groups"); Dataset<Row> topGroups = interactions .groupBy("residue2") .count(); topGroups .sort(col("count").desc()) // sort descending by count .show(10); System.out.println("Top interacting group/atoms types"); Dataset<Row> topGroupsAndAtoms = interactions .groupBy("residue2","atom2") .count(); topGroupsAndAtoms .withColumn("frequency", col("count").divide(n)) // add column with frequency of occurrence .sort(col("frequency").desc()) // sort descending .show(10); long end = System.nanoTime(); System.out.println("Time: " + (end-start)/1E9 + "sec."); sc.close(); }
Example 13
Source File: Basic.java From learning-spark-with-java with MIT License | 4 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("Dataset-Basic") .master("local[4]") .getOrCreate(); List<Integer> data = Arrays.asList(10, 11, 12, 13, 14, 15); Dataset<Integer> ds = spark.createDataset(data, Encoders.INT()); System.out.println("*** only one column, and it always has the same name"); ds.printSchema(); ds.show(); System.out.println("*** values > 12"); // the harder way to filter Dataset<Integer> ds2 = ds.filter((Integer value) -> value > 12); ds.show(); List<Tuple3<Integer, String, String>> tuples = Arrays.asList( new Tuple3<>(1, "one", "un"), new Tuple3<>(2, "two", "deux"), new Tuple3<>(3, "three", "trois")); Encoder<Tuple3<Integer, String, String>> encoder = Encoders.tuple(Encoders.INT(), Encoders.STRING(), Encoders.STRING()); Dataset<Tuple3<Integer, String, String>> tupleDS = spark.createDataset(tuples, encoder); System.out.println("*** Tuple Dataset types"); tupleDS.printSchema(); // the tuple columns have unfriendly names, but you can use them to query System.out.println("*** filter by one column and fetch another"); tupleDS.where(col("_1").gt(2)).select(col("_2"), col("_3")).show(); spark.stop(); }
Example 14
Source File: InListDeriver.java From envelope with Apache License 2.0 | 4 votes |
@Override public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) { Dataset<Row> target = getStepDataFrame(dependencies); if (target.columns().length < 1) { throw new RuntimeException("Targeted step, '" + stepName + ",' has no columns"); } try { String targetField = fieldName == null ? target.columns()[0] : fieldName; Column targetColumn = target.col(targetField); LOGGER.debug("Targeting '{}[{}]'", stepName, targetField); // If the IN list is inline, there is no batch if (inList != null) { LOGGER.debug("IN list is inline"); return target.filter(targetColumn.isin(inList.toArray())); } // Otherwise, collect the values from the reference, executed within the batch else { LOGGER.trace("IN list is a reference"); Dataset<Row> reference = dependencies.get(refStepName); String referenceField = refFieldName == null ? reference.columns()[0] : refFieldName; LOGGER.debug("Referencing using {}[{}]", refStepName, referenceField); Column referenceColumn = reference.col(referenceField); Iterator<Row> referenceIterator = reference.select(referenceColumn).distinct().toLocalIterator(); this.inList = new ArrayList<>(); long counter = 0; // Set up the batch collector JavaRDD<Row> unionRDD = new JavaSparkContext(Contexts.getSparkSession().sparkContext()).emptyRDD(); Dataset<Row> union = Contexts.getSparkSession().createDataFrame(unionRDD, target.schema()); while (referenceIterator.hasNext()) { // Flush the batch if (counter == batchSize) { LOGGER.trace("Flushing batch"); union = union.union(target.filter(targetColumn.isin(inList.toArray()))); inList.clear(); counter = 0L; } // Gather the elements of the IN list from the reference inList.add(referenceIterator.next().get(0)); counter++; } // If the selection is under the batch threshold if (union.rdd().isEmpty()) { return target.filter(targetColumn.isin(inList.toArray())); } // Flush any remaining IN list values else { return union.union(target.filter(targetColumn.isin(inList.toArray()))); } } } catch (Throwable ae) { throw new RuntimeException("Error executing IN list filtering", ae); } }