org.apache.spark.sql.Dataset#filter

Source File: DataFilterStep.java From bpmn.ai with BSD 3-Clause "New" or "Revised" License

6 votes

@Override
public Dataset<Row> runPreprocessingStep(Dataset<Row> dataset, Map<String, Object> parameters, SparkRunnerConfig config) {

	if (parameters == null || parameters.size() == 0) {
        BpmnaiLogger.getInstance().writeWarn("No parameters found for the DataFilterStep");
        return dataset;
    }
	
	String query = (String) parameters.get("query");               
    BpmnaiLogger.getInstance().writeInfo("Filtering data with filter query: " + query + ".");
    dataset = dataset.filter(query);

    dataset.cache();
    if(dataset.count() == 0) {
        BpmnaiLogger.getInstance().writeInfo("Filtering resulted in zero lines of data. Aborting. Please check your filter query.");
        System.exit(1);
    }
           
    return dataset;
}

Source File: WaterInteractions.java From mmtf-spark with Apache License 2.0

6 votes

/**
 * Remove rows where the water interaction does not include at least one organic ligand (LGO) 
 * and one protein residue (PRO).
 * 
 * TODO need to handle cases of maxInteractions > 4
 * @param data
 * @param maxInteractions
 * @return
 */
private static Dataset<Row> filterBridgingWaterInteractions(Dataset<Row> data, String maxInteractions) {    
	if (maxInteractions.compareTo("4") > 0) {
		throw new IllegalArgumentException("maxInteractions > 4 are not supported, yet");
	}
	
    if (maxInteractions.equals("2")) {
        data = data.filter(col("type1").equalTo("LGO").or(col("type2").equalTo("LGO")));
        data = data.filter(col("type1").equalTo("PRO").or(col("type2").equalTo("PRO")));
    } else if (maxInteractions.equals("3")) {
        data = data.filter(col("type1").equalTo("LGO").or(col("type2").equalTo("LGO"))
                .or(col("type3").equalTo("LGO")));
        data = data.filter(col("type1").equalTo("PRO").or(col("type2").equalTo("PRO"))
                .or(col("type3").equalTo("PRO")));
    } else if (maxInteractions.equals("4")) {
        data = data.filter(col("type1").equalTo("LGO").or(col("type2").equalTo("LGO"))
                .or(col("type3").equalTo("LGO")).or(col("type4").equalTo("LGO")));
        data = data.filter(col("type1").equalTo("PRO").or(col("type2").equalTo("PRO"))
                .or(col("type3").equalTo("PRO")).or(col("type4").equalTo("PRO")));
    }
    return data;
}

Source File: PdbDrugBankMapping.java From mmtf-spark with Apache License 2.0

6 votes

public static void main(String[] args) throws IOException {
    SparkSession spark = SparkSession.builder().master("local[*]").appName(PdbDrugBankMapping.class.getSimpleName())
            .getOrCreate();

    // download open DrugBank dataset
    Dataset<Row> drugBank = DrugBankDataset.getOpenDrugLinks();
    
    // find some tryrosine kinase inhibitors with generic name stem: "tinib"
    drugBank = drugBank.filter("Commonname LIKE '%tinib'");
    
    // get PDB ligand annotations
    Dataset<Row> ligands = CustomReportService.getDataset("ligandId","ligandMolecularWeight","ligandFormula","ligandSmiles","InChIKey");

    // join ligand dataset with DrugBank info by InChIKey
    ligands = ligands.join(drugBank, ligands.col("InChIKey").equalTo(drugBank.col("StandardInChIKey")));
   
    // show one example per drug molecule
    ligands = ligands.dropDuplicates("Commonname");
    ligands.select("structureChainId", "ligandId", "DrugBankID", "Commonname", "ligandMolecularWeight","ligandFormula", "InChIKey", "ligandSmiles")
    .sort("Commonname").show(50);

    spark.close(); 
}

Source File: DrugBankDemo.java From mmtf-spark with Apache License 2.0

6 votes

public static void main(String[] args) throws IOException {
    SparkSession spark = SparkSession.builder().master("local[*]").appName(DrugBankDemo.class.getSimpleName())
            .getOrCreate();

    // download open DrugBank dataset
    Dataset<Row> openDrugLinks = DrugBankDataset.getOpenDrugLinks();

    // find all drugs with an InChIKey
    openDrugLinks = openDrugLinks.filter("StandardInChIKey IS NOT NULL");

    // show some sample data
    openDrugLinks.select("DrugBankID", "Commonname", "CAS", "StandardInChIKey").show();

    // The DrugBank password protected datasets contain more information.
    // You need to create a DrugBank account and supply username/password
    // to access these datasets.

    // Download DrugBank dataset for approved drugs
    // String username = args[0];
    // String password = args[1];
    // Dataset<Row> drugLinks =
    // DrugBankDataset.getDrugLinks(DrugGroup.APPROVED, username, password);
    // drugLinks.show();

    spark.close(); 
}

Source File: HoodieClientTestUtils.java From hudi with Apache License 2.0

6 votes

/**
 * Obtain all new data written into the Hoodie table since the given timestamp.
 */
public static Dataset<Row> readSince(String basePath, SQLContext sqlContext,
                                     HoodieTimeline commitTimeline, String lastCommitTime) {
  List<HoodieInstant> commitsToReturn =
      commitTimeline.findInstantsAfter(lastCommitTime, Integer.MAX_VALUE).getInstants().collect(Collectors.toList());
  try {
    // Go over the commit metadata, and obtain the new files that need to be read.
    HashMap<String, String> fileIdToFullPath = getLatestFileIDsToFullPath(basePath, commitTimeline, commitsToReturn);
    String[] paths = fileIdToFullPath.values().toArray(new String[fileIdToFullPath.size()]);
    Dataset<Row> rows = null;
    if (paths[0].endsWith(HoodieFileFormat.PARQUET.getFileExtension())) {
      rows = sqlContext.read().parquet(paths);
    }

    return rows.filter(String.format("%s >'%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, lastCommitTime));
  } catch (IOException e) {
    throw new HoodieException("Error pulling data incrementally from commitTimestamp :" + lastCommitTime, e);
  }
}

Source File: ParDoTranslatorBatch.java From beam with Apache License 2.0

6 votes

private void pruneOutputFilteredByTag(
    TranslationContext context,
    Dataset<Tuple2<TupleTag<?>, WindowedValue<?>>> allOutputs,
    Map.Entry<TupleTag<?>, PValue> output,
    Coder<? extends BoundedWindow> windowCoder) {
  Dataset<Tuple2<TupleTag<?>, WindowedValue<?>>> filteredDataset =
      allOutputs.filter(new DoFnFilterFunction(output.getKey()));
  Coder<WindowedValue<?>> windowedValueCoder =
      (Coder<WindowedValue<?>>)
          (Coder<?>)
              WindowedValue.getFullCoder(
                  ((PCollection<OutputT>) output.getValue()).getCoder(), windowCoder);
  Dataset<WindowedValue<?>> outputDataset =
      filteredDataset.map(
          (MapFunction<Tuple2<TupleTag<?>, WindowedValue<?>>, WindowedValue<?>>)
              value -> value._2,
          EncoderHelpers.fromBeamCoder(windowedValueCoder));
  context.putDatasetWildcard(output.getValue(), outputDataset);
}

Source File: CustomReportDemo.java From mmtf-spark with Apache License 2.0

5 votes

/**
 * @param args no input arguments
 * @throws IOException if custom report web service fails
 */
public static void main(String[] args) throws IOException {    
    long start = System.nanoTime();
    
    SparkConf conf = new SparkConf().setMaster("local[1]").setAppName(CustomReportDemo.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
   
    // retrieve PDB annotation: Binding affinities (Ki, Kd), 
    // group name of the ligand (hetId), and the 
    // Enzyme Classification number (ecNo)
    Dataset<Row> ds = CustomReportService.getDataset("Ki","Kd","hetId","ecNo");
    
    // show the schema of this dataset
    ds.printSchema();
        
    // select structures that either have a Ki or Kd value(s) and
    // are protein-serine/threonine kinases (EC 2.7.1.*):
    
    // A. by using dataset operations
    ds = ds.filter("(Ki IS NOT NULL OR Kd IS NOT NULL) AND ecNo LIKE '2.7.11.%'");
    ds.show(10);
     
    // B. by creating a temporary query and running SQL
    ds.createOrReplaceTempView("table");
    ds.sparkSession().sql("SELECT * from table WHERE (Ki IS NOT NULL OR Kd IS NOT NULL) AND ecNo LIKE '2.7.11.%'");
    ds.show(10);
    
    long end = System.nanoTime();
    
    System.out.println("Time:     " + (end-start)/1E9 + "sec.");
    
    sc.close();
}

Source File: PdbMetadataDemo.java From mmtf-spark with Apache License 2.0

5 votes

public static void main(String[] args) throws IOException {
 SparkSession spark = SparkSession.builder().master("local[*]").appName(PdbMetadataDemo.class.getSimpleName())
            .getOrCreate();

 // query the following fields from the _citation category using PDBj's Mine2 web service:
 // journal_abbrev, pdbx_database_id_PubMed, year.   
 // Note, mixed case column names must be quoted and escaped with \".
 String sqlQuery = "SELECT pdbid, journal_abbrev, \"pdbx_database_id_PubMed\", year from citation WHERE id = 'primary'";
 Dataset<Row>ds = PdbjMineDataset.getDataset(sqlQuery);
 
 System.out.println("First 10 results from query: " + sqlQuery);
 ds.show(10, false);
  
 // filter out unpublished entries (they contain the word "published" in various upper/lower case combinations)
 ds = ds.filter("UPPER(journal_abbrev) NOT LIKE '%PUBLISHED%'");
 
 // print the top 10 journals
 System.out.println("Top 10 journals that publish PDB structures:");
 ds.groupBy("journal_abbrev").count().sort(col("count").desc()).show(10, false);
	
 // filter out entries without a PubMed Id (is -1 if PubMed Id is not available)
 ds = ds.filter("pdbx_database_id_PubMed > 0");
 System.out.println("Entries with PubMed Ids: " + ds.count());
 
 // show growth of papers in PubMed
 System.out.println("PubMed Ids per year: ");
 ds.groupBy("year").count().sort(col("year").desc()).show(10, false);

 spark.close();
}

Source File: SparkMLScoringOnline.java From -Data-Stream-Development-with-Apache-Spark-Kafka-and-Spring-Boot with MIT License

4 votes

public static void main(String[] args) throws InterruptedException, StreamingQueryException {
 
      System.setProperty("hadoop.home.dir", HADOOP_HOME_DIR_VALUE);

      // * the schema can be written on disk, and read from disk
      // * the schema is not mandatory to be complete, it can contain only the needed fields    
      StructType RSVP_SCHEMA = new StructType()                                
              .add("event",
                      new StructType()
                              .add("event_id", StringType, true)
                              .add("event_name", StringType, true)
                              .add("event_url", StringType, true)
                              .add("time", LongType, true))
              .add("group",
                      new StructType()
                              .add("group_city", StringType, true)
                              .add("group_country", StringType, true)
                              .add("group_id", LongType, true)
                              .add("group_lat", DoubleType, true)
                              .add("group_lon", DoubleType, true)
                              .add("group_name", StringType, true)
                              .add("group_state", StringType, true)
                              .add("group_topics", DataTypes.createArrayType(
                                      new StructType()
                                              .add("topicName", StringType, true)
                                              .add("urlkey", StringType, true)), true)
                              .add("group_urlname", StringType, true))
              .add("guests", LongType, true)
              .add("member",
                      new StructType()
                              .add("member_id", LongType, true)
                              .add("member_name", StringType, true)                                
                              .add("photo", StringType, true))
              .add("mtime", LongType, true)
              .add("response", StringType, true)
              .add("rsvp_id", LongType, true)
              .add("venue",
                      new StructType()
                              .add("lat", DoubleType, true)
                              .add("lon", DoubleType, true)
                              .add("venue_id", LongType, true)
                              .add("venue_name", StringType, true))
              .add("visibility", StringType, true);

      final SparkConf conf = new SparkConf()
              .setMaster(RUN_LOCAL_WITH_AVAILABLE_CORES)
              .setAppName(APPLICATION_NAME)
              .set("spark.sql.caseSensitive", CASE_SENSITIVE);

      SparkSession spark = SparkSession
              .builder()
              .config(conf)
              .getOrCreate();

      PipelineModel pipelineModel = PipelineModel.load(MODEL_FOLDER_PATH);
     
      Dataset<Row> meetupStream = spark.readStream()
              .format(KAFKA_FORMAT)
              .option("kafka.bootstrap.servers", KAFKA_BROKERS)
              .option("subscribe", KAFKA_TOPIC)
              .load();

      Dataset<Row> gatheredDF = meetupStream.select(
    (from_json(col("value").cast("string"), RSVP_SCHEMA))
	        .alias("rsvp"))
	.alias("meetup")
          .select("meetup.*");
		
      Dataset<Row> filteredDF = gatheredDF.filter(e -> !e.anyNull());  

      Dataset<Row> preparedDF = filteredDF.select(
        col("rsvp.group.group_city"),
        col("rsvp.group.group_lat"), col("rsvp.group.group_lon"), 
		col("rsvp.response")
);
		                
      preparedDF.printSchema();
   
      Dataset<Row> predictionDF = pipelineModel.transform(preparedDF);
      
      StreamingQuery query = predictionDF.writeStream()                
              .format(JSON_FORMAT)
              .option("path", RESULT_FOLDER_PATH)
              .option("checkpointLocation", CHECKPOINT_LOCATION)
              .trigger(Trigger.ProcessingTime(QUERY_INTERVAL_SECONDS))
              .option("truncate", false)
              .start();

      query.awaitTermination();
  }

Source File: DetermineProcessVariablesStep.java From bpmn.ai with BSD 3-Clause "New" or "Revised" License

4 votes

private Dataset<Row> doFilterVariables(Dataset<Row> dataset, boolean writeStepResultIntoFile, SparkRunnerConfig config) {
    List<String> variablesToFilter = new ArrayList<>();

    Configuration configuration = ConfigurationUtils.getInstance().getConfiguration(config);
    if(configuration != null) {
        PreprocessingConfiguration preprocessingConfiguration = configuration.getPreprocessingConfiguration();
        if(preprocessingConfiguration != null) {
            for(VariableConfiguration vc : preprocessingConfiguration.getVariableConfiguration()) {
                if(!vc.isUseVariable()) {
                    variablesToFilter.add(vc.getVariableName());
                    BpmnaiLogger.getInstance().writeInfo("The variable '" + vc.getVariableName() + "' will be filtered out. Comment: " + vc.getComment());
                }
            }
        }

    }

    //check if all variables that should be filtered actually exist, otherwise log a warning
    List<Row> existingVariablesRows = dataset.select(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_NAME).distinct().collectAsList();
    List<String> existingVariables = existingVariablesRows
            .stream()
            .map(r -> r.getString(0)).collect(Collectors.toList());

    variablesToFilter
            .stream()
            .forEach(new Consumer<String>() {
                @Override
                public void accept(String s) {
                    if(!existingVariables.contains(s)) {
                        // log the fact that a variable that should be filtered does not exist
                        BpmnaiLogger.getInstance().writeWarn("The variable '" + s + "' is configured to be filtered, but does not exist in the data.");
                    }
                }
            });

    dataset = dataset.filter((FilterFunction<Row>) row -> {
        // keep the row if the variable name column does not contain a value that should be filtered
        String variable = row.getAs(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_NAME);

        //TODO: cleanup
        boolean keep = !variablesToFilter.contains(variable);
        if(variable != null && variable.startsWith("_CORRELATION_ID_")) {
            keep = false;
        }

        return keep;
    });

    if(writeStepResultIntoFile) {
        BpmnaiUtils.getInstance().writeDatasetToCSV(dataset, "variable_filter", config);
    }

    return dataset;
}

Source File: DataFilterOnActivityStep.java From bpmn.ai with BSD 3-Clause "New" or "Revised" License

4 votes

/**
 * @param dataSet the incoming dataset for this processing step
 * @param parameters
 * @return the filtered DataSet
 */
@Override
public Dataset<Row> runPreprocessingStep(Dataset<Row> dataSet, Map<String, Object> parameters, SparkRunnerConfig config) {
    // any parameters set?
    if (parameters == null || parameters.size() == 0) {
        BpmnaiLogger.getInstance().writeWarn("No parameters found for the DataFilterOnActivityStep");
        return dataSet;
    }

    // get query parameter
    String query = (String) parameters.get("query");
    BpmnaiLogger.getInstance().writeInfo("Filtering data with activity instance filter query: " + query + ".");

    // save size of initial dataset for log
    dataSet.cache();
    Long initialDSCount = dataSet.count();

    // repartition by process instance and order by start_time for this operation
    dataSet = dataSet.repartition(dataSet.col(BpmnaiVariables.VAR_PROCESS_INSTANCE_ID)).sortWithinPartitions(BpmnaiVariables.VAR_START_TIME);

    // we temporarily store variable updates (rows with a var type set) separately.
    Dataset<Row> variables = dataSet.filter(col(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_TYPE).isNotNull());
    //find first occurrence of activity instance
    final Dataset<Row> dsTmp = dataSet.filter(dataSet.col(BpmnaiVariables.VAR_ACT_ID).equalTo(query)).filter(dataSet.col(BpmnaiVariables.VAR_END_TIME).isNull()); //TODO: ENSURING THAT THIS ISN'T A VARIABLE ROW

    // now we look for the first occurrence of the activity id contained in "query". The result comprises of a dataset of corresponding activity instances.
    final Dataset<Row> dsActivityInstances = dataSet.filter(dataSet.col(BpmnaiVariables.VAR_ACT_ID).like(query)).filter(dataSet.col(BpmnaiVariables.VAR_END_TIME).isNull()); //TODO: ENSURING THAT THIS ISN'T A VARIABLE ROW

    // we slim the resulting dataset down: only the activity instances process id and the instances start time are relevant.
    List<Row> activityRows = dsActivityInstances.select(BpmnaiVariables.VAR_PROCESS_INSTANCE_ID, BpmnaiVariables.VAR_START_TIME).collectAsList();
    Map<String, String> activities = activityRows.stream().collect(Collectors.toMap(
            r -> r.getAs(BpmnaiVariables.VAR_PROCESS_INSTANCE_ID), r -> r.getAs(BpmnaiVariables.VAR_START_TIME)));
    // broadcasting the PID - Start time Map to use it in a user defined function
    SparkBroadcastHelper.getInstance().broadcastVariable(SparkBroadcastHelper.BROADCAST_VARIABLE.PROCESS_INSTANCE_TIMESTAMP_MAP, activities);

    // now we have to select for each process instance in our inital dataset all events that happend before the first occurence of our selected activity.
    // We first narrow it down to the process instances in question
    Dataset<Row> selectedProcesses = dataSet.filter(col(BpmnaiVariables.VAR_PROCESS_INSTANCE_ID).isin(activities.keySet().toArray()));
    // Then, we mark all events that should be removed
    Dataset<Row> activityDataSet = selectedProcesses.withColumn("data_filter_on_activity",
            callUDF("activityBeforeTimestamp",
                    selectedProcesses.col(BpmnaiVariables.VAR_PROCESS_INSTANCE_ID),
                    selectedProcesses.col(BpmnaiVariables.VAR_START_TIME)));
    // And we keep the rest
    activityDataSet = activityDataSet.filter(col("data_filter_on_activity").like("TRUE"));
    // Clean up
    activityDataSet = activityDataSet.drop("data_filter_on_activity");

    // However, we lost all variable updates in this approach, so now we add the variables in question to the dataset
    // first, we narrow it down to keep only variables that have a corresponding activity instance
    activityDataSet = activityDataSet.withColumnRenamed(BpmnaiVariables.VAR_ACT_INST_ID, BpmnaiVariables.VAR_ACT_INST_ID+"_RIGHT");

    variables = variables.join(activityDataSet.select(BpmnaiVariables.VAR_ACT_INST_ID+"_RIGHT").distinct(), variables.col(BpmnaiVariables.VAR_ACT_INST_ID).equalTo(activityDataSet.col(BpmnaiVariables.VAR_ACT_INST_ID+"_RIGHT")),"inner");

    activityDataSet = activityDataSet.withColumnRenamed(BpmnaiVariables.VAR_ACT_INST_ID+"_RIGHT", BpmnaiVariables.VAR_ACT_INST_ID);
    variables = variables.drop(BpmnaiVariables.VAR_ACT_INST_ID+"_RIGHT");
    dataSet = activityDataSet.union(variables);

    dataSet.cache();
    BpmnaiLogger.getInstance().writeInfo("DataFilterOnActivityStep: The filtered DataSet contains "+dataSet.count()+" rows, (before: "+ initialDSCount+" rows)");

    if (config.isWriteStepResultsIntoFile()) {
        BpmnaiUtils.getInstance().writeDatasetToCSV(dataSet, "data_filter_on_activity_step", config);
    }

    return dataSet;


}

Source File: AtpInteractionAnalysis.java From mmtf-spark with Apache License 2.0

4 votes

/**
 * @param args input arguments
 * @throws IOException
 */
public static void main(String[] args) throws IOException {

	String path = MmtfReader.getMmtfFullPath();
     
    long start = System.nanoTime();
    
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(AtpInteractionAnalysis.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    
    // read PDB in MMTF format
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.readSequenceFile(path, sc);
   
    // filter by sequence identity subset
    int sequenceIdentity = 20;
    double resolution = 2.0;
    pdb = pdb.filter(new Pisces(sequenceIdentity, resolution));
    
    // find ATP interactions within 3 Angstroms
    GroupInteractionExtractor finder = new GroupInteractionExtractor("ATP", 3);
    Dataset<Row> interactions = finder.getDataset(pdb).cache();
    
    // TODO add a line to only analyze interactions 
    // with the oxygens in the terminal phosphate group of ATP
    // (O1G, O2G, O3G)
    // Tip: Google SQL LIKE
    interactions = interactions.filter("atom1 LIKE('O%G')");
    
    // show the data schema of the dataset and some data
       interactions.printSchema();
       interactions.show(20);
       
       long n = interactions.count();
       System.out.println("# interactions: " + n);
       
       System.out.println("Top interacting groups");

       Dataset<Row> topGroups = interactions
       		.groupBy("residue2")
       		.count();
       
       topGroups
       .sort(col("count").desc()) // sort descending by count
       .show(10);
       
       System.out.println("Top interacting group/atoms types");

       Dataset<Row> topGroupsAndAtoms = interactions
       		.groupBy("residue2","atom2")
       		.count();

       topGroupsAndAtoms
       .withColumn("frequency", col("count").divide(n)) // add column with frequency of occurrence
       .sort(col("frequency").desc()) // sort descending
       .show(10);

       long end = System.nanoTime();
    
    System.out.println("Time:     " + (end-start)/1E9 + "sec.");
    
    sc.close();
}

Source File: Basic.java From learning-spark-with-java with MIT License

4 votes

public static void main(String[] args) {
    SparkSession spark = SparkSession
        .builder()
        .appName("Dataset-Basic")
        .master("local[4]")
        .getOrCreate();

    List<Integer> data = Arrays.asList(10, 11, 12, 13, 14, 15);
    Dataset<Integer> ds = spark.createDataset(data, Encoders.INT());

    System.out.println("*** only one column, and it always has the same name");
    ds.printSchema();

    ds.show();

    System.out.println("*** values > 12");

    // the harder way to filter
    Dataset<Integer> ds2 = ds.filter((Integer value) -> value > 12);

    ds.show();

    List<Tuple3<Integer, String, String>> tuples =
        Arrays.asList(
            new Tuple3<>(1, "one", "un"),
            new Tuple3<>(2, "two", "deux"),
            new Tuple3<>(3, "three", "trois"));

    Encoder<Tuple3<Integer, String, String>> encoder =
        Encoders.tuple(Encoders.INT(), Encoders.STRING(), Encoders.STRING());

    Dataset<Tuple3<Integer, String, String>> tupleDS =
        spark.createDataset(tuples, encoder);

    System.out.println("*** Tuple Dataset types");
    tupleDS.printSchema();

    // the tuple columns have unfriendly names, but you can use them to query
    System.out.println("*** filter by one column and fetch another");
    tupleDS.where(col("_1").gt(2)).select(col("_2"), col("_3")).show();

    spark.stop();
}

Source File: InListDeriver.java From envelope with Apache License 2.0

4 votes

@Override
public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) {

  Dataset<Row> target = getStepDataFrame(dependencies);
  if (target.columns().length < 1) {
    throw new RuntimeException("Targeted step, '" + stepName + ",' has no columns");
  }

  try {
    String targetField = fieldName == null ? target.columns()[0] : fieldName;
    Column targetColumn = target.col(targetField);

    LOGGER.debug("Targeting '{}[{}]'", stepName, targetField);

    // If the IN list is inline, there is no batch
    if (inList != null) {
      LOGGER.debug("IN list is inline");
      return target.filter(targetColumn.isin(inList.toArray()));
    }

    // Otherwise, collect the values from the reference, executed within the batch
    else {
      LOGGER.trace("IN list is a reference");
      Dataset<Row> reference = dependencies.get(refStepName);
      String referenceField = refFieldName == null ? reference.columns()[0] : refFieldName;

      LOGGER.debug("Referencing using {}[{}]", refStepName, referenceField);
      Column referenceColumn = reference.col(referenceField);

      Iterator<Row> referenceIterator = reference.select(referenceColumn).distinct().toLocalIterator();
      this.inList = new ArrayList<>();
      long counter = 0;

      // Set up the batch collector
      JavaRDD<Row> unionRDD = new JavaSparkContext(Contexts.getSparkSession().sparkContext()).emptyRDD();
      Dataset<Row> union = Contexts.getSparkSession().createDataFrame(unionRDD, target.schema());

      while (referenceIterator.hasNext()) {
        // Flush the batch
        if (counter == batchSize) {
          LOGGER.trace("Flushing batch");
          union = union.union(target.filter(targetColumn.isin(inList.toArray())));
          inList.clear();
          counter = 0L;
        }

        // Gather the elements of the IN list from the reference
        inList.add(referenceIterator.next().get(0));
        counter++;
      }

      // If the selection is under the batch threshold
      if (union.rdd().isEmpty()) {
        return target.filter(targetColumn.isin(inList.toArray()));
      }

      // Flush any remaining IN list values
      else {
        return union.union(target.filter(targetColumn.isin(inList.toArray())));
      }
    }
  } catch (Throwable ae) {
    throw new RuntimeException("Error executing IN list filtering", ae);
  }

}

Java Code Examples for org.apache.spark.sql.Dataset#filter()