org.apache.spark.sql.Dataset#drop

Source File: ParseJSONDeriver.java From envelope with Apache License 2.0

6 votes

@Override
public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) {
  String parsedStructTemporaryFieldName = "__parsed_json";

  Dataset<Row> dependency = dependencies.get(stepName);

  Dataset<Row> parsed = dependency.select(
      functions.from_json(new Column(fieldName), schema, options).as(parsedStructTemporaryFieldName));

  if (asStruct) {
    return parsed.withColumnRenamed(parsedStructTemporaryFieldName, structFieldName);
  }
  else {
    for (StructField parsedField : schema.fields()) {
      parsed = parsed.withColumn(
          parsedField.name(), new Column(parsedStructTemporaryFieldName + "." + parsedField.name()));
    }

    return parsed.drop(parsedStructTemporaryFieldName);
  }
}

Source File: CreateColumnsFromJsonStep.java From bpmn.ai with BSD 3-Clause "New" or "Revised" License

6 votes

private Dataset<Row> doFilterJsonVariables(Dataset<Row> dataset, SparkRunnerConfig config) {
    //read all variables to filter again. They contain also variables that resulted from Json parsing and are not columns, so they can just be dropped
    List<String> variablesToFilter = new ArrayList<>();

    Configuration configuration = ConfigurationUtils.getInstance().getConfiguration(config);
    if(configuration != null) {
        PreprocessingConfiguration preprocessingConfiguration = configuration.getPreprocessingConfiguration();
        if(preprocessingConfiguration != null) {
            for(VariableConfiguration vc : preprocessingConfiguration.getVariableConfiguration()) {
                if(!vc.isUseVariable()) {
                    variablesToFilter.add(vc.getVariableName());

                    if(Arrays.asList(dataset.columns()).contains(vc.getVariableName())) {
                        BpmnaiLogger.getInstance().writeInfo("The variable '" + vc.getVariableName() + "' will be filtered out after json processing. Comment: " + vc.getComment());
                    }
                }
            }
        }
    }

    dataset = dataset.drop(BpmnaiUtils.getInstance().asSeq(variablesToFilter));

    return dataset;
}

Source File: SelectDeriver.java From envelope with Apache License 2.0

5 votes

@Override
public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) throws Exception {
  dependencyCheck(dependencies);
  Dataset<Row> sourceStep = dependencies.get(stepName);
  if (useIncludeFields){
      if (!Arrays.asList(sourceStep.columns()).containsAll(includeFields)){
          throw new RuntimeException("Columns specified in " + INCLUDE_FIELDS + " are not found in input dependency schema \n" +
          "Available columns: " + Arrays.toString(sourceStep.columns()));
      }
      String firstCol = includeFields.get(0);
      includeFields.remove(0);
      return sourceStep.select(firstCol, includeFields.toArray(new String[0]));
  } else {
      if (!Arrays.asList(sourceStep.columns()).containsAll(excludeFields)){
          throw new RuntimeException("Columns specified in " + EXCLUDE_FIELDS + " are not found in input dependency schema \n" +
          "Available columns: " + Arrays.toString(sourceStep.columns()));
      }
      return sourceStep.drop(JavaConverters.collectionAsScalaIterableConverter(excludeFields).asScala().toSeq());
  }
}

Source File: PdbjMineDataset.java From mmtf-spark with Apache License 2.0

5 votes

/**
 * Fetches data using the PDBj Mine 2 SQL service
 * 
 * @param sqlQuery
 *            query in SQL format
 * @throws IOException
 */
public static Dataset<Row> getDataset(String sqlQuery) throws IOException {
	String encodedSQL = URLEncoder.encode(sqlQuery, "UTF-8");

	URL u = new URL(SERVICELOCATION + "?format=csv&q=" + encodedSQL);
	InputStream in = u.openStream();

	// save as a temporary CSV file
	Path tempFile = Files.createTempFile(null, ".csv");
	Files.copy(in, tempFile, StandardCopyOption.REPLACE_EXISTING);
	in.close();

	SparkSession spark = SparkSession.builder().getOrCreate();

	// load temporary CSV file into Spark dataset
	Dataset<Row> ds = spark.read().format("csv").option("header", "true").option("inferSchema", "true")
			// .option("parserLib", "UNIVOCITY")
			.load(tempFile.toString());

	// rename/concatenate columns to assign
	// consistent primary keys to datasets
	List<String> columns = Arrays.asList(ds.columns());

	if (columns.contains("pdbid")) {
		// this project uses upper case pdbids
		ds = ds.withColumn("pdbid", upper(col("pdbid")));

		if (columns.contains("chain")) {
			ds = ds.withColumn("structureChainId", concat(col("pdbid"), lit("."), col("chain")));
			ds = ds.drop("pdbid", "chain");
		} else {
			ds = ds.withColumnRenamed("pdbid", "structureId");
		}
	}

	return ds;
}

Source File: AggregateActivityInstancesStep.java From bpmn.ai with BSD 3-Clause "New" or "Revised" License

4 votes

@Override
public Dataset<Row> runPreprocessingStep(Dataset<Row> dataset, Map<String, Object> parameters, SparkRunnerConfig config) {

    //apply first and processState aggregator
    Map<String, String> aggregationMap = new HashMap<>();
    for(String column : dataset.columns()) {
        if(column.equals(BpmnaiVariables.VAR_PROCESS_INSTANCE_ID)) {
            continue;
        } else if(column.equals(BpmnaiVariables.VAR_DURATION) || column.endsWith("_rev")) {
            aggregationMap.put(column, "max");
        } else if(column.equals(BpmnaiVariables.VAR_STATE)) {
            aggregationMap.put(column, "ProcessState");
        } else if(column.equals(BpmnaiVariables.VAR_ACT_INST_ID)) {
            //ignore it, as we aggregate by it
            continue;
        } else {
            aggregationMap.put(column, "AllButEmptyString");
        }
    }

    //first aggregation
    //activity level, take only processInstance and activityInstance rows
    dataset = dataset
            .filter(dataset.col(BpmnaiVariables.VAR_DATA_SOURCE).notEqual(BpmnaiVariables.EVENT_PROCESS_INSTANCE))
            .groupBy(BpmnaiVariables.VAR_PROCESS_INSTANCE_ID, BpmnaiVariables.VAR_ACT_INST_ID)
            .agg(aggregationMap);

    //rename back columns after aggregation
    String pattern = "(max|allbutemptystring|processstate)\\((.+)\\)";
    Pattern r = Pattern.compile(pattern);

    for(String columnName : dataset.columns()) {
        Matcher m = r.matcher(columnName);
        if(m.find()) {
            String newColumnName = m.group(2);
            dataset = dataset.withColumnRenamed(columnName, newColumnName);
        }
    }


    //in case we add the CSV we have a name column in the first dataset of the join so we call drop again to make sure it is gone
    dataset = dataset.drop(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_NAME);
    dataset = dataset.drop(BpmnaiVariables.VAR_DATA_SOURCE);

    dataset = dataset.sort(BpmnaiVariables.VAR_START_TIME);

    dataset.cache();
    BpmnaiLogger.getInstance().writeInfo("Found " + dataset.count() + " activity instances.");

    if(config.isWriteStepResultsIntoFile()) {
        BpmnaiUtils.getInstance().writeDatasetToCSV(dataset, "agg_of_activity_instances", config);
    }

    //return preprocessed data
    return dataset;
}

Source File: ColumnRemoveStep.java From bpmn.ai with BSD 3-Clause "New" or "Revised" License

4 votes

@Override
public Dataset<Row> runPreprocessingStep(Dataset<Row> dataSet, Map<String, Object> parameters, SparkRunnerConfig config) {

    //these columns have to stay in in order to do the processing
    List<String> columnsToKeep = new ArrayList<>();
    columnsToKeep.add(BpmnaiVariables.VAR_PROCESS_INSTANCE_ID);
    columnsToKeep.add(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_NAME);
    columnsToKeep.add(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_TYPE);
    columnsToKeep.add(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_REVISION);
    columnsToKeep.add(BpmnaiVariables.VAR_STATE);
    columnsToKeep.add(BpmnaiVariables.VAR_LONG);
    columnsToKeep.add(BpmnaiVariables.VAR_DOUBLE);
    columnsToKeep.add(BpmnaiVariables.VAR_TEXT);
    columnsToKeep.add(BpmnaiVariables.VAR_TEXT2);
    columnsToKeep.add(BpmnaiVariables.VAR_DATA_SOURCE);

    List<String> columnsToRemove = new ArrayList<>();

    Configuration configuration = ConfigurationUtils.getInstance().getConfiguration(config);
    if(configuration != null) {
        PreprocessingConfiguration preprocessingConfiguration = configuration.getPreprocessingConfiguration();
        if(preprocessingConfiguration != null) {
            for(ColumnConfiguration cc : preprocessingConfiguration.getColumnConfiguration()) {
                if(!cc.isUseColumn()) {
                    if(columnsToKeep.contains(cc.getColumnName())) {
                        BpmnaiLogger.getInstance().writeWarn("The column '" + cc.getColumnName() + "' has to stay in in order to do the processing. It will not be removed. Comment: " + cc.getComment());
                    } else {
                        columnsToRemove.add(cc.getColumnName());
                        BpmnaiLogger.getInstance().writeInfo("The column '" + cc.getColumnName() + "' will be removed. Comment: " + cc.getComment());
                    }
                }
            }
        }
    }

    //check if all variables that should be filtered actually exist, otherwise log a warning
    List<String> existingColumns = new ArrayList<>(Arrays.asList(dataSet.columns()));

    columnsToRemove
            .stream()
            .forEach(new Consumer<String>() {
                @Override
                public void accept(String s) {
                    if(!existingColumns.contains(s)) {
                        // log the fact that a variable that should be filtered does not exist
                        BpmnaiLogger.getInstance().writeWarn("The column '" + s + "' is configured to be filtered, but does not exist in the data.");
                    }
                }
            });

    dataSet = dataSet.drop(BpmnaiUtils.getInstance().asSeq(columnsToRemove));

    return dataSet;
}

Source File: AddVariableColumnsStep.java From bpmn.ai with BSD 3-Clause "New" or "Revised" License

4 votes

private Dataset<Row> doAddVariableColumns(Dataset<Row> dataset, boolean writeStepResultIntoFile, String dataLevel, SparkRunnerConfig config) {
    Map<String, String> varMap = (Map<String, String>) SparkBroadcastHelper.getInstance().getBroadcastVariable(SparkBroadcastHelper.BROADCAST_VARIABLE.PROCESS_VARIABLES_ESCALATED);
    Set<String> variables = varMap.keySet();

    for(String v : variables) {
        dataset = dataset.withColumn(v, when(dataset.col(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_NAME).equalTo(v),
                when(dataset.col(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_TYPE).equalTo("string"), dataset.col(BpmnaiVariables.VAR_TEXT))
                        .when(dataset.col(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_TYPE).equalTo("null"), dataset.col(BpmnaiVariables.VAR_TEXT))
                        .when(dataset.col(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_TYPE).equalTo("boolean"), dataset.col(BpmnaiVariables.VAR_LONG))
                        .when(dataset.col(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_TYPE).equalTo("integer"), dataset.col(BpmnaiVariables.VAR_LONG))
                        .when(dataset.col(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_TYPE).equalTo("long"), dataset.col(BpmnaiVariables.VAR_LONG))
                        .when(dataset.col(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_TYPE).equalTo("double"), dataset.col(BpmnaiVariables.VAR_DOUBLE))
                        .when(dataset.col(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_TYPE).equalTo("date"), dataset.col(BpmnaiVariables.VAR_LONG))
                        .otherwise(dataset.col(BpmnaiVariables.VAR_TEXT2)))
                .otherwise(null));

        //rev count is only relevant on process level
        if(dataLevel.equals(BpmnaiVariables.DATA_LEVEL_PROCESS) && config.isRevCountEnabled()) {
            dataset = dataset.withColumn(v+"_rev",
                    when(dataset.col(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_NAME).equalTo(v), dataset.col(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_REVISION))
                            .otherwise("0"));
        }
    }

    //drop unnecesssary columns
    dataset = dataset.drop(
            BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_TYPE,
            BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_REVISION,
            BpmnaiVariables.VAR_DOUBLE,
            BpmnaiVariables.VAR_LONG,
            BpmnaiVariables.VAR_TEXT,
            BpmnaiVariables.VAR_TEXT2);

    if(!config.isDevProcessStateColumnWorkaroundEnabled()) {
        dataset = dataset.drop(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_NAME);
    }

    if(writeStepResultIntoFile) {
        BpmnaiUtils.getInstance().writeDatasetToCSV(dataset, "add_var_columns", config);
    }

    //return preprocessed data
    return dataset;
}

Source File: DataFilterOnActivityStep.java From bpmn.ai with BSD 3-Clause "New" or "Revised" License

4 votes

/**
 * @param dataSet the incoming dataset for this processing step
 * @param parameters
 * @return the filtered DataSet
 */
@Override
public Dataset<Row> runPreprocessingStep(Dataset<Row> dataSet, Map<String, Object> parameters, SparkRunnerConfig config) {
    // any parameters set?
    if (parameters == null || parameters.size() == 0) {
        BpmnaiLogger.getInstance().writeWarn("No parameters found for the DataFilterOnActivityStep");
        return dataSet;
    }

    // get query parameter
    String query = (String) parameters.get("query");
    BpmnaiLogger.getInstance().writeInfo("Filtering data with activity instance filter query: " + query + ".");

    // save size of initial dataset for log
    dataSet.cache();
    Long initialDSCount = dataSet.count();

    // repartition by process instance and order by start_time for this operation
    dataSet = dataSet.repartition(dataSet.col(BpmnaiVariables.VAR_PROCESS_INSTANCE_ID)).sortWithinPartitions(BpmnaiVariables.VAR_START_TIME);

    // we temporarily store variable updates (rows with a var type set) separately.
    Dataset<Row> variables = dataSet.filter(col(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_TYPE).isNotNull());
    //find first occurrence of activity instance
    final Dataset<Row> dsTmp = dataSet.filter(dataSet.col(BpmnaiVariables.VAR_ACT_ID).equalTo(query)).filter(dataSet.col(BpmnaiVariables.VAR_END_TIME).isNull()); //TODO: ENSURING THAT THIS ISN'T A VARIABLE ROW

    // now we look for the first occurrence of the activity id contained in "query". The result comprises of a dataset of corresponding activity instances.
    final Dataset<Row> dsActivityInstances = dataSet.filter(dataSet.col(BpmnaiVariables.VAR_ACT_ID).like(query)).filter(dataSet.col(BpmnaiVariables.VAR_END_TIME).isNull()); //TODO: ENSURING THAT THIS ISN'T A VARIABLE ROW

    // we slim the resulting dataset down: only the activity instances process id and the instances start time are relevant.
    List<Row> activityRows = dsActivityInstances.select(BpmnaiVariables.VAR_PROCESS_INSTANCE_ID, BpmnaiVariables.VAR_START_TIME).collectAsList();
    Map<String, String> activities = activityRows.stream().collect(Collectors.toMap(
            r -> r.getAs(BpmnaiVariables.VAR_PROCESS_INSTANCE_ID), r -> r.getAs(BpmnaiVariables.VAR_START_TIME)));
    // broadcasting the PID - Start time Map to use it in a user defined function
    SparkBroadcastHelper.getInstance().broadcastVariable(SparkBroadcastHelper.BROADCAST_VARIABLE.PROCESS_INSTANCE_TIMESTAMP_MAP, activities);

    // now we have to select for each process instance in our inital dataset all events that happend before the first occurence of our selected activity.
    // We first narrow it down to the process instances in question
    Dataset<Row> selectedProcesses = dataSet.filter(col(BpmnaiVariables.VAR_PROCESS_INSTANCE_ID).isin(activities.keySet().toArray()));
    // Then, we mark all events that should be removed
    Dataset<Row> activityDataSet = selectedProcesses.withColumn("data_filter_on_activity",
            callUDF("activityBeforeTimestamp",
                    selectedProcesses.col(BpmnaiVariables.VAR_PROCESS_INSTANCE_ID),
                    selectedProcesses.col(BpmnaiVariables.VAR_START_TIME)));
    // And we keep the rest
    activityDataSet = activityDataSet.filter(col("data_filter_on_activity").like("TRUE"));
    // Clean up
    activityDataSet = activityDataSet.drop("data_filter_on_activity");

    // However, we lost all variable updates in this approach, so now we add the variables in question to the dataset
    // first, we narrow it down to keep only variables that have a corresponding activity instance
    activityDataSet = activityDataSet.withColumnRenamed(BpmnaiVariables.VAR_ACT_INST_ID, BpmnaiVariables.VAR_ACT_INST_ID+"_RIGHT");

    variables = variables.join(activityDataSet.select(BpmnaiVariables.VAR_ACT_INST_ID+"_RIGHT").distinct(), variables.col(BpmnaiVariables.VAR_ACT_INST_ID).equalTo(activityDataSet.col(BpmnaiVariables.VAR_ACT_INST_ID+"_RIGHT")),"inner");

    activityDataSet = activityDataSet.withColumnRenamed(BpmnaiVariables.VAR_ACT_INST_ID+"_RIGHT", BpmnaiVariables.VAR_ACT_INST_ID);
    variables = variables.drop(BpmnaiVariables.VAR_ACT_INST_ID+"_RIGHT");
    dataSet = activityDataSet.union(variables);

    dataSet.cache();
    BpmnaiLogger.getInstance().writeInfo("DataFilterOnActivityStep: The filtered DataSet contains "+dataSet.count()+" rows, (before: "+ initialDSCount+" rows)");

    if (config.isWriteStepResultsIntoFile()) {
        BpmnaiUtils.getInstance().writeDatasetToCSV(dataSet, "data_filter_on_activity_step", config);
    }

    return dataSet;


}

Source File: PdbToUniProt.java From mmtf-spark with Apache License 2.0

4 votes

/**
 * Returns an up-to-date dataset of PDB to UniProt 
 * residue-level mappings for a list of ids.
 * Valid ids are either a list of pdbIds (e.g. 1XYZ) or pdbId.chainId (e.g., 1XYZ.A).
 * This method reads a cached file and downloads updates.
 * 
 * @param ids list of pdbIds or pdbId.chainIds
 * @return dataset of PDB to UniProt residue-level mappings
 * @throws IOException
 */
public static Dataset<Row> getResidueMappings(List<String> ids) throws IOException {
    SparkSession spark = SparkSession.builder().getOrCreate();
    
    boolean withChainId = ids.size() > 0 && ids.get(0).length() > 4;
    
    // create dataset of ids
    Dataset<Row> df = spark.createDataset(ids, Encoders.STRING()).toDF("id");
    // get cached mappings
    Dataset<Row> mapping = getCachedResidueMappings();  
    
    // dataset for non-cached mappings
    Dataset<Row> notCached = null;
    // dataset with PDB Ids to be downloaded
    Dataset<Row> toDownload = null; 
    
    if (withChainId) {
        // get subset of requested ids from cached dataset
        mapping = mapping.join(df, mapping.col("structureChainId").equalTo(df.col("id"))).drop("id");
        // get ids that are not in the cached dataset
        notCached = df.join(mapping, df.col("id").equalTo(mapping.col("structureChainId")), "left_anti").cache(); 
        // create dataset of PDB Ids to be downloaded
        toDownload = notCached.withColumn("id", col("id").substr(0, 4)).distinct().cache();
    } else {
        // get subset of requested ids from cached dataset
        mapping = mapping.withColumn("pdbId", col("structureChainId").substr(0, 4));
        mapping = mapping.join(df, mapping.col("pdbId").equalTo(df.col("id"))).drop("id");
        // create dataset of PDB Ids to be downloaded
        toDownload = df.join(mapping, df.col("id").equalTo(mapping.col("pdbId")), "left_anti").distinct().cache();
        mapping = mapping.drop("pdbId");
    }
    
    toDownload = toDownload.distinct().cache();
        
    // download data that are not in the cache
    if (toDownload.count() > 0) {
        Dataset<Row> unpData = getChainMappings().select("structureId").distinct();
        toDownload = toDownload.join(unpData, toDownload.col("id").equalTo(unpData.col("structureId"))).drop("structureId").cache();
        System.out.println("Downloading mapping for " + toDownload.count() + " PDB structures.");
        Dataset<Row> downloadedData = downloadData(toDownload);
  
        // since data are downloaded for all chains in structure, make sure to only include the requested chains.
        if (withChainId) {
            downloadedData = downloadedData.join(notCached, downloadedData.col("structureChainId").equalTo(notCached.col("id"))).drop("id");
        }
        mapping = mapping.union(downloadedData);
    }
    
    return mapping;
}

Source File: HoodieIncrSource.java From hudi with Apache License 2.0

4 votes

@Override
public Pair<Option<Dataset<Row>>, String> fetchNextBatch(Option<String> lastCkptStr, long sourceLimit) {

  DataSourceUtils.checkRequiredProperties(props, Collections.singletonList(Config.HOODIE_SRC_BASE_PATH));

  /*
   * DataSourceUtils.checkRequiredProperties(props, Arrays.asList(Config.HOODIE_SRC_BASE_PATH,
   * Config.HOODIE_SRC_PARTITION_FIELDS)); List<String> partitionFields =
   * props.getStringList(Config.HOODIE_SRC_PARTITION_FIELDS, ",", new ArrayList<>()); PartitionValueExtractor
   * extractor = DataSourceUtils.createPartitionExtractor(props.getString( Config.HOODIE_SRC_PARTITION_EXTRACTORCLASS,
   * Config.DEFAULT_HOODIE_SRC_PARTITION_EXTRACTORCLASS));
   */
  String srcPath = props.getString(Config.HOODIE_SRC_BASE_PATH);
  int numInstantsPerFetch = props.getInteger(Config.NUM_INSTANTS_PER_FETCH, Config.DEFAULT_NUM_INSTANTS_PER_FETCH);
  boolean readLatestOnMissingCkpt = props.getBoolean(Config.READ_LATEST_INSTANT_ON_MISSING_CKPT,
      Config.DEFAULT_READ_LATEST_INSTANT_ON_MISSING_CKPT);

  // Use begin Instant if set and non-empty
  Option<String> beginInstant =
      lastCkptStr.isPresent() ? lastCkptStr.get().isEmpty() ? Option.empty() : lastCkptStr : Option.empty();

  Pair<String, String> instantEndpts = IncrSourceHelper.calculateBeginAndEndInstants(sparkContext, srcPath,
      numInstantsPerFetch, beginInstant, readLatestOnMissingCkpt);

  if (instantEndpts.getKey().equals(instantEndpts.getValue())) {
    LOG.warn("Already caught up. Begin Checkpoint was :" + instantEndpts.getKey());
    return Pair.of(Option.empty(), instantEndpts.getKey());
  }

  // Do Incr pull. Set end instant if available
  DataFrameReader reader = sparkSession.read().format("org.apache.hudi")
      .option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY(), DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL())
      .option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY(), instantEndpts.getLeft())
      .option(DataSourceReadOptions.END_INSTANTTIME_OPT_KEY(), instantEndpts.getRight());

  Dataset<Row> source = reader.load(srcPath);

  /*
   * log.info("Partition Fields are : (" + partitionFields + "). Initial Source Schema :" + source.schema());
   * 
   * StructType newSchema = new StructType(source.schema().fields()); for (String field : partitionFields) { newSchema
   * = newSchema.add(field, DataTypes.StringType, true); }
   * 
   * /** Validates if the commit time is sane and also generates Partition fields from _hoodie_partition_path if
   * configured
   *
   * Dataset<Row> validated = source.map((MapFunction<Row, Row>) (Row row) -> { // _hoodie_instant_time String
   * instantTime = row.getString(0); IncrSourceHelper.validateInstantTime(row, instantTime, instantEndpts.getKey(),
   * instantEndpts.getValue()); if (!partitionFields.isEmpty()) { // _hoodie_partition_path String hoodiePartitionPath
   * = row.getString(3); List<Object> partitionVals =
   * extractor.extractPartitionValuesInPath(hoodiePartitionPath).stream() .map(o -> (Object)
   * o).collect(Collectors.toList()); ValidationUtils.checkArgument(partitionVals.size() == partitionFields.size(),
   * "#partition-fields != #partition-values-extracted"); List<Object> rowObjs = new
   * ArrayList<>(scala.collection.JavaConversions.seqAsJavaList(row.toSeq())); rowObjs.addAll(partitionVals); return
   * RowFactory.create(rowObjs.toArray()); } return row; }, RowEncoder.apply(newSchema));
   * 
   * log.info("Validated Source Schema :" + validated.schema());
   */

  // Remove Hoodie meta columns except partition path from input source
  final Dataset<Row> src = source.drop(HoodieRecord.HOODIE_META_COLUMNS.stream()
      .filter(x -> !x.equals(HoodieRecord.PARTITION_PATH_METADATA_FIELD)).toArray(String[]::new));
  // log.info("Final Schema from Source is :" + src.schema());
  return Pair.of(Option.of(src), instantEndpts.getRight());
}

Source File: KuduOutput.java From envelope with Apache License 2.0

4 votes

@Override
public void applyBulkMutations(List<Tuple2<MutationType, Dataset<Row>>> planned) {
  KuduContext kc = new KuduContext(
      config.getString(CONNECTION_CONFIG_NAME), Contexts.getSparkSession().sparkContext());

  String tableName = config.getString(TABLE_CONFIG_NAME);

  Set<String> kuduColumns = null;
  if (KuduUtils.ignoreMissingColumns(config)) {
      try {
        KuduTable table = getConnection().getTable(tableName);
        kuduColumns = Sets.newHashSetWithExpectedSize(table.getSchema().getColumns().size());
        for (int i = 0; i < table.getSchema().getColumns().size(); i++) {
          ColumnSchema columnSchema = table.getSchema().getColumns().get(i);
          kuduColumns.add(columnSchema.getName());
        }
      } catch (Exception e) {
        throw new RuntimeException(e);
      }
  }

  for (Tuple2<MutationType, Dataset<Row>> plan : planned) {
    MutationType mutationType = plan._1();
    Dataset<Row> mutation = plan._2();

    if (KuduUtils.ignoreMissingColumns(config) && kuduColumns != null) {
      Set<String> mutationFields = Sets.newHashSet(mutation.schema().fieldNames());
      for (String col : Sets.difference(mutationFields, kuduColumns)) {
        mutation = mutation.drop(col);
      }
    }

    KuduWriteOptions kuduWriteOptions = new KuduWriteOptions(
        KuduUtils.doesInsertIgnoreDuplicates(config),
        false
    );

    switch (mutationType) {
      case DELETE:
        kc.deleteRows(mutation, tableName, kuduWriteOptions);
        break;
      case INSERT:
        kc.insertRows(mutation, tableName, kuduWriteOptions);
        break;
      case UPDATE:
        kc.updateRows(mutation, tableName, kuduWriteOptions);
        break;
      case UPSERT:
        kc.upsertRows(mutation, tableName, kuduWriteOptions);
        break;
      default:
        throw new RuntimeException("Kudu bulk output does not support mutation type: " + mutationType);
    }
  }
}

Source File: MLResults.java From systemds with Apache License 2.0

3 votes

/**
 * Obtain an output as a {@code DataFrame} of vectors with no ID column.
 * <p>
 * The following matrix in DML:
 * </p>
 * <code>M = full('1 2 3 4', rows=2, cols=2);
 * </code>
 * <p>
 * is equivalent to the following {@code DataFrame} of vectors:
 * </p>
 * <code>[[1.0,2.0]]
 * <br>[[3.0,4.0]]
 * </code>
 *
 * @param outputName
 *            the name of the output
 * @return the output as a {@code DataFrame} of vectors with no ID column
 */
public Dataset<Row> getDataFrameVectorNoIDColumn(String outputName) {
	if (isFrameObject(outputName)) {
		throw new MLContextException("This method currently supports only matrices");
	}
	MatrixObject mo = getMatrixObject(outputName);
	Dataset<Row> df = MLContextConversionUtil.matrixObjectToDataFrame(mo, sparkExecutionContext, true);
	return df.drop(RDDConverterUtils.DF_ID_COLUMN);
}

Source File: MLResults.java From systemds with Apache License 2.0

3 votes

/**
 * Obtain an output as a {@code DataFrame} of doubles with no ID column.
 * <p>
 * The following matrix in DML:
 * </p>
 * <code>M = full('1 2 3 4', rows=2, cols=2);
 * </code>
 * <p>
 * is equivalent to the following {@code DataFrame} of doubles:
 * </p>
 * <code>[1.0,2.0]
 * <br>[3.0,4.0]
 * </code>
 *
 * @param outputName
 *            the name of the output
 * @return the output as a {@code DataFrame} of doubles with no ID column
 */
public Dataset<Row> getDataFrameDoubleNoIDColumn(String outputName) {
	if (isFrameObject(outputName)) {
		throw new MLContextException("This method currently supports only matrices");
	}
	MatrixObject mo = getMatrixObject(outputName);
	Dataset<Row> df = MLContextConversionUtil.matrixObjectToDataFrame(mo, sparkExecutionContext, false);
	return df.drop(RDDConverterUtils.DF_ID_COLUMN);
}

Source File: MLResults.java From systemds with Apache License 2.0

3 votes

/**
 * Obtain an output as a {@code DataFrame} of vectors with no ID column.
 * <p>
 * The following matrix in DML:
 * </p>
 * <code>M = full('1 2 3 4', rows=2, cols=2);
 * </code>
 * <p>
 * is equivalent to the following {@code DataFrame} of vectors:
 * </p>
 * <code>[[1.0,2.0]]
 * <br>[[3.0,4.0]]
 * </code>
 *
 * @param outputName
 *            the name of the output
 * @return the output as a {@code DataFrame} of vectors with no ID column
 */
public Dataset<Row> getDataFrameVectorNoIDColumn(String outputName) {
	if (isFrameObject(outputName)) {
		throw new MLContextException("This method currently supports only matrices");
	}
	MatrixObject mo = getMatrixObject(outputName);
	Dataset<Row> df = MLContextConversionUtil.matrixObjectToDataFrame(mo, sparkExecutionContext, true);
	return df.drop(RDDConverterUtils.DF_ID_COLUMN);
}

Source File: MLResults.java From systemds with Apache License 2.0

3 votes

/**
 * Obtain an output as a {@code DataFrame} of doubles with no ID column.
 * <p>
 * The following matrix in DML:
 * </p>
 * <code>M = full('1 2 3 4', rows=2, cols=2);
 * </code>
 * <p>
 * is equivalent to the following {@code DataFrame} of doubles:
 * </p>
 * <code>[1.0,2.0]
 * <br>[3.0,4.0]
 * </code>
 *
 * @param outputName
 *            the name of the output
 * @return the output as a {@code DataFrame} of doubles with no ID column
 */
public Dataset<Row> getDataFrameDoubleNoIDColumn(String outputName) {
	if (isFrameObject(outputName)) {
		throw new MLContextException("This method currently supports only matrices");
	}
	MatrixObject mo = getMatrixObject(outputName);
	Dataset<Row> df = MLContextConversionUtil.matrixObjectToDataFrame(mo, sparkExecutionContext, false);
	return df.drop(RDDConverterUtils.DF_ID_COLUMN);
}

Source File: Matrix.java From systemds with Apache License 2.0

2 votes

/**
 * Obtain the matrix as a {@code DataFrame} of doubles with no ID column
 *
 * @return the matrix as a {@code DataFrame} of doubles with no ID column
 */
public Dataset<Row> toDFDoubleNoIDColumn() {
	Dataset<Row> df = MLContextConversionUtil.matrixObjectToDataFrame(matrixObject, sparkExecutionContext, false);
	return df.drop(RDDConverterUtils.DF_ID_COLUMN);
}

Source File: Matrix.java From systemds with Apache License 2.0

2 votes

/**
 * Obtain the matrix as a {@code DataFrame} of vectors with no ID column
 *
 * @return the matrix as a {@code DataFrame} of vectors with no ID column
 */
public Dataset<Row> toDFVectorNoIDColumn() {
	Dataset<Row> df = MLContextConversionUtil.matrixObjectToDataFrame(matrixObject, sparkExecutionContext, true);
	return df.drop(RDDConverterUtils.DF_ID_COLUMN);
}

Source File: Matrix.java From systemds with Apache License 2.0

2 votes

/**
 * Obtain the matrix as a {@code DataFrame} of vectors with no ID column
 *
 * @return the matrix as a {@code DataFrame} of vectors with no ID column
 */
public Dataset<Row> toDFVectorNoIDColumn() {
	Dataset<Row> df = MLContextConversionUtil.matrixObjectToDataFrame(matrixObject, sparkExecutionContext, true);
	return df.drop(RDDConverterUtils.DF_ID_COLUMN);
}

Source File: Matrix.java From systemds with Apache License 2.0

2 votes

/**
 * Obtain the matrix as a {@code DataFrame} of doubles with no ID column
 *
 * @return the matrix as a {@code DataFrame} of doubles with no ID column
 */
public Dataset<Row> toDFDoubleNoIDColumn() {
	Dataset<Row> df = MLContextConversionUtil.matrixObjectToDataFrame(matrixObject, sparkExecutionContext, false);
	return df.drop(RDDConverterUtils.DF_ID_COLUMN);
}

Java Code Examples for org.apache.spark.sql.Dataset#drop()