Java Code Examples for org.apache.spark.sql.Dataset#drop()
The following examples show how to use
org.apache.spark.sql.Dataset#drop() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ParseJSONDeriver.java From envelope with Apache License 2.0 | 6 votes |
@Override public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) { String parsedStructTemporaryFieldName = "__parsed_json"; Dataset<Row> dependency = dependencies.get(stepName); Dataset<Row> parsed = dependency.select( functions.from_json(new Column(fieldName), schema, options).as(parsedStructTemporaryFieldName)); if (asStruct) { return parsed.withColumnRenamed(parsedStructTemporaryFieldName, structFieldName); } else { for (StructField parsedField : schema.fields()) { parsed = parsed.withColumn( parsedField.name(), new Column(parsedStructTemporaryFieldName + "." + parsedField.name())); } return parsed.drop(parsedStructTemporaryFieldName); } }
Example 2
Source File: CreateColumnsFromJsonStep.java From bpmn.ai with BSD 3-Clause "New" or "Revised" License | 6 votes |
private Dataset<Row> doFilterJsonVariables(Dataset<Row> dataset, SparkRunnerConfig config) { //read all variables to filter again. They contain also variables that resulted from Json parsing and are not columns, so they can just be dropped List<String> variablesToFilter = new ArrayList<>(); Configuration configuration = ConfigurationUtils.getInstance().getConfiguration(config); if(configuration != null) { PreprocessingConfiguration preprocessingConfiguration = configuration.getPreprocessingConfiguration(); if(preprocessingConfiguration != null) { for(VariableConfiguration vc : preprocessingConfiguration.getVariableConfiguration()) { if(!vc.isUseVariable()) { variablesToFilter.add(vc.getVariableName()); if(Arrays.asList(dataset.columns()).contains(vc.getVariableName())) { BpmnaiLogger.getInstance().writeInfo("The variable '" + vc.getVariableName() + "' will be filtered out after json processing. Comment: " + vc.getComment()); } } } } } dataset = dataset.drop(BpmnaiUtils.getInstance().asSeq(variablesToFilter)); return dataset; }
Example 3
Source File: SelectDeriver.java From envelope with Apache License 2.0 | 5 votes |
@Override public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) throws Exception { dependencyCheck(dependencies); Dataset<Row> sourceStep = dependencies.get(stepName); if (useIncludeFields){ if (!Arrays.asList(sourceStep.columns()).containsAll(includeFields)){ throw new RuntimeException("Columns specified in " + INCLUDE_FIELDS + " are not found in input dependency schema \n" + "Available columns: " + Arrays.toString(sourceStep.columns())); } String firstCol = includeFields.get(0); includeFields.remove(0); return sourceStep.select(firstCol, includeFields.toArray(new String[0])); } else { if (!Arrays.asList(sourceStep.columns()).containsAll(excludeFields)){ throw new RuntimeException("Columns specified in " + EXCLUDE_FIELDS + " are not found in input dependency schema \n" + "Available columns: " + Arrays.toString(sourceStep.columns())); } return sourceStep.drop(JavaConverters.collectionAsScalaIterableConverter(excludeFields).asScala().toSeq()); } }
Example 4
Source File: PdbjMineDataset.java From mmtf-spark with Apache License 2.0 | 5 votes |
/** * Fetches data using the PDBj Mine 2 SQL service * * @param sqlQuery * query in SQL format * @throws IOException */ public static Dataset<Row> getDataset(String sqlQuery) throws IOException { String encodedSQL = URLEncoder.encode(sqlQuery, "UTF-8"); URL u = new URL(SERVICELOCATION + "?format=csv&q=" + encodedSQL); InputStream in = u.openStream(); // save as a temporary CSV file Path tempFile = Files.createTempFile(null, ".csv"); Files.copy(in, tempFile, StandardCopyOption.REPLACE_EXISTING); in.close(); SparkSession spark = SparkSession.builder().getOrCreate(); // load temporary CSV file into Spark dataset Dataset<Row> ds = spark.read().format("csv").option("header", "true").option("inferSchema", "true") // .option("parserLib", "UNIVOCITY") .load(tempFile.toString()); // rename/concatenate columns to assign // consistent primary keys to datasets List<String> columns = Arrays.asList(ds.columns()); if (columns.contains("pdbid")) { // this project uses upper case pdbids ds = ds.withColumn("pdbid", upper(col("pdbid"))); if (columns.contains("chain")) { ds = ds.withColumn("structureChainId", concat(col("pdbid"), lit("."), col("chain"))); ds = ds.drop("pdbid", "chain"); } else { ds = ds.withColumnRenamed("pdbid", "structureId"); } } return ds; }
Example 5
Source File: AggregateActivityInstancesStep.java From bpmn.ai with BSD 3-Clause "New" or "Revised" License | 4 votes |
@Override public Dataset<Row> runPreprocessingStep(Dataset<Row> dataset, Map<String, Object> parameters, SparkRunnerConfig config) { //apply first and processState aggregator Map<String, String> aggregationMap = new HashMap<>(); for(String column : dataset.columns()) { if(column.equals(BpmnaiVariables.VAR_PROCESS_INSTANCE_ID)) { continue; } else if(column.equals(BpmnaiVariables.VAR_DURATION) || column.endsWith("_rev")) { aggregationMap.put(column, "max"); } else if(column.equals(BpmnaiVariables.VAR_STATE)) { aggregationMap.put(column, "ProcessState"); } else if(column.equals(BpmnaiVariables.VAR_ACT_INST_ID)) { //ignore it, as we aggregate by it continue; } else { aggregationMap.put(column, "AllButEmptyString"); } } //first aggregation //activity level, take only processInstance and activityInstance rows dataset = dataset .filter(dataset.col(BpmnaiVariables.VAR_DATA_SOURCE).notEqual(BpmnaiVariables.EVENT_PROCESS_INSTANCE)) .groupBy(BpmnaiVariables.VAR_PROCESS_INSTANCE_ID, BpmnaiVariables.VAR_ACT_INST_ID) .agg(aggregationMap); //rename back columns after aggregation String pattern = "(max|allbutemptystring|processstate)\\((.+)\\)"; Pattern r = Pattern.compile(pattern); for(String columnName : dataset.columns()) { Matcher m = r.matcher(columnName); if(m.find()) { String newColumnName = m.group(2); dataset = dataset.withColumnRenamed(columnName, newColumnName); } } //in case we add the CSV we have a name column in the first dataset of the join so we call drop again to make sure it is gone dataset = dataset.drop(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_NAME); dataset = dataset.drop(BpmnaiVariables.VAR_DATA_SOURCE); dataset = dataset.sort(BpmnaiVariables.VAR_START_TIME); dataset.cache(); BpmnaiLogger.getInstance().writeInfo("Found " + dataset.count() + " activity instances."); if(config.isWriteStepResultsIntoFile()) { BpmnaiUtils.getInstance().writeDatasetToCSV(dataset, "agg_of_activity_instances", config); } //return preprocessed data return dataset; }
Example 6
Source File: ColumnRemoveStep.java From bpmn.ai with BSD 3-Clause "New" or "Revised" License | 4 votes |
@Override public Dataset<Row> runPreprocessingStep(Dataset<Row> dataSet, Map<String, Object> parameters, SparkRunnerConfig config) { //these columns have to stay in in order to do the processing List<String> columnsToKeep = new ArrayList<>(); columnsToKeep.add(BpmnaiVariables.VAR_PROCESS_INSTANCE_ID); columnsToKeep.add(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_NAME); columnsToKeep.add(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_TYPE); columnsToKeep.add(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_REVISION); columnsToKeep.add(BpmnaiVariables.VAR_STATE); columnsToKeep.add(BpmnaiVariables.VAR_LONG); columnsToKeep.add(BpmnaiVariables.VAR_DOUBLE); columnsToKeep.add(BpmnaiVariables.VAR_TEXT); columnsToKeep.add(BpmnaiVariables.VAR_TEXT2); columnsToKeep.add(BpmnaiVariables.VAR_DATA_SOURCE); List<String> columnsToRemove = new ArrayList<>(); Configuration configuration = ConfigurationUtils.getInstance().getConfiguration(config); if(configuration != null) { PreprocessingConfiguration preprocessingConfiguration = configuration.getPreprocessingConfiguration(); if(preprocessingConfiguration != null) { for(ColumnConfiguration cc : preprocessingConfiguration.getColumnConfiguration()) { if(!cc.isUseColumn()) { if(columnsToKeep.contains(cc.getColumnName())) { BpmnaiLogger.getInstance().writeWarn("The column '" + cc.getColumnName() + "' has to stay in in order to do the processing. It will not be removed. Comment: " + cc.getComment()); } else { columnsToRemove.add(cc.getColumnName()); BpmnaiLogger.getInstance().writeInfo("The column '" + cc.getColumnName() + "' will be removed. Comment: " + cc.getComment()); } } } } } //check if all variables that should be filtered actually exist, otherwise log a warning List<String> existingColumns = new ArrayList<>(Arrays.asList(dataSet.columns())); columnsToRemove .stream() .forEach(new Consumer<String>() { @Override public void accept(String s) { if(!existingColumns.contains(s)) { // log the fact that a variable that should be filtered does not exist BpmnaiLogger.getInstance().writeWarn("The column '" + s + "' is configured to be filtered, but does not exist in the data."); } } }); dataSet = dataSet.drop(BpmnaiUtils.getInstance().asSeq(columnsToRemove)); return dataSet; }
Example 7
Source File: AddVariableColumnsStep.java From bpmn.ai with BSD 3-Clause "New" or "Revised" License | 4 votes |
private Dataset<Row> doAddVariableColumns(Dataset<Row> dataset, boolean writeStepResultIntoFile, String dataLevel, SparkRunnerConfig config) { Map<String, String> varMap = (Map<String, String>) SparkBroadcastHelper.getInstance().getBroadcastVariable(SparkBroadcastHelper.BROADCAST_VARIABLE.PROCESS_VARIABLES_ESCALATED); Set<String> variables = varMap.keySet(); for(String v : variables) { dataset = dataset.withColumn(v, when(dataset.col(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_NAME).equalTo(v), when(dataset.col(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_TYPE).equalTo("string"), dataset.col(BpmnaiVariables.VAR_TEXT)) .when(dataset.col(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_TYPE).equalTo("null"), dataset.col(BpmnaiVariables.VAR_TEXT)) .when(dataset.col(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_TYPE).equalTo("boolean"), dataset.col(BpmnaiVariables.VAR_LONG)) .when(dataset.col(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_TYPE).equalTo("integer"), dataset.col(BpmnaiVariables.VAR_LONG)) .when(dataset.col(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_TYPE).equalTo("long"), dataset.col(BpmnaiVariables.VAR_LONG)) .when(dataset.col(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_TYPE).equalTo("double"), dataset.col(BpmnaiVariables.VAR_DOUBLE)) .when(dataset.col(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_TYPE).equalTo("date"), dataset.col(BpmnaiVariables.VAR_LONG)) .otherwise(dataset.col(BpmnaiVariables.VAR_TEXT2))) .otherwise(null)); //rev count is only relevant on process level if(dataLevel.equals(BpmnaiVariables.DATA_LEVEL_PROCESS) && config.isRevCountEnabled()) { dataset = dataset.withColumn(v+"_rev", when(dataset.col(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_NAME).equalTo(v), dataset.col(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_REVISION)) .otherwise("0")); } } //drop unnecesssary columns dataset = dataset.drop( BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_TYPE, BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_REVISION, BpmnaiVariables.VAR_DOUBLE, BpmnaiVariables.VAR_LONG, BpmnaiVariables.VAR_TEXT, BpmnaiVariables.VAR_TEXT2); if(!config.isDevProcessStateColumnWorkaroundEnabled()) { dataset = dataset.drop(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_NAME); } if(writeStepResultIntoFile) { BpmnaiUtils.getInstance().writeDatasetToCSV(dataset, "add_var_columns", config); } //return preprocessed data return dataset; }
Example 8
Source File: DataFilterOnActivityStep.java From bpmn.ai with BSD 3-Clause "New" or "Revised" License | 4 votes |
/** * @param dataSet the incoming dataset for this processing step * @param parameters * @return the filtered DataSet */ @Override public Dataset<Row> runPreprocessingStep(Dataset<Row> dataSet, Map<String, Object> parameters, SparkRunnerConfig config) { // any parameters set? if (parameters == null || parameters.size() == 0) { BpmnaiLogger.getInstance().writeWarn("No parameters found for the DataFilterOnActivityStep"); return dataSet; } // get query parameter String query = (String) parameters.get("query"); BpmnaiLogger.getInstance().writeInfo("Filtering data with activity instance filter query: " + query + "."); // save size of initial dataset for log dataSet.cache(); Long initialDSCount = dataSet.count(); // repartition by process instance and order by start_time for this operation dataSet = dataSet.repartition(dataSet.col(BpmnaiVariables.VAR_PROCESS_INSTANCE_ID)).sortWithinPartitions(BpmnaiVariables.VAR_START_TIME); // we temporarily store variable updates (rows with a var type set) separately. Dataset<Row> variables = dataSet.filter(col(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_TYPE).isNotNull()); //find first occurrence of activity instance final Dataset<Row> dsTmp = dataSet.filter(dataSet.col(BpmnaiVariables.VAR_ACT_ID).equalTo(query)).filter(dataSet.col(BpmnaiVariables.VAR_END_TIME).isNull()); //TODO: ENSURING THAT THIS ISN'T A VARIABLE ROW // now we look for the first occurrence of the activity id contained in "query". The result comprises of a dataset of corresponding activity instances. final Dataset<Row> dsActivityInstances = dataSet.filter(dataSet.col(BpmnaiVariables.VAR_ACT_ID).like(query)).filter(dataSet.col(BpmnaiVariables.VAR_END_TIME).isNull()); //TODO: ENSURING THAT THIS ISN'T A VARIABLE ROW // we slim the resulting dataset down: only the activity instances process id and the instances start time are relevant. List<Row> activityRows = dsActivityInstances.select(BpmnaiVariables.VAR_PROCESS_INSTANCE_ID, BpmnaiVariables.VAR_START_TIME).collectAsList(); Map<String, String> activities = activityRows.stream().collect(Collectors.toMap( r -> r.getAs(BpmnaiVariables.VAR_PROCESS_INSTANCE_ID), r -> r.getAs(BpmnaiVariables.VAR_START_TIME))); // broadcasting the PID - Start time Map to use it in a user defined function SparkBroadcastHelper.getInstance().broadcastVariable(SparkBroadcastHelper.BROADCAST_VARIABLE.PROCESS_INSTANCE_TIMESTAMP_MAP, activities); // now we have to select for each process instance in our inital dataset all events that happend before the first occurence of our selected activity. // We first narrow it down to the process instances in question Dataset<Row> selectedProcesses = dataSet.filter(col(BpmnaiVariables.VAR_PROCESS_INSTANCE_ID).isin(activities.keySet().toArray())); // Then, we mark all events that should be removed Dataset<Row> activityDataSet = selectedProcesses.withColumn("data_filter_on_activity", callUDF("activityBeforeTimestamp", selectedProcesses.col(BpmnaiVariables.VAR_PROCESS_INSTANCE_ID), selectedProcesses.col(BpmnaiVariables.VAR_START_TIME))); // And we keep the rest activityDataSet = activityDataSet.filter(col("data_filter_on_activity").like("TRUE")); // Clean up activityDataSet = activityDataSet.drop("data_filter_on_activity"); // However, we lost all variable updates in this approach, so now we add the variables in question to the dataset // first, we narrow it down to keep only variables that have a corresponding activity instance activityDataSet = activityDataSet.withColumnRenamed(BpmnaiVariables.VAR_ACT_INST_ID, BpmnaiVariables.VAR_ACT_INST_ID+"_RIGHT"); variables = variables.join(activityDataSet.select(BpmnaiVariables.VAR_ACT_INST_ID+"_RIGHT").distinct(), variables.col(BpmnaiVariables.VAR_ACT_INST_ID).equalTo(activityDataSet.col(BpmnaiVariables.VAR_ACT_INST_ID+"_RIGHT")),"inner"); activityDataSet = activityDataSet.withColumnRenamed(BpmnaiVariables.VAR_ACT_INST_ID+"_RIGHT", BpmnaiVariables.VAR_ACT_INST_ID); variables = variables.drop(BpmnaiVariables.VAR_ACT_INST_ID+"_RIGHT"); dataSet = activityDataSet.union(variables); dataSet.cache(); BpmnaiLogger.getInstance().writeInfo("DataFilterOnActivityStep: The filtered DataSet contains "+dataSet.count()+" rows, (before: "+ initialDSCount+" rows)"); if (config.isWriteStepResultsIntoFile()) { BpmnaiUtils.getInstance().writeDatasetToCSV(dataSet, "data_filter_on_activity_step", config); } return dataSet; }
Example 9
Source File: PdbToUniProt.java From mmtf-spark with Apache License 2.0 | 4 votes |
/** * Returns an up-to-date dataset of PDB to UniProt * residue-level mappings for a list of ids. * Valid ids are either a list of pdbIds (e.g. 1XYZ) or pdbId.chainId (e.g., 1XYZ.A). * This method reads a cached file and downloads updates. * * @param ids list of pdbIds or pdbId.chainIds * @return dataset of PDB to UniProt residue-level mappings * @throws IOException */ public static Dataset<Row> getResidueMappings(List<String> ids) throws IOException { SparkSession spark = SparkSession.builder().getOrCreate(); boolean withChainId = ids.size() > 0 && ids.get(0).length() > 4; // create dataset of ids Dataset<Row> df = spark.createDataset(ids, Encoders.STRING()).toDF("id"); // get cached mappings Dataset<Row> mapping = getCachedResidueMappings(); // dataset for non-cached mappings Dataset<Row> notCached = null; // dataset with PDB Ids to be downloaded Dataset<Row> toDownload = null; if (withChainId) { // get subset of requested ids from cached dataset mapping = mapping.join(df, mapping.col("structureChainId").equalTo(df.col("id"))).drop("id"); // get ids that are not in the cached dataset notCached = df.join(mapping, df.col("id").equalTo(mapping.col("structureChainId")), "left_anti").cache(); // create dataset of PDB Ids to be downloaded toDownload = notCached.withColumn("id", col("id").substr(0, 4)).distinct().cache(); } else { // get subset of requested ids from cached dataset mapping = mapping.withColumn("pdbId", col("structureChainId").substr(0, 4)); mapping = mapping.join(df, mapping.col("pdbId").equalTo(df.col("id"))).drop("id"); // create dataset of PDB Ids to be downloaded toDownload = df.join(mapping, df.col("id").equalTo(mapping.col("pdbId")), "left_anti").distinct().cache(); mapping = mapping.drop("pdbId"); } toDownload = toDownload.distinct().cache(); // download data that are not in the cache if (toDownload.count() > 0) { Dataset<Row> unpData = getChainMappings().select("structureId").distinct(); toDownload = toDownload.join(unpData, toDownload.col("id").equalTo(unpData.col("structureId"))).drop("structureId").cache(); System.out.println("Downloading mapping for " + toDownload.count() + " PDB structures."); Dataset<Row> downloadedData = downloadData(toDownload); // since data are downloaded for all chains in structure, make sure to only include the requested chains. if (withChainId) { downloadedData = downloadedData.join(notCached, downloadedData.col("structureChainId").equalTo(notCached.col("id"))).drop("id"); } mapping = mapping.union(downloadedData); } return mapping; }
Example 10
Source File: HoodieIncrSource.java From hudi with Apache License 2.0 | 4 votes |
@Override public Pair<Option<Dataset<Row>>, String> fetchNextBatch(Option<String> lastCkptStr, long sourceLimit) { DataSourceUtils.checkRequiredProperties(props, Collections.singletonList(Config.HOODIE_SRC_BASE_PATH)); /* * DataSourceUtils.checkRequiredProperties(props, Arrays.asList(Config.HOODIE_SRC_BASE_PATH, * Config.HOODIE_SRC_PARTITION_FIELDS)); List<String> partitionFields = * props.getStringList(Config.HOODIE_SRC_PARTITION_FIELDS, ",", new ArrayList<>()); PartitionValueExtractor * extractor = DataSourceUtils.createPartitionExtractor(props.getString( Config.HOODIE_SRC_PARTITION_EXTRACTORCLASS, * Config.DEFAULT_HOODIE_SRC_PARTITION_EXTRACTORCLASS)); */ String srcPath = props.getString(Config.HOODIE_SRC_BASE_PATH); int numInstantsPerFetch = props.getInteger(Config.NUM_INSTANTS_PER_FETCH, Config.DEFAULT_NUM_INSTANTS_PER_FETCH); boolean readLatestOnMissingCkpt = props.getBoolean(Config.READ_LATEST_INSTANT_ON_MISSING_CKPT, Config.DEFAULT_READ_LATEST_INSTANT_ON_MISSING_CKPT); // Use begin Instant if set and non-empty Option<String> beginInstant = lastCkptStr.isPresent() ? lastCkptStr.get().isEmpty() ? Option.empty() : lastCkptStr : Option.empty(); Pair<String, String> instantEndpts = IncrSourceHelper.calculateBeginAndEndInstants(sparkContext, srcPath, numInstantsPerFetch, beginInstant, readLatestOnMissingCkpt); if (instantEndpts.getKey().equals(instantEndpts.getValue())) { LOG.warn("Already caught up. Begin Checkpoint was :" + instantEndpts.getKey()); return Pair.of(Option.empty(), instantEndpts.getKey()); } // Do Incr pull. Set end instant if available DataFrameReader reader = sparkSession.read().format("org.apache.hudi") .option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY(), DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL()) .option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY(), instantEndpts.getLeft()) .option(DataSourceReadOptions.END_INSTANTTIME_OPT_KEY(), instantEndpts.getRight()); Dataset<Row> source = reader.load(srcPath); /* * log.info("Partition Fields are : (" + partitionFields + "). Initial Source Schema :" + source.schema()); * * StructType newSchema = new StructType(source.schema().fields()); for (String field : partitionFields) { newSchema * = newSchema.add(field, DataTypes.StringType, true); } * * /** Validates if the commit time is sane and also generates Partition fields from _hoodie_partition_path if * configured * * Dataset<Row> validated = source.map((MapFunction<Row, Row>) (Row row) -> { // _hoodie_instant_time String * instantTime = row.getString(0); IncrSourceHelper.validateInstantTime(row, instantTime, instantEndpts.getKey(), * instantEndpts.getValue()); if (!partitionFields.isEmpty()) { // _hoodie_partition_path String hoodiePartitionPath * = row.getString(3); List<Object> partitionVals = * extractor.extractPartitionValuesInPath(hoodiePartitionPath).stream() .map(o -> (Object) * o).collect(Collectors.toList()); ValidationUtils.checkArgument(partitionVals.size() == partitionFields.size(), * "#partition-fields != #partition-values-extracted"); List<Object> rowObjs = new * ArrayList<>(scala.collection.JavaConversions.seqAsJavaList(row.toSeq())); rowObjs.addAll(partitionVals); return * RowFactory.create(rowObjs.toArray()); } return row; }, RowEncoder.apply(newSchema)); * * log.info("Validated Source Schema :" + validated.schema()); */ // Remove Hoodie meta columns except partition path from input source final Dataset<Row> src = source.drop(HoodieRecord.HOODIE_META_COLUMNS.stream() .filter(x -> !x.equals(HoodieRecord.PARTITION_PATH_METADATA_FIELD)).toArray(String[]::new)); // log.info("Final Schema from Source is :" + src.schema()); return Pair.of(Option.of(src), instantEndpts.getRight()); }
Example 11
Source File: KuduOutput.java From envelope with Apache License 2.0 | 4 votes |
@Override public void applyBulkMutations(List<Tuple2<MutationType, Dataset<Row>>> planned) { KuduContext kc = new KuduContext( config.getString(CONNECTION_CONFIG_NAME), Contexts.getSparkSession().sparkContext()); String tableName = config.getString(TABLE_CONFIG_NAME); Set<String> kuduColumns = null; if (KuduUtils.ignoreMissingColumns(config)) { try { KuduTable table = getConnection().getTable(tableName); kuduColumns = Sets.newHashSetWithExpectedSize(table.getSchema().getColumns().size()); for (int i = 0; i < table.getSchema().getColumns().size(); i++) { ColumnSchema columnSchema = table.getSchema().getColumns().get(i); kuduColumns.add(columnSchema.getName()); } } catch (Exception e) { throw new RuntimeException(e); } } for (Tuple2<MutationType, Dataset<Row>> plan : planned) { MutationType mutationType = plan._1(); Dataset<Row> mutation = plan._2(); if (KuduUtils.ignoreMissingColumns(config) && kuduColumns != null) { Set<String> mutationFields = Sets.newHashSet(mutation.schema().fieldNames()); for (String col : Sets.difference(mutationFields, kuduColumns)) { mutation = mutation.drop(col); } } KuduWriteOptions kuduWriteOptions = new KuduWriteOptions( KuduUtils.doesInsertIgnoreDuplicates(config), false ); switch (mutationType) { case DELETE: kc.deleteRows(mutation, tableName, kuduWriteOptions); break; case INSERT: kc.insertRows(mutation, tableName, kuduWriteOptions); break; case UPDATE: kc.updateRows(mutation, tableName, kuduWriteOptions); break; case UPSERT: kc.upsertRows(mutation, tableName, kuduWriteOptions); break; default: throw new RuntimeException("Kudu bulk output does not support mutation type: " + mutationType); } } }
Example 12
Source File: MLResults.java From systemds with Apache License 2.0 | 3 votes |
/** * Obtain an output as a {@code DataFrame} of vectors with no ID column. * <p> * The following matrix in DML: * </p> * <code>M = full('1 2 3 4', rows=2, cols=2); * </code> * <p> * is equivalent to the following {@code DataFrame} of vectors: * </p> * <code>[[1.0,2.0]] * <br>[[3.0,4.0]] * </code> * * @param outputName * the name of the output * @return the output as a {@code DataFrame} of vectors with no ID column */ public Dataset<Row> getDataFrameVectorNoIDColumn(String outputName) { if (isFrameObject(outputName)) { throw new MLContextException("This method currently supports only matrices"); } MatrixObject mo = getMatrixObject(outputName); Dataset<Row> df = MLContextConversionUtil.matrixObjectToDataFrame(mo, sparkExecutionContext, true); return df.drop(RDDConverterUtils.DF_ID_COLUMN); }
Example 13
Source File: MLResults.java From systemds with Apache License 2.0 | 3 votes |
/** * Obtain an output as a {@code DataFrame} of doubles with no ID column. * <p> * The following matrix in DML: * </p> * <code>M = full('1 2 3 4', rows=2, cols=2); * </code> * <p> * is equivalent to the following {@code DataFrame} of doubles: * </p> * <code>[1.0,2.0] * <br>[3.0,4.0] * </code> * * @param outputName * the name of the output * @return the output as a {@code DataFrame} of doubles with no ID column */ public Dataset<Row> getDataFrameDoubleNoIDColumn(String outputName) { if (isFrameObject(outputName)) { throw new MLContextException("This method currently supports only matrices"); } MatrixObject mo = getMatrixObject(outputName); Dataset<Row> df = MLContextConversionUtil.matrixObjectToDataFrame(mo, sparkExecutionContext, false); return df.drop(RDDConverterUtils.DF_ID_COLUMN); }
Example 14
Source File: MLResults.java From systemds with Apache License 2.0 | 3 votes |
/** * Obtain an output as a {@code DataFrame} of vectors with no ID column. * <p> * The following matrix in DML: * </p> * <code>M = full('1 2 3 4', rows=2, cols=2); * </code> * <p> * is equivalent to the following {@code DataFrame} of vectors: * </p> * <code>[[1.0,2.0]] * <br>[[3.0,4.0]] * </code> * * @param outputName * the name of the output * @return the output as a {@code DataFrame} of vectors with no ID column */ public Dataset<Row> getDataFrameVectorNoIDColumn(String outputName) { if (isFrameObject(outputName)) { throw new MLContextException("This method currently supports only matrices"); } MatrixObject mo = getMatrixObject(outputName); Dataset<Row> df = MLContextConversionUtil.matrixObjectToDataFrame(mo, sparkExecutionContext, true); return df.drop(RDDConverterUtils.DF_ID_COLUMN); }
Example 15
Source File: MLResults.java From systemds with Apache License 2.0 | 3 votes |
/** * Obtain an output as a {@code DataFrame} of doubles with no ID column. * <p> * The following matrix in DML: * </p> * <code>M = full('1 2 3 4', rows=2, cols=2); * </code> * <p> * is equivalent to the following {@code DataFrame} of doubles: * </p> * <code>[1.0,2.0] * <br>[3.0,4.0] * </code> * * @param outputName * the name of the output * @return the output as a {@code DataFrame} of doubles with no ID column */ public Dataset<Row> getDataFrameDoubleNoIDColumn(String outputName) { if (isFrameObject(outputName)) { throw new MLContextException("This method currently supports only matrices"); } MatrixObject mo = getMatrixObject(outputName); Dataset<Row> df = MLContextConversionUtil.matrixObjectToDataFrame(mo, sparkExecutionContext, false); return df.drop(RDDConverterUtils.DF_ID_COLUMN); }
Example 16
Source File: Matrix.java From systemds with Apache License 2.0 | 2 votes |
/** * Obtain the matrix as a {@code DataFrame} of doubles with no ID column * * @return the matrix as a {@code DataFrame} of doubles with no ID column */ public Dataset<Row> toDFDoubleNoIDColumn() { Dataset<Row> df = MLContextConversionUtil.matrixObjectToDataFrame(matrixObject, sparkExecutionContext, false); return df.drop(RDDConverterUtils.DF_ID_COLUMN); }
Example 17
Source File: Matrix.java From systemds with Apache License 2.0 | 2 votes |
/** * Obtain the matrix as a {@code DataFrame} of vectors with no ID column * * @return the matrix as a {@code DataFrame} of vectors with no ID column */ public Dataset<Row> toDFVectorNoIDColumn() { Dataset<Row> df = MLContextConversionUtil.matrixObjectToDataFrame(matrixObject, sparkExecutionContext, true); return df.drop(RDDConverterUtils.DF_ID_COLUMN); }
Example 18
Source File: Matrix.java From systemds with Apache License 2.0 | 2 votes |
/** * Obtain the matrix as a {@code DataFrame} of vectors with no ID column * * @return the matrix as a {@code DataFrame} of vectors with no ID column */ public Dataset<Row> toDFVectorNoIDColumn() { Dataset<Row> df = MLContextConversionUtil.matrixObjectToDataFrame(matrixObject, sparkExecutionContext, true); return df.drop(RDDConverterUtils.DF_ID_COLUMN); }
Example 19
Source File: Matrix.java From systemds with Apache License 2.0 | 2 votes |
/** * Obtain the matrix as a {@code DataFrame} of doubles with no ID column * * @return the matrix as a {@code DataFrame} of doubles with no ID column */ public Dataset<Row> toDFDoubleNoIDColumn() { Dataset<Row> df = MLContextConversionUtil.matrixObjectToDataFrame(matrixObject, sparkExecutionContext, false); return df.drop(RDDConverterUtils.DF_ID_COLUMN); }