org.apache.spark.sql.Column Java Examples
The following examples show how to use
org.apache.spark.sql.Column.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: HoodieSnapshotExporter.java From hudi with Apache License 2.0 | 7 votes |
private void exportAsNonHudi(JavaSparkContext jsc, Config cfg, List<String> partitions, String latestCommitTimestamp) { Partitioner defaultPartitioner = dataset -> { Dataset<Row> hoodieDroppedDataset = dataset.drop(JavaConversions.asScalaIterator(HoodieRecord.HOODIE_META_COLUMNS.iterator()).toSeq()); return StringUtils.isNullOrEmpty(cfg.outputPartitionField) ? hoodieDroppedDataset.write() : hoodieDroppedDataset.repartition(new Column(cfg.outputPartitionField)).write().partitionBy(cfg.outputPartitionField); }; Partitioner partitioner = StringUtils.isNullOrEmpty(cfg.outputPartitioner) ? defaultPartitioner : ReflectionUtils.loadClass(cfg.outputPartitioner); final BaseFileOnlyView fsView = getBaseFileOnlyView(jsc, cfg); Iterator<String> exportingFilePaths = jsc .parallelize(partitions, partitions.size()) .flatMap(partition -> fsView .getLatestBaseFilesBeforeOrOn(partition, latestCommitTimestamp) .map(HoodieBaseFile::getPath).iterator()) .toLocalIterator(); Dataset<Row> sourceDataset = new SQLContext(jsc).read().parquet(JavaConversions.asScalaIterator(exportingFilePaths).toSeq()); partitioner.partition(sourceDataset) .format(cfg.outputFormat) .mode(SaveMode.Overwrite) .save(cfg.targetOutputPath); }
Example #2
Source File: SparkRelationalOperator.java From spliceengine with GNU Affero General Public License v3.0 | 6 votes |
@Override public Column getColumnExpression(Dataset<Row> leftDF, Dataset<Row> rightDF, Function<String, DataType> convertStringToDataTypeFunction) throws UnsupportedOperationException { Column leftExpr = getLeftChild().getColumnExpression(leftDF, rightDF, convertStringToDataTypeFunction); Column rightExpr = getRightChild().getColumnExpression(leftDF, rightDF, convertStringToDataTypeFunction); if (relOpKind == EQUALS_RELOP) return leftExpr.equalTo(rightExpr); else if (relOpKind == NOT_EQUALS_RELOP) return leftExpr.notEqual(rightExpr); else if (relOpKind == GREATER_THAN_RELOP) return leftExpr.gt(rightExpr); else if (relOpKind == GREATER_EQUALS_RELOP) return leftExpr.geq(rightExpr); else if (relOpKind == LESS_THAN_RELOP) return leftExpr.lt(rightExpr); else if (relOpKind == LESS_EQUALS_RELOP) return leftExpr.leq(rightExpr); else if (relOpKind == IS_NULL_RELOP) return leftExpr.isNull(); else if (relOpKind == IS_NOT_NULL_RELOP) return leftExpr.isNotNull(); else throw new UnsupportedOperationException(); }
Example #3
Source File: SparkArithmeticOperator.java From spliceengine with GNU Affero General Public License v3.0 | 6 votes |
@Override public Column getColumnExpression(Dataset<Row> leftDF, Dataset<Row> rightDF, Function<String, DataType> convertStringToDataTypeFunction) throws UnsupportedOperationException { Column leftExpr = getLeftChild().getColumnExpression(leftDF, rightDF, convertStringToDataTypeFunction); Column rightExpr = getRightChild().getColumnExpression(leftDF, rightDF, convertStringToDataTypeFunction); if (opKind == PLUS) return leftExpr.plus(rightExpr); else if (opKind == MINUS) return leftExpr.minus(rightExpr); else if (opKind == TIMES) return leftExpr.multiply(rightExpr); else if (opKind == DIVIDE) return leftExpr.divide(rightExpr); else throw new UnsupportedOperationException(); }
Example #4
Source File: RewriteManifestsAction.java From iceberg with Apache License 2.0 | 6 votes |
private Dataset<Row> buildManifestEntryDF(List<ManifestFile> manifests) { Dataset<Row> manifestDF = spark .createDataset(Lists.transform(manifests, ManifestFile::path), Encoders.STRING()) .toDF("manifest"); String entriesMetadataTable = metadataTableName(MetadataTableType.ENTRIES); Dataset<Row> manifestEntryDF = spark.read().format("iceberg") .load(entriesMetadataTable) .filter("status < 2") // select only live entries .selectExpr("input_file_name() as manifest", "snapshot_id", "sequence_number", "data_file"); Column joinCond = manifestDF.col("manifest").equalTo(manifestEntryDF.col("manifest")); return manifestEntryDF .join(manifestDF, joinCond, "left_semi") .select("snapshot_id", "sequence_number", "data_file"); }
Example #5
Source File: ParseJSONDeriver.java From envelope with Apache License 2.0 | 6 votes |
@Override public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) { String parsedStructTemporaryFieldName = "__parsed_json"; Dataset<Row> dependency = dependencies.get(stepName); Dataset<Row> parsed = dependency.select( functions.from_json(new Column(fieldName), schema, options).as(parsedStructTemporaryFieldName)); if (asStruct) { return parsed.withColumnRenamed(parsedStructTemporaryFieldName, structFieldName); } else { for (StructField parsedField : schema.fields()) { parsed = parsed.withColumn( parsedField.name(), new Column(parsedStructTemporaryFieldName + "." + parsedField.name())); } return parsed.drop(parsedStructTemporaryFieldName); } }
Example #6
Source File: SparkPredictionServiceRunner.java From bpmn.ai with BSD 3-Clause "New" or "Revised" License | 6 votes |
public Dataset<Row> run(Dataset dataset) { //only use configured variables for pipeline Configuration configuration = ConfigurationUtils.getInstance().getConfiguration(this.sparkRunnerConfig); List<String> predictionVars = configuration.getModelPredictionConfiguration().getPredictionVariables(); List<Column> usedColumns = new ArrayList<>(); for(String var : predictionVars) { usedColumns.add(new Column(var)); } dataset = dataset.select(BpmnaiUtils.getInstance().asSeq(usedColumns)); //go through pipe elements // Define processing steps to run final PreprocessingRunner preprocessingRunner = new PreprocessingRunner(); for(PipelineStep ps : pipelineManager.getOrderedPipeline()) { preprocessingRunner.addPreprocessorStep(ps); } // Run processing runner Dataset<Row> resultDataset = preprocessingRunner.run(dataset, this.sparkRunnerConfig); writeConfig(); return resultDataset; }
Example #7
Source File: Normalization.java From DataVec with Apache License 2.0 | 6 votes |
/** * Scale based on min,max * * @param dataFrame the dataframe to scale * @param min the minimum value * @param max the maximum value * @return the normalized dataframe per column */ public static DataRowsFacade normalize(DataRowsFacade dataFrame, double min, double max, List<String> skipColumns) { List<String> columnsList = DataFrames.toList(dataFrame.get().columns()); columnsList.removeAll(skipColumns); String[] columnNames = DataFrames.toArray(columnsList); //first row is min second row is max, each column in a row is for a particular column List<Row> minMax = minMaxColumns(dataFrame, columnNames); for (int i = 0; i < columnNames.length; i++) { String columnName = columnNames[i]; double dMin = ((Number) minMax.get(0).get(i)).doubleValue(); double dMax = ((Number) minMax.get(1).get(i)).doubleValue(); double maxSubMin = (dMax - dMin); if (maxSubMin == 0) maxSubMin = 1; Column newCol = dataFrame.get().col(columnName).minus(dMin).divide(maxSubMin).multiply(max - min).plus(min); dataFrame = dataRows(dataFrame.get().withColumn(columnName, newCol)); } return dataFrame; }
Example #8
Source File: Loader.java From AWS-MIMIC-IIItoOMOP with Apache License 2.0 | 6 votes |
public void write(String destination, List<String> overflowColumns) throws IOException { String prefix = configuration.getFileSystem() + "://" + configuration.getDestinationBucket() + "/stage"; String suffix = destination + ".csv"; List<Column> columns = new ArrayList<Column>(); List<String> strings = new ArrayList<String>(); addMetadata(destination); materializeUUID(prefix, suffix); strings.add("file_location"); columns.add(col("overlflow_column_uuid_lookup")); columns.add(col("file_location")); for(String column: overflowColumns) { columns.add(col(column)); strings.add(column); } df.select(JavaConverters.asScalaBufferConverter(columns).asScala()).write().format("com.databricks.spark.csv").option("header", "true").option("codec", "org.apache.hadoop.io.compress.GzipCodec").mode("overwrite").save(prefix + "/column_overflow/" + suffix); df.drop(JavaConverters.asScalaBufferConverter(strings).asScala()).write().format("com.databricks.spark.csv").option("header", "true").option("codec", "org.apache.hadoop.io.compress.GzipCodec").mode("overwrite").save(prefix + "/" + suffix); //clean up temp file new File(prefix + "/temp/" + suffix).delete(); }
Example #9
Source File: Normalization.java From deeplearning4j with Apache License 2.0 | 6 votes |
/** * Scale based on min,max * * @param dataFrame the dataframe to scale * @param min the minimum value * @param max the maximum value * @return the normalized dataframe per column */ public static Dataset<Row> normalize(Dataset<Row> dataFrame, double min, double max, List<String> skipColumns) { List<String> columnsList = DataFrames.toList(dataFrame.columns()); columnsList.removeAll(skipColumns); String[] columnNames = DataFrames.toArray(columnsList); //first row is min second row is max, each column in a row is for a particular column List<Row> minMax = minMaxColumns(dataFrame, columnNames); for (int i = 0; i < columnNames.length; i++) { String columnName = columnNames[i]; double dMin = ((Number) minMax.get(0).get(i)).doubleValue(); double dMax = ((Number) minMax.get(1).get(i)).doubleValue(); double maxSubMin = (dMax - dMin); if (maxSubMin == 0) maxSubMin = 1; Column newCol = dataFrame.col(columnName).minus(dMin).divide(maxSubMin).multiply(max - min).plus(min); dataFrame = dataFrame.withColumn(columnName, newCol); } return dataFrame; }
Example #10
Source File: ProjectRestrictMapFunction.java From spliceengine with GNU Affero General Public License v3.0 | 6 votes |
@Override public Pair<Dataset<Row>, OperationContext> nativeTransformation(Dataset<Row> input, OperationContext context) { ProjectRestrictOperation op = (ProjectRestrictOperation) operationContext.getOperation(); Dataset<Row> df = null; // TODO: Enable the commented try-catch block after regression testing. // This would be a safeguard against unanticipated exceptions: // org.apache.spark.sql.catalyst.parser.ParseException // org.apache.spark.sql.AnalysisException // ... which may occur if the Splice parser fails to detect a // SQL expression which SparkSQL does not support. if (op.hasExpressions()) { // try { df = input.selectExpr(op.getExpressions()); return Pair.newPair(df, context); // } // catch (Exception e) { // } } int[] mapping = op.projectMapping; Column[] columns = new Column[mapping.length]; for (int i = 0; i < mapping.length; ++i) { columns[i] = input.col("c" + (mapping[i] - 1)); } df = input.select(columns); return Pair.newPair(df, context); }
Example #11
Source File: Dataset.java From nemo with Apache License 2.0 | 5 votes |
@Override public Dataset<Row> select(final Column... cols) { final boolean userTriggered = initializeFunction(cols); final Dataset<Row> result = from(super.select(cols)); this.setIsUserTriggered(userTriggered); return result; }
Example #12
Source File: Dataset.java From nemo with Apache License 2.0 | 5 votes |
@Override public Dataset<T> where(final Column condition) { final boolean userTriggered = initializeFunction(condition); final Dataset<T> result = from(super.where(condition)); this.setIsUserTriggered(userTriggered); return result; }
Example #13
Source File: Dataset.java From nemo with Apache License 2.0 | 5 votes |
@Override public Dataset<Row> withColumn(final String colName, final Column col) { final boolean userTriggered = initializeFunction(colName, col); final Dataset<Row> result = from(super.withColumn(colName, col)); this.setIsUserTriggered(userTriggered); return result; }
Example #14
Source File: DataFrames.java From DataVec with Apache License 2.0 | 5 votes |
/** * Convert a list of string names * to columns * @param columns the columns to convert * @return the resulting column list */ public static List<Column> toColumn(List<String> columns) { List<Column> ret = new ArrayList<>(); for (String s : columns) ret.add(col(s)); return ret; }
Example #15
Source File: Dataset.java From nemo with Apache License 2.0 | 5 votes |
@Override public Dataset<Row> select(final scala.collection.Seq<Column> cols) { final boolean userTriggered = initializeFunction(cols); final Dataset<Row> result = from(super.select(cols)); this.setIsUserTriggered(userTriggered); return result; }
Example #16
Source File: TestHoodieSnapshotExporter.java From hudi with Apache License 2.0 | 5 votes |
@Override public DataFrameWriter<Row> partition(Dataset<Row> source) { return source .withColumnRenamed(HoodieRecord.PARTITION_PATH_METADATA_FIELD, PARTITION_NAME) .repartition(new Column(PARTITION_NAME)) .write() .partitionBy(PARTITION_NAME); }
Example #17
Source File: RowUtils.java From envelope with Apache License 2.0 | 5 votes |
public static Column[] toColumnArray(List<String> columnList) { Column[] columnArray = new Column[columnList.size()]; for (int i = 0; i < columnList.size(); i++) { columnArray[i] = new Column(columnList.get(i)); } return columnArray; }
Example #18
Source File: DataframeUtils.java From net.jgp.labs.spark with Apache License 2.0 | 5 votes |
public static Dataset<Row> addMetadata(Dataset<Row> df, String colName, String key, String value) { Metadata metadata = new MetadataBuilder() .withMetadata(ColumnUtils.getMetadata(df, colName)) .putString(key, value) .build(); Column col = col(colName); return df.withColumn(colName, col, metadata); }
Example #19
Source File: ColumnUtils.java From net.jgp.labs.spark with Apache License 2.0 | 5 votes |
public static String explain(Column col) { StringBuilder sb = new StringBuilder(); sb.append("Name ....... "); sb.append(col.toString()); return sb.toString(); }
Example #20
Source File: InnerJoinNullFilterFunction.java From spliceengine with GNU Affero General Public License v3.0 | 5 votes |
@Override public Pair<Dataset<Row>, OperationContext> nativeTransformation(Dataset<Row> input, OperationContext context) { Column andCols = null; for (int i : hashKeys) { Column col = input.col("c"+i).isNotNull(); if (andCols ==null) andCols = col; else andCols = andCols.and(col); } return Pair.newPair(input.filter(andCols), null); }
Example #21
Source File: Dataset.java From nemo with Apache License 2.0 | 5 votes |
@Override public Dataset<T> sortWithinPartitions(final scala.collection.Seq<Column> sortExprs) { final boolean userTriggered = initializeFunction(sortExprs); final Dataset<T> result = from(super.sortWithinPartitions(sortExprs)); this.setIsUserTriggered(userTriggered); return result; }
Example #22
Source File: Dataset.java From nemo with Apache License 2.0 | 5 votes |
@Override public Dataset<T> repartition(final Column... partitionExprs) { final boolean userTriggered = initializeFunction(partitionExprs); final Dataset<T> result = from(super.repartition(partitionExprs)); this.setIsUserTriggered(userTriggered); return result; }
Example #23
Source File: Dataset.java From nemo with Apache License 2.0 | 5 votes |
@Override public Dataset<T> orderBy(final scala.collection.Seq<Column> sortExprs) { final boolean userTriggered = initializeFunction(sortExprs); final Dataset<T> result = from(super.orderBy(sortExprs)); this.setIsUserTriggered(userTriggered); return result; }
Example #24
Source File: Dataset.java From nemo with Apache License 2.0 | 5 votes |
@Override public Dataset<T> orderBy(final Column... sortExprs) { final boolean userTriggered = initializeFunction(sortExprs); final Dataset<T> result = from(super.orderBy(sortExprs)); this.setIsUserTriggered(userTriggered); return result; }
Example #25
Source File: Dataset.java From nemo with Apache License 2.0 | 5 votes |
@Override public Dataset<Row> join(final org.apache.spark.sql.Dataset<?> right, final Column joinExprs, final String joinType) { final boolean userTriggered = initializeFunction(right, joinExprs, joinType); final Dataset<Row> result = from(super.join(right, joinExprs, joinType)); this.setIsUserTriggered(userTriggered); return result; }
Example #26
Source File: Dataset.java From nemo with Apache License 2.0 | 5 votes |
@Override public Dataset<Row> join(final org.apache.spark.sql.Dataset<?> right, final Column joinExprs) { final boolean userTriggered = initializeFunction(right, joinExprs); final Dataset<Row> result = from(super.join(right, joinExprs)); this.setIsUserTriggered(userTriggered); return result; }
Example #27
Source File: Dataset.java From nemo with Apache License 2.0 | 5 votes |
@Override public Dataset<Row> drop(final Column col) { final boolean userTriggered = initializeFunction(col); final Dataset<Row> result = from(super.drop(col)); this.setIsUserTriggered(userTriggered); return result; }
Example #28
Source File: Dataset.java From nemo with Apache License 2.0 | 5 votes |
@Override public Dataset<Row> agg(final Column expr, final scala.collection.Seq<Column> exprs) { final boolean userTriggered = initializeFunction(expr, exprs); final Dataset<Row> result = from(super.agg(expr, exprs)); this.setIsUserTriggered(userTriggered); return result; }
Example #29
Source File: Dataset.java From nemo with Apache License 2.0 | 5 votes |
@Override public Dataset<Row> agg(final Column expr, final Column... exprs) { final boolean userTriggered = initializeFunction(expr, exprs); final Dataset<Row> result = from(super.agg(expr, exprs)); this.setIsUserTriggered(userTriggered); return result; }
Example #30
Source File: DataFrames.java From DataVec with Apache License 2.0 | 5 votes |
/** * Convert an array of strings * to column names * @param columns the columns to convert * @return the converted columns */ public static Column[] toColumns(String... columns) { Column[] ret = new Column[columns.length]; for (int i = 0; i < columns.length; i++) ret[i] = col(columns[i]); return ret; }