org.apache.spark.sql.Column Java Exaples

Source File: HoodieSnapshotExporter.java From hudi with Apache License 2.0

7 votes

private void exportAsNonHudi(JavaSparkContext jsc, Config cfg, List<String> partitions, String latestCommitTimestamp) {
  Partitioner defaultPartitioner = dataset -> {
    Dataset<Row> hoodieDroppedDataset = dataset.drop(JavaConversions.asScalaIterator(HoodieRecord.HOODIE_META_COLUMNS.iterator()).toSeq());
    return StringUtils.isNullOrEmpty(cfg.outputPartitionField)
        ? hoodieDroppedDataset.write()
        : hoodieDroppedDataset.repartition(new Column(cfg.outputPartitionField)).write().partitionBy(cfg.outputPartitionField);
  };

  Partitioner partitioner = StringUtils.isNullOrEmpty(cfg.outputPartitioner)
      ? defaultPartitioner
      : ReflectionUtils.loadClass(cfg.outputPartitioner);

  final BaseFileOnlyView fsView = getBaseFileOnlyView(jsc, cfg);
  Iterator<String> exportingFilePaths = jsc
      .parallelize(partitions, partitions.size())
      .flatMap(partition -> fsView
          .getLatestBaseFilesBeforeOrOn(partition, latestCommitTimestamp)
          .map(HoodieBaseFile::getPath).iterator())
      .toLocalIterator();

  Dataset<Row> sourceDataset = new SQLContext(jsc).read().parquet(JavaConversions.asScalaIterator(exportingFilePaths).toSeq());
  partitioner.partition(sourceDataset)
      .format(cfg.outputFormat)
      .mode(SaveMode.Overwrite)
      .save(cfg.targetOutputPath);
}

Source File: SparkRelationalOperator.java From spliceengine with GNU Affero General Public License v3.0

6 votes

@Override
public Column getColumnExpression(Dataset<Row> leftDF,
                                  Dataset<Row> rightDF,
                                  Function<String, DataType> convertStringToDataTypeFunction) throws UnsupportedOperationException {
    Column leftExpr  = getLeftChild().getColumnExpression(leftDF, rightDF, convertStringToDataTypeFunction);
    Column rightExpr = getRightChild().getColumnExpression(leftDF, rightDF, convertStringToDataTypeFunction);

    if (relOpKind == EQUALS_RELOP)
        return leftExpr.equalTo(rightExpr);
    else if (relOpKind == NOT_EQUALS_RELOP)
        return leftExpr.notEqual(rightExpr);
    else if (relOpKind == GREATER_THAN_RELOP)
        return leftExpr.gt(rightExpr);
    else if (relOpKind == GREATER_EQUALS_RELOP)
        return leftExpr.geq(rightExpr);
    else if (relOpKind == LESS_THAN_RELOP)
        return leftExpr.lt(rightExpr);
    else if (relOpKind == LESS_EQUALS_RELOP)
        return leftExpr.leq(rightExpr);
    else if (relOpKind == IS_NULL_RELOP)
        return leftExpr.isNull();
    else if (relOpKind == IS_NOT_NULL_RELOP)
        return leftExpr.isNotNull();
    else
        throw new UnsupportedOperationException();
}

Source File: SparkArithmeticOperator.java From spliceengine with GNU Affero General Public License v3.0

6 votes

@Override
public Column getColumnExpression(Dataset<Row> leftDF,
                                  Dataset<Row> rightDF,
                                  Function<String, DataType> convertStringToDataTypeFunction) throws UnsupportedOperationException {
    Column leftExpr  = getLeftChild().getColumnExpression(leftDF, rightDF, convertStringToDataTypeFunction);
    Column rightExpr = getRightChild().getColumnExpression(leftDF, rightDF, convertStringToDataTypeFunction);

    if (opKind == PLUS)
        return leftExpr.plus(rightExpr);
    else if (opKind == MINUS)
        return leftExpr.minus(rightExpr);
    else if (opKind == TIMES)
        return leftExpr.multiply(rightExpr);
    else if (opKind == DIVIDE)
        return leftExpr.divide(rightExpr);
    else
        throw new UnsupportedOperationException();
}

Source File: RewriteManifestsAction.java From iceberg with Apache License 2.0

6 votes

private Dataset<Row> buildManifestEntryDF(List<ManifestFile> manifests) {
  Dataset<Row> manifestDF = spark
      .createDataset(Lists.transform(manifests, ManifestFile::path), Encoders.STRING())
      .toDF("manifest");

  String entriesMetadataTable = metadataTableName(MetadataTableType.ENTRIES);
  Dataset<Row> manifestEntryDF = spark.read().format("iceberg")
      .load(entriesMetadataTable)
      .filter("status < 2") // select only live entries
      .selectExpr("input_file_name() as manifest", "snapshot_id", "sequence_number", "data_file");

  Column joinCond = manifestDF.col("manifest").equalTo(manifestEntryDF.col("manifest"));
  return manifestEntryDF
      .join(manifestDF, joinCond, "left_semi")
      .select("snapshot_id", "sequence_number", "data_file");
}

Source File: ParseJSONDeriver.java From envelope with Apache License 2.0

6 votes

@Override
public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) {
  String parsedStructTemporaryFieldName = "__parsed_json";

  Dataset<Row> dependency = dependencies.get(stepName);

  Dataset<Row> parsed = dependency.select(
      functions.from_json(new Column(fieldName), schema, options).as(parsedStructTemporaryFieldName));

  if (asStruct) {
    return parsed.withColumnRenamed(parsedStructTemporaryFieldName, structFieldName);
  }
  else {
    for (StructField parsedField : schema.fields()) {
      parsed = parsed.withColumn(
          parsedField.name(), new Column(parsedStructTemporaryFieldName + "." + parsedField.name()));
    }

    return parsed.drop(parsedStructTemporaryFieldName);
  }
}

Source File: SparkPredictionServiceRunner.java From bpmn.ai with BSD 3-Clause "New" or "Revised" License

6 votes

public Dataset<Row> run(Dataset dataset) {

        //only use configured variables for pipeline
        Configuration configuration = ConfigurationUtils.getInstance().getConfiguration(this.sparkRunnerConfig);
        List<String> predictionVars = configuration.getModelPredictionConfiguration().getPredictionVariables();
        List<Column> usedColumns = new ArrayList<>();
        for(String var : predictionVars) {
            usedColumns.add(new Column(var));
        }
        dataset = dataset.select(BpmnaiUtils.getInstance().asSeq(usedColumns));

        //go through pipe elements
        // Define processing steps to run
        final PreprocessingRunner preprocessingRunner = new PreprocessingRunner();

        for(PipelineStep ps : pipelineManager.getOrderedPipeline()) {
            preprocessingRunner.addPreprocessorStep(ps);
        }

        // Run processing runner
        Dataset<Row> resultDataset = preprocessingRunner.run(dataset, this.sparkRunnerConfig);

        writeConfig();

        return resultDataset;
    }

Source File: Normalization.java From DataVec with Apache License 2.0

6 votes

/**
 * Scale based on min,max
 *
 * @param dataFrame the dataframe to scale
 * @param min       the minimum value
 * @param max       the maximum value
 * @return the normalized dataframe per column
 */
public static DataRowsFacade normalize(DataRowsFacade dataFrame, double min, double max, List<String> skipColumns) {
    List<String> columnsList = DataFrames.toList(dataFrame.get().columns());
    columnsList.removeAll(skipColumns);
    String[] columnNames = DataFrames.toArray(columnsList);
    //first row is min second row is max, each column in a row is for a particular column
    List<Row> minMax = minMaxColumns(dataFrame, columnNames);
    for (int i = 0; i < columnNames.length; i++) {
        String columnName = columnNames[i];
        double dMin = ((Number) minMax.get(0).get(i)).doubleValue();
        double dMax = ((Number) minMax.get(1).get(i)).doubleValue();
        double maxSubMin = (dMax - dMin);
        if (maxSubMin == 0)
            maxSubMin = 1;

        Column newCol = dataFrame.get().col(columnName).minus(dMin).divide(maxSubMin).multiply(max - min).plus(min);
        dataFrame = dataRows(dataFrame.get().withColumn(columnName, newCol));
    }


    return dataFrame;
}

Source File: Loader.java From AWS-MIMIC-IIItoOMOP with Apache License 2.0

6 votes

public void write(String destination, List<String> overflowColumns) throws IOException 
{
    String prefix = configuration.getFileSystem() + "://" + configuration.getDestinationBucket() + "/stage";
    String suffix = destination + ".csv"; 
    
    List<Column> columns = new ArrayList<Column>();
    List<String> strings = new ArrayList<String>();
    
    addMetadata(destination);
    materializeUUID(prefix, suffix);
    
    strings.add("file_location");
    columns.add(col("overlflow_column_uuid_lookup"));
    columns.add(col("file_location"));
    
    for(String column: overflowColumns) { columns.add(col(column)); strings.add(column); }
    
    df.select(JavaConverters.asScalaBufferConverter(columns).asScala()).write().format("com.databricks.spark.csv").option("header", "true").option("codec", "org.apache.hadoop.io.compress.GzipCodec").mode("overwrite").save(prefix + "/column_overflow/" + suffix);
    
    df.drop(JavaConverters.asScalaBufferConverter(strings).asScala()).write().format("com.databricks.spark.csv").option("header", "true").option("codec", "org.apache.hadoop.io.compress.GzipCodec").mode("overwrite").save(prefix + "/" + suffix);

    //clean up temp file
    new File(prefix + "/temp/" + suffix).delete();
}

Source File: Normalization.java From deeplearning4j with Apache License 2.0

6 votes

/**
 * Scale based on min,max
 *
 * @param dataFrame the dataframe to scale
 * @param min       the minimum value
 * @param max       the maximum value
 * @return the normalized dataframe per column
 */
public static Dataset<Row> normalize(Dataset<Row> dataFrame, double min, double max, List<String> skipColumns) {
    List<String> columnsList = DataFrames.toList(dataFrame.columns());
    columnsList.removeAll(skipColumns);
    String[] columnNames = DataFrames.toArray(columnsList);
    //first row is min second row is max, each column in a row is for a particular column
    List<Row> minMax = minMaxColumns(dataFrame, columnNames);
    for (int i = 0; i < columnNames.length; i++) {
        String columnName = columnNames[i];
        double dMin = ((Number) minMax.get(0).get(i)).doubleValue();
        double dMax = ((Number) minMax.get(1).get(i)).doubleValue();
        double maxSubMin = (dMax - dMin);
        if (maxSubMin == 0)
            maxSubMin = 1;

        Column newCol = dataFrame.col(columnName).minus(dMin).divide(maxSubMin).multiply(max - min).plus(min);
        dataFrame = dataFrame.withColumn(columnName, newCol);
    }


    return dataFrame;
}

Source File: ProjectRestrictMapFunction.java From spliceengine with GNU Affero General Public License v3.0

6 votes

@Override
    public Pair<Dataset<Row>, OperationContext> nativeTransformation(Dataset<Row> input, OperationContext context) {
        ProjectRestrictOperation op = (ProjectRestrictOperation) operationContext.getOperation();
        Dataset<Row> df = null;
        // TODO:  Enable the commented try-catch block after regression testing.
        //        This would be a safeguard against unanticipated exceptions:
        //             org.apache.spark.sql.catalyst.parser.ParseException
        //             org.apache.spark.sql.AnalysisException
        //    ... which may occur if the Splice parser fails to detect a
        //        SQL expression which SparkSQL does not support.
        if (op.hasExpressions()) {
//      try {
            df = input.selectExpr(op.getExpressions());
            return Pair.newPair(df, context);
//        }
//        catch (Exception e) {
//        }
        }
        int[] mapping = op.projectMapping;
        Column[] columns = new Column[mapping.length];
        for (int i = 0; i < mapping.length; ++i) {
            columns[i] = input.col("c" + (mapping[i] - 1));
        }
        df = input.select(columns);
        return Pair.newPair(df, context);
    }

Source File: Dataset.java From nemo with Apache License 2.0

5 votes

@Override
public Dataset<Row> select(final Column... cols) {
  final boolean userTriggered = initializeFunction(cols);
  final Dataset<Row> result = from(super.select(cols));
  this.setIsUserTriggered(userTriggered);
  return result;
}

Source File: Dataset.java From nemo with Apache License 2.0

5 votes

@Override
public Dataset<T> where(final Column condition) {
  final boolean userTriggered = initializeFunction(condition);
  final Dataset<T> result = from(super.where(condition));
  this.setIsUserTriggered(userTriggered);
  return result;
}

Source File: Dataset.java From nemo with Apache License 2.0

5 votes

@Override
public Dataset<Row> withColumn(final String colName, final Column col) {
  final boolean userTriggered = initializeFunction(colName, col);
  final Dataset<Row> result = from(super.withColumn(colName, col));
  this.setIsUserTriggered(userTriggered);
  return result;
}

Source File: DataFrames.java From DataVec with Apache License 2.0

5 votes

/**
 * Convert a list of string names
 * to columns
 * @param columns the columns to convert
 * @return the resulting column list
 */
public static List<Column> toColumn(List<String> columns) {
    List<Column> ret = new ArrayList<>();
    for (String s : columns)
        ret.add(col(s));
    return ret;
}

Source File: Dataset.java From nemo with Apache License 2.0

5 votes

@Override
public Dataset<Row> select(final scala.collection.Seq<Column> cols) {
  final boolean userTriggered = initializeFunction(cols);
  final Dataset<Row> result = from(super.select(cols));
  this.setIsUserTriggered(userTriggered);
  return result;
}

Source File: TestHoodieSnapshotExporter.java From hudi with Apache License 2.0

5 votes

@Override
public DataFrameWriter<Row> partition(Dataset<Row> source) {
  return source
      .withColumnRenamed(HoodieRecord.PARTITION_PATH_METADATA_FIELD, PARTITION_NAME)
      .repartition(new Column(PARTITION_NAME))
      .write()
      .partitionBy(PARTITION_NAME);
}

Source File: RowUtils.java From envelope with Apache License 2.0

5 votes

public static Column[] toColumnArray(List<String> columnList) {
  Column[] columnArray = new Column[columnList.size()];
  for (int i = 0; i < columnList.size(); i++) {
    columnArray[i] = new Column(columnList.get(i));
  }
  return columnArray;
}

Source File: DataframeUtils.java From net.jgp.labs.spark with Apache License 2.0

5 votes

public static Dataset<Row> addMetadata(Dataset<Row> df, String colName,
    String key, String value) {
  Metadata metadata = new MetadataBuilder()
      .withMetadata(ColumnUtils.getMetadata(df, colName))
      .putString(key, value)
      .build();
  Column col = col(colName);
  return df.withColumn(colName, col, metadata);
}

Source File: ColumnUtils.java From net.jgp.labs.spark with Apache License 2.0

5 votes

public static String explain(Column col) {
  StringBuilder sb = new StringBuilder();

  sb.append("Name ....... ");
  sb.append(col.toString());

  return sb.toString();
}

Source File: InnerJoinNullFilterFunction.java From spliceengine with GNU Affero General Public License v3.0

5 votes

@Override
public Pair<Dataset<Row>, OperationContext> nativeTransformation(Dataset<Row> input, OperationContext context) {
    Column andCols = null;
    for (int i : hashKeys) {
        Column col = input.col("c"+i).isNotNull();
        if (andCols ==null)
            andCols = col;
        else
            andCols = andCols.and(col);
    }
    return Pair.newPair(input.filter(andCols), null);
}

Source File: Dataset.java From nemo with Apache License 2.0

5 votes

@Override
public Dataset<T> sortWithinPartitions(final scala.collection.Seq<Column> sortExprs) {
  final boolean userTriggered = initializeFunction(sortExprs);
  final Dataset<T> result = from(super.sortWithinPartitions(sortExprs));
  this.setIsUserTriggered(userTriggered);
  return result;
}

Source File: Dataset.java From nemo with Apache License 2.0

5 votes

@Override
public Dataset<T> repartition(final Column... partitionExprs) {
  final boolean userTriggered = initializeFunction(partitionExprs);
  final Dataset<T> result = from(super.repartition(partitionExprs));
  this.setIsUserTriggered(userTriggered);
  return result;
}

Source File: Dataset.java From nemo with Apache License 2.0

5 votes

@Override
public Dataset<T> orderBy(final scala.collection.Seq<Column> sortExprs) {
  final boolean userTriggered = initializeFunction(sortExprs);
  final Dataset<T> result = from(super.orderBy(sortExprs));
  this.setIsUserTriggered(userTriggered);
  return result;
}

Source File: Dataset.java From nemo with Apache License 2.0

5 votes

@Override
public Dataset<T> orderBy(final Column... sortExprs) {
  final boolean userTriggered = initializeFunction(sortExprs);
  final Dataset<T> result = from(super.orderBy(sortExprs));
  this.setIsUserTriggered(userTriggered);
  return result;
}

Source File: Dataset.java From nemo with Apache License 2.0

5 votes

@Override
public Dataset<Row> join(final org.apache.spark.sql.Dataset<?> right, final Column joinExprs, final String joinType) {
  final boolean userTriggered = initializeFunction(right, joinExprs, joinType);
  final Dataset<Row> result = from(super.join(right, joinExprs, joinType));
  this.setIsUserTriggered(userTriggered);
  return result;
}

Source File: Dataset.java From nemo with Apache License 2.0

5 votes

@Override
public Dataset<Row> join(final org.apache.spark.sql.Dataset<?> right, final Column joinExprs) {
  final boolean userTriggered = initializeFunction(right, joinExprs);
  final Dataset<Row> result = from(super.join(right, joinExprs));
  this.setIsUserTriggered(userTriggered);
  return result;
}

Source File: Dataset.java From nemo with Apache License 2.0

5 votes

@Override
public Dataset<Row> drop(final Column col) {
  final boolean userTriggered = initializeFunction(col);
  final Dataset<Row> result = from(super.drop(col));
  this.setIsUserTriggered(userTriggered);
  return result;
}

Source File: Dataset.java From nemo with Apache License 2.0

5 votes

@Override
public Dataset<Row> agg(final Column expr, final scala.collection.Seq<Column> exprs) {
  final boolean userTriggered = initializeFunction(expr, exprs);
  final Dataset<Row> result = from(super.agg(expr, exprs));
  this.setIsUserTriggered(userTriggered);
  return result;
}

Source File: Dataset.java From nemo with Apache License 2.0

5 votes

@Override
public Dataset<Row> agg(final Column expr, final Column... exprs) {
  final boolean userTriggered = initializeFunction(expr, exprs);
  final Dataset<Row> result = from(super.agg(expr, exprs));
  this.setIsUserTriggered(userTriggered);
  return result;
}

Source File: DataFrames.java From DataVec with Apache License 2.0

5 votes

/**
 * Convert an array of strings
 * to column names
 * @param columns the columns to convert
 * @return the converted columns
 */
public static Column[] toColumns(String... columns) {
    Column[] ret = new Column[columns.length];
    for (int i = 0; i < columns.length; i++)
        ret[i] = col(columns[i]);
    return ret;
}

org.apache.spark.sql.Column Java Examples