org.apache.spark.sql.Dataset#columns

Source File: SparkCubingJobTest.java From kylin-on-parquet-v2 with Apache License 2.0

6 votes

private Integer convertOutSchema(Dataset<Row> layoutDs, String fieldName,
        org.apache.spark.sql.types.DataType dataType) {
    StructField[] structFieldList = layoutDs.schema().fields();
    String[] columns = layoutDs.columns();

    int index = 0;
    StructField[] outStructFieldList = new StructField[structFieldList.length];
    for (int i = 0; i < structFieldList.length; i++) {
        if (columns[i].equalsIgnoreCase(fieldName)) {
            index = i;
            StructField structField = structFieldList[i];
            outStructFieldList[i] = new StructField(structField.name(), dataType, false, structField.metadata());
        } else {
            outStructFieldList[i] = structFieldList[i];
        }
    }

    OUT_SCHEMA = new StructType(outStructFieldList);

    return index;
}

Source File: NManualBuildAndQueryCuboidTest.java From kylin-on-parquet-v2 with Apache License 2.0

6 votes

private Integer convertOutSchema(Dataset<Row> layoutDs, String fieldName,
                                 org.apache.spark.sql.types.DataType dataType) {
    StructField[] structFieldList = layoutDs.schema().fields();
    String[] columns = layoutDs.columns();

    int index = 0;
    StructField[] outStructFieldList = new StructField[structFieldList.length];
    for (int i = 0; i < structFieldList.length; i++) {
        if (columns[i].equalsIgnoreCase(fieldName)) {
            index = i;
            StructField structField = structFieldList[i];
            outStructFieldList[i] = new StructField(structField.name(), dataType, false, structField.metadata());
        } else {
            outStructFieldList[i] = structFieldList[i];
        }
    }

    OUT_SCHEMA = new StructType(outStructFieldList);

    return index;
}

Source File: CsvToDatasetCompatibleWithSparkv1x.java From net.jgp.labs.spark with Apache License 2.0

6 votes

private void start() {
  SparkSession spark = SparkSession.builder()
      .appName("CSV to Dataset")
      .master("local")
      .getOrCreate();

  String filename = "data/tuple-data-file.csv";
  Dataset<Row> df = spark.read().format("csv")
      .option("inferSchema", "true")
      .option("header", "false")
      .load(filename);
  df.show();

  // To ensure compatibility between Spark 2.0.0 and Spark 1.6.x
  int count = df.columns().length;
  for (int i = 0; i < count; i++) {
    String oldColName = "_c" + i;
    String newColName = "C" + i;
    df = df.withColumn(newColName, df.col(oldColName)).drop(oldColName);
  }
  df.show();
}

Source File: WriteToDiscStep.java From bpmn.ai with BSD 3-Clause "New" or "Revised" License

6 votes

@Override
public Dataset<Row> runPreprocessingStep(Dataset<Row> dataset, Map<String, Object> parameters, SparkRunnerConfig config) {
	
    // remove spaces from column names as parquet does not support them
    for(String columnName : dataset.columns()) {
        if(columnName.contains(" ")) {
            String newColumnName = columnName.replace(' ', '_');
            dataset = dataset.withColumnRenamed(columnName, newColumnName);
        }
    }

    dataset.cache();
    BpmnaiUtils.getInstance().writeDatasetToParquet(dataset, "result", config);

    if(config.isGenerateResultPreview()) {
        dataset.limit(config.getResultPreviewLineCount()).write().mode(SaveMode.Overwrite).saveAsTable(BpmnaiVariables.RESULT_PREVIEW_TEMP_TABLE);
    }

    return dataset;
}

Source File: RDDConverterUtils.java From systemds with Apache License 2.0

5 votes

public static JavaPairRDD<MatrixIndexes, MatrixBlock> dataFrameToBinaryBlock(JavaSparkContext sc,
	Dataset<Row> df, DataCharacteristics mc, boolean containsID, boolean isVector)
{
	//determine unknown dimensions and sparsity if required
	if( !mc.dimsKnown(true) ) {
		LongAccumulator aNnz = sc.sc().longAccumulator("nnz");
		JavaRDD<Row> tmp = df.javaRDD().map(new DataFrameAnalysisFunction(aNnz, containsID, isVector));
		long rlen = tmp.count();
		long clen = !isVector ? df.columns().length - (containsID?1:0) : 
				((Vector) tmp.first().get(containsID?1:0)).size();
		long nnz = UtilFunctions.toLong(aNnz.value());
		mc.set(rlen, clen, mc.getBlocksize(), nnz);
	}
	
	//ensure valid blocksizes
	if( mc.getBlocksize()<=1 )
		mc.setBlocksize(ConfigurationManager.getBlocksize());
	
	//construct or reuse row ids
	JavaPairRDD<Row, Long> prepinput = containsID ?
			df.javaRDD().mapToPair(new DataFrameExtractIDFunction(
				df.schema().fieldIndex(DF_ID_COLUMN))) :
			df.javaRDD().zipWithIndex(); //zip row index
	
	//convert csv rdd to binary block rdd (w/ partial blocks)
	boolean sparse = requiresSparseAllocation(prepinput, mc);
	JavaPairRDD<MatrixIndexes, MatrixBlock> out = 
			prepinput.mapPartitionsToPair(
				new DataFrameToBinaryBlockFunction(mc, sparse, containsID, isVector));
	
	//aggregate partial matrix blocks (w/ preferred number of output 
	//partitions as the data is likely smaller in binary block format,
	//but also to bound the size of partitions for compressed inputs)
	int parts = SparkUtils.getNumPreferredPartitions(mc, out);
	return RDDAggregateUtils.mergeByKey(out, parts, false); 
}

Source File: SparkCubingJobTest.java From kylin-on-parquet-v2 with Apache License 2.0

5 votes

private Dataset<Row> dsConvertToOriginal(Dataset<Row> layoutDs, LayoutEntity entity) {
    Map<Integer, FunctionDesc> orderedMeasures = entity.getOrderedMeasures();

    for (final Map.Entry<Integer, FunctionDesc> entry : orderedMeasures.entrySet()) {
        FunctionDesc functionDesc = entry.getValue();
        if (functionDesc != null) {
            final String[] columns = layoutDs.columns();
            String functionName = functionDesc.returnType().dataType();

            if ("bitmap".equals(functionName)) {
                final int finalIndex = convertOutSchema(layoutDs, entry.getKey().toString(), DataTypes.LongType);
                PreciseCountDistinct preciseCountDistinct = new PreciseCountDistinct(null);
                layoutDs = layoutDs.map((MapFunction<Row, Row>) value -> {
                    Object[] ret = new Object[value.size()];
                    for (int i = 0; i < columns.length; i++) {
                        if (i == finalIndex) {
                            byte[] bytes = (byte[]) value.get(i);
                            Roaring64NavigableMap bitmapCounter = preciseCountDistinct.deserialize(bytes);
                            ret[i] = bitmapCounter.getLongCardinality();
                        } else {
                            ret[i] = value.get(i);
                        }
                    }
                    return RowFactory.create(ret);
                }, RowEncoder.apply(OUT_SCHEMA));
            }
        }
    }
    return layoutDs;
}

Source File: DataframeUtils.java From net.jgp.labs.spark with Apache License 2.0

5 votes

public static Dataset<Row> addMetadata(Dataset<Row> df, String key,
    String value) {
  for (String colName : df.columns()) {
    df = addMetadata(df, colName, key, value);
  }
  return df;
}

Source File: DrugBankDataset.java From mmtf-spark with Apache License 2.0

5 votes

/**
 * Removes spaces from column names to ensure compatibility with parquet
 * files.
 *
 * @param original
 *            dataset
 * @return dataset with columns renamed
 */
private static Dataset<Row> removeSpacesFromColumnNames(Dataset<Row> original) {

    for (String existingName : original.columns()) {
        String newName = existingName.replaceAll(" ", "");
        original = original.withColumnRenamed(existingName, newName);
    }

    return original;
}

Source File: G2SDataset.java From mmtf-spark with Apache License 2.0

5 votes

/**
 * Downloads PDB residue mappings for a list of genomic variations.
 * @param variationIds genomic variation ids (e.g. chr7:g.140449103A>C)
 * @param pdbId specific PDB structure used for mapping
 * @param chainId specific chain used for mapping
 * @return dataset with PDB mapping information
 * @throws IOException
 */
private static Dataset<Row> getDataset(List<String> variationIds, String structureId, String chainId) throws IOException {
    // get a spark context
    SparkSession spark = SparkSession.builder().getOrCreate();    
    @SuppressWarnings("resource") // sc will be closed elsewhere
    JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());

    // download data in parallel
    JavaRDD<String> data = sc.parallelize(variationIds).flatMap(m -> getData(m, structureId, chainId));

    // convert from JavaRDD to Dataset
    Dataset<String> jsonData = spark.createDataset(JavaRDD.toRDD(data), Encoders.STRING()); 

    // parse json strings and return as a dataset
    Dataset<Row> dataset = spark.read().json(jsonData); 
    dataset.show();
    
    // return null if dataset is empty
    if (dataset.columns().length == 0) {
        System.out.println("G2SDataset: no matches found");
        return null;
    }   
       
    dataset = standardizeData(dataset);
    
    return flattenDataset(dataset);
}

Source File: RDDConverterUtils.java From systemds with Apache License 2.0

5 votes

public static JavaPairRDD<MatrixIndexes, MatrixBlock> dataFrameToBinaryBlock(JavaSparkContext sc,
	Dataset<Row> df, DataCharacteristics mc, boolean containsID, boolean isVector)
{
	//determine unknown dimensions and sparsity if required
	if( !mc.dimsKnown(true) ) {
		LongAccumulator aNnz = sc.sc().longAccumulator("nnz");
		JavaRDD<Row> tmp = df.javaRDD().map(new DataFrameAnalysisFunction(aNnz, containsID, isVector));
		long rlen = tmp.count();
		long clen = !isVector ? df.columns().length - (containsID?1:0) : 
				((Vector) tmp.first().get(containsID?1:0)).size();
		long nnz = UtilFunctions.toLong(aNnz.value());
		mc.set(rlen, clen, mc.getBlocksize(), nnz);
	}
	
	//ensure valid blocksizes
	if( mc.getBlocksize()<=1 )
		mc.setBlocksize(ConfigurationManager.getBlocksize());
	
	//construct or reuse row ids
	JavaPairRDD<Row, Long> prepinput = containsID ?
			df.javaRDD().mapToPair(new DataFrameExtractIDFunction(
				df.schema().fieldIndex(DF_ID_COLUMN))) :
			df.javaRDD().zipWithIndex(); //zip row index
	
	//convert csv rdd to binary block rdd (w/ partial blocks)
	boolean sparse = requiresSparseAllocation(prepinput, mc);
	JavaPairRDD<MatrixIndexes, MatrixBlock> out = 
			prepinput.mapPartitionsToPair(
				new DataFrameToBinaryBlockFunction(mc, sparse, containsID, isVector));
	
	//aggregate partial matrix blocks (w/ preferred number of output 
	//partitions as the data is likely smaller in binary block format,
	//but also to bound the size of partitions for compressed inputs)
	int parts = SparkUtils.getNumPreferredPartitions(mc, out);
	return RDDAggregateUtils.mergeByKey(out, parts, false); 
}

Source File: NManualBuildAndQueryCuboidTest.java From kylin-on-parquet-v2 with Apache License 2.0

5 votes

private Dataset<Row> dsConvertToOriginal(Dataset<Row> layoutDs, LayoutEntity entity) {
    Map<Integer, FunctionDesc> orderedMeasures = entity.getOrderedMeasures();

    for (final Map.Entry<Integer, FunctionDesc> entry : orderedMeasures.entrySet()) {
        FunctionDesc functionDesc = entry.getValue();
        if (functionDesc != null) {
            final String[] columns = layoutDs.columns();
            String functionName = functionDesc.returnType().dataType();

            if ("bitmap".equals(functionName)) {
                final int finalIndex = convertOutSchema(layoutDs, entry.getKey().toString(), DataTypes.LongType);
                PreciseCountDistinct preciseCountDistinct = new PreciseCountDistinct(null);
                layoutDs = layoutDs.map((MapFunction<Row, Row>) value -> {
                    Object[] ret = new Object[value.size()];
                    for (int i = 0; i < columns.length; i++) {
                        if (i == finalIndex) {
                            byte[] bytes = (byte[]) value.get(i);
                            Roaring64NavigableMap bitmapCounter = preciseCountDistinct.deserialize(bytes);
                            ret[i] = bitmapCounter.getLongCardinality();
                        } else {
                            ret[i] = value.get(i);
                        }
                    }
                    return RowFactory.create(ret);
                }, RowEncoder.apply(OUT_SCHEMA));
            }
        }
    }
    return layoutDs;
}

Source File: SparkRegressor.java From mmtf-spark with Apache License 2.0

4 votes

/**
 * Dataset must at least contain the following two columns:
 * label: the class labels
 * features: feature vector
 * @param data
 * @return map with metrics
 */
public Map<String,String> fit(Dataset<Row> data) {

	// Split the data into training and test sets (30% held out for testing)
	Dataset<Row>[] splits = data.randomSplit(new double[] {1.0-testFraction, testFraction}, seed);
	Dataset<Row> trainingData = splits[0];
	Dataset<Row> testData = splits[1];

	// Train a RandomForest model.
	predictor
	  .setLabelCol(label)
	  .setFeaturesCol("features");

	// Chain indexer and forest in a Pipeline
	Pipeline pipeline = new Pipeline()
	  .setStages(new PipelineStage[] {predictor});

	// Train model. This also runs the indexer.
	PipelineModel model = pipeline.fit(trainingData);

	// Make predictions.
	Dataset<Row> predictions = model.transform(testData);

	// Display some sample predictions
	System.out.println("Sample predictions: " + predictor.getClass().getSimpleName());
	String primaryKey = predictions.columns()[0];
	predictions.select(primaryKey, label, "prediction").sample(false, 0.1, seed).show(50);
	
	Map<String,String> metrics = new LinkedHashMap<>();
        
    metrics.put("Method", predictor.getClass().getSimpleName());
    
    // Select (prediction, true label) and compute test error
    RegressionEvaluator evaluator = new RegressionEvaluator()
 		  .setLabelCol(label)
 		  .setPredictionCol("prediction")
 		  .setMetricName("rmse");
    
    metrics.put("rmse", Double.toString(evaluator.evaluate(predictions)));

	return metrics;
}

Source File: AggregateActivityInstancesStep.java From bpmn.ai with BSD 3-Clause "New" or "Revised" License

4 votes

@Override
public Dataset<Row> runPreprocessingStep(Dataset<Row> dataset, Map<String, Object> parameters, SparkRunnerConfig config) {

    //apply first and processState aggregator
    Map<String, String> aggregationMap = new HashMap<>();
    for(String column : dataset.columns()) {
        if(column.equals(BpmnaiVariables.VAR_PROCESS_INSTANCE_ID)) {
            continue;
        } else if(column.equals(BpmnaiVariables.VAR_DURATION) || column.endsWith("_rev")) {
            aggregationMap.put(column, "max");
        } else if(column.equals(BpmnaiVariables.VAR_STATE)) {
            aggregationMap.put(column, "ProcessState");
        } else if(column.equals(BpmnaiVariables.VAR_ACT_INST_ID)) {
            //ignore it, as we aggregate by it
            continue;
        } else {
            aggregationMap.put(column, "AllButEmptyString");
        }
    }

    //first aggregation
    //activity level, take only processInstance and activityInstance rows
    dataset = dataset
            .filter(dataset.col(BpmnaiVariables.VAR_DATA_SOURCE).notEqual(BpmnaiVariables.EVENT_PROCESS_INSTANCE))
            .groupBy(BpmnaiVariables.VAR_PROCESS_INSTANCE_ID, BpmnaiVariables.VAR_ACT_INST_ID)
            .agg(aggregationMap);

    //rename back columns after aggregation
    String pattern = "(max|allbutemptystring|processstate)\\((.+)\\)";
    Pattern r = Pattern.compile(pattern);

    for(String columnName : dataset.columns()) {
        Matcher m = r.matcher(columnName);
        if(m.find()) {
            String newColumnName = m.group(2);
            dataset = dataset.withColumnRenamed(columnName, newColumnName);
        }
    }


    //in case we add the CSV we have a name column in the first dataset of the join so we call drop again to make sure it is gone
    dataset = dataset.drop(BpmnaiVariables.VAR_PROCESS_INSTANCE_VARIABLE_NAME);
    dataset = dataset.drop(BpmnaiVariables.VAR_DATA_SOURCE);

    dataset = dataset.sort(BpmnaiVariables.VAR_START_TIME);

    dataset.cache();
    BpmnaiLogger.getInstance().writeInfo("Found " + dataset.count() + " activity instances.");

    if(config.isWriteStepResultsIntoFile()) {
        BpmnaiUtils.getInstance().writeDatasetToCSV(dataset, "agg_of_activity_instances", config);
    }

    //return preprocessed data
    return dataset;
}

Source File: InListDeriver.java From envelope with Apache License 2.0

4 votes

@Override
public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) {

  Dataset<Row> target = getStepDataFrame(dependencies);
  if (target.columns().length < 1) {
    throw new RuntimeException("Targeted step, '" + stepName + ",' has no columns");
  }

  try {
    String targetField = fieldName == null ? target.columns()[0] : fieldName;
    Column targetColumn = target.col(targetField);

    LOGGER.debug("Targeting '{}[{}]'", stepName, targetField);

    // If the IN list is inline, there is no batch
    if (inList != null) {
      LOGGER.debug("IN list is inline");
      return target.filter(targetColumn.isin(inList.toArray()));
    }

    // Otherwise, collect the values from the reference, executed within the batch
    else {
      LOGGER.trace("IN list is a reference");
      Dataset<Row> reference = dependencies.get(refStepName);
      String referenceField = refFieldName == null ? reference.columns()[0] : refFieldName;

      LOGGER.debug("Referencing using {}[{}]", refStepName, referenceField);
      Column referenceColumn = reference.col(referenceField);

      Iterator<Row> referenceIterator = reference.select(referenceColumn).distinct().toLocalIterator();
      this.inList = new ArrayList<>();
      long counter = 0;

      // Set up the batch collector
      JavaRDD<Row> unionRDD = new JavaSparkContext(Contexts.getSparkSession().sparkContext()).emptyRDD();
      Dataset<Row> union = Contexts.getSparkSession().createDataFrame(unionRDD, target.schema());

      while (referenceIterator.hasNext()) {
        // Flush the batch
        if (counter == batchSize) {
          LOGGER.trace("Flushing batch");
          union = union.union(target.filter(targetColumn.isin(inList.toArray())));
          inList.clear();
          counter = 0L;
        }

        // Gather the elements of the IN list from the reference
        inList.add(referenceIterator.next().get(0));
        counter++;
      }

      // If the selection is under the batch threshold
      if (union.rdd().isEmpty()) {
        return target.filter(targetColumn.isin(inList.toArray()));
      }

      // Flush any remaining IN list values
      else {
        return union.union(target.filter(targetColumn.isin(inList.toArray())));
      }
    }
  } catch (Throwable ae) {
    throw new RuntimeException("Error executing IN list filtering", ae);
  }

}

Source File: TypeCastStep.java From bpmn.ai with BSD 3-Clause "New" or "Revised" License

4 votes

@Override
public Dataset<Row> runPreprocessingStep(Dataset<Row> dataset, Map<String, Object> parameters, SparkRunnerConfig config) {

    // get variables
    Map<String, String> varMap = (Map<String, String>) SparkBroadcastHelper.getInstance().getBroadcastVariable(SparkBroadcastHelper.BROADCAST_VARIABLE.PROCESS_VARIABLES_ESCALATED);

    List<StructField> datasetFields = Arrays.asList(dataset.schema().fields());

    List<ColumnConfiguration> columnConfigurations = null;
    List<VariableConfiguration> variableConfigurations = null;

    Configuration configuration = ConfigurationUtils.getInstance().getConfiguration(config);
    if(configuration != null) {
        PreprocessingConfiguration preprocessingConfiguration = configuration.getPreprocessingConfiguration();
        columnConfigurations = preprocessingConfiguration.getColumnConfiguration();
        variableConfigurations = preprocessingConfiguration.getVariableConfiguration();
    }

    Map<String, ColumnConfiguration> columnTypeConfigMap = new HashMap<>();
    Map<String, VariableConfiguration> variableTypeConfigMap = new HashMap<>();

    if(columnConfigurations != null) {
        for(ColumnConfiguration cc : columnConfigurations) {
            columnTypeConfigMap.put(cc.getColumnName(), cc);
        }
    }

    if(variableConfigurations != null) {
        for(VariableConfiguration vc : variableConfigurations) {
            variableTypeConfigMap.put(vc.getVariableName(), vc);
        }
    }

    for(String column : dataset.columns()) {

        // skip revision columns as they are handled for each variable column
        if(column.endsWith("_rev")) {
            continue;
        }

        DataType newDataType = null;
        boolean isVariableColumn  = false;
        String configurationDataType = null;
        String configurationParseFormat = null;

        if(variableTypeConfigMap.keySet().contains(column)) {
            // was initially a variable
            configurationDataType = variableTypeConfigMap.get(column).getVariableType();
            configurationParseFormat = variableTypeConfigMap.get(column).getParseFormat();
            if (config.getPipelineMode().equals(BpmnaiVariables.PIPELINE_MODE_LEARN)) {
                isVariableColumn = varMap.keySet().contains(column);
            } else {
                isVariableColumn = true;
            }
        } else if(columnTypeConfigMap.keySet().contains(column)){
            // was initially a column
            configurationDataType = columnTypeConfigMap.get(column).getColumnType();
            configurationParseFormat = columnTypeConfigMap.get(column).getParseFormat();
        }

        newDataType = mapDataType(datasetFields, column, configurationDataType);

        // only check for cast errors if dev feature is enabled and if a change in the datatype has been done
        if(config.isDevTypeCastCheckEnabled() && !newDataType.equals(getCurrentDataType(datasetFields, column))) {
            // add a column with casted value to be able to check the cast results
            dataset = castColumn(dataset, column, column+"_casted", newDataType, configurationParseFormat);

            // add a column for cast results and write CAST_ERROR? in it if there might be a cast error
            dataset = dataset.withColumn(column+"_castresult",
                    when(dataset.col(column).isNotNull().and(dataset.col(column).notEqual(lit(""))),
                            when(dataset.col(column+"_casted").isNull(), lit("CAST_ERROR?"))
                                    .otherwise(lit(""))
                    ).otherwise(lit(""))
            );
            dataset.cache();

            // check for cast errors and write warning to application log
            if(dataset.filter(column+"_castresult == 'CAST_ERROR?'").count() > 0) {
                BpmnaiLogger.getInstance().writeWarn("Column '" + column + "' seems to have cast errors. Please check the data type (is defined as '" + configurationDataType + "')");
            } else {
                // drop help columns as there are no cast errors for this column and rename casted column to actual column name
                dataset = dataset.drop(column, column+"_castresult").withColumnRenamed(column+"_casted", column);
            }
        } else {
            // cast without checking the cast result, entries are null is spark can't cast it
            dataset = castColumn(dataset, column, column, newDataType, configurationParseFormat);
        }

        // cast revision columns for former variables, revisions columns only exist on process level
        if(config.getDataLevel().equals(BpmnaiVariables.DATA_LEVEL_PROCESS) && config.isRevCountEnabled() && isVariableColumn) {
            dataset = dataset.withColumn(column+"_rev", dataset.col(column+"_rev").cast("integer"));
        }
    }

    if(config.isWriteStepResultsIntoFile()) {
        BpmnaiUtils.getInstance().writeDatasetToCSV(dataset, "type_cast_columns", config);
    }

    //return preprocessed data
    return dataset;
}

Source File: Spark3Shims.java From zeppelin with Apache License 2.0

4 votes

@Override
public String showDataFrame(Object obj, int maxResult, InterpreterContext context) {
  if (obj instanceof Dataset) {
    Dataset<Row> df = ((Dataset) obj).toDF();
    String[] columns = df.columns();
    // DDL will empty DataFrame
    if (columns.length == 0) {
      return "";
    }
    // fetch maxResult+1 rows so that we can check whether it is larger than zeppelin.spark.maxResult
    List<Row> rows = df.takeAsList(maxResult + 1);
    String template = context.getLocalProperties().get("template");
    if (!StringUtils.isBlank(template)) {
      if (rows.size() >= 1) {
        return new SingleRowInterpreterResult(sparkRowToList(rows.get(0)), template, context).toHtml();
      } else {
        return "";
      }
    }

    StringBuilder msg = new StringBuilder();
    msg.append("%table ");
    msg.append(StringUtils.join(TableDataUtils.normalizeColumns(columns), "\t"));
    msg.append("\n");
    boolean isLargerThanMaxResult = rows.size() > maxResult;
    if (isLargerThanMaxResult) {
      rows = rows.subList(0, maxResult);
    }
    for (Row row : rows) {
      for (int i = 0; i < row.size(); ++i) {
        msg.append(TableDataUtils.normalizeColumn(row.get(i)));
        if (i != row.size() -1) {
          msg.append("\t");
        }
      }
      msg.append("\n");
    }

    if (isLargerThanMaxResult) {
      msg.append("\n");
      msg.append(ResultMessages.getExceedsLimitRowsMessage(maxResult, "zeppelin.spark.maxResult"));
    }
    // append %text at the end, otherwise the following output will be put in table as well.
    msg.append("\n%text ");
    return msg.toString();
  } else {
    return obj.toString();
  }
}

Source File: Spark2Shims.java From zeppelin with Apache License 2.0

4 votes

@Override
public String showDataFrame(Object obj, int maxResult, InterpreterContext context) {
  if (obj instanceof Dataset) {
    Dataset<Row> df = ((Dataset) obj).toDF();
    String[] columns = df.columns();
    // DDL will empty DataFrame
    if (columns.length == 0) {
      return "";
    }
    // fetch maxResult+1 rows so that we can check whether it is larger than zeppelin.spark.maxResult
    List<Row> rows = df.takeAsList(maxResult + 1);
    String template = context.getLocalProperties().get("template");
    if (!StringUtils.isBlank(template)) {
      if (rows.size() >= 1) {
        return new SingleRowInterpreterResult(sparkRowToList(rows.get(0)), template, context).toHtml();
      } else {
        return "";
      }
    }

    StringBuilder msg = new StringBuilder();
    msg.append("\n%table ");
    msg.append(StringUtils.join(TableDataUtils.normalizeColumns(columns), "\t"));
    msg.append("\n");
    boolean isLargerThanMaxResult = rows.size() > maxResult;
    if (isLargerThanMaxResult) {
      rows = rows.subList(0, maxResult);
    }
    for (Row row : rows) {
      for (int i = 0; i < row.size(); ++i) {
        msg.append(TableDataUtils.normalizeColumn(row.get(i)));
        if (i != row.size() -1) {
          msg.append("\t");
        }
      }
      msg.append("\n");
    }

    if (isLargerThanMaxResult) {
      msg.append("\n");
      msg.append(ResultMessages.getExceedsLimitRowsMessage(maxResult, "zeppelin.spark.maxResult"));
    }
    // append %text at the end, otherwise the following output will be put in table as well.
    msg.append("\n%text ");
    return msg.toString();
  } else {
    return obj.toString();
  }
}

Java Code Examples for org.apache.spark.sql.Dataset#columns()