org.apache.spark.sql.Dataset#repartition

Source File: StreamingStep.java From envelope with Apache License 2.0

6 votes

@SuppressWarnings({"unchecked","rawtypes"})
public Dataset<Row> translate(JavaRDD raw) {
  StreamInput streamInput = (StreamInput)getInput(true);
  TranslateFunction translateFunction = getTranslateFunction(config, true);

  // Encode the raw messages as rows (i.e. the raw value plus associated metadata fields)
  JavaRDD<Row> encoded = raw.map(streamInput.getMessageEncoderFunction());

  // Translate raw message rows to structured rows
  TranslationResults translationResults = new TranslationResults(
      encoded.flatMap(translateFunction),
      translateFunction.getProvidingSchema(),
      streamInput.getProvidingSchema());

  BatchStep errored = createErrorStep(getName() + DEFAULT_ERROR_DATAFRAME_SUFFIX,
      translationResults.getErrored());
  addNewBatchStep(errored);

  // Provide translated rows and errors
  Dataset<Row> translated = translationResults.getTranslated();
  if (doesRepartition()) {
    translated = translated.repartition(config.getInt(REPARTITION_NUM_PARTITIONS_PROPERTY));
  }
  return translated;
}

Source File: JavaQuantileDiscretizerExample.java From SparkDemo with MIT License

5 votes

public static void main(String[] args) {
  SparkSession spark = SparkSession
    .builder()
    .appName("JavaQuantileDiscretizerExample")
    .getOrCreate();

  // $example on$
  List<Row> data = Arrays.asList(
    RowFactory.create(0, 18.0),
    RowFactory.create(1, 19.0),
    RowFactory.create(2, 8.0),
    RowFactory.create(3, 5.0),
    RowFactory.create(4, 2.2)
  );

  StructType schema = new StructType(new StructField[]{
    new StructField("id", DataTypes.IntegerType, false, Metadata.empty()),
    new StructField("hour", DataTypes.DoubleType, false, Metadata.empty())
  });

  Dataset<Row> df = spark.createDataFrame(data, schema);
  // $example off$
  // Output of QuantileDiscretizer for such small datasets can depend on the number of
  // partitions. Here we force a single partition to ensure consistent results.
  // Note this is not necessary for normal use cases
  df = df.repartition(1);
  // $example on$
  QuantileDiscretizer discretizer = new QuantileDiscretizer()
    .setInputCol("hour")
    .setOutputCol("result")
    .setNumBuckets(3);

  Dataset<Row> result = discretizer.fit(df).transform(df);
  result.show();
  // $example off$
  spark.stop();
}

Source File: BatchStep.java From envelope with Apache License 2.0

5 votes

private Dataset<Row> repartition(Dataset<Row> data) {
  int numPartitions = 0;
  List<String> colPartitions = null;

  if (config.hasPath(REPARTITION_NUM_PARTITIONS_PROPERTY)) {
    numPartitions = config.getInt(REPARTITION_NUM_PARTITIONS_PROPERTY);
  }

  if (config.hasPath(REPARTITION_COLUMNS_PROPERTY)) {
    colPartitions = config.getStringList(REPARTITION_COLUMNS_PROPERTY);
  }

  if (numPartitions > 0 && null != colPartitions) {
    data = data.repartition(numPartitions, RowUtils.toColumnArray(colPartitions));
  }
  else if (numPartitions > 0) {
    data = data.repartition(numPartitions);
  }
  else if (null != colPartitions) {
    data = data.repartition(RowUtils.toColumnArray(colPartitions));
  }

  if (config.hasPath(COALESCE_NUM_PARTITIONS_PROPERTY)) {
    numPartitions = config.getInt(COALESCE_NUM_PARTITIONS_PROPERTY);
    data = data.coalesce(numPartitions);
  }
  
  return data;
}

Source File: DataFrameMatrixConversionTest.java From systemds with Apache License 2.0

4 votes

private void testDataFrameConversion(boolean vector, int cols, boolean dense, boolean unknownDims) {
	boolean oldConfig = DMLScript.USE_LOCAL_SPARK_CONFIG; 
	ExecMode oldPlatform = DMLScript.getGlobalExecMode();

	try
	{
		DMLScript.USE_LOCAL_SPARK_CONFIG = true;
		DMLScript.setGlobalExecMode(ExecMode.HYBRID);
		
		//generate input data and setup metadata
		int rows = (cols == cols3) ? rows3 : rows1;
		double sparsity = dense ? sparsity1 : sparsity2; 
		double[][] A = getRandomMatrix(rows, cols, -10, 10, sparsity, 2373); 
		MatrixBlock mbA = DataConverter.convertToMatrixBlock(A); 
		int blksz = ConfigurationManager.getBlocksize();
		MatrixCharacteristics mc1 = new MatrixCharacteristics(rows, cols, blksz, mbA.getNonZeros());
		MatrixCharacteristics mc2 = unknownDims ? new MatrixCharacteristics() : new MatrixCharacteristics(mc1);

		//get binary block input rdd
		JavaPairRDD<MatrixIndexes,MatrixBlock> in = SparkExecutionContext.toMatrixJavaPairRDD(sc, mbA, blksz);
		
		//matrix - dataframe - matrix conversion
		Dataset<Row> df = RDDConverterUtils.binaryBlockToDataFrame(spark, in, mc1, vector);
		df = ( rows==rows3 ) ? df.repartition(rows) : df;
		JavaPairRDD<MatrixIndexes,MatrixBlock> out = RDDConverterUtils.dataFrameToBinaryBlock(sc, df, mc2, true, vector);
		
		//get output matrix block
		MatrixBlock mbB = SparkExecutionContext.toMatrixBlock(out, rows, cols, blksz, -1);
		
		//compare matrix blocks
		double[][] B = DataConverter.convertToDoubleMatrix(mbB);
		TestUtils.compareMatrices(A, B, rows, cols, eps);
	}
	catch( Exception ex ) {
		throw new RuntimeException(ex);
	}
	finally {
		DMLScript.USE_LOCAL_SPARK_CONFIG = oldConfig;
		DMLScript.setGlobalExecMode(oldPlatform);
	}
}

Source File: Runner.java From stocator with Apache License 2.0

4 votes

private static void executeTestSuite(NameGenerator nameGenerator,
    SparkSession spark) throws Exception{
  TestSuite testSuite = new TestSuite(dataCreate, flatListing);

  System.out.println("*********************************");
  System.out.println("*** Create dataframe from the local CSV file ***");
  Dataset<Row> schemaFlights = testSuite.getFlights(spark, csvLocalPath);
  
  nameGenerator.generateObjectNames();
  if (dataCreate) {
      System.out.println("Data cleanup (start) for " + nameGenerator.getContainerPath() + "*");
      System.out.println("*********************************");
      testSuite.deleteData(nameGenerator.getContainerPath(), spark.sparkContext().hadoopConfiguration(), false);
      System.out.println("*********************************");
  }

  testSuite.test1(spark, schemaFlights, nameGenerator.getCsvPath2());
  testSuite.test2(spark, schemaFlights, nameGenerator.getParquetPath(), Constants.PARQUET_TYPE);
  testSuite.test2(spark, schemaFlights, nameGenerator.getJsonPath(), Constants.JSON_TYPE);
  testSuite.test3(spark, schemaFlights, nameGenerator.getCsvPath1());
  testSuite.test4(spark, nameGenerator.getTxtPath());
  testSuite.test8(spark, nameGenerator.getTxtPath(), isTimeOutTest );

  if (isSwift) {
    nameGenerator.generateNewContainer("list");
    System.out.println("Data cleanup for " + nameGenerator.getContainerPath() + "*");
    System.out.println("*********************************");
    testSuite.deleteData(nameGenerator.getContainerPath(), spark.sparkContext().hadoopConfiguration(), dataCreate);
    System.out.println("*********************************");
  }
  testSuite.test6(spark, schemaFlights, nameGenerator.getContainerPath(), nameGenerator.getCsvName());
  if (isSwift) {
    nameGenerator.generateNewContainer(false);
    System.out.println("Data cleanup for " + nameGenerator.getContainerPath() + "*");
    System.out.println("*********************************");
    testSuite.deleteData(nameGenerator.getContainerPath(), spark.sparkContext().hadoopConfiguration(), dataCreate);
    System.out.println("*********************************");
  }
  
  testSuite.test7(spark, schemaFlights, nameGenerator.getContainerPath(), Constants.TEXT_TYPE);
  testSuite.test7(spark, schemaFlights, nameGenerator.getContainerPath(), Constants.JSON_TYPE);
  testSuite.test7(spark, schemaFlights, nameGenerator.getContainerPath(), Constants.PARQUET_TYPE);
  testSuite.test71(spark, schemaFlights, nameGenerator.getContainerPath(), Constants.TEXT_TYPE);
  testSuite.test71(spark, schemaFlights, nameGenerator.getContainerPath(), Constants.JSON_TYPE);
  testSuite.test71(spark, schemaFlights, nameGenerator.getContainerPath(), Constants.PARQUET_TYPE);
  testSuite.test10(spark, nameGenerator.getDataResPath() + "/dfp");
  testSuite.test11(spark, schemaFlights, nameGenerator.getContainerPath(), Constants.PARQUET_TYPE);
  testSuite.test12(spark, schemaFlights, nameGenerator.getContainerPath(), Constants.PARQUET_TYPE);
  testSuite.test9(spark, nameGenerator.getDataResPath());
  testSuite.test13(spark, schemaFlights, nameGenerator.getContainerPath(), Constants.CSV_TYPE);
  testSuite.test14(spark, schemaFlights, nameGenerator.getContainerPath(), Constants.JSON_TYPE);
  testSuite.test14(spark, schemaFlights, nameGenerator.getContainerPath(), Constants.PARQUET_TYPE);
  testSuite.test15(spark, schemaFlights, nameGenerator.getContainerPath(), Constants.JSON_TYPE);
  testSuite.test15(spark, schemaFlights, nameGenerator.getContainerPath(), Constants.PARQUET_TYPE);
  testSuite.test16(spark, schemaFlights, nameGenerator.getContainerPath(), Constants.JSON_TYPE);
  testSuite.test16(spark, schemaFlights, nameGenerator.getContainerPath(), Constants.PARQUET_TYPE);
  
  if (csvLocalLargePath != null) {
    System.out.println("*********************************");
    System.out.println("Large file test!");
    Dataset<Row> largeSchemaFlights = testSuite.getFlights(spark, csvLocalLargePath);
    if (isSwift) {
      nameGenerator.generateNewContainer(true);
    }
    testSuite.test1(spark, largeSchemaFlights, nameGenerator.getCsvPath2());
    testSuite.test2(spark, largeSchemaFlights, nameGenerator.getParquetPath(), Constants.PARQUET_TYPE);
    testSuite.test2(spark, largeSchemaFlights, nameGenerator.getJsonPath(), Constants.JSON_TYPE);
    System.out.println("***** Repartition to 1");
    largeSchemaFlights.repartition(1);
    if (isSwift) {
      nameGenerator.generateNewContainer(true);
    }
    testSuite.test2(spark, largeSchemaFlights, nameGenerator.getParquetPath(), Constants.PARQUET_TYPE);
    testSuite.test2(spark, largeSchemaFlights, nameGenerator.getJsonPath(), Constants.JSON_TYPE);
  } else {
    System.out.println("*********************************");
    System.out.println("No large file test to be executed");
  }

}

Source File: DataFrameMatrixConversionTest.java From systemds with Apache License 2.0

4 votes

private void testDataFrameConversion(boolean vector, int cols, boolean dense, boolean unknownDims) {
	boolean oldConfig = DMLScript.USE_LOCAL_SPARK_CONFIG; 
	ExecMode oldPlatform = DMLScript.getGlobalExecMode();

	try
	{
		DMLScript.USE_LOCAL_SPARK_CONFIG = true;
		DMLScript.setGlobalExecMode(ExecMode.HYBRID);
		
		//generate input data and setup metadata
		int rows = (cols == cols3) ? rows3 : rows1;
		double sparsity = dense ? sparsity1 : sparsity2; 
		double[][] A = getRandomMatrix(rows, cols, -10, 10, sparsity, 2373); 
		MatrixBlock mbA = DataConverter.convertToMatrixBlock(A); 
		int blksz = ConfigurationManager.getBlocksize();
		MatrixCharacteristics mc1 = new MatrixCharacteristics(rows, cols, blksz, mbA.getNonZeros());
		MatrixCharacteristics mc2 = unknownDims ? new MatrixCharacteristics() : new MatrixCharacteristics(mc1);

		//get binary block input rdd
		JavaPairRDD<MatrixIndexes,MatrixBlock> in = SparkExecutionContext.toMatrixJavaPairRDD(sc, mbA, blksz);
		
		//matrix - dataframe - matrix conversion
		Dataset<Row> df = RDDConverterUtils.binaryBlockToDataFrame(spark, in, mc1, vector);
		df = ( rows==rows3 ) ? df.repartition(rows) : df;
		JavaPairRDD<MatrixIndexes,MatrixBlock> out = RDDConverterUtils.dataFrameToBinaryBlock(sc, df, mc2, true, vector);
		
		//get output matrix block
		MatrixBlock mbB = SparkExecutionContext.toMatrixBlock(out, rows, cols, blksz, -1);
		
		//compare matrix blocks
		double[][] B = DataConverter.convertToDoubleMatrix(mbB);
		TestUtils.compareMatrices(A, B, rows, cols, eps);
	}
	catch( Exception ex ) {
		throw new RuntimeException(ex);
	}
	finally {
		DMLScript.USE_LOCAL_SPARK_CONFIG = oldConfig;
		DMLScript.setGlobalExecMode(oldPlatform);
	}
}

Source File: SparkDataSet.java From spliceengine with GNU Affero General Public License v3.0

4 votes

@SuppressWarnings({ "unchecked", "rawtypes" })
public DataSet<ExecRow> writeAvroFile(DataSetProcessor dsp,
                                      int[] partitionBy,
                                      String location,
                                      String compression,
                                      OperationContext context) throws StandardException
{
    compression = SparkDataSet.getAvroCompression(compression);

    StructType dataSchema = null;
    StructType tableSchema = generateTableSchema(context);

    // what is this? why is this so different from parquet/orc ?
    // actually very close to NativeSparkDataSet.writeFile
    dataSchema = ExternalTableUtils.getDataSchema(dsp, tableSchema, partitionBy, location, "a");

    if (dataSchema == null)
        dataSchema = tableSchema;

    Dataset<Row> insertDF = SpliceSpark.getSession().createDataFrame(
            rdd.map(new SparkSpliceFunctionWrapper<>(new CountWriteFunction(context))).map(new LocatedRowToRowAvroFunction()),
            dataSchema);


    // We duplicate the code in NativeSparkDataset.writeAvroFile here to avoid calling  ExternalTableUtils.getDataSchema() twice
    List<String> partitionByCols = new ArrayList();
    for (int i = 0; i < partitionBy.length; i++) {
        partitionByCols.add(dataSchema.fields()[partitionBy[i]].name());
    }
    if (partitionBy.length > 0) {
        List<Column> repartitionCols = new ArrayList();
        for (int i = 0; i < partitionBy.length; i++) {
            repartitionCols.add(new Column(dataSchema.fields()[partitionBy[i]].name()));
        }
        insertDF = insertDF.repartition(scala.collection.JavaConversions.asScalaBuffer(repartitionCols).toList());
    }
    if (compression.equals("none")) {
        compression = "uncompressed";
    }
    insertDF.write().option(SPARK_COMPRESSION_OPTION,compression).partitionBy(partitionByCols.toArray(new String[partitionByCols.size()]))
            .mode(SaveMode.Append).format("com.databricks.spark.avro").save(location);
    ValueRow valueRow=new ValueRow(1);
    valueRow.setColumn(1,new SQLLongint(context.getRecordsWritten()));
    return new SparkDataSet<>(SpliceSpark.getContext().parallelize(Collections.singletonList(valueRow), 1));
}

Java Code Examples for org.apache.spark.sql.Dataset#repartition()