Java Code Examples for org.apache.spark.sql.Dataset#repartition()
The following examples show how to use
org.apache.spark.sql.Dataset#repartition() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: StreamingStep.java From envelope with Apache License 2.0 | 6 votes |
@SuppressWarnings({"unchecked","rawtypes"}) public Dataset<Row> translate(JavaRDD raw) { StreamInput streamInput = (StreamInput)getInput(true); TranslateFunction translateFunction = getTranslateFunction(config, true); // Encode the raw messages as rows (i.e. the raw value plus associated metadata fields) JavaRDD<Row> encoded = raw.map(streamInput.getMessageEncoderFunction()); // Translate raw message rows to structured rows TranslationResults translationResults = new TranslationResults( encoded.flatMap(translateFunction), translateFunction.getProvidingSchema(), streamInput.getProvidingSchema()); BatchStep errored = createErrorStep(getName() + DEFAULT_ERROR_DATAFRAME_SUFFIX, translationResults.getErrored()); addNewBatchStep(errored); // Provide translated rows and errors Dataset<Row> translated = translationResults.getTranslated(); if (doesRepartition()) { translated = translated.repartition(config.getInt(REPARTITION_NUM_PARTITIONS_PROPERTY)); } return translated; }
Example 2
Source File: JavaQuantileDiscretizerExample.java From SparkDemo with MIT License | 5 votes |
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("JavaQuantileDiscretizerExample") .getOrCreate(); // $example on$ List<Row> data = Arrays.asList( RowFactory.create(0, 18.0), RowFactory.create(1, 19.0), RowFactory.create(2, 8.0), RowFactory.create(3, 5.0), RowFactory.create(4, 2.2) ); StructType schema = new StructType(new StructField[]{ new StructField("id", DataTypes.IntegerType, false, Metadata.empty()), new StructField("hour", DataTypes.DoubleType, false, Metadata.empty()) }); Dataset<Row> df = spark.createDataFrame(data, schema); // $example off$ // Output of QuantileDiscretizer for such small datasets can depend on the number of // partitions. Here we force a single partition to ensure consistent results. // Note this is not necessary for normal use cases df = df.repartition(1); // $example on$ QuantileDiscretizer discretizer = new QuantileDiscretizer() .setInputCol("hour") .setOutputCol("result") .setNumBuckets(3); Dataset<Row> result = discretizer.fit(df).transform(df); result.show(); // $example off$ spark.stop(); }
Example 3
Source File: BatchStep.java From envelope with Apache License 2.0 | 5 votes |
private Dataset<Row> repartition(Dataset<Row> data) { int numPartitions = 0; List<String> colPartitions = null; if (config.hasPath(REPARTITION_NUM_PARTITIONS_PROPERTY)) { numPartitions = config.getInt(REPARTITION_NUM_PARTITIONS_PROPERTY); } if (config.hasPath(REPARTITION_COLUMNS_PROPERTY)) { colPartitions = config.getStringList(REPARTITION_COLUMNS_PROPERTY); } if (numPartitions > 0 && null != colPartitions) { data = data.repartition(numPartitions, RowUtils.toColumnArray(colPartitions)); } else if (numPartitions > 0) { data = data.repartition(numPartitions); } else if (null != colPartitions) { data = data.repartition(RowUtils.toColumnArray(colPartitions)); } if (config.hasPath(COALESCE_NUM_PARTITIONS_PROPERTY)) { numPartitions = config.getInt(COALESCE_NUM_PARTITIONS_PROPERTY); data = data.coalesce(numPartitions); } return data; }
Example 4
Source File: DataFrameMatrixConversionTest.java From systemds with Apache License 2.0 | 4 votes |
private void testDataFrameConversion(boolean vector, int cols, boolean dense, boolean unknownDims) { boolean oldConfig = DMLScript.USE_LOCAL_SPARK_CONFIG; ExecMode oldPlatform = DMLScript.getGlobalExecMode(); try { DMLScript.USE_LOCAL_SPARK_CONFIG = true; DMLScript.setGlobalExecMode(ExecMode.HYBRID); //generate input data and setup metadata int rows = (cols == cols3) ? rows3 : rows1; double sparsity = dense ? sparsity1 : sparsity2; double[][] A = getRandomMatrix(rows, cols, -10, 10, sparsity, 2373); MatrixBlock mbA = DataConverter.convertToMatrixBlock(A); int blksz = ConfigurationManager.getBlocksize(); MatrixCharacteristics mc1 = new MatrixCharacteristics(rows, cols, blksz, mbA.getNonZeros()); MatrixCharacteristics mc2 = unknownDims ? new MatrixCharacteristics() : new MatrixCharacteristics(mc1); //get binary block input rdd JavaPairRDD<MatrixIndexes,MatrixBlock> in = SparkExecutionContext.toMatrixJavaPairRDD(sc, mbA, blksz); //matrix - dataframe - matrix conversion Dataset<Row> df = RDDConverterUtils.binaryBlockToDataFrame(spark, in, mc1, vector); df = ( rows==rows3 ) ? df.repartition(rows) : df; JavaPairRDD<MatrixIndexes,MatrixBlock> out = RDDConverterUtils.dataFrameToBinaryBlock(sc, df, mc2, true, vector); //get output matrix block MatrixBlock mbB = SparkExecutionContext.toMatrixBlock(out, rows, cols, blksz, -1); //compare matrix blocks double[][] B = DataConverter.convertToDoubleMatrix(mbB); TestUtils.compareMatrices(A, B, rows, cols, eps); } catch( Exception ex ) { throw new RuntimeException(ex); } finally { DMLScript.USE_LOCAL_SPARK_CONFIG = oldConfig; DMLScript.setGlobalExecMode(oldPlatform); } }
Example 5
Source File: Runner.java From stocator with Apache License 2.0 | 4 votes |
private static void executeTestSuite(NameGenerator nameGenerator, SparkSession spark) throws Exception{ TestSuite testSuite = new TestSuite(dataCreate, flatListing); System.out.println("*********************************"); System.out.println("*** Create dataframe from the local CSV file ***"); Dataset<Row> schemaFlights = testSuite.getFlights(spark, csvLocalPath); nameGenerator.generateObjectNames(); if (dataCreate) { System.out.println("Data cleanup (start) for " + nameGenerator.getContainerPath() + "*"); System.out.println("*********************************"); testSuite.deleteData(nameGenerator.getContainerPath(), spark.sparkContext().hadoopConfiguration(), false); System.out.println("*********************************"); } testSuite.test1(spark, schemaFlights, nameGenerator.getCsvPath2()); testSuite.test2(spark, schemaFlights, nameGenerator.getParquetPath(), Constants.PARQUET_TYPE); testSuite.test2(spark, schemaFlights, nameGenerator.getJsonPath(), Constants.JSON_TYPE); testSuite.test3(spark, schemaFlights, nameGenerator.getCsvPath1()); testSuite.test4(spark, nameGenerator.getTxtPath()); testSuite.test8(spark, nameGenerator.getTxtPath(), isTimeOutTest ); if (isSwift) { nameGenerator.generateNewContainer("list"); System.out.println("Data cleanup for " + nameGenerator.getContainerPath() + "*"); System.out.println("*********************************"); testSuite.deleteData(nameGenerator.getContainerPath(), spark.sparkContext().hadoopConfiguration(), dataCreate); System.out.println("*********************************"); } testSuite.test6(spark, schemaFlights, nameGenerator.getContainerPath(), nameGenerator.getCsvName()); if (isSwift) { nameGenerator.generateNewContainer(false); System.out.println("Data cleanup for " + nameGenerator.getContainerPath() + "*"); System.out.println("*********************************"); testSuite.deleteData(nameGenerator.getContainerPath(), spark.sparkContext().hadoopConfiguration(), dataCreate); System.out.println("*********************************"); } testSuite.test7(spark, schemaFlights, nameGenerator.getContainerPath(), Constants.TEXT_TYPE); testSuite.test7(spark, schemaFlights, nameGenerator.getContainerPath(), Constants.JSON_TYPE); testSuite.test7(spark, schemaFlights, nameGenerator.getContainerPath(), Constants.PARQUET_TYPE); testSuite.test71(spark, schemaFlights, nameGenerator.getContainerPath(), Constants.TEXT_TYPE); testSuite.test71(spark, schemaFlights, nameGenerator.getContainerPath(), Constants.JSON_TYPE); testSuite.test71(spark, schemaFlights, nameGenerator.getContainerPath(), Constants.PARQUET_TYPE); testSuite.test10(spark, nameGenerator.getDataResPath() + "/dfp"); testSuite.test11(spark, schemaFlights, nameGenerator.getContainerPath(), Constants.PARQUET_TYPE); testSuite.test12(spark, schemaFlights, nameGenerator.getContainerPath(), Constants.PARQUET_TYPE); testSuite.test9(spark, nameGenerator.getDataResPath()); testSuite.test13(spark, schemaFlights, nameGenerator.getContainerPath(), Constants.CSV_TYPE); testSuite.test14(spark, schemaFlights, nameGenerator.getContainerPath(), Constants.JSON_TYPE); testSuite.test14(spark, schemaFlights, nameGenerator.getContainerPath(), Constants.PARQUET_TYPE); testSuite.test15(spark, schemaFlights, nameGenerator.getContainerPath(), Constants.JSON_TYPE); testSuite.test15(spark, schemaFlights, nameGenerator.getContainerPath(), Constants.PARQUET_TYPE); testSuite.test16(spark, schemaFlights, nameGenerator.getContainerPath(), Constants.JSON_TYPE); testSuite.test16(spark, schemaFlights, nameGenerator.getContainerPath(), Constants.PARQUET_TYPE); if (csvLocalLargePath != null) { System.out.println("*********************************"); System.out.println("Large file test!"); Dataset<Row> largeSchemaFlights = testSuite.getFlights(spark, csvLocalLargePath); if (isSwift) { nameGenerator.generateNewContainer(true); } testSuite.test1(spark, largeSchemaFlights, nameGenerator.getCsvPath2()); testSuite.test2(spark, largeSchemaFlights, nameGenerator.getParquetPath(), Constants.PARQUET_TYPE); testSuite.test2(spark, largeSchemaFlights, nameGenerator.getJsonPath(), Constants.JSON_TYPE); System.out.println("***** Repartition to 1"); largeSchemaFlights.repartition(1); if (isSwift) { nameGenerator.generateNewContainer(true); } testSuite.test2(spark, largeSchemaFlights, nameGenerator.getParquetPath(), Constants.PARQUET_TYPE); testSuite.test2(spark, largeSchemaFlights, nameGenerator.getJsonPath(), Constants.JSON_TYPE); } else { System.out.println("*********************************"); System.out.println("No large file test to be executed"); } }
Example 6
Source File: DataFrameMatrixConversionTest.java From systemds with Apache License 2.0 | 4 votes |
private void testDataFrameConversion(boolean vector, int cols, boolean dense, boolean unknownDims) { boolean oldConfig = DMLScript.USE_LOCAL_SPARK_CONFIG; ExecMode oldPlatform = DMLScript.getGlobalExecMode(); try { DMLScript.USE_LOCAL_SPARK_CONFIG = true; DMLScript.setGlobalExecMode(ExecMode.HYBRID); //generate input data and setup metadata int rows = (cols == cols3) ? rows3 : rows1; double sparsity = dense ? sparsity1 : sparsity2; double[][] A = getRandomMatrix(rows, cols, -10, 10, sparsity, 2373); MatrixBlock mbA = DataConverter.convertToMatrixBlock(A); int blksz = ConfigurationManager.getBlocksize(); MatrixCharacteristics mc1 = new MatrixCharacteristics(rows, cols, blksz, mbA.getNonZeros()); MatrixCharacteristics mc2 = unknownDims ? new MatrixCharacteristics() : new MatrixCharacteristics(mc1); //get binary block input rdd JavaPairRDD<MatrixIndexes,MatrixBlock> in = SparkExecutionContext.toMatrixJavaPairRDD(sc, mbA, blksz); //matrix - dataframe - matrix conversion Dataset<Row> df = RDDConverterUtils.binaryBlockToDataFrame(spark, in, mc1, vector); df = ( rows==rows3 ) ? df.repartition(rows) : df; JavaPairRDD<MatrixIndexes,MatrixBlock> out = RDDConverterUtils.dataFrameToBinaryBlock(sc, df, mc2, true, vector); //get output matrix block MatrixBlock mbB = SparkExecutionContext.toMatrixBlock(out, rows, cols, blksz, -1); //compare matrix blocks double[][] B = DataConverter.convertToDoubleMatrix(mbB); TestUtils.compareMatrices(A, B, rows, cols, eps); } catch( Exception ex ) { throw new RuntimeException(ex); } finally { DMLScript.USE_LOCAL_SPARK_CONFIG = oldConfig; DMLScript.setGlobalExecMode(oldPlatform); } }
Example 7
Source File: SparkDataSet.java From spliceengine with GNU Affero General Public License v3.0 | 4 votes |
@SuppressWarnings({ "unchecked", "rawtypes" }) public DataSet<ExecRow> writeAvroFile(DataSetProcessor dsp, int[] partitionBy, String location, String compression, OperationContext context) throws StandardException { compression = SparkDataSet.getAvroCompression(compression); StructType dataSchema = null; StructType tableSchema = generateTableSchema(context); // what is this? why is this so different from parquet/orc ? // actually very close to NativeSparkDataSet.writeFile dataSchema = ExternalTableUtils.getDataSchema(dsp, tableSchema, partitionBy, location, "a"); if (dataSchema == null) dataSchema = tableSchema; Dataset<Row> insertDF = SpliceSpark.getSession().createDataFrame( rdd.map(new SparkSpliceFunctionWrapper<>(new CountWriteFunction(context))).map(new LocatedRowToRowAvroFunction()), dataSchema); // We duplicate the code in NativeSparkDataset.writeAvroFile here to avoid calling ExternalTableUtils.getDataSchema() twice List<String> partitionByCols = new ArrayList(); for (int i = 0; i < partitionBy.length; i++) { partitionByCols.add(dataSchema.fields()[partitionBy[i]].name()); } if (partitionBy.length > 0) { List<Column> repartitionCols = new ArrayList(); for (int i = 0; i < partitionBy.length; i++) { repartitionCols.add(new Column(dataSchema.fields()[partitionBy[i]].name())); } insertDF = insertDF.repartition(scala.collection.JavaConversions.asScalaBuffer(repartitionCols).toList()); } if (compression.equals("none")) { compression = "uncompressed"; } insertDF.write().option(SPARK_COMPRESSION_OPTION,compression).partitionBy(partitionByCols.toArray(new String[partitionByCols.size()])) .mode(SaveMode.Append).format("com.databricks.spark.avro").save(location); ValueRow valueRow=new ValueRow(1); valueRow.setColumn(1,new SQLLongint(context.getRecordsWritten())); return new SparkDataSet<>(SpliceSpark.getContext().parallelize(Collections.singletonList(valueRow), 1)); }