org.apache.spark.sql.execution.datasources.parquet.ParquetWriteSupport Java Examples

The following examples show how to use org.apache.spark.sql.execution.datasources.parquet.ParquetWriteSupport. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: SparkParquetWritersNestedDataBenchmark.java From iceberg with Apache License 2.0

6 votes

@Benchmark
@Threads(1)
public void writeUsingSparkWriter() throws IOException {
  StructType sparkSchema = SparkSchemaUtil.convert(SCHEMA);
  try (FileAppender<InternalRow> writer = Parquet.write(Files.localOutput(dataFile))
      .writeSupport(new ParquetWriteSupport())
      .set("org.apache.spark.sql.parquet.row.attributes", sparkSchema.json())
      .set("spark.sql.parquet.writeLegacyFormat", "false")
      .set("spark.sql.parquet.binaryAsString", "false")
      .set("spark.sql.parquet.int96AsTimestamp", "false")
      .set("spark.sql.parquet.outputTimestampType", "TIMESTAMP_MICROS")
      .schema(SCHEMA)
      .build()) {

    writer.addAll(rows);
  }
}

Example #2

Source File: SparkParquetWritersFlatDataBenchmark.java From iceberg with Apache License 2.0

6 votes

@Benchmark
@Threads(1)
public void writeUsingSparkWriter() throws IOException {
  StructType sparkSchema = SparkSchemaUtil.convert(SCHEMA);
  try (FileAppender<InternalRow> writer = Parquet.write(Files.localOutput(dataFile))
      .writeSupport(new ParquetWriteSupport())
      .set("org.apache.spark.sql.parquet.row.attributes", sparkSchema.json())
      .set("spark.sql.parquet.writeLegacyFormat", "false")
      .set("spark.sql.parquet.binaryAsString", "false")
      .set("spark.sql.parquet.int96AsTimestamp", "false")
      .set("spark.sql.parquet.outputTimestampType", "TIMESTAMP_MICROS")
      .schema(SCHEMA)
      .build()) {

    writer.addAll(rows);
  }
}

Example #3

Source File: Writer.java From iceberg with Apache License 2.0

5 votes

public FileAppender<InternalRow> newAppender(OutputFile file, FileFormat format) {
  Schema schema = spec.schema();
  try {
    switch (format) {
      case PARQUET:
        String jsonSchema = convert(schema).json();
        return Parquet.write(file)
            .writeSupport(new ParquetWriteSupport())
            .set("org.apache.spark.sql.parquet.row.attributes", jsonSchema)
            .set("spark.sql.parquet.writeLegacyFormat", "false")
            .set("spark.sql.parquet.binaryAsString", "false")
            .set("spark.sql.parquet.int96AsTimestamp", "false")
            .set("spark.sql.parquet.outputTimestampType", "TIMESTAMP_MICROS")
            .setAll(properties)
            .schema(schema)
            .build();

      case AVRO:
        return Avro.write(file)
            .createWriterFunc(ignored -> new SparkAvroWriter(schema))
            .setAll(properties)
            .schema(schema)
            .build();

      case ORC: {
        @SuppressWarnings("unchecked")
        SparkOrcWriter writer = new SparkOrcWriter(ORC.write(file)
            .schema(schema)
            .build());
        return writer;
      }
      default:
        throw new UnsupportedOperationException("Cannot write unknown format: " + format);
    }
  } catch (IOException e) {
    throw new RuntimeIOException(e);
  }
}