org.apache.parquet.column.ParquetProperties Scala Example

Source File: RowParquetWriterFn.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.parquet

import io.eels.Row
import io.eels.schema.StructType
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.parquet.column.ParquetProperties
import org.apache.parquet.hadoop.api.WriteSupport
import org.apache.parquet.hadoop.{ParquetFileWriter, ParquetWriter}
import org.apache.parquet.schema.MessageType

import scala.math.BigDecimal.RoundingMode.RoundingMode


object RowParquetWriterFn {

  class RowParquetWriterBuilder(path: Path,
                                schema: MessageType,
                                roundingMode: RoundingMode,
                                metadata: Map[String, String])
    extends ParquetWriter.Builder[Row, RowParquetWriterBuilder](path) {
    override def getWriteSupport(conf: Configuration): WriteSupport[Row] = new RowWriteSupport(schema, roundingMode, metadata)
    override def self(): RowParquetWriterBuilder = this
  }

  def apply(path: Path,
            schema: StructType,
            metadata: Map[String, String],
            dictionary: Boolean,
            roundingMode: RoundingMode,
            fsConfig: Configuration): ParquetWriter[Row] = {
    val config = ParquetWriterConfig()
    val messageType = ParquetSchemaFns.toParquetMessageType(schema)
    new RowParquetWriterBuilder(path, messageType, roundingMode, metadata)
      .withCompressionCodec(config.compressionCodec)
      .withDictionaryEncoding(dictionary)
      .withPageSize(config.pageSize)
      .withRowGroupSize(config.blockSize)
      .withValidation(config.validating)
      .withWriteMode(ParquetFileWriter.Mode.CREATE)
      .withWriterVersion(ParquetProperties.WriterVersion.PARQUET_1_0)
      .withConf(fsConfig)
      .build()
  }
}

Source File: package.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive

import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.kafka.connect.data.{Schema, Struct}
import org.apache.parquet.column.ParquetProperties
import org.apache.parquet.hadoop.{ParquetFileWriter, ParquetReader, ParquetWriter}

package object parquet {
  private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName)

  def listFiles(path: Path)(implicit fs: FileSystem): List[Path] = {
    if (fs.isDirectory(path)) {
      logger.debug(s"$path is a directory, reading constituent files")
      val remote = fs.listFiles(path, false)
      new Iterator[Path] {
        override def hasNext: Boolean = remote.hasNext
        override def next(): Path = remote.next().getPath
      }.toList
    } else {
      logger.debug(s"Reading $path as a single file")
      List(path)
    }
  }

  def parquetReader(file: Path)(implicit fs: FileSystem): ParquetReader[Struct] = {
    ParquetReader.builder(new StructReadSupport, file)
      .withConf(fs.getConf)
      .build()
  }

  def parquetWriter(path: Path,
                    schema: Schema,
                    config: ParquetSinkConfig): ParquetWriter[Struct] = {
    new StructParquetWriterBuilder(path, schema)
      .withCompressionCodec(config.compressionCodec)
      .withDictionaryEncoding(config.enableDictionary)
      .withValidation(config.validation)
      .withWriterVersion(ParquetProperties.WriterVersion.PARQUET_1_0)
      .withWriteMode(if (config.overwrite) {
        ParquetFileWriter.Mode.OVERWRITE
      } else {
        ParquetFileWriter.Mode.CREATE
      }).build()
  }
}

Source File: package.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive

import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.kafka.connect.data.{Schema, Struct}
import org.apache.parquet.column.ParquetProperties
import org.apache.parquet.hadoop.{ParquetFileWriter, ParquetReader, ParquetWriter}

package object parquet {
  private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName)

  def listFiles(path: Path)(implicit fs: FileSystem): List[Path] = {
    if (fs.isDirectory(path)) {
      logger.debug(s"$path is a directory, reading constituent files")
      val remote = fs.listFiles(path, false)
      new Iterator[Path] {
        override def hasNext: Boolean = remote.hasNext
        override def next(): Path = remote.next().getPath
      }.toList
    } else {
      logger.debug(s"Reading $path as a single file")
      List(path)
    }
  }

  def parquetReader(file: Path)(implicit fs: FileSystem): ParquetReader[Struct] = {
    ParquetReader.builder(new StructReadSupport, file)
      .withConf(fs.getConf)
      .build()
  }

  def parquetWriter(path: Path,
                    schema: Schema,
                    config: ParquetSinkConfig): ParquetWriter[Struct] = {
    new StructParquetWriterBuilder(path, schema)
      .withCompressionCodec(config.compressionCodec)
      .withDictionaryEncoding(config.enableDictionary)
      .withValidation(config.validation)
      .withWriterVersion(ParquetProperties.WriterVersion.PARQUET_1_0)
      .withWriteMode(if (config.overwrite) {
        ParquetFileWriter.Mode.OVERWRITE
      } else {
        ParquetFileWriter.Mode.CREATE
      }).build()
  }
}

org.apache.parquet.column.ParquetProperties Scala Examples