org.apache.parquet.hadoop.ParquetFileReader Scala Examples
The following examples show how to use org.apache.parquet.hadoop.ParquetFileReader.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: DirectParquetOutputCommitter.scala From utils with Apache License 2.0 | 5 votes |
package com.indix.utils.spark.parquet import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext} import org.apache.parquet.Log import org.apache.parquet.hadoop.util.ContextUtil import org.apache.parquet.hadoop.{ParquetFileReader, ParquetFileWriter, ParquetOutputCommitter, ParquetOutputFormat} class DirectParquetOutputCommitter(outputPath: Path, context: TaskAttemptContext) extends ParquetOutputCommitter(outputPath, context) { val LOG = Log.getLog(classOf[ParquetOutputCommitter]) override def getWorkPath: Path = outputPath override def abortTask(taskContext: TaskAttemptContext): Unit = {} override def commitTask(taskContext: TaskAttemptContext): Unit = {} override def needsTaskCommit(taskContext: TaskAttemptContext): Boolean = true override def setupJob(jobContext: JobContext): Unit = {} override def setupTask(taskContext: TaskAttemptContext): Unit = {} override def commitJob(jobContext: JobContext) { val configuration = ContextUtil.getConfiguration(jobContext) val fileSystem = outputPath.getFileSystem(configuration) LOG.info("Using DirectParquetOutputCommitter to commit parquet files") if (configuration.getBoolean(ParquetOutputFormat.ENABLE_JOB_SUMMARY, true)) { try { val outputStatus = fileSystem.getFileStatus(outputPath) val footers = ParquetFileReader.readAllFootersInParallel(configuration, outputStatus) try { ParquetFileWriter.writeMetadataFile(configuration, outputPath, footers) } catch { case e: Exception => LOG.warn("Could not write summary file for " + outputPath, e) val metadataPath = new Path(outputPath, ParquetFileWriter.PARQUET_METADATA_FILE) if (fileSystem.exists(metadataPath)) { fileSystem.delete(metadataPath, true) } } } catch { case e: Exception => LOG.warn("Could not write summary file for " + outputPath, e) } } if (configuration.getBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", true)) { try { val successPath = new Path(outputPath, FileOutputCommitter.SUCCEEDED_FILE_NAME) fileSystem.create(successPath).close() } catch { case e: Exception => LOG.warn("Could not write success file for " + outputPath, e) } } } }
Example 2
Source File: RowParquetReaderFn.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet import com.sksamuel.exts.Logging import io.eels.schema.StructType import io.eels.{Predicate, Row} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.parquet.filter2.compat.FilterCompat import org.apache.parquet.format.converter.ParquetMetadataConverter import org.apache.parquet.hadoop.api.ReadSupport import org.apache.parquet.hadoop.{ParquetFileReader, ParquetReader} import org.apache.parquet.schema.Type def apply(path: Path, predicate: Option[Predicate], readSchema: Option[Type], dictionaryFiltering: Boolean)(implicit conf: Configuration): ParquetReader[Row] = { logger.debug(s"Opening parquet reader for $path") // The parquet reader can use a projection by setting a projected schema onto the supplied conf object def configuration(): Configuration = { val newconf = new Configuration(conf) readSchema.foreach { it => newconf.set(ReadSupport.PARQUET_READ_SCHEMA, it.toString) } //newconf.set(ParquetInputFormat.DICTIONARY_FILTERING_ENABLED, dictionaryFiltering.toString) newconf.set(org.apache.parquet.hadoop.ParquetFileReader.PARQUET_READ_PARALLELISM, config.parallelism.toString) newconf } // a filter is set when we have a predicate for the read def filter(): FilterCompat.Filter = predicate.map(ParquetPredicateBuilder.build) .map(FilterCompat.get) .getOrElse(FilterCompat.NOOP) ParquetReader.builder(new RowReadSupport, path) .withConf(configuration()) .withFilter(filter()) .build() } }
Example 3
Source File: ParquetPublisher.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet import java.util.concurrent.atomic.AtomicBoolean import com.sksamuel.exts.Logging import com.sksamuel.exts.OptionImplicits._ import com.sksamuel.exts.io.Using import io.eels.component.parquet.util.ParquetIterator import io.eels.datastream.{DataStream, Publisher, Subscriber, Subscription} import io.eels.schema.StructType import io.eels.{Predicate, Row} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.parquet.format.converter.ParquetMetadataConverter import org.apache.parquet.hadoop.ParquetFileReader import org.apache.parquet.schema.MessageType class ParquetPublisher(path: Path, predicate: Option[Predicate], projection: Seq[String], caseSensitive: Boolean, dictionaryFiltering: Boolean) (implicit conf: Configuration) extends Publisher[Seq[Row]] with Logging with Using { def readSchema: Option[MessageType] = { if (projection.isEmpty) None else { val fileSchema = ParquetFileReader.readFooter(conf, path, ParquetMetadataConverter.NO_FILTER).getFileMetaData.getSchema val structType = ParquetSchemaFns.fromParquetMessageType(fileSchema) if (caseSensitive) { assert( structType.fieldNames.toSet.size == structType.fieldNames.map(_.toLowerCase).toSet.size, "Cannot use case sensitive = true when this would result in a clash of field names" ) } val projectionSchema = StructType(projection.map { field => structType.field(field, caseSensitive).getOrError(s"Requested field $field does not exist in the parquet schema") }) ParquetSchemaFns.toParquetMessageType(projectionSchema).some } } override def subscribe(subscriber: Subscriber[Seq[Row]]): Unit = { try { using(RowParquetReaderFn(path, predicate, readSchema, dictionaryFiltering)) { reader => val running = new AtomicBoolean(true) subscriber.subscribed(Subscription.fromRunning(running)) ParquetIterator(reader) .grouped(DataStream.DefaultBatchSize) .takeWhile(_ => running.get) .foreach(subscriber.next) subscriber.completed() } } catch { case t: Throwable => subscriber.error(t) } } }
Example 4
Source File: AvroParquetSource.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet.avro import com.sksamuel.exts.Logging import com.sksamuel.exts.OptionImplicits._ import com.sksamuel.exts.io.Using import io.eels.component.avro.{AvroSchemaFns, AvroSchemaMerge} import io.eels.component.parquet._ import io.eels.datastream.Publisher import io.eels.schema.StructType import io.eels.{FilePattern, Predicate, _} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.parquet.format.converter.ParquetMetadataConverter import org.apache.parquet.hadoop.{Footer, ParquetFileReader} import scala.collection.JavaConverters._ object AvroParquetSource { def apply(uri: java.net.URI)(implicit fs: FileSystem, conf: Configuration): AvroParquetSource = apply(FilePattern(new Path(uri.toString))) def apply(path: java.nio.file.Path)(implicit fs: FileSystem, conf: Configuration): AvroParquetSource = apply(FilePattern(path)) def apply(path: Path)(implicit fs: FileSystem, conf: Configuration): AvroParquetSource = apply(FilePattern(path)) } case class AvroParquetSource(pattern: FilePattern, predicate: Option[Predicate] = None) (implicit fs: FileSystem, conf: Configuration) extends Source with Logging with Using { private lazy val paths = pattern.toPaths() def withPredicate(pred: Predicate): AvroParquetSource = copy(predicate = pred.some) // the schema returned by the parquet source should be a merged version of the // schemas contained in all the files. override def schema: StructType = { val schemas = paths.map { path => using(AvroParquetReaderFn.apply(path, predicate, None)) { reader => val record = Option(reader.read()).getOrElse { sys.error(s"Cannot read $path for schema; file contains no records") } record.getSchema } } val avroSchema = AvroSchemaMerge("record", "namspace", schemas) AvroSchemaFns.fromAvroSchema(avroSchema) } // returns the count of all records in this source, predicate is ignored def countNoPredicate(): Long = statistics().count // returns stats, predicate is ignored def statistics(): Statistics = { if (paths.isEmpty) Statistics.Empty else { paths.foldLeft(Statistics.Empty) { (stats, path) => val footer = ParquetFileReader.readFooter(conf, path, ParquetMetadataConverter.NO_FILTER) footer.getBlocks.asScala.foldLeft(stats) { (stats, block) => stats.copy( count = stats.count + block.getRowCount, compressedSize = stats.compressedSize + block.getCompressedSize, uncompressedSize = stats.uncompressedSize + block.getTotalByteSize ) } } } } override def parts(): Seq[Publisher[Seq[Row]]] = { logger.debug(s"AvroParquetSource source has ${paths.size} files: $paths") paths.map { it => new AvroParquetPublisher(it, predicate) } } def footers(): List[Footer] = { logger.debug(s"AvroParquetSource source will read footers from $paths") paths.flatMap { it => val status = fs.getFileStatus(it) logger.debug(s"status=$status; path=$it") ParquetFileReader.readAllFootersInParallel(fs.getConf, status).asScala } } }
Example 5
Source File: ParquetSource.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet import com.sksamuel.exts.Logging import com.sksamuel.exts.OptionImplicits._ import com.sksamuel.exts.io.Using import io.eels.datastream.Publisher import io.eels.{Predicate, _} import io.eels.schema.StructType import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.parquet.format.converter.ParquetMetadataConverter import org.apache.parquet.hadoop.{Footer, ParquetFileReader} import scala.collection.JavaConverters._ object ParquetSource { def apply(string: String)(implicit fs: FileSystem, conf: Configuration): ParquetSource = apply(FilePattern(string)) def apply(uri: java.net.URI)(implicit fs: FileSystem, conf: Configuration): ParquetSource = apply(FilePattern(new Path(uri.toString))) def apply(path: java.nio.file.Path)(implicit fs: FileSystem, conf: Configuration): ParquetSource = apply(FilePattern(path)) def apply(path: Path)(implicit fs: FileSystem, conf: Configuration): ParquetSource = apply(FilePattern(path)) } case class ParquetSource(pattern: FilePattern, predicate: Option[Predicate] = None, projection: Seq[String] = Nil, dictionaryFiltering: Boolean = true, caseSensitive: Boolean = true) (implicit fs: FileSystem, conf: Configuration) extends Source with Logging with Using { logger.debug(s"Created parquet source with pattern=$pattern") lazy val paths: List[Path] = pattern.toPaths() def withDictionaryFiltering(dictionary: Boolean): ParquetSource = copy(dictionaryFiltering = dictionary) def withCaseSensitivity(caseSensitive: Boolean): ParquetSource = copy(caseSensitive = caseSensitive) def withPredicate(pred: => Predicate): ParquetSource = copy(predicate = pred.some) def withProjection(first: String, rest: String*): ParquetSource = withProjection(first +: rest) def withProjection(fields: Seq[String]): ParquetSource = { require(fields.nonEmpty) copy(projection = fields.toList) } // returns the metadata in the parquet file, or an empty map if none def metadata(): Map[String, String] = { paths.foldLeft(Map.empty[String, String]) { (metadata, path) => val footer = ParquetFileReader.readFooter(conf, path, ParquetMetadataConverter.NO_FILTER) metadata ++ footer.getFileMetaData.getKeyValueMetaData.asScala } } // todo should take the merged schema from all files lazy val schema: StructType = RowParquetReaderFn.schema(paths.headOption.getOrError("No paths found for source")) // returns the count of all records in this source, predicate is ignored def countNoPredicate(): Long = statistics().count // returns stats, predicate is ignored def statistics(): Statistics = { if (paths.isEmpty) Statistics.Empty else { paths.foldLeft(Statistics.Empty) { (stats, path) => val footer = ParquetFileReader.readFooter(conf, path, ParquetMetadataConverter.NO_FILTER) footer.getBlocks.asScala.foldLeft(stats) { (stats, block) => stats.copy( count = stats.count + block.getRowCount, compressedSize = stats.compressedSize + block.getCompressedSize, uncompressedSize = stats.uncompressedSize + block.getTotalByteSize ) } } } } override def parts(): Seq[Publisher[Seq[Row]]] = { logger.debug(s"Parquet source has ${paths.size} files: ${paths.mkString(", ")}") paths.map { it => new ParquetPublisher(it, predicate, projection, caseSensitive, dictionaryFiltering) } } def footers(): List[Footer] = { logger.debug(s"Parquet source will read footers from $paths") paths.flatMap { it => val status = fs.getFileStatus(it) logger.debug(s"status=$status; path=$it") ParquetFileReader.readAllFootersInParallel(fs.getConf, status).asScala } } }
Example 6
Source File: HiveStats.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.hive import com.sksamuel.exts.Logging import com.sksamuel.exts.OptionImplicits._ import io.eels.schema.PartitionConstraint import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.metastore.IMetaStoreClient import org.apache.parquet.format.converter.ParquetMetadataConverter import org.apache.parquet.hadoop.ParquetFileReader import scala.collection.JavaConverters._ trait HiveStats { // total number of records def count: Long = count(Nil) // total number of records in the partitions that match the constraints def count(constraints: Seq[PartitionConstraint]): Long // returns the minimum value of this field def min(field: String): Any = min(field, Nil) // returns the maximum value of this field def max(field: String): Any = max(field, Nil) // returns the minimum value of this field for the partitions that match the constraints def min(field: String, constraints: Seq[PartitionConstraint]): Any // returns the maximum value of this field for the partitions that match the constraints def max(field: String, constraints: Seq[PartitionConstraint]): Any } class ParquetHiveStats(dbName: String, tableName: String, table: HiveTable) (implicit fs: FileSystem, conf: Configuration, client: IMetaStoreClient) extends HiveStats with Logging { private val ops = new HiveOps(client) private def count(path: Path) = { val blocks = ParquetFileReader.readFooter(fs.getConf, path, ParquetMetadataConverter.NO_FILTER).getBlocks.asScala blocks.map(_.getRowCount).sum } override def count(constraints: Seq[PartitionConstraint]): Long = { val counts = HiveTableFilesFn(dbName, tableName, table.location, constraints) .flatMap(_._2) .map(_.getPath).map(count) if (counts.isEmpty) 0 else counts.sum } private def minmax(field: String, constraints: Seq[PartitionConstraint]): (Any, Any) = { def stats[T]: (Any, Any) = { def min(ts: Seq[Comparable[T]]): Any = ts.reduceLeft { (a, b) => if (a.compareTo(b.asInstanceOf[T]) <= 0) a else b } def max(ts: Seq[Comparable[T]]): Any = ts.reduceLeft { (a, b) => if (a.compareTo(b.asInstanceOf[T]) >= 0) a else b } val location = new Path(ops.location(dbName, tableName)) val (mins, maxes) = HiveTableFilesFn(dbName, tableName, location, constraints).toSeq.flatMap { case (_, files) => logger.debug(s"Calculating min,max in file $files") files.flatMap { file => val footer = ParquetFileReader.readFooter(conf, file, ParquetMetadataConverter.NO_FILTER) footer.getBlocks.asScala.map { block => val column = block.getColumns.asScala.find(_.getPath.toDotString == field).getOrError(s"Unknown column $field") val min = column.getStatistics.genericGetMin.asInstanceOf[Comparable[T]] val max = column.getStatistics.genericGetMax.asInstanceOf[Comparable[T]] (min, max) } } }.unzip (min(mins), max(maxes)) } stats[Any] } override def min(field: String, constraints: Seq[PartitionConstraint]): Any = minmax(field, constraints)._1 override def max(field: String, constraints: Seq[PartitionConstraint]): Any = minmax(field, constraints)._2 }
Example 7
Source File: TikaParquetParser.scala From project-matt with MIT License | 5 votes |
package org.datafy.aws.app.matt.extras import java.io.{File, FileOutputStream, IOException, InputStream} import java.util import scala.collection.JavaConverters._ import org.xml.sax.{ContentHandler, SAXException} import org.apache.tika.metadata.Metadata import org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE import org.apache.tika.mime.MediaType import org.apache.tika.parser.{AbstractParser, ParseContext} import org.apache.commons.io.IOUtils import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.parquet.hadoop.ParquetFileReader import org.apache.parquet.hadoop.ParquetReader import org.apache.parquet.format.converter.ParquetMetadataConverter import org.apache.parquet.hadoop.util.HadoopInputFile import org.apache.parquet.tools.json.JsonRecordFormatter import org.apache.parquet.tools.read.{SimpleReadSupport, SimpleRecord} import org.apache.tika.exception.TikaException import org.apache.tika.sax.XHTMLContentHandler import scala.util.Random class TikaParquetParser extends AbstractParser { // make some stuff here final val PARQUET_RAW = MediaType.application("x-parquet") private val SUPPORTED_TYPES: Set[MediaType] = Set(PARQUET_RAW) def getSupportedTypes(context: ParseContext): util.Set[MediaType] = { SUPPORTED_TYPES.asJava } @throws(classOf[IOException]) @throws(classOf[SAXException]) @throws(classOf[TikaException]) def parse(stream: InputStream, handler: ContentHandler, metadata: Metadata, context: ParseContext): Unit = { // create temp file from stream val fileNamePrefix = Random.alphanumeric.take(5).mkString val tempFile = File.createTempFile(s"parquet-${fileNamePrefix}", ".parquet") IOUtils.copy(stream, new FileOutputStream(tempFile)) val conf = new Configuration() val path = new Path(tempFile.getAbsolutePath) val parquetMetadata = ParquetFileReader.readFooter(conf, path, ParquetMetadataConverter.NO_FILTER) var defaultReader: ParquetReader[SimpleRecord] = null val columns = parquetMetadata.getFileMetaData.getSchema.getFields metadata.set(CONTENT_TYPE, PARQUET_RAW.toString) metadata.set("Total Number of Columns", columns.size.toString) metadata.set("Parquet Column Names", columns.toString) val xhtml = new XHTMLContentHandler(handler, metadata) xhtml.startDocument() xhtml.startElement("p") // ::TODO:: ensure parquet reader reads all files not only file row try { defaultReader = ParquetReader.builder(new SimpleReadSupport(), new Path(tempFile.getAbsolutePath)).build() if(defaultReader.read() != null) { val values: SimpleRecord = defaultReader.read() val jsonFormatter = JsonRecordFormatter.fromSchema(parquetMetadata.getFileMetaData.getSchema) val textContent: String = jsonFormatter.formatRecord(values) xhtml.characters(textContent) xhtml.endElement("p") xhtml.endDocument() } } catch { case e: Throwable => e.printStackTrace() if (defaultReader != null) { try { defaultReader.close() } catch{ case _: Throwable => } } } finally { if (tempFile != null) tempFile.delete() } } }
Example 8
Source File: DirectParquetOutputCommitter.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.parquet import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext} import org.apache.parquet.Log import org.apache.parquet.hadoop.util.ContextUtil import org.apache.parquet.hadoop.{ParquetFileReader, ParquetFileWriter, ParquetOutputCommitter, ParquetOutputFormat} private[datasources] class DirectParquetOutputCommitter( outputPath: Path, context: TaskAttemptContext) extends ParquetOutputCommitter(outputPath, context) { val LOG = Log.getLog(classOf[ParquetOutputCommitter]) override def getWorkPath: Path = outputPath override def abortTask(taskContext: TaskAttemptContext): Unit = {} override def commitTask(taskContext: TaskAttemptContext): Unit = {} override def needsTaskCommit(taskContext: TaskAttemptContext): Boolean = true override def setupJob(jobContext: JobContext): Unit = {} override def setupTask(taskContext: TaskAttemptContext): Unit = {} override def commitJob(jobContext: JobContext) { val configuration = ContextUtil.getConfiguration(jobContext) val fileSystem = outputPath.getFileSystem(configuration) if (configuration.getBoolean(ParquetOutputFormat.ENABLE_JOB_SUMMARY, true)) { try { val outputStatus = fileSystem.getFileStatus(outputPath) val footers = ParquetFileReader.readAllFootersInParallel(configuration, outputStatus) try { ParquetFileWriter.writeMetadataFile(configuration, outputPath, footers) } catch { case e: Exception => LOG.warn("could not write summary file for " + outputPath, e) val metadataPath = new Path(outputPath, ParquetFileWriter.PARQUET_METADATA_FILE) if (fileSystem.exists(metadataPath)) { fileSystem.delete(metadataPath, true) } } } catch { case e: Exception => LOG.warn("could not write summary file for " + outputPath, e) } } if (configuration.getBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", true)) { try { val successPath = new Path(outputPath, FileOutputCommitter.SUCCEEDED_FILE_NAME) fileSystem.create(successPath).close() } catch { case e: Exception => LOG.warn("could not write success file for " + outputPath, e) } } } }
Example 9
Source File: ParquetCompatibilityTest.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.parquet import scala.collection.JavaConversions._ import org.apache.hadoop.fs.{Path, PathFilter} import org.apache.parquet.hadoop.ParquetFileReader import org.apache.parquet.schema.MessageType import org.apache.spark.sql.QueryTest private[sql] abstract class ParquetCompatibilityTest extends QueryTest with ParquetTest { protected def readParquetSchema(path: String): MessageType = { readParquetSchema(path, { path => !path.getName.startsWith("_") }) } //读Parquet模式 protected def readParquetSchema(path: String, pathFilter: Path => Boolean): MessageType = { val fsPath = new Path(path) val fs = fsPath.getFileSystem(configuration) val parquetFiles = fs.listStatus(fsPath, new PathFilter { override def accept(path: Path): Boolean = pathFilter(path) }).toSeq val footers = ParquetFileReader.readAllFootersInParallel(configuration, parquetFiles, true) footers.head.getParquetMetadata.getFileMetaData.getSchema } protected def logParquetSchema(path: String): Unit = { logInfo( //由parquet-avro写的Parquet文件的模式 s"""Schema of the Parquet file written by parquet-avro: |${readParquetSchema(path)} """.stripMargin) } } //复合Parquet的兼容性测试 object ParquetCompatibilityTest { def makeNullable[T <: AnyRef](i: Int)(f: => T): T = { if (i % 3 == 0) null.asInstanceOf[T] else f } }
Example 10
Source File: DirectParquetOutputCommitter.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.parquet import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext} import org.apache.parquet.Log import org.apache.parquet.hadoop.util.ContextUtil import org.apache.parquet.hadoop.{ParquetFileReader, ParquetFileWriter, ParquetOutputCommitter, ParquetOutputFormat} private[datasources] class DirectParquetOutputCommitter( outputPath: Path, context: TaskAttemptContext) extends ParquetOutputCommitter(outputPath, context) { val LOG = Log.getLog(classOf[ParquetOutputCommitter]) override def getWorkPath: Path = outputPath override def abortTask(taskContext: TaskAttemptContext): Unit = {} override def commitTask(taskContext: TaskAttemptContext): Unit = {} override def needsTaskCommit(taskContext: TaskAttemptContext): Boolean = true override def setupJob(jobContext: JobContext): Unit = {} override def setupTask(taskContext: TaskAttemptContext): Unit = {} override def commitJob(jobContext: JobContext) { val configuration = { // scalastyle:off jobcontext ContextUtil.getConfiguration(jobContext) // scalastyle:on jobcontext } val fileSystem = outputPath.getFileSystem(configuration) if (configuration.getBoolean(ParquetOutputFormat.ENABLE_JOB_SUMMARY, true)) { try { val outputStatus = fileSystem.getFileStatus(outputPath) val footers = ParquetFileReader.readAllFootersInParallel(configuration, outputStatus) try { ParquetFileWriter.writeMetadataFile(configuration, outputPath, footers) } catch { case e: Exception => LOG.warn("could not write summary file for " + outputPath, e) val metadataPath = new Path(outputPath, ParquetFileWriter.PARQUET_METADATA_FILE) if (fileSystem.exists(metadataPath)) { fileSystem.delete(metadataPath, true) } } } catch { case e: Exception => LOG.warn("could not write summary file for " + outputPath, e) } } if (configuration.getBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", true)) { try { val successPath = new Path(outputPath, FileOutputCommitter.SUCCEEDED_FILE_NAME) fileSystem.create(successPath).close() } catch { case e: Exception => LOG.warn("could not write success file for " + outputPath, e) } } } }
Example 11
Source File: ParquetCompatibilityTest.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.parquet import scala.collection.JavaConverters.{collectionAsScalaIterableConverter, mapAsJavaMapConverter, seqAsJavaListConverter} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{Path, PathFilter} import org.apache.parquet.hadoop.api.WriteSupport import org.apache.parquet.hadoop.api.WriteSupport.WriteContext import org.apache.parquet.hadoop.{ParquetFileReader, ParquetWriter} import org.apache.parquet.io.api.RecordConsumer import org.apache.parquet.schema.{MessageType, MessageTypeParser} import org.apache.spark.sql.QueryTest def writeDirect( path: String, schema: String, metadata: Map[String, String], recordWriters: (RecordConsumer => Unit)*): Unit = { val messageType = MessageTypeParser.parseMessageType(schema) val writeSupport = new DirectWriteSupport(messageType, metadata) val parquetWriter = new ParquetWriter[RecordConsumer => Unit](new Path(path), writeSupport) try recordWriters.foreach(parquetWriter.write) finally parquetWriter.close() } }