org.apache.parquet.hadoop.api.ReadSupport Scala Examples
The following examples show how to use org.apache.parquet.hadoop.api.ReadSupport.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: RowParquetReaderFn.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet import com.sksamuel.exts.Logging import io.eels.schema.StructType import io.eels.{Predicate, Row} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.parquet.filter2.compat.FilterCompat import org.apache.parquet.format.converter.ParquetMetadataConverter import org.apache.parquet.hadoop.api.ReadSupport import org.apache.parquet.hadoop.{ParquetFileReader, ParquetReader} import org.apache.parquet.schema.Type def apply(path: Path, predicate: Option[Predicate], readSchema: Option[Type], dictionaryFiltering: Boolean)(implicit conf: Configuration): ParquetReader[Row] = { logger.debug(s"Opening parquet reader for $path") // The parquet reader can use a projection by setting a projected schema onto the supplied conf object def configuration(): Configuration = { val newconf = new Configuration(conf) readSchema.foreach { it => newconf.set(ReadSupport.PARQUET_READ_SCHEMA, it.toString) } //newconf.set(ParquetInputFormat.DICTIONARY_FILTERING_ENABLED, dictionaryFiltering.toString) newconf.set(org.apache.parquet.hadoop.ParquetFileReader.PARQUET_READ_PARALLELISM, config.parallelism.toString) newconf } // a filter is set when we have a predicate for the read def filter(): FilterCompat.Filter = predicate.map(ParquetPredicateBuilder.build) .map(FilterCompat.get) .getOrElse(FilterCompat.NOOP) ParquetReader.builder(new RowReadSupport, path) .withConf(configuration()) .withFilter(filter()) .build() } }
Example 2
Source File: ParquetWriterTaskSpec.scala From gearpump-examples with Apache License 2.0 | 5 votes |
package io.gearpump.examples.kafka_hdfs_pipeline import akka.actor.ActorSystem import org.apache.avro.Schema import io.gearpump.Message import io.gearpump.cluster.UserConfig import io.gearpump.streaming.MockUtil import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.yarn.conf.YarnConfiguration import org.apache.parquet.avro.{AvroParquetReader, AvroParquetWriter} import org.apache.parquet.hadoop.ParquetReader import org.apache.parquet.hadoop.api.ReadSupport import org.mockito.Mockito import org.mockito.Mockito._ import org.scalatest.prop.PropertyChecks import org.scalatest.{BeforeAndAfterAll, Matchers, PropSpec} class ParquetWriterTaskSpec extends PropSpec with PropertyChecks with Matchers with BeforeAndAfterAll { implicit var system: ActorSystem = ActorSystem("PipeLineSpec") val context = MockUtil.mockTaskContext val appName = "KafkaHdfsPipeLine" when(context.appName).thenReturn(appName) val fs = FileSystem.get(new YarnConfiguration) val homeDir = fs.getHomeDirectory.toUri.getPath val parquetDir = new Path(homeDir, "gearpump") + "/parquet/" val parquetPath = parquetDir + appName + ".parquet" val parquetCrc = parquetDir + "." + appName + ".parquet.crc" val parquetWriter = Mockito.mock(classOf[AvroParquetWriter[SpaceShuttleRecord]]) val anomaly = 0.252 val now = System.currentTimeMillis val userConfig = UserConfig.empty.withString(ParquetWriterTask.PARQUET_OUTPUT_DIRECTORY, "/parquet") override def afterAll(): Unit = { List(parquetPath, parquetCrc, parquetDir).foreach(new java.io.File(_).delete) system.shutdown() } property("ParquetWriterTask should initialize with local parquet file opened for writing") { val parquetWriterTask = new ParquetWriterTask(context, userConfig) val path = parquetWriterTask.absolutePath.stripPrefix("file:") assert(parquetPath.equals(path)) parquetWriterTask.onStop } property("ParquetWriterTask should write records to a parquet file") { val message = Message(SpaceShuttleRecord(now, anomaly), now) val parquetWriterTask = new ParquetWriterTask(context, userConfig) parquetWriterTask.parquetWriter = parquetWriter parquetWriterTask.onNext(message) verify(parquetWriterTask.parquetWriter).write(message.msg.asInstanceOf[SpaceShuttleRecord]) parquetWriterTask.onStop } property("ParquetWriterTask should have verifiable written record") { val message = Message(SpaceShuttleRecord(now, anomaly), now) val parquetWriterTask = new ParquetWriterTask(context, userConfig) parquetWriterTask.onNext(message) parquetWriterTask.onStop val reader = new AvroParquetReader[SpaceShuttleRecord](new Path(parquetPath)) val record = reader.read() assert(message.msg.asInstanceOf[SpaceShuttleRecord].anomaly == record.anomaly) assert(message.msg.asInstanceOf[SpaceShuttleRecord].ts == record.ts) } }
Example 3
Source File: ParquetReadSupportWrapper.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.parquet import java.util.{Map => JMap} import org.apache.hadoop.conf.Configuration import org.apache.parquet.hadoop.api.{InitContext, ReadSupport} import org.apache.parquet.hadoop.api.ReadSupport.ReadContext import org.apache.parquet.io.api.RecordMaterializer import org.apache.parquet.schema._ import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.InternalRow override def prepareForRead( conf: Configuration, keyValueMetaData: JMap[String, String], fileSchema: MessageType, readContext: ReadContext): RecordMaterializer[InternalRow] = { readSupport.prepareForRead(conf, keyValueMetaData, fileSchema, readContext) } } object ParquetReadSupportWrapper { // Proxy ParquetReadSupport.SPARK_ROW_REQUESTED_SCHEMA value. val SPARK_ROW_REQUESTED_SCHEMA: String = ParquetReadSupport.SPARK_ROW_REQUESTED_SCHEMA }
Example 4
Source File: StructReadSupport.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.parquet import java.util import org.apache.hadoop.conf.Configuration import org.apache.kafka.connect.data.Struct import org.apache.parquet.hadoop.api.{InitContext, ReadSupport} import org.apache.parquet.io.api.RecordMaterializer import org.apache.parquet.schema.MessageType class StructReadSupport extends ReadSupport[Struct] { override def prepareForRead(configuration: Configuration, metaData: util.Map[String, String], fileSchema: MessageType, context: ReadSupport.ReadContext): RecordMaterializer[Struct] = { // the file schema in here comes from the footer of the parquet file val schema = ParquetSchemas.toKafka(fileSchema) new StructMaterializer(schema) } override def init(context: InitContext): ReadSupport.ReadContext = { new ReadSupport.ReadContext(context.getFileSchema) } }
Example 5
Source File: StructReadSupport.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.parquet import java.util import org.apache.hadoop.conf.Configuration import org.apache.kafka.connect.data.Struct import org.apache.parquet.hadoop.api.{InitContext, ReadSupport} import org.apache.parquet.io.api.RecordMaterializer import org.apache.parquet.schema.MessageType class StructReadSupport extends ReadSupport[Struct] { override def prepareForRead(configuration: Configuration, metaData: util.Map[String, String], fileSchema: MessageType, context: ReadSupport.ReadContext): RecordMaterializer[Struct] = { // the file schema in here comes from the footer of the parquet file val schema = ParquetSchemas.toKafka(fileSchema) new StructMaterializer(schema) } override def init(context: InitContext): ReadSupport.ReadContext = { new ReadSupport.ReadContext(context.getFileSchema) } }