org.apache.parquet.avro.AvroParquetWriter Scala Examples
The following examples show how to use org.apache.parquet.avro.AvroParquetWriter.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: AvroParquetSourceTest.scala From eel-sdk with Apache License 2.0 | 6 votes |
package io.eels.component.parquet import java.nio.file.Paths import io.eels.component.parquet.avro.AvroParquetSource import io.eels.component.parquet.util.ParquetLogMute import io.eels.schema._ import org.apache.avro.SchemaBuilder import org.apache.avro.generic.{GenericData, GenericRecord} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.parquet.avro.AvroParquetWriter import org.scalatest.{Matchers, WordSpec} class AvroParquetSourceTest extends WordSpec with Matchers { ParquetLogMute() private implicit val conf = new Configuration() private implicit val fs = FileSystem.get(conf) private val personFile = Paths.get(getClass.getResource("/io/eels/component/parquet/person.avro.pq").toURI) private val resourcesDir = personFile.getParent "AvroParquetSource" should { "read schema" in { val people = AvroParquetSource(personFile) people.schema shouldBe StructType( Field("name", StringType, nullable = false), Field("job", StringType, nullable = false), Field("location", StringType, nullable = false) ) } "read parquet files" in { val people = AvroParquetSource(personFile.toAbsolutePath()).toDataStream().toSet.map(_.values) people shouldBe Set( Vector("clint eastwood", "actor", "carmel"), Vector("elton john", "musician", "pinner") ) } "read multiple parquet files using file expansion" in { import io.eels.FilePattern._ val people = AvroParquetSource(s"${resourcesDir.toUri.toString}/*.pq").toDataStream().toSet.map(_.values) people shouldBe Set( Vector("clint eastwood", "actor", "carmel"), Vector("elton john", "musician", "pinner"), Vector("clint eastwood", "actor", "carmel"), Vector("elton john", "musician", "pinner") ) } // todo add merge to parquet source "merge schemas" ignore { try { fs.delete(new Path("merge1.pq"), false) } catch { case t: Throwable => } try { fs.delete(new Path("merge2.pq"), false) } catch { case t: Throwable => } val schema1 = SchemaBuilder.builder().record("schema1").fields().requiredString("a").requiredDouble("b").endRecord() val schema2 = SchemaBuilder.builder().record("schema2").fields().requiredInt("a").requiredBoolean("c").endRecord() val writer1 = AvroParquetWriter.builder[GenericRecord](new Path("merge1.pq")).withSchema(schema1).build() val record1 = new GenericData.Record(schema1) record1.put("a", "aaaaa") record1.put("b", 124.3) writer1.write(record1) writer1.close() val writer2 = AvroParquetWriter.builder[GenericRecord](new Path("merge2.pq")).withSchema(schema2).build() val record2 = new GenericData.Record(schema2) record2.put("a", 111) record2.put("c", true) writer2.write(record2) writer2.close() ParquetSource(new Path("merge*")).schema shouldBe StructType( Field("a", StringType, nullable = false), Field("b", DoubleType, nullable = false), Field("c", BooleanType, nullable = false) ) fs.delete(new Path(".merge1.pq.crc"), false) fs.delete(new Path(".merge2.pq.crc"), false) fs.delete(new Path("merge1.pq"), false) fs.delete(new Path("merge2.pq"), false) } } }
Example 2
Source File: AvroParquetWriterFn.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet.avro import com.sksamuel.exts.Logging import io.eels.component.parquet.ParquetWriterConfig import org.apache.avro.Schema import org.apache.avro.generic.GenericRecord import org.apache.hadoop.fs.Path import org.apache.parquet.avro.AvroParquetWriter import org.apache.parquet.hadoop.{ParquetFileWriter, ParquetWriter} object AvroParquetWriterFn extends Logging { def apply(path: Path, avroSchema: Schema): ParquetWriter[GenericRecord] = { val config = ParquetWriterConfig() AvroParquetWriter.builder[GenericRecord](path) .withSchema(avroSchema) .withCompressionCodec(config.compressionCodec) .withPageSize(config.pageSize) .withRowGroupSize(config.blockSize) .withDictionaryEncoding(config.enableDictionary) .withWriteMode(ParquetFileWriter.Mode.CREATE) .withValidation(config.validating) .build() } }
Example 3
Source File: AvroParquetReaderFnTest.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet import java.util.UUID import io.eels.component.avro.AvroSchemaFns import io.eels.component.parquet.avro.AvroParquetReaderFn import io.eels.schema.{DoubleType, Field, LongType, StructType} import org.apache.avro.SchemaBuilder import org.apache.avro.generic.{GenericData, GenericRecord} import org.apache.avro.util.Utf8 import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.parquet.avro.AvroParquetWriter import org.scalatest.{BeforeAndAfterAll, Matchers, WordSpec} class AvroParquetReaderFnTest extends WordSpec with Matchers with BeforeAndAfterAll { private implicit val conf = new Configuration() private implicit val fs = FileSystem.get(new Configuration()) private val path = new Path(UUID.randomUUID().toString()) override def afterAll(): Unit = { val fs = FileSystem.get(new Configuration()) fs.delete(path, false) } private val avroSchema = SchemaBuilder.record("com.chuckle").fields() .requiredString("str").requiredLong("looong").requiredDouble("dooble").endRecord() private val writer = AvroParquetWriter.builder[GenericRecord](path) .withSchema(avroSchema) .build() private val record = new GenericData.Record(avroSchema) record.put("str", "wibble") record.put("looong", 999L) record.put("dooble", 12.34) writer.write(record) writer.close() val schema = StructType(Field("str"), Field("looong", LongType(true), true), Field("dooble", DoubleType, true)) "AvroParquetReaderFn" should { "support projections on doubles" in { val reader = AvroParquetReaderFn(path, None, Option(AvroSchemaFns.toAvroSchema(schema.removeField("looong")))) val record = reader.read() reader.close() record.get("str").asInstanceOf[Utf8].toString shouldBe "wibble" record.get("dooble") shouldBe 12.34 } "support projections on longs" in { val reader = AvroParquetReaderFn(path, None, Option(AvroSchemaFns.toAvroSchema(schema.removeField("str")))) val record = reader.read() reader.close() record.get("looong") shouldBe 999L } "support full projections" in { val reader = AvroParquetReaderFn(path, None, Option(AvroSchemaFns.toAvroSchema(schema))) val record = reader.read() reader.close() record.get("str").asInstanceOf[Utf8].toString shouldBe "wibble" record.get("looong") shouldBe 999L record.get("dooble") shouldBe 12.34 } "support non projections" in { val reader = AvroParquetReaderFn(path, None, None) val group = reader.read() reader.close() group.get("str").asInstanceOf[Utf8].toString shouldBe "wibble" group.get("looong") shouldBe 999L group.get("dooble") shouldBe 12.34 } } }
Example 4
Source File: ParquetWriterTask.scala From gearpump-examples with Apache License 2.0 | 5 votes |
package io.gearpump.examples.kafka_hdfs_pipeline import org.apache.avro.Schema import io.gearpump.Message import io.gearpump.cluster.UserConfig import io.gearpump.examples.kafka_hdfs_pipeline.ParquetWriterTask._ import io.gearpump.streaming.task.{StartTime, Task, TaskContext} import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.yarn.conf.YarnConfiguration import org.apache.parquet.avro.AvroParquetWriter import scala.util.{Failure, Success, Try} class ParquetWriterTask(taskContext : TaskContext, config: UserConfig) extends Task(taskContext, config) { val outputFileName = taskContext.appName + ".parquet" val absolutePath = Option(getHdfs + config.getString(PARQUET_OUTPUT_DIRECTORY).getOrElse("/parquet") + "/" + outputFileName).map(deleteFile(_)).get val outputPath = new Path(absolutePath) var parquetWriter = new AvroParquetWriter[SpaceShuttleRecord](outputPath, SpaceShuttleRecord.SCHEMA$) def getYarnConf = new YarnConfiguration def getFs = FileSystem.get(getYarnConf) def getHdfs = new Path(getFs.getHomeDirectory, "gearpump") private def deleteFile(fileName: String): String = { val file = new Path(fileName) getFs.exists(file) match { case true => getFs.delete(file,false) case false => } fileName } override def onStart(startTime: StartTime): Unit = { LOG.info(s"ParquetWriter.onStart $absolutePath") } override def onNext(msg: Message): Unit = { Try({ parquetWriter.write(msg.msg.asInstanceOf[SpaceShuttleRecord]) }) match { case Success(ok) => case Failure(throwable) => LOG.error(s"failed ${throwable.getMessage}") } } override def onStop(): Unit = { LOG.info("ParquetWriter.onStop") parquetWriter.close() } } object ParquetWriterTask { val PARQUET_OUTPUT_DIRECTORY = "parquet.output.directory" val PARQUET_WRITER = "parquet.writer" }
Example 5
Source File: ParquetWriterTaskSpec.scala From gearpump-examples with Apache License 2.0 | 5 votes |
package io.gearpump.examples.kafka_hdfs_pipeline import akka.actor.ActorSystem import org.apache.avro.Schema import io.gearpump.Message import io.gearpump.cluster.UserConfig import io.gearpump.streaming.MockUtil import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.yarn.conf.YarnConfiguration import org.apache.parquet.avro.{AvroParquetReader, AvroParquetWriter} import org.apache.parquet.hadoop.ParquetReader import org.apache.parquet.hadoop.api.ReadSupport import org.mockito.Mockito import org.mockito.Mockito._ import org.scalatest.prop.PropertyChecks import org.scalatest.{BeforeAndAfterAll, Matchers, PropSpec} class ParquetWriterTaskSpec extends PropSpec with PropertyChecks with Matchers with BeforeAndAfterAll { implicit var system: ActorSystem = ActorSystem("PipeLineSpec") val context = MockUtil.mockTaskContext val appName = "KafkaHdfsPipeLine" when(context.appName).thenReturn(appName) val fs = FileSystem.get(new YarnConfiguration) val homeDir = fs.getHomeDirectory.toUri.getPath val parquetDir = new Path(homeDir, "gearpump") + "/parquet/" val parquetPath = parquetDir + appName + ".parquet" val parquetCrc = parquetDir + "." + appName + ".parquet.crc" val parquetWriter = Mockito.mock(classOf[AvroParquetWriter[SpaceShuttleRecord]]) val anomaly = 0.252 val now = System.currentTimeMillis val userConfig = UserConfig.empty.withString(ParquetWriterTask.PARQUET_OUTPUT_DIRECTORY, "/parquet") override def afterAll(): Unit = { List(parquetPath, parquetCrc, parquetDir).foreach(new java.io.File(_).delete) system.shutdown() } property("ParquetWriterTask should initialize with local parquet file opened for writing") { val parquetWriterTask = new ParquetWriterTask(context, userConfig) val path = parquetWriterTask.absolutePath.stripPrefix("file:") assert(parquetPath.equals(path)) parquetWriterTask.onStop } property("ParquetWriterTask should write records to a parquet file") { val message = Message(SpaceShuttleRecord(now, anomaly), now) val parquetWriterTask = new ParquetWriterTask(context, userConfig) parquetWriterTask.parquetWriter = parquetWriter parquetWriterTask.onNext(message) verify(parquetWriterTask.parquetWriter).write(message.msg.asInstanceOf[SpaceShuttleRecord]) parquetWriterTask.onStop } property("ParquetWriterTask should have verifiable written record") { val message = Message(SpaceShuttleRecord(now, anomaly), now) val parquetWriterTask = new ParquetWriterTask(context, userConfig) parquetWriterTask.onNext(message) parquetWriterTask.onStop val reader = new AvroParquetReader[SpaceShuttleRecord](new Path(parquetPath)) val record = reader.read() assert(message.msg.asInstanceOf[SpaceShuttleRecord].anomaly == record.anomaly) assert(message.msg.asInstanceOf[SpaceShuttleRecord].ts == record.ts) } }
Example 6
Source File: AvroToParquetWriter.scala From etl-light with MIT License | 5 votes |
package yamrcraft.etlite.writers import org.apache.avro.Schema import org.apache.avro.generic.GenericRecord import org.apache.hadoop.fs.Path import org.apache.parquet.avro.AvroParquetWriter import org.slf4j.LoggerFactory import yamrcraft.etlite.utils.FileUtils class AvroToParquetWriter(tempFile: String, outputFile: String) extends Writer[GenericRecord] { val logger = LoggerFactory.getLogger(this.getClass) // lazy initialization var writer: Option[AvroParquetWriter[GenericRecord]] = None val tempPath = new Path(tempFile + ".parquet") val outputPath = new Path(outputFile + ".parquet") logger.info(s"creating writer for working file: ${tempPath.toString}, outputFile: ${outputPath.toString}") override def write(event: GenericRecord): Unit = { logger.info(s"ParquetWriter.write, event type: ${event.getSchema.getName}") if (writer.isEmpty) { writer = Some(createWriter(tempPath.toString, event.getSchema)) } writer.get.write(event) } override def commit(): Unit = { writer.get.close() val fs = FileUtils.getFS(outputPath.toString) fs.mkdirs(outputPath.getParent) if (fs.exists(outputPath)) { fs.rename(outputPath, new Path(outputPath.getParent, s"__${outputPath.getName}.${System.currentTimeMillis()}.old.__")) } // copy temp file to output file (typically temp file would be on local file system). if (tempFile.startsWith("file")) { logger.info(s"copy file from: ${tempPath.toString} to $outputPath") fs.copyFromLocalFile(true, true, tempPath, outputPath) } else { logger.info(s"renaming file from: ${tempPath.toString} to $outputPath") fs.rename(tempPath, outputPath) } } private def createWriter(file: String, schema: Schema) = { val fs = FileUtils.getFS(file) val path = new Path(file) if (fs.exists(path)) { fs.delete(path, true) } fs.mkdirs(path.getParent) new AvroParquetWriter[GenericRecord](path, schema) } }