org.apache.hadoop.io.NullWritable Scala Examples
The following examples show how to use org.apache.hadoop.io.NullWritable.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: TextFileFormat.scala From drizzle-spark with Apache License 2.0 | 12 votes |
package org.apache.spark.sql.execution.datasources.text import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileStatus, Path} import org.apache.hadoop.io.{NullWritable, Text} import org.apache.hadoop.io.compress.GzipCodec import org.apache.hadoop.mapreduce.{Job, RecordWriter, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.output.{FileOutputFormat, TextOutputFormat} import org.apache.hadoop.util.ReflectionUtils import org.apache.spark.TaskContext import org.apache.spark.sql.{AnalysisException, Row, SparkSession} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.UnsafeRow import org.apache.spark.sql.catalyst.expressions.codegen.{BufferHolder, UnsafeRowWriter} import org.apache.spark.sql.catalyst.util.CompressionCodecs import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.sources._ import org.apache.spark.sql.types.{StringType, StructType} import org.apache.spark.util.SerializableConfiguration def getCompressionExtension(context: TaskAttemptContext): String = { // Set the compression extension, similar to code in TextOutputFormat.getDefaultWorkFile if (FileOutputFormat.getCompressOutput(context)) { val codecClass = FileOutputFormat.getOutputCompressorClass(context, classOf[GzipCodec]) ReflectionUtils.newInstance(codecClass, context.getConfiguration).getDefaultExtension } else { "" } } }
Example 2
Source File: CarbonTaskCompletionListener.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.carbondata.execution.datasources.tasklisteners import org.apache.hadoop.io.NullWritable import org.apache.hadoop.mapreduce.{RecordWriter, TaskAttemptContext} import org.apache.spark.TaskContext import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.datasources.RecordReaderIterator import org.apache.spark.util.TaskCompletionListener import org.apache.carbondata.common.logging.LogServiceFactory import org.apache.carbondata.core.memory.UnsafeMemoryManager import org.apache.carbondata.core.util.{DataTypeUtil, ThreadLocalTaskInfo} import org.apache.carbondata.hadoop.internal.ObjectArrayWritable trait CarbonCompactionTaskCompletionListener extends TaskCompletionListener case class CarbonQueryTaskCompletionListenerImpl(iter: RecordReaderIterator[InternalRow], freeMemory: Boolean = false) extends CarbonQueryTaskCompletionListener { override def onTaskCompletion(context: TaskContext): Unit = { if (iter != null) { try { iter.close() } catch { case e: Exception => LogServiceFactory.getLogService(this.getClass.getCanonicalName).error(e) } } if (freeMemory) { UnsafeMemoryManager.INSTANCE .freeMemoryAll(ThreadLocalTaskInfo.getCarbonTaskInfo.getTaskId) ThreadLocalTaskInfo.clearCarbonTaskInfo() } DataTypeUtil.clearFormatter() } } case class CarbonLoadTaskCompletionListenerImpl(recordWriter: RecordWriter[NullWritable, ObjectArrayWritable], taskAttemptContext: TaskAttemptContext) extends CarbonLoadTaskCompletionListener { override def onTaskCompletion(context: TaskContext): Unit = { try { recordWriter.close(taskAttemptContext) } finally { UnsafeMemoryManager.INSTANCE .freeMemoryAll(ThreadLocalTaskInfo.getCarbonTaskInfo.getTaskId) ThreadLocalTaskInfo.clearCarbonTaskInfo() DataTypeUtil.clearFormatter() } } }
Example 3
Source File: IOCommon.scala From Swallow with Apache License 2.0 | 5 votes |
package com.intel.hibench.sparkbench.common import java.io.{File, FileInputStream, IOException, InputStreamReader} import java.util.Properties import org.apache.hadoop.io.compress.CompressionCodec import org.apache.hadoop.io.{NullWritable, Text} import org.apache.hadoop.mapred.SequenceFileOutputFormat import org.apache.spark.rdd.RDD import org.apache.spark.{SparkContext, SparkException} import scala.collection.JavaConversions._ import scala.collection.mutable.HashMap import scala.reflect.ClassTag import scala.reflect.runtime.universe.TypeTag class IOCommon(val sc:SparkContext) { def load[T:ClassTag:TypeTag](filename:String, force_format:Option[String]=None) = { val input_format = force_format.getOrElse( IOCommon.getProperty("sparkbench.inputformat").getOrElse("Text")) input_format match { case "Text" => sc.textFile(filename) case "Sequence" => sc.sequenceFile[NullWritable, Text](filename).map(_._2.toString) case _ => throw new UnsupportedOperationException(s"Unknown inpout format: $input_format") } } def save(filename:String, data:RDD[_], prefix:String) = { val output_format = IOCommon.getProperty(prefix).getOrElse("Text") val output_format_codec = loadClassByName[CompressionCodec](IOCommon.getProperty(prefix + ".codec")) output_format match { case "Text" => if (output_format_codec.isEmpty) data.saveAsTextFile(filename) else data.saveAsTextFile(filename, output_format_codec.get) case "Sequence" => val sequence_data = data.map(x => (NullWritable.get(), new Text(x.toString))) if (output_format_codec.isEmpty) { sequence_data.saveAsHadoopFile[SequenceFileOutputFormat[NullWritable, Text]](filename) } else { sequence_data.saveAsHadoopFile[SequenceFileOutputFormat[NullWritable, Text]](filename, output_format_codec.get) } case _ => throw new UnsupportedOperationException(s"Unknown output format: $output_format") } } def save(filename:String, data:RDD[_]):Unit = save(filename, data, "sparkbench.outputformat") private def loadClassByName[T](name:Option[String]) = { if (!name.isEmpty) Some(Class.forName(name.get) .newInstance.asInstanceOf[T].getClass) else None } private def callMethod[T, R](obj:T, method_name:String) = obj.getClass.getMethod(method_name).invoke(obj).asInstanceOf[R] } object IOCommon { private val sparkbench_conf: HashMap[String, String] = getPropertiesFromFile(System.getenv("SPARKBENCH_PROPERTIES_FILES")) def getPropertiesFromFile(filenames: String): HashMap[String, String] = { val result = new HashMap[String, String] filenames.split(',').filter(_.stripMargin.length > 0).foreach { filename => val file = new File(filename) require(file.exists, s"Properties file $file does not exist") require(file.isFile, s"Properties file $file is not a normal file") val inReader = new InputStreamReader(new FileInputStream(file), "UTF-8") try { val properties = new Properties() properties.load(inReader) result ++= properties.stringPropertyNames() .map(k => (k, properties(k).trim)).toMap } catch { case e: IOException => val message = s"Failed when loading Sparkbench properties file $file" throw new SparkException(message, e) } finally { inReader.close() } } result.filter{case (key, value) => value.toLowerCase != "none"} } def getProperty(key:String):Option[String] = sparkbench_conf.get(key) def dumpProperties(): Unit = sparkbench_conf .foreach{case (key, value)=> println(s"$key\t\t$value")} }
Example 4
Source File: TensorflowRelation.scala From ecosystem with Apache License 2.0 | 5 votes |
package org.tensorflow.spark.datasources.tfrecords import org.apache.hadoop.io.{BytesWritable, NullWritable} import org.apache.spark.rdd.RDD import org.apache.spark.sql.sources.{BaseRelation, TableScan} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{Row, SQLContext, SparkSession} import org.tensorflow.example.{SequenceExample, Example} import org.tensorflow.hadoop.io.TFRecordFileInputFormat import org.tensorflow.spark.datasources.tfrecords.serde.DefaultTfRecordRowDecoder case class TensorflowRelation(options: Map[String, String], customSchema: Option[StructType]=None) (@transient val session: SparkSession) extends BaseRelation with TableScan { //Import TFRecords as DataFrame happens here lazy val (tfRdd, tfSchema) = { val rdd = session.sparkContext.newAPIHadoopFile(options("path"), classOf[TFRecordFileInputFormat], classOf[BytesWritable], classOf[NullWritable]) val recordType = options.getOrElse("recordType", "Example") recordType match { case "Example" => val exampleRdd = rdd.map{case (bytesWritable, nullWritable) => Example.parseFrom(bytesWritable.getBytes) } val finalSchema = customSchema.getOrElse(TensorFlowInferSchema(exampleRdd)) val rowRdd = exampleRdd.map(example => DefaultTfRecordRowDecoder.decodeExample(example, finalSchema)) (rowRdd, finalSchema) case "SequenceExample" => val sequenceExampleRdd = rdd.map{case (bytesWritable, nullWritable) => SequenceExample.parseFrom(bytesWritable.getBytes) } val finalSchema = customSchema.getOrElse(TensorFlowInferSchema(sequenceExampleRdd)) val rowRdd = sequenceExampleRdd.map(example => DefaultTfRecordRowDecoder.decodeSequenceExample(example, finalSchema)) (rowRdd, finalSchema) case _ => throw new IllegalArgumentException(s"Unsupported recordType ${recordType}: recordType can be Example or SequenceExample") } } override def sqlContext: SQLContext = session.sqlContext override def schema: StructType = tfSchema override def buildScan(): RDD[Row] = tfRdd }
Example 5
Source File: LasOutputWriter.scala From spark-iqmulus with Apache License 2.0 | 5 votes |
package fr.ign.spark.iqmulus.las import org.apache.spark.sql.types._ import org.apache.hadoop.mapreduce.{ TaskAttemptID, RecordWriter, TaskAttemptContext } import java.io.DataOutputStream import org.apache.spark.sql.sources.OutputWriter import org.apache.spark.deploy.SparkHadoopUtil import org.apache.hadoop.io.{ NullWritable, BytesWritable } import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat import org.apache.hadoop.fs.Path import java.text.NumberFormat import org.apache.spark.sql.{ Row, SQLContext, sources } import fr.ign.spark.iqmulus.RowOutputStream class LasOutputWriter( name: String, context: TaskAttemptContext, dataSchema: StructType, formatOpt: Option[Byte] = None, version: Version = Version(), offset: Array[Double] = Array(0F, 0F, 0F), scale: Array[Double] = Array(0.01F, 0.01F, 0.01F) ) extends OutputWriter { private val file = { val path = getDefaultWorkFile("/1.pdr") val fs = path.getFileSystem(context.getConfiguration) fs.create(path) } private val pmin = Array.fill[Double](3)(Double.PositiveInfinity) private val pmax = Array.fill[Double](3)(Double.NegativeInfinity) private val countByReturn = Array.fill[Long](15)(0) private def count = countByReturn.sum private val format = formatOpt.getOrElse(LasHeader.formatFromSchema(dataSchema)) // todo, extra bytes private val schema = LasHeader.schema(format) private def header = new LasHeader(name, format, count, pmin, pmax, scale, offset, countByReturn) private val recordWriter = new RowOutputStream(new DataOutputStream(file), littleEndian = true, schema, dataSchema) def getDefaultWorkFile(extension: String): Path = { val uniqueWriteJobId = context.getConfiguration.get("spark.sql.sources.writeJobUUID") val taskAttemptId: TaskAttemptID = context.getTaskAttemptID val split = taskAttemptId.getTaskID.getId new Path(name, f"$split%05d-$uniqueWriteJobId$extension") } override def write(row: Row): Unit = { recordWriter.write(row) // gather statistics for the header val x = offset(0) + scale(0) * row.getAs[Int]("x").toDouble val y = offset(1) + scale(1) * row.getAs[Int]("y").toDouble val z = offset(2) + scale(2) * row.getAs[Int]("z").toDouble val ret = row.getAs[Byte]("flags") & 0x3 countByReturn(ret) += 1 pmin(0) = Math.min(pmin(0), x) pmin(1) = Math.min(pmin(1), y) pmin(2) = Math.min(pmin(2), z) pmax(0) = Math.max(pmax(0), x) pmax(1) = Math.max(pmax(1), y) pmax(2) = Math.max(pmax(2), z) } override def close(): Unit = { recordWriter.close // write header val path = getDefaultWorkFile("/0.header") val fs = path.getFileSystem(context.getConfiguration) val dos = new java.io.DataOutputStream(fs.create(path)) header.write(dos) dos.close // copy header and pdf to a final las file (1 per split) org.apache.hadoop.fs.FileUtil.copyMerge( fs, getDefaultWorkFile("/"), fs, getDefaultWorkFile(".las"), true, context.getConfiguration, "" ) } }
Example 6
Source File: PlyOutputWriter.scala From spark-iqmulus with Apache License 2.0 | 5 votes |
package fr.ign.spark.iqmulus.ply import org.apache.spark.sql.types._ import org.apache.hadoop.mapreduce.{ TaskAttemptID, RecordWriter, TaskAttemptContext, JobContext } import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter import java.io.DataOutputStream import org.apache.spark.sql.sources.OutputWriter import org.apache.hadoop.io.{ NullWritable, BytesWritable } import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat import org.apache.hadoop.fs.Path import java.text.NumberFormat import org.apache.spark.sql.{ Row, SQLContext, sources } import fr.ign.spark.iqmulus.RowOutputStream class PlyOutputWriter( name: String, context: TaskAttemptContext, dataSchema: StructType, element: String, littleEndian: Boolean ) extends OutputWriter { private val file = { val path = getDefaultWorkFile(s".ply.$element") val fs = path.getFileSystem(context.getConfiguration) fs.create(path) } private var count = 0L // strip out ids private val schema = StructType(dataSchema.filterNot { Seq("fid", "pid") contains _.name }) private val recordWriter = new RowOutputStream(new DataOutputStream(file), littleEndian, schema, dataSchema) def getDefaultWorkFile(extension: String): Path = { val uniqueWriteJobId = context.getConfiguration.get("spark.sql.sources.writeJobUUID") val taskAttemptId: TaskAttemptID = context.getTaskAttemptID val split = taskAttemptId.getTaskID.getId new Path(name, f"$split%05d-$uniqueWriteJobId$extension") } override def write(row: Row): Unit = { recordWriter.write(row) count += 1 } override def close(): Unit = { recordWriter.close // write header val path = getDefaultWorkFile(".ply.header") val fs = path.getFileSystem(context.getConfiguration) val dos = new java.io.DataOutputStream(fs.create(path)) val header = new PlyHeader(path.toString, littleEndian, Map(element -> ((count, schema)))) header.write(dos) dos.close } }
Example 7
Source File: ReadingWritingData.scala From Spark-RSVD with Apache License 2.0 | 5 votes |
package com.criteo.rsvd import java.nio.ByteBuffer import com.esotericsoftware.kryo.Kryo import com.typesafe.scalalogging.slf4j.StrictLogging import de.javakaffee.kryoserializers.UnmodifiableCollectionsSerializer import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.io.{BytesWritable, NullWritable} import org.apache.spark.mllib.linalg.distributed.MatrixEntry import org.apache.spark.rdd.RDD import org.apache.spark.serializer.{KryoRegistrator, KryoSerializer} import org.apache.spark.{SparkConf, SparkContext} import scala.reflect.ClassTag object ReadingWritingData extends StrictLogging { def getInputDataSizeMB(inputPathPattern: String, sc: SparkContext): Int = { val fs = FileSystem.get(sc.hadoopConfiguration) val path = new Path(inputPathPattern) (fs.globStatus(path).map(f => f.getLen).sum / 1024 / 1024).toInt } def loadMatrixEntries(inputPath: String, singlePartitionSizeMB: Int, sc: SparkContext): RDD[MatrixEntry] = { logger.info(s"Input matrix path: $inputPath") val inputDataSizeMB = getInputDataSizeMB(inputPath + " def makeRddFromKryoFile[T: ClassTag]( sc: SparkContext, path: String, minPartitionsOpt: Option[Int] = None): RDD[T] = { val minPartitions = minPartitionsOpt.getOrElse(sc.defaultMinPartitions) val serializer = new KryoSerializer(sc.getConf) sc.sequenceFile(path, classOf[NullWritable], classOf[BytesWritable], minPartitions) .mapPartitions { it => val instance = serializer.newInstance() it.flatMap { case (_, v) => instance.deserialize[Array[T]](ByteBuffer.wrap(v.getBytes)) } } } object RandomizedSVDKryoRegistrator extends KryoRegistrator { def registerClasses(kryo: Kryo): Unit = { UnmodifiableCollectionsSerializer.registerSerializers(kryo) kryo.register(classOf[MatrixEntry]) kryo.register(classOf[Array[MatrixEntry]]) } } def appendBasicRegistratorToSparkConf(sparkConf: SparkConf): SparkConf = appendRegistratorToSparkConf(sparkConf, RandomizedSVDKryoRegistrator.getClass.getName) def appendRegistratorToSparkConf(sparkConf: SparkConf, registratorName: String): SparkConf = { val oldValue = sparkConf.get("spark.kryo.registrator", "") if (oldValue == "") { sparkConf.set("spark.kryo.registrator", registratorName) } else { sparkConf.set("spark.kryo.registrator", oldValue + "," + registratorName) } } }
Example 8
Source File: TimelyImplicits.scala From Mastering-Spark-for-Data-Science with MIT License | 5 votes |
package io.gzet.timeseries.timely import io.gzet.utils.spark.accumulo.AccumuloConfig import org.apache.accumulo.core.client.ClientConfiguration import org.apache.accumulo.core.client.mapreduce.{AbstractInputFormat, InputFormatBase} import org.apache.accumulo.core.client.security.tokens.PasswordToken import org.apache.accumulo.core.data.Range import org.apache.accumulo.core.security.Authorizations import org.apache.hadoop.io.NullWritable import org.apache.hadoop.mapreduce.Job import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import scala.collection.JavaConversions._ object TimelyImplicits { implicit class AccumuloReader(sc: SparkContext) { def timely(accumuloConfig: AccumuloConfig, rowPrefix: Option[String] = None): RDD[Metric] = { val conf = sc.hadoopConfiguration val job = Job.getInstance(conf) val clientConfig: ClientConfiguration = new ClientConfiguration() .withInstance(accumuloConfig.accumuloInstance) .withZkHosts(accumuloConfig.zookeeperHosts) val authorizations = new Authorizations(List("INTERNAL", "CONFIDENTIAL", "SECRET").map(_.getBytes())) AbstractInputFormat.setConnectorInfo(job, accumuloConfig.accumuloUser, new PasswordToken(accumuloConfig.accumuloPassword)) AbstractInputFormat.setZooKeeperInstance(job, clientConfig) AbstractInputFormat.setScanAuthorizations(job, authorizations) InputFormatBase.setInputTableName(job, "timely.metrics") if(rowPrefix.isDefined) { val ranges = List(Range.prefix(rowPrefix.get)) InputFormatBase.setRanges(job, ranges) } val rdd = sc.newAPIHadoopRDD(job.getConfiguration, classOf[AccumuloTimelyInputFormat], classOf[NullWritable], classOf[TimelyWritable] ) values rdd map { timely => val Array(tagK, tagV) = timely.getMetricType.split("=", 2) Metric( timely.getMetric, timely.getTime, timely.getMetricValue, Map(tagK -> tagV) ) } } } }
Example 9
Source File: AccumuloReader.scala From Mastering-Spark-for-Data-Science with MIT License | 5 votes |
package io.gzet.community.accumulo import org.apache.accumulo.core.client.{IteratorSetting, ClientConfiguration} import org.apache.accumulo.core.client.mapreduce.{AccumuloInputFormat, AbstractInputFormat, InputFormatBase} import org.apache.accumulo.core.client.security.tokens.PasswordToken import org.apache.accumulo.core.security.Authorizations import org.apache.hadoop.io.NullWritable import org.apache.hadoop.mapreduce.Job import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import scala.language.postfixOps class AccumuloReader(config: AccumuloConfig) extends Serializable { def read(sc: SparkContext, accumuloTable: String, authorization: Option[String] = None): RDD[EdgeWritable] = { val conf = sc.hadoopConfiguration val job = Job.getInstance(conf) val clientConfig: ClientConfiguration = new ClientConfiguration() .withInstance(config.accumuloInstance) .withZkHosts(config.zookeeperHosts) AbstractInputFormat.setConnectorInfo(job, config.accumuloUser, new PasswordToken(config.accumuloPassword)) AbstractInputFormat.setZooKeeperInstance(job, clientConfig) if(authorization.isDefined) AbstractInputFormat.setScanAuthorizations(job, new Authorizations(authorization.get)) val is = new IteratorSetting( 1, "summingCombiner", "org.apache.accumulo.core.iterators.user.SummingCombiner" ) is.addOption("all", "") is.addOption("columns", "associated") is.addOption("lossy", "TRUE") is.addOption("type", "STRING") InputFormatBase.addIterator(job, is) InputFormatBase.setInputTableName(job, accumuloTable) sc.newAPIHadoopRDD(job.getConfiguration, classOf[AccumuloGraphxInputFormat], classOf[NullWritable], classOf[EdgeWritable] ) values } }
Example 10
Source File: RecordIOOutputFormatTests.scala From sagemaker-spark with Apache License 2.0 | 5 votes |
package com.amazonaws.services.sagemaker.sparksdk.protobuf import java.io.ByteArrayOutputStream import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, FSDataOutputStream, Path} import org.apache.hadoop.io.{BytesWritable, NullWritable} import org.apache.hadoop.mapreduce.TaskAttemptContext import org.mockito.Matchers.any import org.mockito.Mockito.{verify, when} import org.scalatest.{BeforeAndAfter, FlatSpec} import org.scalatest.mock.MockitoSugar import com.amazonaws.services.sagemaker.sparksdk.protobuf.RecordIOOutputFormat.SageMakerProtobufRecordWriter class RecordIOOutputFormatTests extends FlatSpec with MockitoSugar with BeforeAndAfter { var sagemakerProtobufRecordWriter: SageMakerProtobufRecordWriter = _ var mockOutputStream : FSDataOutputStream = _ var byteArrayOutputStream: ByteArrayOutputStream = _ var mockTaskAttemptContext: TaskAttemptContext = _ var mockPath: Path = _ var mockFileSystem: FileSystem = _ before { byteArrayOutputStream = new ByteArrayOutputStream() mockOutputStream = mock[FSDataOutputStream] sagemakerProtobufRecordWriter = new SageMakerProtobufRecordWriter(mockOutputStream) mockTaskAttemptContext = mock[TaskAttemptContext] mockPath = mock[Path] mockFileSystem = mock[FileSystem] } it should "write an empty array of bytes" in { val bytesWritable = new BytesWritable(byteArrayOutputStream.toByteArray) val bytes = ProtobufConverter.byteArrayToRecordIOEncodedByteArray(bytesWritable.getBytes) sagemakerProtobufRecordWriter.write(NullWritable.get(), bytesWritable) verify(mockOutputStream).write(bytes, 0, bytes.length) } it should "write an array of bytes" in { val byteArray = Array[Byte](0, 0, 0, 0) byteArrayOutputStream.write(byteArray) val bytesWritable = new BytesWritable(byteArrayOutputStream.toByteArray) val bytes = ProtobufConverter.byteArrayToRecordIOEncodedByteArray(bytesWritable.getBytes) sagemakerProtobufRecordWriter.write(NullWritable.get(), bytesWritable) verify(mockOutputStream).write(bytes, 0, bytes.length) } it should "write an array of bytes, padding as necessary" in { byteArrayOutputStream.write(5) val bytesWritable = new BytesWritable(byteArrayOutputStream.toByteArray) val bytes = ProtobufConverter.byteArrayToRecordIOEncodedByteArray(bytesWritable.getBytes) sagemakerProtobufRecordWriter.write(NullWritable.get(), bytesWritable) verify(mockOutputStream).write(bytes, 0, bytes.length) } it should "write an array of bytes, padding only as much as necessary" in { byteArrayOutputStream.write(Array[Byte](0, 0, 0, 0, 0)) val bytesWritable = new BytesWritable(byteArrayOutputStream.toByteArray) val bytes = ProtobufConverter.byteArrayToRecordIOEncodedByteArray(bytesWritable.getBytes) sagemakerProtobufRecordWriter.write(NullWritable.get(), bytesWritable) verify(mockOutputStream).write(bytes, 0, bytes.length) } it should "create a record writer from a FSDataOutputStream created by the filesystem" in { val mockTaskAttemptContext = mock[TaskAttemptContext] val mockPath = mock[Path] val mockFileSystem = mock[FileSystem] when(mockPath.getFileSystem(any[Configuration])).thenReturn(mockFileSystem) new RecordIOOutputFormat() { override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = { mockPath } }.getRecordWriter(mockTaskAttemptContext) verify(mockFileSystem).create(mockPath, true) } }
Example 11
Source File: SageMakerProtobufWriter.scala From sagemaker-spark with Apache License 2.0 | 5 votes |
package com.amazonaws.services.sagemaker.sparksdk.protobuf import java.io.ByteArrayOutputStream import org.apache.hadoop.fs.Path import org.apache.hadoop.io.{BytesWritable, NullWritable} import org.apache.hadoop.mapreduce.{RecordWriter, TaskAttemptContext} import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.execution.datasources.OutputWriter import org.apache.spark.sql.types.StructType def write(row: Row): Unit = { val labelColumnName = options.getOrElse("labelColumnName", "label") val featuresColumnName = options.getOrElse("featuresColumnName", "features") val record = ProtobufConverter.rowToProtobuf(row, featuresColumnName, Some(labelColumnName)) record.writeTo(byteArrayOutputStream) recordWriter.write(NullWritable.get(), new BytesWritable(byteArrayOutputStream.toByteArray)) byteArrayOutputStream.reset() } override def close(): Unit = { recordWriter.close(context) } }
Example 12
Source File: OrcOutputWriter.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.orc import org.apache.hadoop.fs.Path import org.apache.hadoop.io.NullWritable import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.orc.mapred.OrcStruct import org.apache.orc.mapreduce.OrcOutputFormat import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.datasources.OutputWriter import org.apache.spark.sql.types._ private[orc] class OrcOutputWriter( path: String, dataSchema: StructType, context: TaskAttemptContext) extends OutputWriter { private[this] val serializer = new OrcSerializer(dataSchema) private val recordWriter = { new OrcOutputFormat[OrcStruct]() { override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = { new Path(path) } }.getRecordWriter(context) } override def write(row: InternalRow): Unit = { recordWriter.write(NullWritable.get(), serializer.serialize(row)) } override def close(): Unit = { recordWriter.close(context) } }
Example 13
Source File: Job.scala From spark-avro-compactor with Apache License 2.0 | 5 votes |
package ie.ianduffy.spark.avro.compactor import ie.ianduffy.spark.avro.compactor.Utils._ import io.confluent.kafka.schemaregistry.client.{SchemaMetadata, SchemaRegistryClient} import org.apache.avro.Schema import org.apache.avro.generic.GenericRecord import org.apache.avro.mapred.AvroKey import org.apache.avro.mapreduce.AvroKeyOutputFormat import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.io.NullWritable import org.apache.spark.sql.SparkSession import org.slf4j.LoggerFactory object Job { private val log = LoggerFactory.getLogger(Job.getClass.getName.replace("$", "")) def run(spark: SparkSession, schemaRegistry: SchemaRegistryClient, jobConfig: JobConfig): Unit = { val schema: Schema = { val latestSchemaMetadata: SchemaMetadata = schemaRegistry.getLatestSchemaMetadata(jobConfig.schemaRegistrySubject) val id: Int = latestSchemaMetadata.getId schemaRegistry.getById(id) } implicit val sparkConfig: Configuration = spark.sparkContext.hadoopConfiguration sparkConfig.set("avro.schema.input.key", schema.toString()) sparkConfig.set("avro.schema.output.key", schema.toString()) val inputPath: Path = new Path(jobConfig.input) val outputPath: Path = new Path(jobConfig.output) val fs: FileSystem = inputPath.getFileSystem(sparkConfig) // avoid raising org.apache.hadoop.mapred.FileAlreadyExistsException if (jobConfig.overrideOutput) fs.delete(outputPath, true) // from fileSystem prefix with s3 the default is 64MB and can be overwitten by fs.s3.block.size // from fileSystem prefix with s3a the default is 32MB and can be overwitten by setting fs.s3a.block.size val outputBlocksize: Long = fs.getDefaultBlockSize(outputPath) // Where inputPath is of the form s3://some/path val inputPathSize: Long = fs.getContentSummary(inputPath).getSpaceConsumed val numPartitions: Int = Math.max(1, Math.floor((inputPathSize / CompressionRatio.AVRO_SNAPPY) / outputBlocksize).toInt) log.debug( s"""outputBlocksize: $outputBlocksize | inputPathSize: $inputPathSize | splitSize: $numPartitions """.stripMargin) val rdd = readHadoopFile(spark, inputPath.toString) rdd.coalesce(numPartitions) .saveAsNewAPIHadoopFile( outputPath.toString, classOf[AvroKey[GenericRecord]], classOf[NullWritable], classOf[AvroKeyOutputFormat[GenericRecord]], sparkConfig ) } }
Example 14
Source File: Utils.scala From spark-avro-compactor with Apache License 2.0 | 5 votes |
package ie.ianduffy.spark.avro.compactor import org.apache.avro.generic.GenericRecord import org.apache.avro.mapred.AvroKey import org.apache.avro.mapreduce.AvroKeyInputFormat import org.apache.hadoop.conf.Configuration import org.apache.hadoop.io.NullWritable import org.apache.spark.sql.SparkSession object Utils { def createSparkSession: SparkSession = SparkSession .builder .appName("avro-compactor") .getOrCreate def readHadoopFile(spark: SparkSession, path: String)(implicit sparkConfig: Configuration) = { spark.sparkContext.newAPIHadoopFile( path, classOf[AvroKeyInputFormat[GenericRecord]], classOf[AvroKey[GenericRecord]], classOf[NullWritable], sparkConfig ) } }
Example 15
Source File: BigQueryDataFrame.scala From spark-bigquery with Apache License 2.0 | 5 votes |
package com.samelamin.spark.bigquery import com.google.api.services.bigquery.model.{TableReference, TableSchema} import com.google.cloud.hadoop.io.bigquery._ import com.google.gson._ import com.samelamin.spark.bigquery.converters.{BigQueryAdapter, SchemaConverters} import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.io.{LongWritable, NullWritable} import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat import org.apache.spark.sql.DataFrame import org.slf4j.LoggerFactory import scala.util.Random def saveAsBigQueryTable(fullyQualifiedOutputTableId: String, isPartitionedByDay: Boolean = false, timePartitionExpiration: Long = 0, writeDisposition: WriteDisposition.Value = null, createDisposition: CreateDisposition.Value = null): Unit = { val destinationTable = BigQueryStrings.parseTableReference(fullyQualifiedOutputTableId) val bigQuerySchema = SchemaConverters.SqlToBQSchema(adaptedDf) val gcsPath = writeDFToGoogleStorage(adaptedDf,destinationTable,bigQuerySchema) bq.load(destinationTable, bigQuerySchema, gcsPath, isPartitionedByDay, timePartitionExpiration, writeDisposition, createDisposition) delete(new Path(gcsPath)) } def writeDFToGoogleStorage(adaptedDf: DataFrame, destinationTable: TableReference, bqSchema: TableSchema): String = { val tableName = BigQueryStrings.toString(destinationTable) BigQueryConfiguration.configureBigQueryOutput(hadoopConf, tableName, bqSchema.toPrettyString()) hadoopConf.set("mapreduce.job.outputformat.class", classOf[BigQueryOutputFormat[_, _]].getName) val bucket = self.sparkSession.conf.get(BigQueryConfiguration.GCS_BUCKET_KEY) val temp = s"spark-bigquery-${System.currentTimeMillis()}=${Random.nextInt(Int.MaxValue)}" val gcsPath = s"gs://$bucket/hadoop/tmp/spark-bigquery/$temp" if(hadoopConf.get(BigQueryConfiguration.TEMP_GCS_PATH_KEY) == null) { hadoopConf.set(BigQueryConfiguration.TEMP_GCS_PATH_KEY, gcsPath) } logger.info(s"Loading $gcsPath into $tableName") adaptedDf .toJSON .rdd .map(json => (null, jsonParser.parse(json))) .saveAsNewAPIHadoopFile(gcsPath, classOf[GsonBigQueryInputFormat], classOf[LongWritable], classOf[TextOutputFormat[NullWritable, JsonObject]], hadoopConf) gcsPath } private def delete(path: Path): Unit = { val fs = FileSystem.get(path.toUri, hadoopConf) fs.delete(path, true) } }
Example 16
Source File: DefaultSource.scala From spark-tensorflow-connector with Apache License 2.0 | 5 votes |
package org.trustedanalytics.spark.datasources.tensorflow import org.apache.hadoop.io.{BytesWritable, NullWritable} import org.apache.spark.sql._ import org.apache.spark.sql.sources._ import org.apache.spark.sql.types.StructType import org.tensorflow.hadoop.io.TFRecordFileOutputFormat import org.trustedanalytics.spark.datasources.tensorflow.serde.DefaultTfRecordRowEncoder override def shortName(): String = "tensorflow" // Writes DataFrame as TensorFlow Records override def createRelation( sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], data: DataFrame): BaseRelation = { val path = parameters("path") //Export DataFrame as TFRecords val features = data.rdd.map(row => { val example = DefaultTfRecordRowEncoder.encodeTfRecord(row) (new BytesWritable(example.toByteArray), NullWritable.get()) }) features.saveAsNewAPIHadoopFile[TFRecordFileOutputFormat](path) TensorflowRelation(parameters)(sqlContext.sparkSession) } override def createRelation(sqlContext: SQLContext, parameters: Map[String, String], schema: StructType): BaseRelation = { TensorflowRelation(parameters, Some(schema))(sqlContext.sparkSession) } // Reads TensorFlow Records into DataFrame override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): TensorflowRelation = { TensorflowRelation(parameters)(sqlContext.sparkSession) } }
Example 17
Source File: WholeFileReaderSuite.scala From magellan with Apache License 2.0 | 5 votes |
package magellan.mapreduce import magellan.TestSparkContext import org.apache.hadoop.io.{NullWritable, Text} import org.scalatest.FunSuite class WholeFileReaderSuite extends FunSuite with TestSparkContext { test("Read Whole File") { val path = this.getClass.getClassLoader.getResource("geojson/point").getPath val data = sc.newAPIHadoopFile( path, classOf[WholeFileInputFormat], classOf[NullWritable], classOf[Text] ) assert(data.count() === 1) } }
Example 18
Source File: WholeFileReader.scala From magellan with Apache License 2.0 | 5 votes |
package magellan.mapreduce import java.io.InputStream import org.apache.commons.io.IOUtils import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FSDataInputStream, FileSystem, Path} import org.apache.hadoop.io.compress.{CodecPool, CompressionCodecFactory, Decompressor} import org.apache.hadoop.io.{NullWritable, Text} import org.apache.hadoop.mapreduce.lib.input.FileSplit import org.apache.hadoop.mapreduce.{InputSplit, RecordReader, TaskAttemptContext} class WholeFileReader extends RecordReader[NullWritable, Text] { private val key = NullWritable.get() private val value = new Text() private var split: FileSplit = _ private var conf: Configuration = _ private var path: Path = _ private var done: Boolean = false override def getProgress: Float = ??? override def nextKeyValue(): Boolean = { if (done){ false } else { val fs = path.getFileSystem(conf) var is: FSDataInputStream = null var in: InputStream = null var decompressor: Decompressor = null try { is = fs.open(split.getPath) val codec = new CompressionCodecFactory(conf).getCodec(path) if (codec != null) { decompressor = CodecPool.getDecompressor(codec) in = codec.createInputStream(is, decompressor) } else { in = is } val result = IOUtils.toByteArray(in) value.clear() value.set(result) done = true true } finally { if (in != null) { IOUtils.closeQuietly(in) } if (decompressor != null) { CodecPool.returnDecompressor(decompressor) } } } } override def getCurrentValue: Text = value override def initialize(inputSplit: InputSplit, taskAttemptContext: TaskAttemptContext): Unit = { this.split = inputSplit.asInstanceOf[FileSplit] this.conf = MapReduceUtils.getConfigurationFromContext(taskAttemptContext) this.path = this.split.getPath } override def getCurrentKey: NullWritable = key override def close() {} }
Example 19
Source File: GeoJSONRelation.scala From magellan with Apache License 2.0 | 5 votes |
package magellan import magellan.mapreduce.WholeFileInputFormat import org.apache.hadoop.fs.FileSystem import org.apache.hadoop.io.{NullWritable, Text} import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.json4s._ import org.json4s.jackson.JsonMethods._ case class GeoJSONRelation( path: String, parameters: Map[String, String]) (@transient val sqlContext: SQLContext) extends SpatialRelation { protected override def _buildScan(): RDD[Array[Any]] = { val conf = sc.hadoopConfiguration FileSystem.getLocal(conf) sc.newAPIHadoopFile( path, classOf[WholeFileInputFormat], classOf[NullWritable], classOf[Text]).flatMap { case (k, v) => val line = v.toString() parseShapeWithMeta(line) }.map { case (shape: Shape, meta: Option[Map[String, String]]) => Array(shape, meta) } } private def parseShapeWithMeta(line: String) = { val tree = parse(line) implicit val formats = org.json4s.DefaultFormats val result = tree.extract[GeoJSON] result.features.flatMap { f => f.geometry.shapes.map(shape => (shape, f.properties)) } } }
Example 20
Source File: NodesWithGeohash.scala From schedoscope with Apache License 2.0 | 5 votes |
package schedoscope.example.osm.processed import org.apache.hadoop.fs.Path import org.apache.hadoop.io.{NullWritable, Text} import org.apache.hadoop.mapreduce.Job import org.apache.hadoop.mapreduce.lib.input.FileInputFormat import org.apache.hadoop.mapreduce.lib.output.{FileOutputFormat, LazyOutputFormat, TextOutputFormat} import org.schedoscope.dsl.View import org.schedoscope.dsl.storageformats.TextFile import org.schedoscope.dsl.transformations.MapreduceTransformation import schedoscope.example.osm.mapreduce.GeohashMapper case class NodesWithGeohash() extends View { val id = fieldOf[Long]("The node ID") val version = fieldOf[Int]("OSM version - ignored") val userId = fieldOf[Int]("OSM user ID - ignored") val tstamp = fieldOf[String]("Timestamp of node creation") val longitude = fieldOf[Double]("Longitude of the node") val latitude = fieldOf[Double]("Latitude of the node") val geohash = fieldOf[String]("A geoencoded area string") val stageNodes = dependsOn { () => schedoscope.example.osm.stage.Nodes() .affects(n => Seq( n.id -> id, n.version -> version, n.userId -> userId, n.tstamp -> tstamp, n.longitude -> longitude, n.longitude -> geohash, n.latitude -> latitude, n.latitude -> geohash )) } transformVia(() => MapreduceTransformation( this, (conf: Map[String, Any]) => { val job = Job.getInstance LazyOutputFormat.setOutputFormatClass(job, classOf[TextOutputFormat[Text, NullWritable]]) job.setJobName(this.urlPath) job.setJarByClass(classOf[GeohashMapper]) job.setMapperClass(classOf[GeohashMapper]) job.setNumReduceTasks(0) FileInputFormat.setInputPaths(job, conf.get("input_path").get.toString) FileOutputFormat.setOutputPath(job, new Path(conf.get("output_path").get.toString)) val cfg = job.getConfiguration(); if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) { cfg.set("mapreduce.job.credentials.binary", System.getenv("HADOOP_TOKEN_FILE_LOCATION")) } job }).configureWith( Map( "input_path" -> stageNodes().fullPath, "output_path" -> fullPath))) comment("nodes, extended with geohash") storedAs(TextFile(fieldTerminator = "\\t", lineTerminator = "\\n")) }
Example 21
Source File: UnsplittableSequenceFileInputFormatTest.scala From spark-util with Apache License 2.0 | 5 votes |
package org.hammerlab.hadoop.splits import org.apache.hadoop.io.NullWritable import org.apache.hadoop.mapred import org.apache.hadoop.mapred.{ FileInputFormat, JobConf } import FileInputFormat.setInputPaths import org.hammerlab.test.Suite import org.hammerlab.test.resources.File class UnsplittableSequenceFileInputFormatTest extends Suite { test("part files") { val ifmt = new UnsplittableSequenceFileInputFormat[NullWritable, NullWritable] val jc = new JobConf() setInputPaths(jc, File("rdd")) val paths = ifmt .getSplits(jc, 2) .map(_.asInstanceOf[mapred.FileSplit]) .map(FileSplit(_).path) paths should be( 0 to 5 map( File("rdd") / PartFileBasename(_) ) ) } test("non-part file error") { val ifmt = new UnsplittableSequenceFileInputFormat[NullWritable, NullWritable] val jc = new JobConf() setInputPaths(jc, File("bad")) intercept[IllegalArgumentException] { ifmt.getSplits(jc, 2) } .getMessage should be(s"Bad partition file: error") } }
Example 22
Source File: ExcelOutputWriter.scala From spark-hadoopoffice-ds with Apache License 2.0 | 5 votes |
package org.zuinnote.spark.office.excel import java.math.BigDecimal import java.sql.Date import java.sql.Timestamp import java.text.DateFormat import java.text.SimpleDateFormat import java.util.Calendar import org.apache.hadoop.conf.Configuration import org.apache.hadoop.io.NullWritable import org.apache.hadoop.io.ArrayWritable import org.apache.hadoop.mapreduce.RecordWriter import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.hadoop.fs.Path import org.apache.spark.sql.catalyst.{ CatalystTypeConverters, InternalRow } import org.apache.spark.sql.Row import org.apache.spark.sql.execution.datasources.OutputWriter import org.apache.spark.sql.types._ import org.zuinnote.hadoop.office.format.common.dao.SpreadSheetCellDAO import org.zuinnote.hadoop.office.format.common.HadoopOfficeWriteConfiguration import org.zuinnote.hadoop.office.format.common.util.msexcel.MSExcelUtil import org.zuinnote.hadoop.office.format.mapreduce._ import org.apache.commons.logging.LogFactory import org.apache.commons.logging.Log import org.zuinnote.hadoop.office.format.common.HadoopOfficeWriteConfiguration import java.util.Locale import java.text.DecimalFormat import org.zuinnote.hadoop.office.format.common.converter.ExcelConverterSimpleSpreadSheetCellDAO import java.text.NumberFormat // NOTE: This class is instantiated and used on executor side only, no need to be serializable. private[excel] class ExcelOutputWriter( path: String, dataSchema: StructType, context: TaskAttemptContext, options: Map[String, String]) extends OutputWriter { def write(row: Row): Unit = { // check useHeader if (useHeader) { val headers = row.schema.fieldNames var i = 0 for (x <- headers) { val headerColumnSCD = new SpreadSheetCellDAO(x, "", "", MSExcelUtil.getCellAddressA1Format(currentRowNum, i), defaultSheetName) recordWriter.write(NullWritable.get(), headerColumnSCD) i += 1 } currentRowNum += 1 useHeader = false } // for each value in the row if (row.size>0) { var currentColumnNum = 0; val simpleObject = new Array[AnyRef](row.size) for (i <- 0 to row.size - 1) { // for each element of the row val obj = row.get(i) if ((obj.isInstanceOf[Seq[String]]) && (obj.asInstanceOf[Seq[String]].length==5)) { val formattedValue = obj.asInstanceOf[Seq[String]](0) val comment = obj.asInstanceOf[Seq[String]](1) val formula = obj.asInstanceOf[Seq[String]](2) val address = obj.asInstanceOf[Seq[String]](3) val sheetName = obj.asInstanceOf[Seq[String]](4) simpleObject(i) = new SpreadSheetCellDAO(formattedValue,comment,formula,address,sheetName) } else { simpleObject(i)=obj.asInstanceOf[AnyRef] } } // convert row to spreadsheetcellDAO val spreadSheetCellDAORow = simpleConverter.getSpreadSheetCellDAOfromSimpleDataType(simpleObject, defaultSheetName, currentRowNum) // write it for (x<- spreadSheetCellDAORow) { recordWriter.write(NullWritable.get(), x) } } currentRowNum += 1 } override def close(): Unit = { recordWriter.close(context) currentRowNum = 0; } }
Example 23
Source File: OrcOutputWriter.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.orc import org.apache.hadoop.fs.Path import org.apache.hadoop.io.NullWritable import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.orc.mapred.OrcStruct import org.apache.orc.mapreduce.OrcOutputFormat import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.datasources.OutputWriter import org.apache.spark.sql.types._ private[orc] class OrcOutputWriter( path: String, dataSchema: StructType, context: TaskAttemptContext) extends OutputWriter { private[this] val serializer = new OrcSerializer(dataSchema) private val recordWriter = { new OrcOutputFormat[OrcStruct]() { override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = { new Path(path) } }.getRecordWriter(context) } override def write(row: InternalRow): Unit = { recordWriter.write(NullWritable.get(), serializer.serialize(row)) } override def close(): Unit = { recordWriter.close(context) } }
Example 24
Source File: DeltaRecordReaderWrapper.scala From connectors with Apache License 2.0 | 5 votes |
package io.delta.hive import org.apache.hadoop.hive.ql.io.parquet.read.ParquetRecordReaderWrapper import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory import org.apache.hadoop.io.ArrayWritable import org.apache.hadoop.io.NullWritable import org.apache.hadoop.io.Writable import org.apache.hadoop.mapred.JobConf import org.apache.hadoop.mapred.Reporter import org.apache.parquet.hadoop.ParquetInputFormat import org.slf4j.LoggerFactory private def insertPartitionValues(value: ArrayWritable): Unit = { val valueArray = value.get() var i = 0 val n = partitionValues.length // Using while loop for better performance since this method is called for each row. while (i < n) { val partition = partitionValues(i) // The schema of `valueArray` is the Hive schema, and it's the same as the Delta // schema since we have verified it in `DeltaInputFormat`. Hence, the position of a partition // column in `valueArray` is the same as its position in Delta schema. valueArray(partition._1) = partition._2 i += 1 } } }
Example 25
Source File: DeltaOutputFormat.scala From connectors with Apache License 2.0 | 5 votes |
package io.delta.hive import org.apache.hadoop.fs.FileSystem import org.apache.hadoop.io.{ArrayWritable, NullWritable} import org.apache.hadoop.mapred.{JobConf, OutputFormat, RecordWriter} import org.apache.hadoop.util.Progressable class DeltaOutputFormat extends OutputFormat[NullWritable, ArrayWritable] { private def writingNotSupported[T](): T = { throw new UnsupportedOperationException( "Writing to a Delta table in Hive is not supported. Please use Spark to write.") } override def getRecordWriter( ignored: FileSystem, job: JobConf, name: String, progress: Progressable): RecordWriter[NullWritable, ArrayWritable] = writingNotSupported() override def checkOutputSpecs(ignored: FileSystem, job: JobConf): Unit = writingNotSupported() }
Example 26
Source File: FileLocalityInputFormat.scala From ArchiveSpark with MIT License | 5 votes |
package org.archive.archivespark.sparkling.util import org.apache.hadoop.fs.Path import org.apache.hadoop.io.{NullWritable, Text} import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat, FileSplit} import org.apache.hadoop.mapreduce.{InputSplit, JobContext, RecordReader, TaskAttemptContext} class FileLocalityInputFormat extends FileInputFormat[NullWritable, Text] { class FileLocalityRecordReader extends RecordReader[NullWritable, Text] { private var filePath: Text = new Text() private var read: Boolean = true override def initialize(split: InputSplit, context: TaskAttemptContext): Unit = { filePath.set(split.asInstanceOf[FileSplit].getPath.toString) read = false } override def nextKeyValue(): Boolean = { if (read) false else { read = true true } } override def getCurrentKey: NullWritable = NullWritable.get override def getCurrentValue: Text = filePath override def getProgress: Float = if (read) 1.0f else 0.0f override def close(): Unit = read = true } override def isSplitable(context: JobContext, filename: Path): Boolean = false override def createRecordReader(split: InputSplit, context: TaskAttemptContext): RecordReader[NullWritable, Text] = new FileLocalityRecordReader }
Example 27
Source File: TFRecordInputFormat.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.utils.tf import org.apache.hadoop.fs.Path import org.apache.hadoop.io.{BytesWritable, NullWritable} import org.apache.hadoop.mapreduce.{InputSplit, JobContext, RecordReader, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat, FileSplit} import org.apache.hadoop.fs.FSDataInputStream class TFRecordInputFormat extends FileInputFormat[BytesWritable, NullWritable] { override def createRecordReader(inputSplit: InputSplit, context: TaskAttemptContext): RecordReader[BytesWritable, NullWritable] = new RecordReader[BytesWritable, NullWritable] { private var inputStream: FSDataInputStream = null private var reader: TFRecordIterator = null private var length: Long = 0L private var begin: Long = 0L private var current: Array[Byte] = null override def getCurrentKey: BytesWritable = { new BytesWritable(current) } override def getProgress: Float = { (inputStream.getPos - begin) / (length + 1e-6f) } override def nextKeyValue(): Boolean = { if (reader.hasNext) { current = reader.next() true } else { false } } override def getCurrentValue: NullWritable = { NullWritable.get() } override def initialize(split: InputSplit, context: TaskAttemptContext): Unit = { val conf = context.getConfiguration val fileSplit = split.asInstanceOf[FileSplit] length = fileSplit.getLength begin = fileSplit.getStart val file = fileSplit.getPath val fs = file.getFileSystem(conf) inputStream = fs.open(file, 4096) reader = new TFRecordIterator(inputStream) } override def close(): Unit = { inputStream.close() } } override protected def isSplitable(context: JobContext, filename: Path): Boolean = false }
Example 28
Source File: TFRecordOutputFormat.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.utils.tf import org.apache.hadoop.io.BytesWritable import org.apache.hadoop.io.NullWritable import org.apache.hadoop.mapreduce.RecordWriter import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat class TFRecordOutputFormat extends FileOutputFormat[BytesWritable, NullWritable]{ override def getRecordWriter(taskAttemptContext: TaskAttemptContext): RecordWriter[BytesWritable, NullWritable] = { val conf = taskAttemptContext.getConfiguration val file = getDefaultWorkFile(taskAttemptContext, "") val fs = file.getFileSystem(conf) val bufferSize = 4096 val outStream = fs.create(file, true, bufferSize) val writer = new TFRecordWriter(outStream) new RecordWriter[BytesWritable, NullWritable]() { override def close(context: TaskAttemptContext): Unit = { outStream.close() } override def write(k: BytesWritable, v: NullWritable): Unit = { writer.write(k.getBytes, 0, k.getLength) } } } }