org.apache.hadoop.mapred.SequenceFileOutputFormat Scala Examples
The following examples show how to use org.apache.hadoop.mapred.SequenceFileOutputFormat.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: SequenceFileRDDFunctions.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.{classTag, ClassTag} import org.apache.hadoop.io.Writable import org.apache.hadoop.io.compress.CompressionCodec import org.apache.hadoop.mapred.JobConf import org.apache.hadoop.mapred.SequenceFileOutputFormat import org.apache.spark.internal.Logging def saveAsSequenceFile( path: String, codec: Option[Class[_ <: CompressionCodec]] = None): Unit = self.withScope { def anyToWritable[U <% Writable](u: U): Writable = u // TODO We cannot force the return type of `anyToWritable` be same as keyWritableClass and // valueWritableClass at the compile time. To implement that, we need to add type parameters to // SequenceFileRDDFunctions. however, SequenceFileRDDFunctions is a public class so it will be a // breaking change. val convertKey = self.keyClass != keyWritableClass val convertValue = self.valueClass != valueWritableClass logInfo("Saving as sequence file of type (" + keyWritableClass.getSimpleName + "," + valueWritableClass.getSimpleName + ")" ) val format = classOf[SequenceFileOutputFormat[Writable, Writable]] val jobConf = new JobConf(self.context.hadoopConfiguration) if (!convertKey && !convertValue) { self.saveAsHadoopFile(path, keyWritableClass, valueWritableClass, format, jobConf, codec) } else if (!convertKey && convertValue) { self.map(x => (x._1, anyToWritable(x._2))).saveAsHadoopFile( path, keyWritableClass, valueWritableClass, format, jobConf, codec) } else if (convertKey && !convertValue) { self.map(x => (anyToWritable(x._1), x._2)).saveAsHadoopFile( path, keyWritableClass, valueWritableClass, format, jobConf, codec) } else if (convertKey && convertValue) { self.map(x => (anyToWritable(x._1), anyToWritable(x._2))).saveAsHadoopFile( path, keyWritableClass, valueWritableClass, format, jobConf, codec) } } }
Example 2
Source File: SequenceFileRDDFunctions.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.hadoop.io.Writable import org.apache.hadoop.io.compress.CompressionCodec import org.apache.hadoop.mapred.JobConf import org.apache.hadoop.mapred.SequenceFileOutputFormat import org.apache.spark.internal.Logging def saveAsSequenceFile( path: String, codec: Option[Class[_ <: CompressionCodec]] = None): Unit = self.withScope { def anyToWritable[U <% Writable](u: U): Writable = u // TODO We cannot force the return type of `anyToWritable` be same as keyWritableClass and // valueWritableClass at the compile time. To implement that, we need to add type parameters to // SequenceFileRDDFunctions. however, SequenceFileRDDFunctions is a public class so it will be a // breaking change. val convertKey = self.keyClass != _keyWritableClass val convertValue = self.valueClass != _valueWritableClass logInfo("Saving as sequence file of type " + s"(${_keyWritableClass.getSimpleName},${_valueWritableClass.getSimpleName})" ) val format = classOf[SequenceFileOutputFormat[Writable, Writable]] val jobConf = new JobConf(self.context.hadoopConfiguration) if (!convertKey && !convertValue) { self.saveAsHadoopFile(path, _keyWritableClass, _valueWritableClass, format, jobConf, codec) } else if (!convertKey && convertValue) { self.map(x => (x._1, anyToWritable(x._2))).saveAsHadoopFile( path, _keyWritableClass, _valueWritableClass, format, jobConf, codec) } else if (convertKey && !convertValue) { self.map(x => (anyToWritable(x._1), x._2)).saveAsHadoopFile( path, _keyWritableClass, _valueWritableClass, format, jobConf, codec) } else if (convertKey && convertValue) { self.map(x => (anyToWritable(x._1), anyToWritable(x._2))).saveAsHadoopFile( path, _keyWritableClass, _valueWritableClass, format, jobConf, codec) } } }
Example 3
Source File: IOCommon.scala From Swallow with Apache License 2.0 | 5 votes |
package com.intel.hibench.sparkbench.common import java.io.{File, FileInputStream, IOException, InputStreamReader} import java.util.Properties import org.apache.hadoop.io.compress.CompressionCodec import org.apache.hadoop.io.{NullWritable, Text} import org.apache.hadoop.mapred.SequenceFileOutputFormat import org.apache.spark.rdd.RDD import org.apache.spark.{SparkContext, SparkException} import scala.collection.JavaConversions._ import scala.collection.mutable.HashMap import scala.reflect.ClassTag import scala.reflect.runtime.universe.TypeTag class IOCommon(val sc:SparkContext) { def load[T:ClassTag:TypeTag](filename:String, force_format:Option[String]=None) = { val input_format = force_format.getOrElse( IOCommon.getProperty("sparkbench.inputformat").getOrElse("Text")) input_format match { case "Text" => sc.textFile(filename) case "Sequence" => sc.sequenceFile[NullWritable, Text](filename).map(_._2.toString) case _ => throw new UnsupportedOperationException(s"Unknown inpout format: $input_format") } } def save(filename:String, data:RDD[_], prefix:String) = { val output_format = IOCommon.getProperty(prefix).getOrElse("Text") val output_format_codec = loadClassByName[CompressionCodec](IOCommon.getProperty(prefix + ".codec")) output_format match { case "Text" => if (output_format_codec.isEmpty) data.saveAsTextFile(filename) else data.saveAsTextFile(filename, output_format_codec.get) case "Sequence" => val sequence_data = data.map(x => (NullWritable.get(), new Text(x.toString))) if (output_format_codec.isEmpty) { sequence_data.saveAsHadoopFile[SequenceFileOutputFormat[NullWritable, Text]](filename) } else { sequence_data.saveAsHadoopFile[SequenceFileOutputFormat[NullWritable, Text]](filename, output_format_codec.get) } case _ => throw new UnsupportedOperationException(s"Unknown output format: $output_format") } } def save(filename:String, data:RDD[_]):Unit = save(filename, data, "sparkbench.outputformat") private def loadClassByName[T](name:Option[String]) = { if (!name.isEmpty) Some(Class.forName(name.get) .newInstance.asInstanceOf[T].getClass) else None } private def callMethod[T, R](obj:T, method_name:String) = obj.getClass.getMethod(method_name).invoke(obj).asInstanceOf[R] } object IOCommon { private val sparkbench_conf: HashMap[String, String] = getPropertiesFromFile(System.getenv("SPARKBENCH_PROPERTIES_FILES")) def getPropertiesFromFile(filenames: String): HashMap[String, String] = { val result = new HashMap[String, String] filenames.split(',').filter(_.stripMargin.length > 0).foreach { filename => val file = new File(filename) require(file.exists, s"Properties file $file does not exist") require(file.isFile, s"Properties file $file is not a normal file") val inReader = new InputStreamReader(new FileInputStream(file), "UTF-8") try { val properties = new Properties() properties.load(inReader) result ++= properties.stringPropertyNames() .map(k => (k, properties(k).trim)).toMap } catch { case e: IOException => val message = s"Failed when loading Sparkbench properties file $file" throw new SparkException(message, e) } finally { inReader.close() } } result.filter{case (key, value) => value.toLowerCase != "none"} } def getProperty(key:String):Option[String] = sparkbench_conf.get(key) def dumpProperties(): Unit = sparkbench_conf .foreach{case (key, value)=> println(s"$key\t\t$value")} }