org.apache.hadoop.mapred.FileInputFormat Scala Examples
The following examples show how to use org.apache.hadoop.mapred.FileInputFormat.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: UnsplittableSequenceFileInputFormatTest.scala From spark-util with Apache License 2.0 | 5 votes |
package org.hammerlab.hadoop.splits import org.apache.hadoop.io.NullWritable import org.apache.hadoop.mapred import org.apache.hadoop.mapred.{ FileInputFormat, JobConf } import FileInputFormat.setInputPaths import org.hammerlab.test.Suite import org.hammerlab.test.resources.File class UnsplittableSequenceFileInputFormatTest extends Suite { test("part files") { val ifmt = new UnsplittableSequenceFileInputFormat[NullWritable, NullWritable] val jc = new JobConf() setInputPaths(jc, File("rdd")) val paths = ifmt .getSplits(jc, 2) .map(_.asInstanceOf[mapred.FileSplit]) .map(FileSplit(_).path) paths should be( 0 to 5 map( File("rdd") / PartFileBasename(_) ) ) } test("non-part file error") { val ifmt = new UnsplittableSequenceFileInputFormat[NullWritable, NullWritable] val jc = new JobConf() setInputPaths(jc, File("bad")) intercept[IllegalArgumentException] { ifmt.getSplits(jc, 2) } .getMessage should be(s"Bad partition file: error") } }
Example 2
Source File: ProtoParquetRDD.scala From sparksql-protobuf with Apache License 2.0 | 5 votes |
package com.github.saurfang.parquet.proto.spark import com.github.saurfang.parquet.proto.ProtoMessageParquetInputFormat import com.google.protobuf.AbstractMessage import org.apache.hadoop.conf.Configuration import org.apache.hadoop.mapred.{FileInputFormat, JobConf} import org.apache.parquet.proto.ProtoReadSupport import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.{NewHadoopRDD, RDD} import org.apache.spark.{Partition, SparkContext, TaskContext} import scala.reflect.ClassTag class ProtoParquetRDD[T <: AbstractMessage : ClassTag]( sc: SparkContext, input: String, protoClass: Class[T], @transient conf: Configuration ) extends RDD[T](sc, Nil) { def this(sc: SparkContext, input: String, protoClass: Class[T]) = { this(sc, input, protoClass, sc.hadoopConfiguration) } lazy private[this] val rdd = { val jconf = new JobConf(conf) FileInputFormat.setInputPaths(jconf, input) ProtoReadSupport.setProtobufClass(jconf, protoClass.getName) new NewHadoopRDD(sc, classOf[ProtoMessageParquetInputFormat[T]], classOf[Void], protoClass, jconf) } @DeveloperApi override def compute(split: Partition, context: TaskContext): Iterator[T] = rdd.compute(split, context).map(_._2) override protected def getPartitions: Array[Partition] = rdd.getPartitions }
Example 3
Source File: TextFileOverwrite.scala From spark_helper with Apache License 2.0 | 5 votes |
package org.apache.spark import org.apache.spark.rdd.{RDD, HadoopRDD} import org.apache.spark.util.SerializableConfiguration import org.apache.hadoop.mapred.{FileInputFormat, JobConf, TextInputFormat} import org.apache.hadoop.io.{LongWritable, Text} import org.apache.hadoop.fs.Path object TextFileOverwrite { def textFile( paths: Seq[String], minPartitions: Int, sc: SparkContext ): RDD[String] = { val confBroadcast = sc.broadcast(new SerializableConfiguration(sc.hadoopConfiguration)) val setInputPathsFunc = (jobConf: JobConf) => FileInputFormat.setInputPaths(jobConf, paths.map(p => new Path(p)): _*) new HadoopRDD( sc, confBroadcast, Some(setInputPathsFunc), classOf[TextInputFormat], classOf[LongWritable], classOf[Text], minPartitions ).map(pair => pair._2.toString) } }