org.apache.hadoop.mapred.FileInputFormat Scala Examples

The following examples show how to use org.apache.hadoop.mapred.FileInputFormat. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

Example 1

Source File: UnsplittableSequenceFileInputFormatTest.scala From spark-util with Apache License 2.0

5 votes

package org.hammerlab.hadoop.splits

import org.apache.hadoop.io.NullWritable
import org.apache.hadoop.mapred
import org.apache.hadoop.mapred.{ FileInputFormat, JobConf }
import FileInputFormat.setInputPaths
import org.hammerlab.test.Suite
import org.hammerlab.test.resources.File

class UnsplittableSequenceFileInputFormatTest
  extends Suite {

  test("part files") {
    val ifmt = new UnsplittableSequenceFileInputFormat[NullWritable, NullWritable]

    val jc = new JobConf()
    setInputPaths(jc, File("rdd"))

    val paths =
      ifmt
        .getSplits(jc, 2)
        .map(_.asInstanceOf[mapred.FileSplit])
        .map(FileSplit(_).path)

    paths should be(
      0 to 5 map(
        File("rdd") / PartFileBasename(_)
      )
    )
  }

  test("non-part file error") {
    val ifmt = new UnsplittableSequenceFileInputFormat[NullWritable, NullWritable]

    val jc = new JobConf()
    setInputPaths(jc, File("bad"))

    intercept[IllegalArgumentException] {
      ifmt.getSplits(jc, 2)
    }
    .getMessage should be(s"Bad partition file: error")

  }
}

Example 2

Source File: ProtoParquetRDD.scala From sparksql-protobuf with Apache License 2.0

5 votes

package com.github.saurfang.parquet.proto.spark

import com.github.saurfang.parquet.proto.ProtoMessageParquetInputFormat
import com.google.protobuf.AbstractMessage
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.mapred.{FileInputFormat, JobConf}
import org.apache.parquet.proto.ProtoReadSupport
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.rdd.{NewHadoopRDD, RDD}
import org.apache.spark.{Partition, SparkContext, TaskContext}

import scala.reflect.ClassTag

class ProtoParquetRDD[T <: AbstractMessage : ClassTag](
                                                        sc: SparkContext,
                                                        input: String,
                                                        protoClass: Class[T],
                                                        @transient conf: Configuration
                                                        ) extends RDD[T](sc, Nil) {

  def this(sc: SparkContext, input: String, protoClass: Class[T]) = {
    this(sc, input, protoClass, sc.hadoopConfiguration)
  }

  lazy private[this] val rdd = {
    val jconf = new JobConf(conf)
    FileInputFormat.setInputPaths(jconf, input)
    ProtoReadSupport.setProtobufClass(jconf, protoClass.getName)

    new NewHadoopRDD(sc, classOf[ProtoMessageParquetInputFormat[T]], classOf[Void], protoClass, jconf)
  }

  @DeveloperApi
  override def compute(split: Partition, context: TaskContext): Iterator[T] = rdd.compute(split, context).map(_._2)

  override protected def getPartitions: Array[Partition] = rdd.getPartitions
}

Example 3

Source File: TextFileOverwrite.scala From spark_helper with Apache License 2.0

5 votes

package org.apache.spark

import org.apache.spark.rdd.{RDD, HadoopRDD}
import org.apache.spark.util.SerializableConfiguration
import org.apache.hadoop.mapred.{FileInputFormat, JobConf, TextInputFormat}
import org.apache.hadoop.io.{LongWritable, Text}
import org.apache.hadoop.fs.Path

object TextFileOverwrite {

  def textFile(
      paths: Seq[String],
      minPartitions: Int,
      sc: SparkContext
  ): RDD[String] = {

    

    val confBroadcast =
      sc.broadcast(new SerializableConfiguration(sc.hadoopConfiguration))

    val setInputPathsFunc =
      (jobConf: JobConf) =>
        FileInputFormat.setInputPaths(jobConf, paths.map(p => new Path(p)): _*)

    new HadoopRDD(
      sc,
      confBroadcast,
      Some(setInputPathsFunc),
      classOf[TextInputFormat],
      classOf[LongWritable],
      classOf[Text],
      minPartitions
    ).map(pair => pair._2.toString)
  }
}