org.apache.hadoop.mapred.InputFormat Scala Examples

The following examples show how to use org.apache.hadoop.mapred.InputFormat. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: PailDataSource.scala    From utils   with Apache License 2.0 5 votes vote down vote up
package com.indix.utils.spark.pail

import com.backtype.hadoop.pail._
import{Utils => PailUtils}
import{BytesWritable, Text}
import org.apache.hadoop.mapred.{InputFormat, JobConf}
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD

import scala.reflect.ClassTag

trait PailDataSource {

  implicit class PailBasedReader(sc: SparkContext) {
    def pailFile[R: ClassTag](inputLocation: String): RDD[R] = {

    def pailFileWithInfo[R: ClassTag](inputLocation: String) = {
      val pail = new Pail(inputLocation)
      val pailSpec = pail.getSpec
      val inputFormat = pail.getFormat.getInputFormatClass.asSubclass(classOf[InputFormat[PailRecordInfo, BytesWritable]])
      sc.hadoopFile(inputLocation, inputFormat, classOf[PailRecordInfo], classOf[BytesWritable])
        .map {
          case (recordInfo, recordInBytes) =>
            recordInfo -> pailSpec.getStructure.deserialize(recordInBytes.getBytes).asInstanceOf[R]

  implicit class PailBasedWriter[R: ClassTag](rdd: RDD[R]) {
    def saveAsPail(outputLocation: String, pailSpec: PailSpec) = {
      val jobConf = new JobConf(rdd.context.hadoopConfiguration)

      PailUtils.setObject(jobConf, PailOutputFormat.SPEC_ARG, pailSpec) { record =>
        val pailStruct = pailSpec.getStructure.asInstanceOf[PailStructure[R]]

        val attr = PailUtils.join(pailStruct.getTarget(record), "/")
        val recordInBytes = pailStruct.serialize(record)
        new Text(attr) -> new BytesWritable(recordInBytes)
      }.saveAsHadoopFile(outputLocation, classOf[Text], classOf[BytesWritable], classOf[PailOutputFormat], jobConf)
