org.apache.hadoop.mapred.JobConf Scala Examples

The following examples show how to use org.apache.hadoop.mapred.JobConf. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: SerializableJobConf.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.util

import java.io.{ObjectInputStream, ObjectOutputStream}

import org.apache.hadoop.mapred.JobConf

private[spark]
class SerializableJobConf(@transient var value: JobConf) extends Serializable {
  private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException {
    out.defaultWriteObject()
    value.write(out)
  }

  private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException {
    value = new JobConf(false)
    value.readFields(in)
  }
} 
Example 2
Source File: SerializableJobConf.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.util

import java.io.{ObjectInputStream, ObjectOutputStream}

import org.apache.hadoop.mapred.JobConf

private[spark]
class SerializableJobConf(@transient var value: JobConf) extends Serializable {
  private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException {
    out.defaultWriteObject()
    value.write(out)
  }

  private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException {
    value = new JobConf(false)
    value.readFields(in)
  }
} 
Example 3
Source File: TextFileOverwrite.scala    From spark_helper   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark

import org.apache.spark.rdd.{RDD, HadoopRDD}
import org.apache.spark.util.SerializableConfiguration
import org.apache.hadoop.mapred.{FileInputFormat, JobConf, TextInputFormat}
import org.apache.hadoop.io.{LongWritable, Text}
import org.apache.hadoop.fs.Path

object TextFileOverwrite {

  def textFile(
      paths: Seq[String],
      minPartitions: Int,
      sc: SparkContext
  ): RDD[String] = {

    

    val confBroadcast =
      sc.broadcast(new SerializableConfiguration(sc.hadoopConfiguration))

    val setInputPathsFunc =
      (jobConf: JobConf) =>
        FileInputFormat.setInputPaths(jobConf, paths.map(p => new Path(p)): _*)

    new HadoopRDD(
      sc,
      confBroadcast,
      Some(setInputPathsFunc),
      classOf[TextInputFormat],
      classOf[LongWritable],
      classOf[Text],
      minPartitions
    ).map(pair => pair._2.toString)
  }
} 
Example 4
Source File: SequenceFileRDDFunctions.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import scala.reflect.ClassTag

import org.apache.hadoop.io.Writable
import org.apache.hadoop.io.compress.CompressionCodec
import org.apache.hadoop.mapred.JobConf
import org.apache.hadoop.mapred.SequenceFileOutputFormat

import org.apache.spark.internal.Logging


  def saveAsSequenceFile(
      path: String,
      codec: Option[Class[_ <: CompressionCodec]] = None): Unit = self.withScope {
    def anyToWritable[U <% Writable](u: U): Writable = u

    // TODO We cannot force the return type of `anyToWritable` be same as keyWritableClass and
    // valueWritableClass at the compile time. To implement that, we need to add type parameters to
    // SequenceFileRDDFunctions. however, SequenceFileRDDFunctions is a public class so it will be a
    // breaking change.
    val convertKey = self.keyClass != _keyWritableClass
    val convertValue = self.valueClass != _valueWritableClass

    logInfo("Saving as sequence file of type " +
      s"(${_keyWritableClass.getSimpleName},${_valueWritableClass.getSimpleName})" )
    val format = classOf[SequenceFileOutputFormat[Writable, Writable]]
    val jobConf = new JobConf(self.context.hadoopConfiguration)
    if (!convertKey && !convertValue) {
      self.saveAsHadoopFile(path, _keyWritableClass, _valueWritableClass, format, jobConf, codec)
    } else if (!convertKey && convertValue) {
      self.map(x => (x._1, anyToWritable(x._2))).saveAsHadoopFile(
        path, _keyWritableClass, _valueWritableClass, format, jobConf, codec)
    } else if (convertKey && !convertValue) {
      self.map(x => (anyToWritable(x._1), x._2)).saveAsHadoopFile(
        path, _keyWritableClass, _valueWritableClass, format, jobConf, codec)
    } else if (convertKey && convertValue) {
      self.map(x => (anyToWritable(x._1), anyToWritable(x._2))).saveAsHadoopFile(
        path, _keyWritableClass, _valueWritableClass, format, jobConf, codec)
    }
  }
} 
Example 5
Source File: SerializableJobConf.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.util

import java.io.{ObjectInputStream, ObjectOutputStream}

import org.apache.hadoop.mapred.JobConf

private[spark]
class SerializableJobConf(@transient var value: JobConf) extends Serializable {
  private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException {
    out.defaultWriteObject()
    value.write(out)
  }

  private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException {
    value = new JobConf(false)
    value.readFields(in)
  }
} 
Example 6
Source File: SerializableJobConf.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.util

import java.io.{ObjectInputStream, ObjectOutputStream}

import org.apache.hadoop.mapred.JobConf

private[spark]
class SerializableJobConf(@transient var value: JobConf) extends Serializable {
  private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException {
    out.defaultWriteObject()
    value.write(out)
  }

  private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException {
    value = new JobConf(false)
    value.readFields(in)
  }
} 
Example 7
Source File: RichRDDTest.scala    From TransmogrifAI   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package com.salesforce.op.utils.spark

import java.io.File

import com.holdenkarau.spark.testing.RDDGenerator
import com.salesforce.op.test.TestSparkContext
import org.apache.hadoop.io.compress.DefaultCodec
import org.apache.hadoop.mapred.JobConf
import org.joda.time.DateTime
import org.junit.runner.RunWith
import org.scalacheck.Arbitrary
import org.scalatest.PropSpec
import org.scalatest.junit.JUnitRunner
import org.scalatest.prop.PropertyChecks


@RunWith(classOf[JUnitRunner])
class RichRDDTest extends PropSpec with PropertyChecks with TestSparkContext {
  import com.salesforce.op.utils.spark.RichRDD._

  val data = RDDGenerator.genRDD[(Int, Int)](sc)(Arbitrary.arbitrary[(Int, Int)])

  property("save as a text file") {
    forAll(data) { rdd =>
      val out = new File(tempDir, "op-richrdd-" + DateTime.now().getMillis).toString
      rdd.saveAsTextFile(out, None, new JobConf(rdd.context.hadoopConfiguration))
      spark.read.textFile(out).count() shouldBe rdd.count()
    }
  }
  property("save as a compressed text file") {
    forAll(data) { rdd =>
      val out = new File(tempDir, "op-richrdd-" + DateTime.now().getMillis).toString
      rdd.saveAsTextFile(out, Some(classOf[DefaultCodec]), new JobConf(rdd.context.hadoopConfiguration))
      spark.read.textFile(out).count() shouldBe rdd.count()
    }
  }

} 
Example 8
Source File: SerializableJobConf.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.util

import java.io.{ObjectInputStream, ObjectOutputStream}

import org.apache.hadoop.mapred.JobConf

private[spark]
class SerializableJobConf(@transient var value: JobConf) extends Serializable {
  private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException {
    out.defaultWriteObject()
    value.write(out)
  }

  private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException {
    value = new JobConf(false)
    value.readFields(in)
  }
} 
Example 9
Source File: ProtoParquetRDD.scala    From sparksql-protobuf   with Apache License 2.0 5 votes vote down vote up
package com.github.saurfang.parquet.proto.spark

import com.github.saurfang.parquet.proto.ProtoMessageParquetInputFormat
import com.google.protobuf.AbstractMessage
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.mapred.{FileInputFormat, JobConf}
import org.apache.parquet.proto.ProtoReadSupport
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.rdd.{NewHadoopRDD, RDD}
import org.apache.spark.{Partition, SparkContext, TaskContext}

import scala.reflect.ClassTag

class ProtoParquetRDD[T <: AbstractMessage : ClassTag](
                                                        sc: SparkContext,
                                                        input: String,
                                                        protoClass: Class[T],
                                                        @transient conf: Configuration
                                                        ) extends RDD[T](sc, Nil) {

  def this(sc: SparkContext, input: String, protoClass: Class[T]) = {
    this(sc, input, protoClass, sc.hadoopConfiguration)
  }

  lazy private[this] val rdd = {
    val jconf = new JobConf(conf)
    FileInputFormat.setInputPaths(jconf, input)
    ProtoReadSupport.setProtobufClass(jconf, protoClass.getName)

    new NewHadoopRDD(sc, classOf[ProtoMessageParquetInputFormat[T]], classOf[Void], protoClass, jconf)
  }

  @DeveloperApi
  override def compute(split: Partition, context: TaskContext): Iterator[T] = rdd.compute(split, context).map(_._2)

  override protected def getPartitions: Array[Partition] = rdd.getPartitions
} 
Example 10
Source File: SerializableJobConf.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.util

import java.io.{ObjectInputStream, ObjectOutputStream}

import org.apache.hadoop.mapred.JobConf

private[spark]
class SerializableJobConf(@transient var value: JobConf) extends Serializable {
  private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException {
    out.defaultWriteObject()
    value.write(out)
  }

  private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException {
    value = new JobConf(false)
    value.readFields(in)
  }
} 
Example 11
Source File: UnsplittableSequenceFileInputFormatTest.scala    From spark-util   with Apache License 2.0 5 votes vote down vote up
package org.hammerlab.hadoop.splits

import org.apache.hadoop.io.NullWritable
import org.apache.hadoop.mapred
import org.apache.hadoop.mapred.{ FileInputFormat, JobConf }
import FileInputFormat.setInputPaths
import org.hammerlab.test.Suite
import org.hammerlab.test.resources.File

class UnsplittableSequenceFileInputFormatTest
  extends Suite {

  test("part files") {
    val ifmt = new UnsplittableSequenceFileInputFormat[NullWritable, NullWritable]

    val jc = new JobConf()
    setInputPaths(jc, File("rdd"))

    val paths =
      ifmt
        .getSplits(jc, 2)
        .map(_.asInstanceOf[mapred.FileSplit])
        .map(FileSplit(_).path)

    paths should be(
      0 to 5 map(
        File("rdd") / PartFileBasename(_)
      )
    )
  }

  test("non-part file error") {
    val ifmt = new UnsplittableSequenceFileInputFormat[NullWritable, NullWritable]

    val jc = new JobConf()
    setInputPaths(jc, File("bad"))

    intercept[IllegalArgumentException] {
      ifmt.getSplits(jc, 2)
    }
    .getMessage should be(s"Bad partition file: error")

  }
} 
Example 12
Source File: UnsplittableSequenceFileInputFormat.scala    From spark-util   with Apache License 2.0 5 votes vote down vote up
package org.hammerlab.hadoop.splits

import java.io.IOException
import java.util

import org.apache.hadoop.fs.{ FileStatus, FileSystem, Path ⇒ HPath }
import org.apache.hadoop.mapred.{ JobConf, SequenceFileInputFormat }
import org.apache.hadoop.mapreduce.JobContext
import org.apache.hadoop.mapreduce.lib.input

import scala.collection.JavaConverters._



  override def listStatus(job: JobContext): util.List[FileStatus] =
    super
      .listStatus(job)
      .asScala
      .sortBy {
        _.getPath.getName match {
          case PartFileBasename(idx) ⇒
            idx
          case basename ⇒
            throw new IllegalArgumentException(s"Bad partition file: $basename")
        }
      }
      .asJava
} 
Example 13
Source File: HiveMRSuite.scala    From connectors   with Apache License 2.0 5 votes vote down vote up
package io.delta.hive

import java.io.{Closeable, File}

import scala.collection.JavaConverters._

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.mapred.{JobConf, MiniMRCluster}
import org.apache.hadoop.mapreduce.MRJobConfig
import org.apache.hadoop.yarn.conf.YarnConfiguration

class HiveMRSuite extends HiveConnectorTest {

  override val engine: String = "mr"

  override def createCluster(namenode: String, conf: Configuration, tempPath: File): Closeable = {
    val jConf = new JobConf(conf);
    jConf.set("yarn.scheduler.capacity.root.queues", "default");
    jConf.set("yarn.scheduler.capacity.root.default.capacity", "100");
    jConf.setInt(MRJobConfig.MAP_MEMORY_MB, 512);
    jConf.setInt(MRJobConfig.REDUCE_MEMORY_MB, 512);
    jConf.setInt(MRJobConfig.MR_AM_VMEM_MB, 128);
    jConf.setInt(YarnConfiguration.YARN_MINICLUSTER_NM_PMEM_MB, 512);
    jConf.setInt(YarnConfiguration.RM_SCHEDULER_MINIMUM_ALLOCATION_MB, 128);
    jConf.setInt(YarnConfiguration.RM_SCHEDULER_MAXIMUM_ALLOCATION_MB, 512);
    val mr = new MiniMRCluster(2, namenode, 1, null, null, jConf)

    new Closeable {
      override def close(): Unit = {
        mr.shutdown()
      }
    }
  }
} 
Example 14
Source File: HiveTezSuite.scala    From connectors   with Apache License 2.0 5 votes vote down vote up
package io.delta.hive

import java.io.{Closeable, File}

import scala.collection.JavaConverters._

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hive.conf.HiveConf
import org.apache.hadoop.mapred.JobConf
import org.apache.hadoop.mapreduce.MRJobConfig
import org.apache.hadoop.yarn.conf.YarnConfiguration
import org.apache.tez.dag.api.TezConfiguration
import org.apache.tez.runtime.library.api.TezRuntimeConfiguration
import org.apache.tez.test.MiniTezCluster

class HiveTezSuite extends HiveConnectorTest {

  override val engine: String = "tez"

  private var tezConf: Configuration = _

  // scalastyle:off
  
  // scalastyle:on
  override def setupConfiguration(conf: Configuration): Unit = {
    tezConf.asScala.foreach { e =>
      conf.set(e.getKey, e.getValue)
    }
    // Overrides values from the hive/tez-site.
    conf.setInt("hive.tez.container.size", 256)
    conf.setInt(TezConfiguration.TEZ_AM_RESOURCE_MEMORY_MB, 256)
    conf.setInt(TezConfiguration.TEZ_TASK_RESOURCE_MEMORY_MB, 256)
    conf.setInt(TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_MB, 24)
    conf.setInt(TezRuntimeConfiguration.TEZ_RUNTIME_UNORDERED_OUTPUT_BUFFER_SIZE_MB, 10)
    conf.setFloat(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_FETCH_BUFFER_PERCENT, 0.4f)
    conf.setBoolean(TezConfiguration.TEZ_IGNORE_LIB_URIS, true)
  }
} 
Example 15
Source File: DeltaRecordReaderWrapper.scala    From connectors   with Apache License 2.0 5 votes vote down vote up
package io.delta.hive

import org.apache.hadoop.hive.ql.io.parquet.read.ParquetRecordReaderWrapper
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory
import org.apache.hadoop.io.ArrayWritable
import org.apache.hadoop.io.NullWritable
import org.apache.hadoop.io.Writable
import org.apache.hadoop.mapred.JobConf
import org.apache.hadoop.mapred.Reporter
import org.apache.parquet.hadoop.ParquetInputFormat
import org.slf4j.LoggerFactory


  private def insertPartitionValues(value: ArrayWritable): Unit = {
    val valueArray = value.get()
    var i = 0
    val n = partitionValues.length
    // Using while loop for better performance since this method is called for each row.
    while (i < n) {
      val partition = partitionValues(i)
      // The schema of `valueArray` is the Hive schema, and it's the same as the Delta
      // schema since we have verified it in `DeltaInputFormat`. Hence, the position of a partition
      // column in `valueArray` is the same as its position in Delta schema.
      valueArray(partition._1) = partition._2
      i += 1
    }
  }
} 
Example 16
Source File: HiveInputFormat.scala    From connectors   with Apache License 2.0 5 votes vote down vote up
package io.delta.hive

import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapred.JobConf

class HiveInputFormat extends org.apache.hadoop.hive.ql.io.HiveInputFormat {

  override def pushProjectionsAndFilters(
      jobConf: JobConf,
      inputFormatClass: Class[_],
      splitPath: Path,
      nonNative: Boolean): Unit = {
    if (inputFormatClass == classOf[DeltaInputFormat]) {
      super.pushProjectionsAndFilters(jobConf, inputFormatClass, splitPath, false)
    } else {
      super.pushProjectionsAndFilters(jobConf, inputFormatClass, splitPath, nonNative)
    }
  }
} 
Example 17
Source File: DeltaOutputFormat.scala    From connectors   with Apache License 2.0 5 votes vote down vote up
package io.delta.hive

import org.apache.hadoop.fs.FileSystem
import org.apache.hadoop.io.{ArrayWritable, NullWritable}
import org.apache.hadoop.mapred.{JobConf, OutputFormat, RecordWriter}
import org.apache.hadoop.util.Progressable


class DeltaOutputFormat extends OutputFormat[NullWritable, ArrayWritable] {

  private def writingNotSupported[T](): T = {
    throw new UnsupportedOperationException(
      "Writing to a Delta table in Hive is not supported. Please use Spark to write.")
  }

  override def getRecordWriter(
    ignored: FileSystem,
    job: JobConf,
    name: String,
    progress: Progressable): RecordWriter[NullWritable, ArrayWritable] = writingNotSupported()

  override def checkOutputSpecs(ignored: FileSystem, job: JobConf): Unit = writingNotSupported()
} 
Example 18
Source File: PailDataSource.scala    From utils   with Apache License 2.0 5 votes vote down vote up
package com.indix.utils.spark.pail

import com.backtype.hadoop.pail._
import com.backtype.support.{Utils => PailUtils}
import org.apache.hadoop.io.{BytesWritable, Text}
import org.apache.hadoop.mapred.{InputFormat, JobConf}
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD

import scala.reflect.ClassTag

trait PailDataSource {

  implicit class PailBasedReader(sc: SparkContext) {
    def pailFile[R: ClassTag](inputLocation: String): RDD[R] = {
      pailFileWithInfo[R](inputLocation).map(_._2)
    }

    def pailFileWithInfo[R: ClassTag](inputLocation: String) = {
      val pail = new Pail(inputLocation)
      val pailSpec = pail.getSpec
      val inputFormat = pail.getFormat.getInputFormatClass.asSubclass(classOf[InputFormat[PailRecordInfo, BytesWritable]])
      sc.hadoopFile(inputLocation, inputFormat, classOf[PailRecordInfo], classOf[BytesWritable])
        .map {
          case (recordInfo, recordInBytes) =>
            recordInfo -> pailSpec.getStructure.deserialize(recordInBytes.getBytes).asInstanceOf[R]
        }
    }
  }

  implicit class PailBasedWriter[R: ClassTag](rdd: RDD[R]) {
    def saveAsPail(outputLocation: String, pailSpec: PailSpec) = {
      val jobConf = new JobConf(rdd.context.hadoopConfiguration)

      PailUtils.setObject(jobConf, PailOutputFormat.SPEC_ARG, pailSpec)

      rdd.map { record =>
        val pailStruct = pailSpec.getStructure.asInstanceOf[PailStructure[R]]

        val attr = PailUtils.join(pailStruct.getTarget(record), "/")
        val recordInBytes = pailStruct.serialize(record)
        new Text(attr) -> new BytesWritable(recordInBytes)
      }.saveAsHadoopFile(outputLocation, classOf[Text], classOf[BytesWritable], classOf[PailOutputFormat], jobConf)
    }
  }

} 
Example 19
Source File: SequenceFileRDDFunctions.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import scala.reflect.{classTag, ClassTag}

import org.apache.hadoop.io.Writable
import org.apache.hadoop.io.compress.CompressionCodec
import org.apache.hadoop.mapred.JobConf
import org.apache.hadoop.mapred.SequenceFileOutputFormat

import org.apache.spark.internal.Logging


  def saveAsSequenceFile(
      path: String,
      codec: Option[Class[_ <: CompressionCodec]] = None): Unit = self.withScope {
    def anyToWritable[U <% Writable](u: U): Writable = u

    // TODO We cannot force the return type of `anyToWritable` be same as keyWritableClass and
    // valueWritableClass at the compile time. To implement that, we need to add type parameters to
    // SequenceFileRDDFunctions. however, SequenceFileRDDFunctions is a public class so it will be a
    // breaking change.
    val convertKey = self.keyClass != keyWritableClass
    val convertValue = self.valueClass != valueWritableClass

    logInfo("Saving as sequence file of type (" + keyWritableClass.getSimpleName + "," +
      valueWritableClass.getSimpleName + ")" )
    val format = classOf[SequenceFileOutputFormat[Writable, Writable]]
    val jobConf = new JobConf(self.context.hadoopConfiguration)
    if (!convertKey && !convertValue) {
      self.saveAsHadoopFile(path, keyWritableClass, valueWritableClass, format, jobConf, codec)
    } else if (!convertKey && convertValue) {
      self.map(x => (x._1, anyToWritable(x._2))).saveAsHadoopFile(
        path, keyWritableClass, valueWritableClass, format, jobConf, codec)
    } else if (convertKey && !convertValue) {
      self.map(x => (anyToWritable(x._1), x._2)).saveAsHadoopFile(
        path, keyWritableClass, valueWritableClass, format, jobConf, codec)
    } else if (convertKey && convertValue) {
      self.map(x => (anyToWritable(x._1), anyToWritable(x._2))).saveAsHadoopFile(
        path, keyWritableClass, valueWritableClass, format, jobConf, codec)
    }
  }
} 
Example 20
Source File: CarbonCountStar.scala    From carbondata   with Apache License 2.0 4 votes vote down vote up
package org.apache.spark.sql

import scala.collection.JavaConverters._

import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapred.JobConf
import org.apache.hadoop.mapreduce.Job
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier}
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.execution.LeafExecNode
import org.apache.spark.sql.optimizer.CarbonFilters
import org.apache.spark.sql.types.StringType
import org.apache.spark.unsafe.types.UTF8String

import org.apache.carbondata.core.datastore.impl.FileFactory
import org.apache.carbondata.core.metadata.AbsoluteTableIdentifier
import org.apache.carbondata.core.metadata.schema.table.CarbonTable
import org.apache.carbondata.core.mutate.CarbonUpdateUtil
import org.apache.carbondata.core.statusmanager.StageInputCollector
import org.apache.carbondata.core.util.{CarbonProperties, ThreadLocalSessionInfo}
import org.apache.carbondata.hadoop.api.{CarbonInputFormat, CarbonTableInputFormat}
import org.apache.carbondata.hadoop.util.CarbonInputFormatUtil
import org.apache.carbondata.spark.load.DataLoadProcessBuilderOnSpark

case class CarbonCountStar(
    attributesRaw: Seq[Attribute],
    carbonTable: CarbonTable,
    sparkSession: SparkSession,
    outUnsafeRows: Boolean = true) extends LeafExecNode {

  override def doExecute(): RDD[InternalRow] = {
    ThreadLocalSessionInfo
      .setConfigurationToCurrentThread(sparkSession.sessionState.newHadoopConf())
    val absoluteTableIdentifier = carbonTable.getAbsoluteTableIdentifier
    val (job, tableInputFormat) = createCarbonInputFormat(absoluteTableIdentifier)
    CarbonInputFormat.setQuerySegment(job.getConfiguration, carbonTable)

    // get row count
    var rowCount = CarbonUpdateUtil.getRowCount(
      tableInputFormat.getBlockRowCount(
        job,
        carbonTable,
        CarbonFilters.getPartitions(
          Seq.empty,
          sparkSession,
          TableIdentifier(
            carbonTable.getTableName,
            Some(carbonTable.getDatabaseName))).map(_.asJava).orNull, false),
      carbonTable)

    if (CarbonProperties.isQueryStageInputEnabled) {
      // check for number of row for stage input
      val splits = StageInputCollector.createInputSplits(carbonTable, job.getConfiguration)
      if (!splits.isEmpty) {
        val df = DataLoadProcessBuilderOnSpark.createInputDataFrame(
          sparkSession, carbonTable, splits.asScala)
        rowCount += df.count()
      }
    }

    val valueRaw =
      attributesRaw.head.dataType match {
        case StringType => Seq(UTF8String.fromString(Long.box(rowCount).toString)).toArray
          .asInstanceOf[Array[Any]]
        case _ => Seq(Long.box(rowCount)).toArray.asInstanceOf[Array[Any]]
      }
    val value = new GenericInternalRow(valueRaw)
    val unsafeProjection = UnsafeProjection.create(output.map(_.dataType).toArray)
    val row = if (outUnsafeRows) unsafeProjection(value) else value
    sparkContext.parallelize(Seq(row))
  }

  override def output: Seq[Attribute] = {
    attributesRaw
  }

  private def createCarbonInputFormat(absoluteTableIdentifier: AbsoluteTableIdentifier
  ): (Job, CarbonTableInputFormat[Array[Object]]) = {
    val carbonInputFormat = new CarbonTableInputFormat[Array[Object]]()
    val jobConf: JobConf = new JobConf(FileFactory.getConfiguration)
    SparkHadoopUtil.get.addCredentials(jobConf)
    CarbonInputFormat.setTableInfo(jobConf, carbonTable.getTableInfo)
    val job = new Job(jobConf)
    FileInputFormat.addInputPath(job, new Path(absoluteTableIdentifier.getTablePath))
    CarbonInputFormat
      .setTransactionalTable(job.getConfiguration,
        carbonTable.getTableInfo.isTransactionalTable)
    CarbonInputFormatUtil.setIndexJobIfConfigured(job.getConfiguration)
    (job, carbonInputFormat)
  }
}