org.apache.hadoop.mapreduce.lib.input.FileInputFormat Scala Examples
The following examples show how to use org.apache.hadoop.mapreduce.lib.input.FileInputFormat.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: RosbagInputFormat.scala From ros_hadoop with Apache License 2.0 | 5 votes |
package de.valtech.foss import scala.io.Source import scala.collection.JavaConverters._ import org.apache.hadoop.fs.Path import org.apache.hadoop.io.{BytesWritable, LongWritable, MapWritable} import org.apache.hadoop.mapreduce.{InputSplit, JobContext, RecordReader, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.input.FileInputFormat object RosbagInputFormat { def getRosChunkIdx(context: JobContext): String = { context.getConfiguration.get("RosbagInputFormat.chunkIdx") } def getBlockSize(context: JobContext): Long = { context.getConfiguration.get("dfs.blocksize").toLong } } class RosbagBytesInputFormat extends FileInputFormat[LongWritable, BytesWritable] { private var rosChunkIdx = "" private var recordLength = -1L override def isSplitable(context: JobContext, filename: Path): Boolean = { rosChunkIdx = RosbagInputFormat.getRosChunkIdx(context) recordLength = RosbagInputFormat.getBlockSize(context) true } override def computeSplitSize(blockSize: Long, minSize: Long, maxSize: Long): Long = { val defaultSize = super.computeSplitSize(blockSize, minSize, maxSize) defaultSize } override def createRecordReader(split: InputSplit, context: TaskAttemptContext) : RecordReader[LongWritable, BytesWritable] = { new RosbagBytesRecordReader } } class RosbagMapInputFormat extends FileInputFormat[LongWritable, MapWritable] { private var rosChunkIdx = "" private var recordLength = -1L override def isSplitable(context: JobContext, filename: Path): Boolean = { rosChunkIdx = RosbagInputFormat.getRosChunkIdx(context) recordLength = RosbagInputFormat.getBlockSize(context) true } override def computeSplitSize(blockSize: Long, minSize: Long, maxSize: Long): Long = { val defaultSize = super.computeSplitSize(blockSize, minSize, maxSize) defaultSize } override def createRecordReader(split: InputSplit, context: TaskAttemptContext) : RecordReader[LongWritable, MapWritable] = { new RosbagMapRecordReader } }
Example 2
Source File: TFRecordInputFormat.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.utils.tf import org.apache.hadoop.fs.Path import org.apache.hadoop.io.{BytesWritable, NullWritable} import org.apache.hadoop.mapreduce.{InputSplit, JobContext, RecordReader, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat, FileSplit} import org.apache.hadoop.fs.FSDataInputStream class TFRecordInputFormat extends FileInputFormat[BytesWritable, NullWritable] { override def createRecordReader(inputSplit: InputSplit, context: TaskAttemptContext): RecordReader[BytesWritable, NullWritable] = new RecordReader[BytesWritable, NullWritable] { private var inputStream: FSDataInputStream = null private var reader: TFRecordIterator = null private var length: Long = 0L private var begin: Long = 0L private var current: Array[Byte] = null override def getCurrentKey: BytesWritable = { new BytesWritable(current) } override def getProgress: Float = { (inputStream.getPos - begin) / (length + 1e-6f) } override def nextKeyValue(): Boolean = { if (reader.hasNext) { current = reader.next() true } else { false } } override def getCurrentValue: NullWritable = { NullWritable.get() } override def initialize(split: InputSplit, context: TaskAttemptContext): Unit = { val conf = context.getConfiguration val fileSplit = split.asInstanceOf[FileSplit] length = fileSplit.getLength begin = fileSplit.getStart val file = fileSplit.getPath val fs = file.getFileSystem(conf) inputStream = fs.open(file, 4096) reader = new TFRecordIterator(inputStream) } override def close(): Unit = { inputStream.close() } } override protected def isSplitable(context: JobContext, filename: Path): Boolean = false }
Example 3
Source File: FileLocalityInputFormat.scala From ArchiveSpark with MIT License | 5 votes |
package org.archive.archivespark.sparkling.util import org.apache.hadoop.fs.Path import org.apache.hadoop.io.{NullWritable, Text} import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat, FileSplit} import org.apache.hadoop.mapreduce.{InputSplit, JobContext, RecordReader, TaskAttemptContext} class FileLocalityInputFormat extends FileInputFormat[NullWritable, Text] { class FileLocalityRecordReader extends RecordReader[NullWritable, Text] { private var filePath: Text = new Text() private var read: Boolean = true override def initialize(split: InputSplit, context: TaskAttemptContext): Unit = { filePath.set(split.asInstanceOf[FileSplit].getPath.toString) read = false } override def nextKeyValue(): Boolean = { if (read) false else { read = true true } } override def getCurrentKey: NullWritable = NullWritable.get override def getCurrentValue: Text = filePath override def getProgress: Float = if (read) 1.0f else 0.0f override def close(): Unit = read = true } override def isSplitable(context: JobContext, filename: Path): Boolean = false override def createRecordReader(split: InputSplit, context: TaskAttemptContext): RecordReader[NullWritable, Text] = new FileLocalityRecordReader }
Example 4
Source File: InputFormatConf.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.hadoop import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{ FileSystem, Path } import org.apache.hadoop.io.{ LongWritable, Text, Writable } import org.apache.hadoop.mapreduce.{ InputFormat, InputSplit, Job, RecordReader } import org.apache.hadoop.mapreduce.lib.input.{ FileInputFormat, FileSplit, TextInputFormat } import scala.collection.immutable trait InputFormatConf[K, V] extends Serializable { type IF <: InputFormat[K, V] type Split <: InputSplit with Writable type KExtract <: Extract[K] type VExtract <: Extract[V] def kExtract: KExtract def vExtract: VExtract def makeInputFormat(): IF // I'm unsure if we should WriSer them for them def makeSplits(hadoopConf: Configuration): IndexedSeq[WriSer[Split]] // TODO do we want to require typing of the RecordReader as well? final def createRecordReader(hadoopConf: Configuration, split: Split, inputFormat: IF = makeInputFormat()): RecordReader[K, V] = { val tac = ConfOnlyTAC(hadoopConf) val recordReader = inputFormat.createRecordReader(split, tac) recordReader.initialize(split, tac) recordReader } } case class TextInputFormatConf(file: String, partitions: Int) extends InputFormatConf[LongWritable, Text] { type IF = TextInputFormat type Split = FileSplit // TODO now that we figured out what's up, see if we can't eliminate the need for this... val internalK = Extract.unit[LongWritable] val internalV = Extract.text type KExtract = internalK.type type VExtract = internalV.type override val kExtract: KExtract = internalK override val vExtract: VExtract = internalV def makeInputFormat() = new TextInputFormat() def makeSplits(hadoopConf: Configuration): immutable.IndexedSeq[WriSer[FileSplit]] = { val job = Job.getInstance(hadoopConf) FileInputFormat.setInputPaths(job, file) val path = new Path(file) val len = FileSystem.get(hadoopConf).listStatus(path).head.getLen val size_per = math.round(len / partitions.toDouble) ((0 until partitions - 1).map { p => new FileSplit(path, size_per * p, size_per, null) } :+ { val fin = size_per * (partitions - 1) new FileSplit(path, fin, len - fin, null) }).map(WriSer(_)) } } // TODO do we really get much from having this as its own class? consider just making a def csv method in TextInputFormatConf object CSVInputFormatConf { def apply[V](ifc: InputFormatConf[LongWritable, V] { type Split = FileSplit }): InputFormatConf[LongWritable, V] { type IF = ifc.IF type Split = ifc.Split type KExtract = ifc.KExtract type VExtract = ifc.VExtract } = new InputFormatConf[LongWritable, V] { type IF = ifc.IF type Split = ifc.Split type KExtract = ifc.KExtract type VExtract = ifc.VExtract override val kExtract: KExtract = ifc.kExtract override val vExtract: VExtract = ifc.vExtract override def makeInputFormat() = ifc.makeInputFormat() override def makeSplits(hadoopConf: Configuration) = { val splits = ifc.makeSplits(hadoopConf) splits.headOption.fold(IndexedSeq.empty[WriSer[Split]]) { case WriSer(head) => val rr = createRecordReader(hadoopConf, head) require(rr.nextKeyValue, "csv has no header, first line was empty") val afterHeader = rr.getCurrentKey.get require(rr.nextKeyValue, "first split is empty") WriSer(new FileSplit(head.getPath, afterHeader, head.getLength - afterHeader, null)) +: splits.tail } } } }
Example 5
Source File: MapreduceTransformation.scala From schedoscope with Apache License 2.0 | 5 votes |
package org.schedoscope.dsl.transformations import java.net.URI import org.apache.commons.lang3.StringUtils import org.apache.hadoop.mapreduce.lib.input.FileInputFormat import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat import org.apache.hadoop.mapreduce.{Job, MRJobConfig} import org.schedoscope.Schedoscope import org.schedoscope.dsl.View import org.schedoscope.scheduler.driver.{DriverRunState, MapreduceDriver} import org.schedoscope.scheduler.service.ViewTransformationStatus case class MapreduceTransformation(v: View, createJob: (Map[String, Any]) => Job, cleanupAfterJob: (Job, MapreduceDriver, DriverRunState[MapreduceBaseTransformation]) => DriverRunState[MapreduceBaseTransformation] = (_, __, completionRunState) => completionRunState, dirsToDelete: List[String] = List(), deleteViewPath: Boolean = true) extends MapreduceBaseTransformation { lazy val job = createJob(configuration.toMap) var directoriesToDelete = dirsToDelete ++ (if (deleteViewPath) List(v.fullPath) else List()) description = StringUtils.abbreviate(v.urlPath, 100) } trait MapreduceBaseTransformation extends Transformation { def name = "mapreduce" val cleanupAfterJob: (Job, MapreduceDriver, DriverRunState[MapreduceBaseTransformation]) => DriverRunState[MapreduceBaseTransformation] val v: View val job: Job var directoriesToDelete: List[String] override def fileResourcesToChecksum = { val jarName = try { job.getConfiguration().get(MRJobConfig.JAR).split("/").last } catch { case _: Throwable => null } Schedoscope.settings .getDriverSettings("mapreduce") .libJarsHdfs .filter(lj => jarName == null || lj.contains(jarName)) } override def viewTransformationStatus = ViewTransformationStatus( name, Some(Map( "input" -> job.getConfiguration().get(FileInputFormat.INPUT_DIR), "output" -> job.getConfiguration().get(FileOutputFormat.OUTDIR)))) def configure() { // if job jar hasn't been registered, add all mapreduce libjars // to distributed cache if (job.getConfiguration().get(MRJobConfig.JAR) == null) { fileResourcesToChecksum.foreach(r => { try { job.addCacheFile(new URI(r)) } catch { case _: Throwable => Unit } }) } configuration.foreach { case (k, v) => if (v == null) job.getConfiguration.unset(k) else job.getConfiguration.set(k, v.toString) } } }
Example 6
Source File: MapreduceDriverTest.scala From schedoscope with Apache License 2.0 | 5 votes |
package org.schedoscope.scheduler.driver import java.nio.charset.StandardCharsets import java.nio.file.{Files, Paths} import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.Job import org.apache.hadoop.mapreduce.lib.input.FileInputFormat import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat import org.scalatest.{FlatSpec, Matchers} import org.schedoscope.dsl.View import org.schedoscope.dsl.transformations.{FailingMapper, MapreduceTransformation} import org.schedoscope.test.resources.LocalTestResources import org.schedoscope.test.resources.TestDriverRunCompletionHandlerCallCounter._ class MapreduceDriverTest extends FlatSpec with Matchers with TestFolder { lazy val driver = new LocalTestResources().driverFor[MapreduceTransformation]("mapreduce") def invalidJob: (Map[String, Any]) => Job = (m: Map[String, Any]) => Job.getInstance def failingJob: (Map[String, Any]) => Job = (m: Map[String, Any]) => { writeData() val job = Job.getInstance job.setMapperClass(classOf[FailingMapper]) FileInputFormat.setInputPaths(job, new Path(inputPath(""))) FileOutputFormat.setOutputPath(job, new Path(outputPath(System.nanoTime.toString))) job } def identityJob: (Map[String, Any]) => Job = (m: Map[String, Any]) => { writeData() val job = Job.getInstance FileInputFormat.setInputPaths(job, new Path(inputPath(""))) FileOutputFormat.setOutputPath(job, new Path(outputPath(System.nanoTime.toString))) job } case class DummyView() extends View def writeData() { Files.write(Paths.get(s"${inputPath("")}/file.txt"), "some data".getBytes(StandardCharsets.UTF_8)) } "MapreduceDriver" should "have transformation name Mapreduce" in { driver.transformationName shouldBe "mapreduce" } it should "execute Mapreduce transformations synchronously" in { val driverRunState = driver.runAndWait(MapreduceTransformation(new DummyView(), identityJob)) driverRunState shouldBe a[DriverRunSucceeded[_]] } it should "execute another Mapreduce transformations synchronously" in { val driverRunState = driver.runAndWait(MapreduceTransformation(new DummyView(), identityJob)) driverRunState shouldBe a[DriverRunSucceeded[_]] } it should "execute Mapreduce transformations asynchronously" in { val driverRunHandle = driver.run(MapreduceTransformation(new DummyView(), identityJob)) var runWasAsynchronous = false while (driver.getDriverRunState(driverRunHandle).isInstanceOf[DriverRunOngoing[_]]) runWasAsynchronous = true runWasAsynchronous shouldBe true driver.getDriverRunState(driverRunHandle) shouldBe a[DriverRunSucceeded[_]] } it should "execute Mapreduce transformations and return errors when running asynchronously" in { val driverRunHandle = driver.run(MapreduceTransformation(new DummyView(), failingJob)) var runWasAsynchronous = false while (driver.getDriverRunState(driverRunHandle).isInstanceOf[DriverRunOngoing[_]]) runWasAsynchronous = true // runWasAsynchronous shouldBe true FIXME: isn't asynchronous, why? driver.getDriverRunState(driverRunHandle) shouldBe a[DriverRunFailed[_]] } it should "call its DriverRunCompletitionHandlers' driverRunCompleted upon request" in { val runHandle = driver.run(MapreduceTransformation(new DummyView(), identityJob)) while (driver.getDriverRunState(runHandle).isInstanceOf[DriverRunOngoing[_]]) {} driver.driverRunCompleted(runHandle) driverRunCompletedCalled(runHandle, driver.getDriverRunState(runHandle)) shouldBe true } it should "call its DriverRunCompletitionHandlers' driverRunStarted upon request" in { val runHandle = driver.run(MapreduceTransformation(new DummyView(), identityJob)) driver.driverRunStarted(runHandle) driverRunStartedCalled(runHandle) shouldBe true } }
Example 7
Source File: NodesWithGeohash.scala From schedoscope with Apache License 2.0 | 5 votes |
package schedoscope.example.osm.processed import org.apache.hadoop.fs.Path import org.apache.hadoop.io.{NullWritable, Text} import org.apache.hadoop.mapreduce.Job import org.apache.hadoop.mapreduce.lib.input.FileInputFormat import org.apache.hadoop.mapreduce.lib.output.{FileOutputFormat, LazyOutputFormat, TextOutputFormat} import org.schedoscope.dsl.View import org.schedoscope.dsl.storageformats.TextFile import org.schedoscope.dsl.transformations.MapreduceTransformation import schedoscope.example.osm.mapreduce.GeohashMapper case class NodesWithGeohash() extends View { val id = fieldOf[Long]("The node ID") val version = fieldOf[Int]("OSM version - ignored") val userId = fieldOf[Int]("OSM user ID - ignored") val tstamp = fieldOf[String]("Timestamp of node creation") val longitude = fieldOf[Double]("Longitude of the node") val latitude = fieldOf[Double]("Latitude of the node") val geohash = fieldOf[String]("A geoencoded area string") val stageNodes = dependsOn { () => schedoscope.example.osm.stage.Nodes() .affects(n => Seq( n.id -> id, n.version -> version, n.userId -> userId, n.tstamp -> tstamp, n.longitude -> longitude, n.longitude -> geohash, n.latitude -> latitude, n.latitude -> geohash )) } transformVia(() => MapreduceTransformation( this, (conf: Map[String, Any]) => { val job = Job.getInstance LazyOutputFormat.setOutputFormatClass(job, classOf[TextOutputFormat[Text, NullWritable]]) job.setJobName(this.urlPath) job.setJarByClass(classOf[GeohashMapper]) job.setMapperClass(classOf[GeohashMapper]) job.setNumReduceTasks(0) FileInputFormat.setInputPaths(job, conf.get("input_path").get.toString) FileOutputFormat.setOutputPath(job, new Path(conf.get("output_path").get.toString)) val cfg = job.getConfiguration(); if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) { cfg.set("mapreduce.job.credentials.binary", System.getenv("HADOOP_TOKEN_FILE_LOCATION")) } job }).configureWith( Map( "input_path" -> stageNodes().fullPath, "output_path" -> fullPath))) comment("nodes, extended with geohash") storedAs(TextFile(fieldTerminator = "\\t", lineTerminator = "\\n")) }
Example 8
Source File: DBInputFormat.scala From magellan with Apache License 2.0 | 5 votes |
package magellan.mapreduce import java.util import scala.collection.JavaConversions.seqAsJavaList import org.apache.hadoop.fs.Path import org.apache.hadoop.io.MapWritable import org.apache.hadoop.mapreduce.lib.input.FileInputFormat import org.apache.hadoop.mapreduce.{InputSplit, JobContext, TaskAttemptContext} import magellan.io.ShapeKey private[magellan] class DBInputFormat extends FileInputFormat[ShapeKey, MapWritable] { override def createRecordReader(inputSplit: InputSplit, taskAttemptContext: TaskAttemptContext) = { new DBReader } override def isSplitable(context: JobContext, filename: Path): Boolean = false override def getSplits(job: JobContext): util.List[InputSplit] = { try { super.getSplits(job) }catch { case e: Exception => seqAsJavaList(List[InputSplit]()) } } }
Example 9
Source File: HadoopUtils.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.image import scala.language.existentials import scala.util.Random import org.apache.commons.io.FilenameUtils import org.apache.hadoop.conf.{Configuration, Configured} import org.apache.hadoop.fs.{Path, PathFilter} import org.apache.hadoop.mapreduce.lib.input.FileInputFormat import org.apache.spark.sql.SparkSession private object RecursiveFlag { def withPathFilter[T]( sampleRatio: Double, spark: SparkSession, seed: Long)(f: => T): T = { val sampleImages = sampleRatio < 1 if (sampleImages) { val flagName = FileInputFormat.PATHFILTER_CLASS val hadoopConf = spark.sparkContext.hadoopConfiguration val old = Option(hadoopConf.getClass(flagName, null)) hadoopConf.setDouble(SamplePathFilter.ratioParam, sampleRatio) hadoopConf.setLong(SamplePathFilter.seedParam, seed) hadoopConf.setClass(flagName, classOf[SamplePathFilter], classOf[PathFilter]) try f finally { hadoopConf.unset(SamplePathFilter.ratioParam) hadoopConf.unset(SamplePathFilter.seedParam) old match { case Some(v) => hadoopConf.setClass(flagName, v, classOf[PathFilter]) case None => hadoopConf.unset(flagName) } } } else { f } } }
Example 10
Source File: WholeTextFileRDD.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.hadoop.conf.{Configurable, Configuration} import org.apache.hadoop.io.{Text, Writable} import org.apache.hadoop.mapreduce.InputSplit import org.apache.hadoop.mapreduce.lib.input.FileInputFormat import org.apache.hadoop.mapreduce.task.JobContextImpl import org.apache.spark.{Partition, SparkContext} import org.apache.spark.input.WholeTextFileInputFormat private[spark] class WholeTextFileRDD( sc : SparkContext, inputFormatClass: Class[_ <: WholeTextFileInputFormat], keyClass: Class[Text], valueClass: Class[Text], conf: Configuration, minPartitions: Int) extends NewHadoopRDD[Text, Text](sc, inputFormatClass, keyClass, valueClass, conf) { override def getPartitions: Array[Partition] = { val conf = getConf // setMinPartitions below will call FileInputFormat.listStatus(), which can be quite slow when // traversing a large number of directories and files. Parallelize it. conf.setIfUnset(FileInputFormat.LIST_STATUS_NUM_THREADS, Runtime.getRuntime.availableProcessors().toString) val inputFormat = inputFormatClass.newInstance inputFormat match { case configurable: Configurable => configurable.setConf(conf) case _ => } val jobContext = new JobContextImpl(conf, jobId) inputFormat.setMinPartitions(jobContext, minPartitions) val rawSplits = inputFormat.getSplits(jobContext).toArray val result = new Array[Partition](rawSplits.size) for (i <- 0 until rawSplits.size) { result(i) = new NewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable]) } result } }
Example 11
Source File: BinaryFileRDD.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.hadoop.conf.{Configurable, Configuration} import org.apache.hadoop.io.Writable import org.apache.hadoop.mapreduce._ import org.apache.hadoop.mapreduce.lib.input.FileInputFormat import org.apache.hadoop.mapreduce.task.JobContextImpl import org.apache.spark.{Partition, SparkContext} import org.apache.spark.input.StreamFileInputFormat private[spark] class BinaryFileRDD[T]( @transient private val sc: SparkContext, inputFormatClass: Class[_ <: StreamFileInputFormat[T]], keyClass: Class[String], valueClass: Class[T], conf: Configuration, minPartitions: Int) extends NewHadoopRDD[String, T](sc, inputFormatClass, keyClass, valueClass, conf) { override def getPartitions: Array[Partition] = { val conf = getConf // setMinPartitions below will call FileInputFormat.listStatus(), which can be quite slow when // traversing a large number of directories and files. Parallelize it. conf.setIfUnset(FileInputFormat.LIST_STATUS_NUM_THREADS, Runtime.getRuntime.availableProcessors().toString) val inputFormat = inputFormatClass.newInstance inputFormat match { case configurable: Configurable => configurable.setConf(conf) case _ => } val jobContext = new JobContextImpl(conf, jobId) inputFormat.setMinPartitions(sc, jobContext, minPartitions) val rawSplits = inputFormat.getSplits(jobContext).toArray val result = new Array[Partition](rawSplits.size) for (i <- 0 until rawSplits.size) { result(i) = new NewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable]) } result } }
Example 12
Source File: HadoopUtils.scala From spark-images with Apache License 2.0 | 5 votes |
package org.apache.spark.image import java.nio.file.Paths import org.apache.commons.io.FilenameUtils import scala.sys.process._ import org.apache.hadoop.conf.{Configuration, Configured} import org.apache.hadoop.fs.{Path, PathFilter} import org.apache.hadoop.mapreduce.lib.input.FileInputFormat import org.apache.spark.sql.SparkSession import scala.language.existentials import scala.util.Random object RecursiveFlag { def setPathFilter(value: Option[Class[_]], sampleRatio: Option[Double] = None, spark: SparkSession) : Option[Class[_]] = { val flagName = FileInputFormat.PATHFILTER_CLASS val hadoopConf = spark.sparkContext.hadoopConfiguration val old = Option(hadoopConf.getClass(flagName, null)) if (sampleRatio.isDefined) { hadoopConf.setDouble(SamplePathFilter.ratioParam, sampleRatio.get) } else { hadoopConf.unset(SamplePathFilter.ratioParam) None } value match { case Some(v) => hadoopConf.setClass(flagName, v, classOf[PathFilter]) case None => hadoopConf.unset(flagName) } old } }
Example 13
Source File: GcsConnectorUtil.scala From scio with Apache License 2.0 | 5 votes |
package com.spotify.scio.parquet import com.spotify.scio.ScioContext import com.spotify.scio.util.ScioUtil import org.apache.hadoop.mapreduce.Job import org.apache.hadoop.mapreduce.lib.input.FileInputFormat private[parquet] object GcsConnectorUtil { def setCredentials(job: Job): Unit = // These are needed since `FileInputFormat.setInputPaths` validates paths locally and // requires the user's GCP credentials. sys.env.get("GOOGLE_APPLICATION_CREDENTIALS") match { case Some(json) => job.getConfiguration .set("fs.gs.auth.service.account.json.keyfile", json) case None => // Client id/secret of Google-managed project associated with the Cloud SDK job.getConfiguration .setBoolean("fs.gs.auth.service.account.enable", false) job.getConfiguration.set("fs.gs.auth.client.id", "32555940559.apps.googleusercontent.com") job.getConfiguration .set("fs.gs.auth.client.secret", "ZmssLNjJy2998hD4CTg2ejr2") } def unsetCredentials(job: Job): Unit = { job.getConfiguration.unset("fs.gs.auth.service.account.json.keyfile") job.getConfiguration.unset("fs.gs.auth.service.account.enable") job.getConfiguration.unset("fs.gs.auth.client.id") job.getConfiguration.unset("fs.gs.auth.client.secret") } def setInputPaths(sc: ScioContext, job: Job, path: String): Unit = { // This is needed since `FileInputFormat.setInputPaths` validates paths locally and requires // the user's GCP credentials. GcsConnectorUtil.setCredentials(job) FileInputFormat.setInputPaths(job, path) // It will interfere with credentials in Dataflow workers if (!ScioUtil.isLocalRunner(sc.options.getRunner)) { GcsConnectorUtil.unsetCredentials(job) } } }
Example 14
Source File: CarbonCountStar.scala From carbondata with Apache License 2.0 | 4 votes |
package org.apache.spark.sql import scala.collection.JavaConverters._ import org.apache.hadoop.fs.Path import org.apache.hadoop.mapred.JobConf import org.apache.hadoop.mapreduce.Job import org.apache.hadoop.mapreduce.lib.input.FileInputFormat import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.execution.LeafExecNode import org.apache.spark.sql.optimizer.CarbonFilters import org.apache.spark.sql.types.StringType import org.apache.spark.unsafe.types.UTF8String import org.apache.carbondata.core.datastore.impl.FileFactory import org.apache.carbondata.core.metadata.AbsoluteTableIdentifier import org.apache.carbondata.core.metadata.schema.table.CarbonTable import org.apache.carbondata.core.mutate.CarbonUpdateUtil import org.apache.carbondata.core.statusmanager.StageInputCollector import org.apache.carbondata.core.util.{CarbonProperties, ThreadLocalSessionInfo} import org.apache.carbondata.hadoop.api.{CarbonInputFormat, CarbonTableInputFormat} import org.apache.carbondata.hadoop.util.CarbonInputFormatUtil import org.apache.carbondata.spark.load.DataLoadProcessBuilderOnSpark case class CarbonCountStar( attributesRaw: Seq[Attribute], carbonTable: CarbonTable, sparkSession: SparkSession, outUnsafeRows: Boolean = true) extends LeafExecNode { override def doExecute(): RDD[InternalRow] = { ThreadLocalSessionInfo .setConfigurationToCurrentThread(sparkSession.sessionState.newHadoopConf()) val absoluteTableIdentifier = carbonTable.getAbsoluteTableIdentifier val (job, tableInputFormat) = createCarbonInputFormat(absoluteTableIdentifier) CarbonInputFormat.setQuerySegment(job.getConfiguration, carbonTable) // get row count var rowCount = CarbonUpdateUtil.getRowCount( tableInputFormat.getBlockRowCount( job, carbonTable, CarbonFilters.getPartitions( Seq.empty, sparkSession, TableIdentifier( carbonTable.getTableName, Some(carbonTable.getDatabaseName))).map(_.asJava).orNull, false), carbonTable) if (CarbonProperties.isQueryStageInputEnabled) { // check for number of row for stage input val splits = StageInputCollector.createInputSplits(carbonTable, job.getConfiguration) if (!splits.isEmpty) { val df = DataLoadProcessBuilderOnSpark.createInputDataFrame( sparkSession, carbonTable, splits.asScala) rowCount += df.count() } } val valueRaw = attributesRaw.head.dataType match { case StringType => Seq(UTF8String.fromString(Long.box(rowCount).toString)).toArray .asInstanceOf[Array[Any]] case _ => Seq(Long.box(rowCount)).toArray.asInstanceOf[Array[Any]] } val value = new GenericInternalRow(valueRaw) val unsafeProjection = UnsafeProjection.create(output.map(_.dataType).toArray) val row = if (outUnsafeRows) unsafeProjection(value) else value sparkContext.parallelize(Seq(row)) } override def output: Seq[Attribute] = { attributesRaw } private def createCarbonInputFormat(absoluteTableIdentifier: AbsoluteTableIdentifier ): (Job, CarbonTableInputFormat[Array[Object]]) = { val carbonInputFormat = new CarbonTableInputFormat[Array[Object]]() val jobConf: JobConf = new JobConf(FileFactory.getConfiguration) SparkHadoopUtil.get.addCredentials(jobConf) CarbonInputFormat.setTableInfo(jobConf, carbonTable.getTableInfo) val job = new Job(jobConf) FileInputFormat.addInputPath(job, new Path(absoluteTableIdentifier.getTablePath)) CarbonInputFormat .setTransactionalTable(job.getConfiguration, carbonTable.getTableInfo.isTransactionalTable) CarbonInputFormatUtil.setIndexJobIfConfigured(job.getConfiguration) (job, carbonInputFormat) } }