org.apache.hadoop.mapreduce.Job Scala Examples
The following examples show how to use org.apache.hadoop.mapreduce.Job.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: TextFileFormat.scala From drizzle-spark with Apache License 2.0 | 12 votes |
package org.apache.spark.sql.execution.datasources.text import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileStatus, Path} import org.apache.hadoop.io.{NullWritable, Text} import org.apache.hadoop.io.compress.GzipCodec import org.apache.hadoop.mapreduce.{Job, RecordWriter, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.output.{FileOutputFormat, TextOutputFormat} import org.apache.hadoop.util.ReflectionUtils import org.apache.spark.TaskContext import org.apache.spark.sql.{AnalysisException, Row, SparkSession} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.UnsafeRow import org.apache.spark.sql.catalyst.expressions.codegen.{BufferHolder, UnsafeRowWriter} import org.apache.spark.sql.catalyst.util.CompressionCodecs import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.sources._ import org.apache.spark.sql.types.{StringType, StructType} import org.apache.spark.util.SerializableConfiguration def getCompressionExtension(context: TaskAttemptContext): String = { // Set the compression extension, similar to code in TextOutputFormat.getDefaultWorkFile if (FileOutputFormat.getCompressOutput(context)) { val codecClass = FileOutputFormat.getOutputCompressorClass(context, classOf[GzipCodec]) ReflectionUtils.newInstance(codecClass, context.getConfiguration).getDefaultExtension } else { "" } } }
Example 2
Source File: HBaseUtils.scala From bigdata-examples with Apache License 2.0 | 5 votes |
package com.timeyang.common.util import com.timeyang.common.config.BaseConf import org.apache.hadoop.conf.Configuration import org.apache.hadoop.hbase.{HBaseConfiguration, HColumnDescriptor, HTableDescriptor, TableName} import org.apache.hadoop.hbase.client._ import org.apache.hadoop.hbase.io.ImmutableBytesWritable import org.apache.hadoop.hbase.mapreduce.TableOutputFormat import org.apache.hadoop.hbase.protobuf.ProtobufUtil import org.apache.hadoop.hbase.protobuf.generated.ClientProtos import org.apache.hadoop.hbase.util.Base64 import org.apache.hadoop.mapreduce.Job def createHbaseOutputJob(tableName: String): Job = { val conf = HBaseUtils.newConf() conf.set(TableOutputFormat.OUTPUT_TABLE, tableName) val job = Job.getInstance(conf) job.setOutputKeyClass(classOf[ImmutableBytesWritable]) job.setOutputValueClass(classOf[Put]) job.setOutputFormatClass(classOf[TableOutputFormat[ImmutableBytesWritable]]) job } }
Example 3
Source File: CommitFailureTestSource.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext} import org.apache.spark.TaskContext import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.execution.datasources.{OutputWriter, OutputWriterFactory} import org.apache.spark.sql.types.StructType class CommitFailureTestSource extends SimpleTextSource { override def prepareWrite( sparkSession: SparkSession, job: Job, options: Map[String, String], dataSchema: StructType): OutputWriterFactory = new OutputWriterFactory { override def newInstance( stagingDir: String, fileNamePrefix: String, dataSchema: StructType, context: TaskAttemptContext): OutputWriter = { new SimpleTextOutputWriter(stagingDir, fileNamePrefix, context) { var failed = false TaskContext.get().addTaskFailureListener { (t: TaskContext, e: Throwable) => failed = true SimpleTextRelation.callbackCalled = true } override val path: String = new Path(stagingDir, fileNamePrefix).toString override def write(row: Row): Unit = { if (SimpleTextRelation.failWriter) { sys.error("Intentional task writer failure for testing purpose.") } super.write(row) } override def close(): Unit = { super.close() sys.error("Intentional task commitment failure for testing purpose.") } } } } override def shortName(): String = "commit-failure-test" }
Example 4
Source File: GcsConnectorUtil.scala From scio with Apache License 2.0 | 5 votes |
package com.spotify.scio.parquet import com.spotify.scio.ScioContext import com.spotify.scio.util.ScioUtil import org.apache.hadoop.mapreduce.Job import org.apache.hadoop.mapreduce.lib.input.FileInputFormat private[parquet] object GcsConnectorUtil { def setCredentials(job: Job): Unit = // These are needed since `FileInputFormat.setInputPaths` validates paths locally and // requires the user's GCP credentials. sys.env.get("GOOGLE_APPLICATION_CREDENTIALS") match { case Some(json) => job.getConfiguration .set("fs.gs.auth.service.account.json.keyfile", json) case None => // Client id/secret of Google-managed project associated with the Cloud SDK job.getConfiguration .setBoolean("fs.gs.auth.service.account.enable", false) job.getConfiguration.set("fs.gs.auth.client.id", "32555940559.apps.googleusercontent.com") job.getConfiguration .set("fs.gs.auth.client.secret", "ZmssLNjJy2998hD4CTg2ejr2") } def unsetCredentials(job: Job): Unit = { job.getConfiguration.unset("fs.gs.auth.service.account.json.keyfile") job.getConfiguration.unset("fs.gs.auth.service.account.enable") job.getConfiguration.unset("fs.gs.auth.client.id") job.getConfiguration.unset("fs.gs.auth.client.secret") } def setInputPaths(sc: ScioContext, job: Job, path: String): Unit = { // This is needed since `FileInputFormat.setInputPaths` validates paths locally and requires // the user's GCP credentials. GcsConnectorUtil.setCredentials(job) FileInputFormat.setInputPaths(job, path) // It will interfere with credentials in Dataflow workers if (!ScioUtil.isLocalRunner(sc.options.getRunner)) { GcsConnectorUtil.unsetCredentials(job) } } }
Example 5
Source File: LasRelation.scala From spark-iqmulus with Apache License 2.0 | 5 votes |
package fr.ign.spark.iqmulus.las import fr.ign.spark.iqmulus.{ BinarySectionRelation, BinarySection } import org.apache.hadoop.fs.{ FileSystem, Path } import org.apache.spark.sql.SQLContext import org.apache.spark.sql.sources.OutputWriterFactory import org.apache.hadoop.mapreduce.Job import org.apache.spark.sql.types._ import scala.util.{ Try, Success, Failure } class LasRelation( override val paths: Array[String], override val maybeDataSchema: Option[StructType], override val userDefinedPartitionColumns: Option[StructType], parameters: Map[String, String] )(@transient val sqlContext: SQLContext) extends BinarySectionRelation(parameters) { def format = parameters.get("lasformat").map(_.toByte) def minor = parameters.get("minor").map(_.toByte).getOrElse(Version.minorDefault) def major = parameters.get("major").map(_.toByte).getOrElse(Version.majorDefault) def version = parameters.get("version").map(Version.fromString) .getOrElse(Version(major, minor)) lazy val headers: Array[LasHeader] = paths flatMap { location => Try { val path = new Path(location) val fs = FileSystem.get(path.toUri, sqlContext.sparkContext.hadoopConfiguration) val dis = fs.open(path) try LasHeader.read(location, dis) finally { dis.close fs.close } } match { case Success(h) => Some(h) case Failure(e) => logWarning(s"Skipping $location : ${e.getMessage}"); None } } override def sections: Array[BinarySection] = headers.map(_.toBinarySection(paths)) override def prepareJobForWrite(job: Job): OutputWriterFactory = { new LasOutputWriterFactory(format, version) } }
Example 6
Source File: TimelyImplicits.scala From Mastering-Spark-for-Data-Science with MIT License | 5 votes |
package io.gzet.timeseries.timely import io.gzet.utils.spark.accumulo.AccumuloConfig import org.apache.accumulo.core.client.ClientConfiguration import org.apache.accumulo.core.client.mapreduce.{AbstractInputFormat, InputFormatBase} import org.apache.accumulo.core.client.security.tokens.PasswordToken import org.apache.accumulo.core.data.Range import org.apache.accumulo.core.security.Authorizations import org.apache.hadoop.io.NullWritable import org.apache.hadoop.mapreduce.Job import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import scala.collection.JavaConversions._ object TimelyImplicits { implicit class AccumuloReader(sc: SparkContext) { def timely(accumuloConfig: AccumuloConfig, rowPrefix: Option[String] = None): RDD[Metric] = { val conf = sc.hadoopConfiguration val job = Job.getInstance(conf) val clientConfig: ClientConfiguration = new ClientConfiguration() .withInstance(accumuloConfig.accumuloInstance) .withZkHosts(accumuloConfig.zookeeperHosts) val authorizations = new Authorizations(List("INTERNAL", "CONFIDENTIAL", "SECRET").map(_.getBytes())) AbstractInputFormat.setConnectorInfo(job, accumuloConfig.accumuloUser, new PasswordToken(accumuloConfig.accumuloPassword)) AbstractInputFormat.setZooKeeperInstance(job, clientConfig) AbstractInputFormat.setScanAuthorizations(job, authorizations) InputFormatBase.setInputTableName(job, "timely.metrics") if(rowPrefix.isDefined) { val ranges = List(Range.prefix(rowPrefix.get)) InputFormatBase.setRanges(job, ranges) } val rdd = sc.newAPIHadoopRDD(job.getConfiguration, classOf[AccumuloTimelyInputFormat], classOf[NullWritable], classOf[TimelyWritable] ) values rdd map { timely => val Array(tagK, tagV) = timely.getMetricType.split("=", 2) Metric( timely.getMetric, timely.getTime, timely.getMetricValue, Map(tagK -> tagV) ) } } } }
Example 7
Source File: AccumuloLoader.scala From Mastering-Spark-for-Data-Science with MIT License | 5 votes |
package io.gzet.community.accumulo import org.apache.accumulo.core.client.mapreduce.AccumuloOutputFormat import org.apache.accumulo.core.client.security.tokens.PasswordToken import org.apache.accumulo.core.client.{BatchWriterConfig, ClientConfiguration} import org.apache.accumulo.core.data.Mutation import org.apache.accumulo.core.security.ColumnVisibility import org.apache.hadoop.io.Text import org.apache.hadoop.mapreduce.Job import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD class AccumuloLoader(config: AccumuloConfig) extends Serializable { def persist(sc: SparkContext, accumuloTable: String, rdd: RDD[(String, String)], blacklist: Set[String] = Set()) = { val conf = sc.hadoopConfiguration val job = Job.getInstance(conf) val clientConfig: ClientConfiguration = new ClientConfiguration() .withInstance(config.accumuloInstance) .withZkHosts(config.zookeeperHosts) AccumuloOutputFormat.setConnectorInfo(job, config.accumuloUser, new PasswordToken(config.accumuloPassword)) AccumuloOutputFormat.setBatchWriterOptions(job, new BatchWriterConfig) AccumuloOutputFormat.setZooKeeperInstance(job, clientConfig) AccumuloOutputFormat.setCreateTables(job, true) val bList = sc.broadcast(blacklist) val mutationRdd = rdd.map({ case (from, to) => val visibility = { if(bList.value.contains(from) || bList.value.contains(to)){ new ColumnVisibility(AccumuloAuthorization.BLACKLIST) } else { new ColumnVisibility("") } } val mutation = new Mutation(from) mutation.put("associated", to, visibility, "1") (new Text(accumuloTable), mutation) }) mutationRdd.saveAsNewAPIHadoopFile( "", classOf[Text], classOf[Mutation], classOf[AccumuloOutputFormat], job.getConfiguration ) } } object AccumuloAuthorization { final val BLACKLIST = "BLACKLIST" }
Example 8
Source File: AccumuloReader.scala From Mastering-Spark-for-Data-Science with MIT License | 5 votes |
package io.gzet.community.accumulo import org.apache.accumulo.core.client.{IteratorSetting, ClientConfiguration} import org.apache.accumulo.core.client.mapreduce.{AccumuloInputFormat, AbstractInputFormat, InputFormatBase} import org.apache.accumulo.core.client.security.tokens.PasswordToken import org.apache.accumulo.core.security.Authorizations import org.apache.hadoop.io.NullWritable import org.apache.hadoop.mapreduce.Job import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import scala.language.postfixOps class AccumuloReader(config: AccumuloConfig) extends Serializable { def read(sc: SparkContext, accumuloTable: String, authorization: Option[String] = None): RDD[EdgeWritable] = { val conf = sc.hadoopConfiguration val job = Job.getInstance(conf) val clientConfig: ClientConfiguration = new ClientConfiguration() .withInstance(config.accumuloInstance) .withZkHosts(config.zookeeperHosts) AbstractInputFormat.setConnectorInfo(job, config.accumuloUser, new PasswordToken(config.accumuloPassword)) AbstractInputFormat.setZooKeeperInstance(job, clientConfig) if(authorization.isDefined) AbstractInputFormat.setScanAuthorizations(job, new Authorizations(authorization.get)) val is = new IteratorSetting( 1, "summingCombiner", "org.apache.accumulo.core.iterators.user.SummingCombiner" ) is.addOption("all", "") is.addOption("columns", "associated") is.addOption("lossy", "TRUE") is.addOption("type", "STRING") InputFormatBase.addIterator(job, is) InputFormatBase.setInputTableName(job, accumuloTable) sc.newAPIHadoopRDD(job.getConfiguration, classOf[AccumuloGraphxInputFormat], classOf[NullWritable], classOf[EdgeWritable] ) values } }
Example 9
Source File: SageMakerProtobufFileFormat.scala From sagemaker-spark with Apache License 2.0 | 5 votes |
package com.amazonaws.services.sagemaker.sparksdk.protobuf import org.apache.hadoop.fs.FileStatus import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.execution.datasources.{FileFormat, OutputWriter, OutputWriterFactory} import org.apache.spark.sql.sources.DataSourceRegister import org.apache.spark.sql.types.StructType class SageMakerProtobufFileFormat extends FileFormat with DataSourceRegister { override def inferSchema(sparkSession: SparkSession, options: Map[String, String], files: Seq[FileStatus]): Option[StructType] = { Option.empty } override def shortName(): String = "sagemaker" override def toString: String = "SageMaker" override def prepareWrite( sparkSession: SparkSession, job: Job, options: Map[String, String], dataSchema: StructType): OutputWriterFactory = { new OutputWriterFactory { override def newInstance( path: String, dataSchema: StructType, context: TaskAttemptContext): OutputWriter = { new SageMakerProtobufWriter(path, context, dataSchema, options) } override def getFileExtension(context: TaskAttemptContext): String = { ".pbr" } } } }
Example 10
Source File: CommitFailureTestSource.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext} import org.apache.spark.TaskContext import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.datasources.{OutputWriter, OutputWriterFactory} import org.apache.spark.sql.types.StructType class CommitFailureTestSource extends SimpleTextSource { override def prepareWrite( sparkSession: SparkSession, job: Job, options: Map[String, String], dataSchema: StructType): OutputWriterFactory = new OutputWriterFactory { override def newInstance( path: String, dataSchema: StructType, context: TaskAttemptContext): OutputWriter = { new SimpleTextOutputWriter(path, dataSchema, context) { var failed = false TaskContext.get().addTaskFailureListener { (t: TaskContext, e: Throwable) => failed = true SimpleTextRelation.callbackCalled = true } override def write(row: InternalRow): Unit = { if (SimpleTextRelation.failWriter) { sys.error("Intentional task writer failure for testing purpose.") } super.write(row) } override def close(): Unit = { super.close() sys.error("Intentional task commitment failure for testing purpose.") } } } override def getFileExtension(context: TaskAttemptContext): String = "" } override def shortName(): String = "commit-failure-test" }
Example 11
Source File: CassandraTest.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.nio.ByteBuffer import java.util.SortedMap import scala.collection.JavaConversions._ import org.apache.cassandra.db.IColumn import org.apache.cassandra.hadoop.ColumnFamilyOutputFormat import org.apache.cassandra.hadoop.ConfigHelper import org.apache.cassandra.hadoop.ColumnFamilyInputFormat import org.apache.cassandra.thrift._ import org.apache.cassandra.utils.ByteBufferUtil import org.apache.hadoop.mapreduce.Job import org.apache.spark.{SparkConf, SparkContext} object CassandraTest { def main(args: Array[String]) { val sparkConf = new SparkConf().setAppName("casDemo") // Get a SparkContext val sc = new SparkContext(sparkConf) // Build the job configuration with ConfigHelper provided by Cassandra //使用Cassandra提供的ConfigHelper构建作业配置 val job = new Job() job.setInputFormatClass(classOf[ColumnFamilyInputFormat]) val host: String = args(1) val port: String = args(2) ConfigHelper.setInputInitialAddress(job.getConfiguration(), host) ConfigHelper.setInputRpcPort(job.getConfiguration(), port) ConfigHelper.setOutputInitialAddress(job.getConfiguration(), host) ConfigHelper.setOutputRpcPort(job.getConfiguration(), port) ConfigHelper.setInputColumnFamily(job.getConfiguration(), "casDemo", "Words") ConfigHelper.setOutputColumnFamily(job.getConfiguration(), "casDemo", "WordCount") sc.stop() } } // scalastyle:on println
Example 12
Source File: CommitFailureTestSource.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext} import org.apache.spark.TaskContext import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.execution.datasources.{OutputWriter, OutputWriterFactory} import org.apache.spark.sql.types.StructType class CommitFailureTestSource extends SimpleTextSource { override def prepareWrite( sparkSession: SparkSession, job: Job, options: Map[String, String], dataSchema: StructType): OutputWriterFactory = new OutputWriterFactory { override def newInstance( path: String, dataSchema: StructType, context: TaskAttemptContext): OutputWriter = { new SimpleTextOutputWriter(path, context) { var failed = false TaskContext.get().addTaskFailureListener { (t: TaskContext, e: Throwable) => failed = true SimpleTextRelation.callbackCalled = true } override def write(row: Row): Unit = { if (SimpleTextRelation.failWriter) { sys.error("Intentional task writer failure for testing purpose.") } super.write(row) } override def close(): Unit = { super.close() sys.error("Intentional task commitment failure for testing purpose.") } } } override def getFileExtension(context: TaskAttemptContext): String = "" } override def shortName(): String = "commit-failure-test" }
Example 13
Source File: DistCpWrapper.scala From m3d-engine with Apache License 2.0 | 5 votes |
package com.adidas.analytics.util import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.Job import org.apache.hadoop.tools.{DistCp, DistCpOptions} import scala.collection.JavaConversions._ class DistCpWrapper(conf: Configuration, sources: Seq[Path], target: Path) { private val baseOptions = new DistCpOptions(sources, target) def run(mapsNum: Int = 10, atomic: Boolean = false, overwrite: Boolean = false): Job = { val options = new DistCpOptions(baseOptions) options.setAppend(false) options.setBlocking(true) options.setSyncFolder(false) options.setDeleteMissing(false) options.setMaxMaps(mapsNum) options.setOverwrite(overwrite) options.setAtomicCommit(atomic) new DistCp(conf, options).execute() } } object DistCpWrapper { def apply(conf: Configuration, sources: Seq[Path], target: Path): DistCpWrapper = { new DistCpWrapper(conf, sources, target) } }
Example 14
Source File: FlinkExample.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.examples import org.apache.flink.api.java.ExecutionEnvironment import org.apache.hadoop.conf.Configuration import org.apache.hadoop.mapreduce.Job import org.apache.carbondata.examples.util.ExampleUtils import org.apache.carbondata.hadoop.CarbonProjection import org.apache.carbondata.hadoop.api.{CarbonInputFormat, CarbonTableInputFormat} // Write carbondata file by spark and read it by flink // scalastyle:off println object FlinkExample { def main(args: Array[String]): Unit = { // write carbondata file by spark val cc = ExampleUtils.createCarbonSession("FlinkExample") val path = ExampleUtils.writeSampleCarbonFile(cc, "carbon1") // read two columns by flink val projection = new CarbonProjection projection.addColumn("c1") // column c1 projection.addColumn("c3") // column c3 val conf = new Configuration() CarbonInputFormat.setColumnProjection(conf, projection) val env = ExecutionEnvironment.getExecutionEnvironment val ds = env.readHadoopFile( new CarbonTableInputFormat[Array[Object]], classOf[Void], classOf[Array[Object]], path, new Job(conf) ) // print result val result = ds.collect() for (i <- 0 until result.size()) { println(result.get(i).f1.mkString(",")) } // delete carbondata file ExampleUtils.cleanSampleCarbonFile(cc, "carbon1") } } // scalastyle:on println
Example 15
Source File: CommitFailureTestSource.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext} import org.apache.spark.TaskContext import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.execution.datasources.{OutputWriter, OutputWriterFactory} import org.apache.spark.sql.types.StructType class CommitFailureTestSource extends SimpleTextSource { override def prepareWrite( sparkSession: SparkSession, job: Job, options: Map[String, String], dataSchema: StructType): OutputWriterFactory = new OutputWriterFactory { override def newInstance( path: String, dataSchema: StructType, context: TaskAttemptContext): OutputWriter = { new SimpleTextOutputWriter(path, context) { var failed = false TaskContext.get().addTaskFailureListener { (t: TaskContext, e: Throwable) => failed = true SimpleTextRelation.callbackCalled = true } override def write(row: Row): Unit = { if (SimpleTextRelation.failWriter) { sys.error("Intentional task writer failure for testing purpose.") } super.write(row) } override def close(): Unit = { super.close() sys.error("Intentional task commitment failure for testing purpose.") } } } override def getFileExtension(context: TaskAttemptContext): String = "" } override def shortName(): String = "commit-failure-test" }
Example 16
Source File: ArrowFileFormat.scala From OAP with Apache License 2.0 | 5 votes |
package com.intel.oap.spark.sql.execution.datasources.arrow import scala.collection.JavaConverters._ import com.intel.oap.spark.sql.execution.datasources.arrow.ArrowFileFormat.UnsafeItr import com.intel.oap.spark.sql.execution.datasources.v2.arrow.{ArrowFilters, ArrowOptions} import com.intel.oap.spark.sql.execution.datasources.v2.arrow.ArrowSQLConf._ import org.apache.arrow.dataset.scanner.ScanOptions import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.FileStatus import org.apache.hadoop.mapreduce.Job import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.datasources.{FileFormat, OutputWriterFactory, PartitionedFile} import org.apache.spark.sql.execution.datasources.v2.arrow.ArrowUtils import org.apache.spark.sql.sources.{DataSourceRegister, Filter} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.util.CaseInsensitiveStringMap; class ArrowFileFormat extends FileFormat with DataSourceRegister with Serializable { val batchSize = 4096 def convert(files: Seq[FileStatus], options: Map[String, String]): Option[StructType] = { ArrowUtils.readSchema(files, new CaseInsensitiveStringMap(options.asJava)) } override def inferSchema( sparkSession: SparkSession, options: Map[String, String], files: Seq[FileStatus]): Option[StructType] = { convert(files, options) } override def prepareWrite( sparkSession: SparkSession, job: Job, options: Map[String, String], dataSchema: StructType): OutputWriterFactory = { throw new UnsupportedOperationException("Write is not supported for Arrow source") } override def supportBatch(sparkSession: SparkSession, dataSchema: StructType): Boolean = true override def buildReaderWithPartitionValues(sparkSession: SparkSession, dataSchema: StructType, partitionSchema: StructType, requiredSchema: StructType, filters: Seq[Filter], options: Map[String, String], hadoopConf: Configuration): PartitionedFile => Iterator[InternalRow] = { (file: PartitionedFile) => { val sqlConf = sparkSession.sessionState.conf; val enableFilterPushDown = sqlConf.arrowFilterPushDown val factory = ArrowUtils.makeArrowDiscovery( file.filePath, new ArrowOptions( new CaseInsensitiveStringMap( options.asJava).asScala.toMap)) // todo predicate validation / pushdown val dataset = factory.finish(); val filter = if (enableFilterPushDown) { ArrowFilters.translateFilters(filters) } else { org.apache.arrow.dataset.filter.Filter.EMPTY } val scanOptions = new ScanOptions(requiredSchema.map(f => f.name).toArray, filter, batchSize) val scanner = dataset.newScan(scanOptions) val itrList = scanner .scan() .iterator() .asScala .map(task => task.scan()) .toList val itr = itrList .toIterator .flatMap(itr => itr.asScala) .map(vsr => ArrowUtils.loadVsr(vsr, file.partitionValues, partitionSchema, dataSchema)) new UnsafeItr(itr).asInstanceOf[Iterator[InternalRow]] } } override def shortName(): String = "arrow" } object ArrowFileFormat { class UnsafeItr[T](delegate: Iterator[T]) extends Iterator[T] { override def hasNext: Boolean = delegate.hasNext override def next(): T = delegate.next() } }
Example 17
Source File: OapIndexCommitProtocolSuite.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.oap.index import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.Job import org.apache.hadoop.mapreduce.MRJobConfig import org.apache.hadoop.mapreduce.TaskAttemptID import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat import org.apache.hadoop.mapreduce.task.JobContextImpl import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl import org.apache.spark.sql.test.oap.SharedOapContext import org.apache.spark.util.Utils class OapIndexCommitProtocolSuite extends SharedOapContext { test("newTaskTempFile") { val attempt = "attempt_200707121733_0001_m_000000_0" val taskID = TaskAttemptID.forName(attempt) val jobID = taskID.getJobID.toString val outDir = Utils.createTempDir().getAbsolutePath val job = Job.getInstance() FileOutputFormat.setOutputPath(job, new Path(outDir)) val conf = job.getConfiguration() conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt) val jobContext = new JobContextImpl(conf, taskID.getJobID()) val taskContext = new TaskAttemptContextImpl(conf, taskID) val commitProtocol = new OapIndexCommitProtocol(jobID, outDir) // test task temp path val pendingDirName = "_temporary_" + jobID commitProtocol.setupJob(jobContext) commitProtocol.setupTask(taskContext) val tempFile = new Path(commitProtocol.newTaskTempFile(taskContext, None, "test")) val expectedJobAttemptPath = new Path(new Path(outDir, pendingDirName), "0") val expectedTaskWorkPath = new Path(new Path(expectedJobAttemptPath, pendingDirName), attempt) assert(tempFile.getParent == expectedTaskWorkPath) } }
Example 18
Source File: ReadOnlyParquetFileFormat.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.parquet import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.Job import org.apache.spark.sql.SparkSession import org.apache.spark.sql.execution.datasources.OutputWriterFactory import org.apache.spark.sql.types.StructType class ReadOnlyParquetFileFormat extends ParquetFileFormat { override def isSplitable( sparkSession: SparkSession, options: Map[String, String], path: Path): Boolean = false override def prepareWrite( sparkSession: SparkSession, job: Job, options: Map[String, String], dataSchema: StructType): OutputWriterFactory = throw new UnsupportedOperationException("ReadOnlyParquetFileFormat not support write operation") }
Example 19
Source File: OapIndexFileFormat.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.oap.index import org.apache.hadoop.fs.FileStatus import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext} import org.apache.parquet.hadoop.util.ContextUtil import org.apache.spark.internal.Logging import org.apache.spark.sql.SparkSession import org.apache.spark.sql.execution.datasources.{FileFormat, OutputWriterFactory} import org.apache.spark.sql.execution.datasources.oap.OapFileFormat import org.apache.spark.sql.types.StructType private[index] class OapIndexFileFormat extends FileFormat with Logging with Serializable { override def inferSchema( sparkSession: SparkSession, options: Map[String, String], files: Seq[FileStatus]): Option[StructType] = None override def prepareWrite( sparkSession: SparkSession, job: Job, options: Map[String, String], dataSchema: StructType): OutputWriterFactory = { val configuration = ContextUtil.getConfiguration(job) configuration.set(OapIndexFileFormat.ROW_SCHEMA, dataSchema.json) configuration.set(OapIndexFileFormat.INDEX_TYPE, options("indexType")) configuration.set(OapIndexFileFormat.INDEX_NAME, options("indexName")) configuration.set(OapIndexFileFormat.INDEX_TIME, options("indexTime")) configuration.set(OapIndexFileFormat.IS_APPEND, options("isAppend")) new OutputWriterFactory { override def getFileExtension(context: TaskAttemptContext): String = OapFileFormat.OAP_INDEX_EXTENSION override def newInstance(path: String, dataSchema: StructType, context: TaskAttemptContext) = new OapIndexOutputWriter(path, context) } } } private[index] object OapIndexFileFormat { val ROW_SCHEMA: String = "org.apache.spark.sql.oap.row.attributes" val INDEX_TYPE: String = "org.apache.spark.sql.oap.index.type" val INDEX_NAME: String = "org.apache.spark.sql.oap.index.name" val INDEX_TIME: String = "org.apache.spark.sql.oap.index.time" val IS_APPEND: String = "org.apache.spark.sql.oap.index.append" } case class IndexBuildResult(dataFile: String, rowCount: Long, fingerprint: String, parent: String)
Example 20
Source File: ReadOnlyNativeOrcFileFormat.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.orc import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.Job import org.apache.spark.sql.SparkSession import org.apache.spark.sql.execution.datasources.OutputWriterFactory import org.apache.spark.sql.types.StructType class ReadOnlyNativeOrcFileFormat extends OrcFileFormat { override def isSplitable( sparkSession: SparkSession, options: Map[String, String], path: Path): Boolean = false override def prepareWrite( sparkSession: SparkSession, job: Job, options: Map[String, String], dataSchema: StructType): OutputWriterFactory = throw new UnsupportedOperationException( "ReadOnlyNativeOrcFileFormat doesn't support write operation") }
Example 21
Source File: ReadOnlyOrcFileFormat.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.orc import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.Job import org.apache.spark.sql.SparkSession import org.apache.spark.sql.execution.datasources.OutputWriterFactory import org.apache.spark.sql.types.StructType class ReadOnlyOrcFileFormat extends OrcFileFormat { override def isSplitable( sparkSession: SparkSession, options: Map[String, String], path: Path): Boolean = false override def prepareWrite( sparkSession: SparkSession, job: Job, options: Map[String, String], dataSchema: StructType): OutputWriterFactory = throw new UnsupportedOperationException("ReadOnlyOrcFileFormat not support write operation") }
Example 22
Source File: NodesWithGeohash.scala From schedoscope with Apache License 2.0 | 5 votes |
package schedoscope.example.osm.processed import org.apache.hadoop.fs.Path import org.apache.hadoop.io.{NullWritable, Text} import org.apache.hadoop.mapreduce.Job import org.apache.hadoop.mapreduce.lib.input.FileInputFormat import org.apache.hadoop.mapreduce.lib.output.{FileOutputFormat, LazyOutputFormat, TextOutputFormat} import org.schedoscope.dsl.View import org.schedoscope.dsl.storageformats.TextFile import org.schedoscope.dsl.transformations.MapreduceTransformation import schedoscope.example.osm.mapreduce.GeohashMapper case class NodesWithGeohash() extends View { val id = fieldOf[Long]("The node ID") val version = fieldOf[Int]("OSM version - ignored") val userId = fieldOf[Int]("OSM user ID - ignored") val tstamp = fieldOf[String]("Timestamp of node creation") val longitude = fieldOf[Double]("Longitude of the node") val latitude = fieldOf[Double]("Latitude of the node") val geohash = fieldOf[String]("A geoencoded area string") val stageNodes = dependsOn { () => schedoscope.example.osm.stage.Nodes() .affects(n => Seq( n.id -> id, n.version -> version, n.userId -> userId, n.tstamp -> tstamp, n.longitude -> longitude, n.longitude -> geohash, n.latitude -> latitude, n.latitude -> geohash )) } transformVia(() => MapreduceTransformation( this, (conf: Map[String, Any]) => { val job = Job.getInstance LazyOutputFormat.setOutputFormatClass(job, classOf[TextOutputFormat[Text, NullWritable]]) job.setJobName(this.urlPath) job.setJarByClass(classOf[GeohashMapper]) job.setMapperClass(classOf[GeohashMapper]) job.setNumReduceTasks(0) FileInputFormat.setInputPaths(job, conf.get("input_path").get.toString) FileOutputFormat.setOutputPath(job, new Path(conf.get("output_path").get.toString)) val cfg = job.getConfiguration(); if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) { cfg.set("mapreduce.job.credentials.binary", System.getenv("HADOOP_TOKEN_FILE_LOCATION")) } job }).configureWith( Map( "input_path" -> stageNodes().fullPath, "output_path" -> fullPath))) comment("nodes, extended with geohash") storedAs(TextFile(fieldTerminator = "\\t", lineTerminator = "\\n")) }
Example 23
Source File: MapreduceDriverTest.scala From schedoscope with Apache License 2.0 | 5 votes |
package org.schedoscope.scheduler.driver import java.nio.charset.StandardCharsets import java.nio.file.{Files, Paths} import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.Job import org.apache.hadoop.mapreduce.lib.input.FileInputFormat import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat import org.scalatest.{FlatSpec, Matchers} import org.schedoscope.dsl.View import org.schedoscope.dsl.transformations.{FailingMapper, MapreduceTransformation} import org.schedoscope.test.resources.LocalTestResources import org.schedoscope.test.resources.TestDriverRunCompletionHandlerCallCounter._ class MapreduceDriverTest extends FlatSpec with Matchers with TestFolder { lazy val driver = new LocalTestResources().driverFor[MapreduceTransformation]("mapreduce") def invalidJob: (Map[String, Any]) => Job = (m: Map[String, Any]) => Job.getInstance def failingJob: (Map[String, Any]) => Job = (m: Map[String, Any]) => { writeData() val job = Job.getInstance job.setMapperClass(classOf[FailingMapper]) FileInputFormat.setInputPaths(job, new Path(inputPath(""))) FileOutputFormat.setOutputPath(job, new Path(outputPath(System.nanoTime.toString))) job } def identityJob: (Map[String, Any]) => Job = (m: Map[String, Any]) => { writeData() val job = Job.getInstance FileInputFormat.setInputPaths(job, new Path(inputPath(""))) FileOutputFormat.setOutputPath(job, new Path(outputPath(System.nanoTime.toString))) job } case class DummyView() extends View def writeData() { Files.write(Paths.get(s"${inputPath("")}/file.txt"), "some data".getBytes(StandardCharsets.UTF_8)) } "MapreduceDriver" should "have transformation name Mapreduce" in { driver.transformationName shouldBe "mapreduce" } it should "execute Mapreduce transformations synchronously" in { val driverRunState = driver.runAndWait(MapreduceTransformation(new DummyView(), identityJob)) driverRunState shouldBe a[DriverRunSucceeded[_]] } it should "execute another Mapreduce transformations synchronously" in { val driverRunState = driver.runAndWait(MapreduceTransformation(new DummyView(), identityJob)) driverRunState shouldBe a[DriverRunSucceeded[_]] } it should "execute Mapreduce transformations asynchronously" in { val driverRunHandle = driver.run(MapreduceTransformation(new DummyView(), identityJob)) var runWasAsynchronous = false while (driver.getDriverRunState(driverRunHandle).isInstanceOf[DriverRunOngoing[_]]) runWasAsynchronous = true runWasAsynchronous shouldBe true driver.getDriverRunState(driverRunHandle) shouldBe a[DriverRunSucceeded[_]] } it should "execute Mapreduce transformations and return errors when running asynchronously" in { val driverRunHandle = driver.run(MapreduceTransformation(new DummyView(), failingJob)) var runWasAsynchronous = false while (driver.getDriverRunState(driverRunHandle).isInstanceOf[DriverRunOngoing[_]]) runWasAsynchronous = true // runWasAsynchronous shouldBe true FIXME: isn't asynchronous, why? driver.getDriverRunState(driverRunHandle) shouldBe a[DriverRunFailed[_]] } it should "call its DriverRunCompletitionHandlers' driverRunCompleted upon request" in { val runHandle = driver.run(MapreduceTransformation(new DummyView(), identityJob)) while (driver.getDriverRunState(runHandle).isInstanceOf[DriverRunOngoing[_]]) {} driver.driverRunCompleted(runHandle) driverRunCompletedCalled(runHandle, driver.getDriverRunState(runHandle)) shouldBe true } it should "call its DriverRunCompletitionHandlers' driverRunStarted upon request" in { val runHandle = driver.run(MapreduceTransformation(new DummyView(), identityJob)) driver.driverRunStarted(runHandle) driverRunStartedCalled(runHandle) shouldBe true } }
Example 24
Source File: DistCpTransformation.scala From schedoscope with Apache License 2.0 | 5 votes |
package org.schedoscope.dsl.transformations import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.Job import org.apache.hadoop.tools.{DistCp, DistCpOptions} import org.schedoscope.dsl.View import org.schedoscope.scheduler.driver.{DriverRunState, MapreduceDriver} import scala.collection.JavaConverters._ object DistCpTransformation { def copyToView(sourceView: View, targetView: View): DistCpTransformation = { val target = targetView.fullPath.split("/").dropRight(1).mkString("/") DistCpTransformation(targetView, List(sourceView.fullPath), target) } def copyToDirToView(sourcePath: String, targetView: View): DistCpTransformation = { val target = targetView.fullPath.split("/").drop(1).mkString("/") DistCpTransformation(targetView, List(sourcePath), target) } def copyToFileToView(sourceFile: String, targetView: View): DistCpTransformation = { DistCpTransformation(targetView, List(sourceFile), targetView.fullPath) } } case class DistCpTransformation(v: View, var sources: List[String], var target: String, deleteViewPath: Boolean = false, config: Configuration = new Configuration()) extends MapreduceBaseTransformation { var directoriesToDelete = if (deleteViewPath) List(v.fullPath) else List() override def stringsToChecksum: List[String] = target :: sources override def fileResourcesToChecksum = List() override val cleanupAfterJob: (Job, MapreduceDriver, DriverRunState[MapreduceBaseTransformation]) => DriverRunState[MapreduceBaseTransformation] = (_, __, completionRunState) => completionRunState lazy val job: Job = { val distCp = new DistCp(config, distCpOptions) val createJob = distCp.getClass.getDeclaredMethod("createJob") createJob.setAccessible(true) val job = createJob.invoke(distCp).asInstanceOf[Job] val prepareFileListing = distCp.getClass.getDeclaredMethod("prepareFileListing", job.getClass) prepareFileListing.setAccessible(true) prepareFileListing.invoke(distCp, job) job } def distCpOptions: DistCpOptions = if (configuration.nonEmpty) { DistCpConfiguration .fromConfig(configuration.toMap) .toDistCpOptions(sources.map(new Path(_)), new Path(target)) } else { val s = sources.map(new Path(_)).asJava new DistCpOptions(s, new Path(target)) } }
Example 25
Source File: MapreduceTransformation.scala From schedoscope with Apache License 2.0 | 5 votes |
package org.schedoscope.dsl.transformations import java.net.URI import org.apache.commons.lang3.StringUtils import org.apache.hadoop.mapreduce.lib.input.FileInputFormat import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat import org.apache.hadoop.mapreduce.{Job, MRJobConfig} import org.schedoscope.Schedoscope import org.schedoscope.dsl.View import org.schedoscope.scheduler.driver.{DriverRunState, MapreduceDriver} import org.schedoscope.scheduler.service.ViewTransformationStatus case class MapreduceTransformation(v: View, createJob: (Map[String, Any]) => Job, cleanupAfterJob: (Job, MapreduceDriver, DriverRunState[MapreduceBaseTransformation]) => DriverRunState[MapreduceBaseTransformation] = (_, __, completionRunState) => completionRunState, dirsToDelete: List[String] = List(), deleteViewPath: Boolean = true) extends MapreduceBaseTransformation { lazy val job = createJob(configuration.toMap) var directoriesToDelete = dirsToDelete ++ (if (deleteViewPath) List(v.fullPath) else List()) description = StringUtils.abbreviate(v.urlPath, 100) } trait MapreduceBaseTransformation extends Transformation { def name = "mapreduce" val cleanupAfterJob: (Job, MapreduceDriver, DriverRunState[MapreduceBaseTransformation]) => DriverRunState[MapreduceBaseTransformation] val v: View val job: Job var directoriesToDelete: List[String] override def fileResourcesToChecksum = { val jarName = try { job.getConfiguration().get(MRJobConfig.JAR).split("/").last } catch { case _: Throwable => null } Schedoscope.settings .getDriverSettings("mapreduce") .libJarsHdfs .filter(lj => jarName == null || lj.contains(jarName)) } override def viewTransformationStatus = ViewTransformationStatus( name, Some(Map( "input" -> job.getConfiguration().get(FileInputFormat.INPUT_DIR), "output" -> job.getConfiguration().get(FileOutputFormat.OUTDIR)))) def configure() { // if job jar hasn't been registered, add all mapreduce libjars // to distributed cache if (job.getConfiguration().get(MRJobConfig.JAR) == null) { fileResourcesToChecksum.foreach(r => { try { job.addCacheFile(new URI(r)) } catch { case _: Throwable => Unit } }) } configuration.foreach { case (k, v) => if (v == null) job.getConfiguration.unset(k) else job.getConfiguration.set(k, v.toString) } } }
Example 26
Source File: CommitFailureTestSource.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext} import org.apache.spark.TaskContext import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.datasources.{OutputWriter, OutputWriterFactory} import org.apache.spark.sql.types.StructType class CommitFailureTestSource extends SimpleTextSource { override def prepareWrite( sparkSession: SparkSession, job: Job, options: Map[String, String], dataSchema: StructType): OutputWriterFactory = new OutputWriterFactory { override def newInstance( path: String, dataSchema: StructType, context: TaskAttemptContext): OutputWriter = { new SimpleTextOutputWriter(path, dataSchema, context) { var failed = false TaskContext.get().addTaskFailureListener { (t: TaskContext, e: Throwable) => failed = true SimpleTextRelation.callbackCalled = true } override def write(row: InternalRow): Unit = { if (SimpleTextRelation.failWriter) { sys.error("Intentional task writer failure for testing purpose.") } super.write(row) } override def close(): Unit = { super.close() sys.error("Intentional task commitment failure for testing purpose.") } } } override def getFileExtension(context: TaskAttemptContext): String = "" } override def shortName(): String = "commit-failure-test" }
Example 27
Source File: ConfOnlyTAC.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.hadoop import org.apache.hadoop.conf.Configuration import org.apache.hadoop.mapreduce.{ Counter, TaskAttemptID, Job, TaskAttemptContext } // This exists just because of a quirk of the record reader api. case class ConfOnlyTAC(_conf: Configuration) extends Job with TaskAttemptContext { // JobContextImpl and JobContext override def getConfiguration: Configuration = _conf // TaskAttemptContext override def getTaskAttemptID: TaskAttemptID = sys.error("not implemented") override def setStatus(msg: String): Unit = sys.error("not implemented") override def getStatus = sys.error("not implemented") override def getProgress: Float = sys.error("not implemented") override def getCounter(counterName: Enum[_]): Counter = sys.error("not implemented") override def getCounter(groupName: String, counterName: String): Counter = sys.error("not implemented") // Progressable override def progress(): Unit = sys.error("not implemented") }
Example 28
Source File: InputFormatConf.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.hadoop import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{ FileSystem, Path } import org.apache.hadoop.io.{ LongWritable, Text, Writable } import org.apache.hadoop.mapreduce.{ InputFormat, InputSplit, Job, RecordReader } import org.apache.hadoop.mapreduce.lib.input.{ FileInputFormat, FileSplit, TextInputFormat } import scala.collection.immutable trait InputFormatConf[K, V] extends Serializable { type IF <: InputFormat[K, V] type Split <: InputSplit with Writable type KExtract <: Extract[K] type VExtract <: Extract[V] def kExtract: KExtract def vExtract: VExtract def makeInputFormat(): IF // I'm unsure if we should WriSer them for them def makeSplits(hadoopConf: Configuration): IndexedSeq[WriSer[Split]] // TODO do we want to require typing of the RecordReader as well? final def createRecordReader(hadoopConf: Configuration, split: Split, inputFormat: IF = makeInputFormat()): RecordReader[K, V] = { val tac = ConfOnlyTAC(hadoopConf) val recordReader = inputFormat.createRecordReader(split, tac) recordReader.initialize(split, tac) recordReader } } case class TextInputFormatConf(file: String, partitions: Int) extends InputFormatConf[LongWritable, Text] { type IF = TextInputFormat type Split = FileSplit // TODO now that we figured out what's up, see if we can't eliminate the need for this... val internalK = Extract.unit[LongWritable] val internalV = Extract.text type KExtract = internalK.type type VExtract = internalV.type override val kExtract: KExtract = internalK override val vExtract: VExtract = internalV def makeInputFormat() = new TextInputFormat() def makeSplits(hadoopConf: Configuration): immutable.IndexedSeq[WriSer[FileSplit]] = { val job = Job.getInstance(hadoopConf) FileInputFormat.setInputPaths(job, file) val path = new Path(file) val len = FileSystem.get(hadoopConf).listStatus(path).head.getLen val size_per = math.round(len / partitions.toDouble) ((0 until partitions - 1).map { p => new FileSplit(path, size_per * p, size_per, null) } :+ { val fin = size_per * (partitions - 1) new FileSplit(path, fin, len - fin, null) }).map(WriSer(_)) } } // TODO do we really get much from having this as its own class? consider just making a def csv method in TextInputFormatConf object CSVInputFormatConf { def apply[V](ifc: InputFormatConf[LongWritable, V] { type Split = FileSplit }): InputFormatConf[LongWritable, V] { type IF = ifc.IF type Split = ifc.Split type KExtract = ifc.KExtract type VExtract = ifc.VExtract } = new InputFormatConf[LongWritable, V] { type IF = ifc.IF type Split = ifc.Split type KExtract = ifc.KExtract type VExtract = ifc.VExtract override val kExtract: KExtract = ifc.kExtract override val vExtract: VExtract = ifc.vExtract override def makeInputFormat() = ifc.makeInputFormat() override def makeSplits(hadoopConf: Configuration) = { val splits = ifc.makeSplits(hadoopConf) splits.headOption.fold(IndexedSeq.empty[WriSer[Split]]) { case WriSer(head) => val rr = createRecordReader(hadoopConf, head) require(rr.nextKeyValue, "csv has no header, first line was empty") val afterHeader = rr.getCurrentKey.get require(rr.nextKeyValue, "first split is empty") WriSer(new FileSplit(head.getPath, afterHeader, head.getLength - afterHeader, null)) +: splits.tail } } } }
Example 29
Source File: CarbonCountStar.scala From carbondata with Apache License 2.0 | 4 votes |
package org.apache.spark.sql import scala.collection.JavaConverters._ import org.apache.hadoop.fs.Path import org.apache.hadoop.mapred.JobConf import org.apache.hadoop.mapreduce.Job import org.apache.hadoop.mapreduce.lib.input.FileInputFormat import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.execution.LeafExecNode import org.apache.spark.sql.optimizer.CarbonFilters import org.apache.spark.sql.types.StringType import org.apache.spark.unsafe.types.UTF8String import org.apache.carbondata.core.datastore.impl.FileFactory import org.apache.carbondata.core.metadata.AbsoluteTableIdentifier import org.apache.carbondata.core.metadata.schema.table.CarbonTable import org.apache.carbondata.core.mutate.CarbonUpdateUtil import org.apache.carbondata.core.statusmanager.StageInputCollector import org.apache.carbondata.core.util.{CarbonProperties, ThreadLocalSessionInfo} import org.apache.carbondata.hadoop.api.{CarbonInputFormat, CarbonTableInputFormat} import org.apache.carbondata.hadoop.util.CarbonInputFormatUtil import org.apache.carbondata.spark.load.DataLoadProcessBuilderOnSpark case class CarbonCountStar( attributesRaw: Seq[Attribute], carbonTable: CarbonTable, sparkSession: SparkSession, outUnsafeRows: Boolean = true) extends LeafExecNode { override def doExecute(): RDD[InternalRow] = { ThreadLocalSessionInfo .setConfigurationToCurrentThread(sparkSession.sessionState.newHadoopConf()) val absoluteTableIdentifier = carbonTable.getAbsoluteTableIdentifier val (job, tableInputFormat) = createCarbonInputFormat(absoluteTableIdentifier) CarbonInputFormat.setQuerySegment(job.getConfiguration, carbonTable) // get row count var rowCount = CarbonUpdateUtil.getRowCount( tableInputFormat.getBlockRowCount( job, carbonTable, CarbonFilters.getPartitions( Seq.empty, sparkSession, TableIdentifier( carbonTable.getTableName, Some(carbonTable.getDatabaseName))).map(_.asJava).orNull, false), carbonTable) if (CarbonProperties.isQueryStageInputEnabled) { // check for number of row for stage input val splits = StageInputCollector.createInputSplits(carbonTable, job.getConfiguration) if (!splits.isEmpty) { val df = DataLoadProcessBuilderOnSpark.createInputDataFrame( sparkSession, carbonTable, splits.asScala) rowCount += df.count() } } val valueRaw = attributesRaw.head.dataType match { case StringType => Seq(UTF8String.fromString(Long.box(rowCount).toString)).toArray .asInstanceOf[Array[Any]] case _ => Seq(Long.box(rowCount)).toArray.asInstanceOf[Array[Any]] } val value = new GenericInternalRow(valueRaw) val unsafeProjection = UnsafeProjection.create(output.map(_.dataType).toArray) val row = if (outUnsafeRows) unsafeProjection(value) else value sparkContext.parallelize(Seq(row)) } override def output: Seq[Attribute] = { attributesRaw } private def createCarbonInputFormat(absoluteTableIdentifier: AbsoluteTableIdentifier ): (Job, CarbonTableInputFormat[Array[Object]]) = { val carbonInputFormat = new CarbonTableInputFormat[Array[Object]]() val jobConf: JobConf = new JobConf(FileFactory.getConfiguration) SparkHadoopUtil.get.addCredentials(jobConf) CarbonInputFormat.setTableInfo(jobConf, carbonTable.getTableInfo) val job = new Job(jobConf) FileInputFormat.addInputPath(job, new Path(absoluteTableIdentifier.getTablePath)) CarbonInputFormat .setTransactionalTable(job.getConfiguration, carbonTable.getTableInfo.isTransactionalTable) CarbonInputFormatUtil.setIndexJobIfConfigured(job.getConfiguration) (job, carbonInputFormat) } }