org.apache.hadoop.fs.Path Scala Examples
The following examples show how to use org.apache.hadoop.fs.Path.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: TextFileFormat.scala From drizzle-spark with Apache License 2.0 | 12 votes |
package org.apache.spark.sql.execution.datasources.text import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileStatus, Path} import org.apache.hadoop.io.{NullWritable, Text} import org.apache.hadoop.io.compress.GzipCodec import org.apache.hadoop.mapreduce.{Job, RecordWriter, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.output.{FileOutputFormat, TextOutputFormat} import org.apache.hadoop.util.ReflectionUtils import org.apache.spark.TaskContext import org.apache.spark.sql.{AnalysisException, Row, SparkSession} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.UnsafeRow import org.apache.spark.sql.catalyst.expressions.codegen.{BufferHolder, UnsafeRowWriter} import org.apache.spark.sql.catalyst.util.CompressionCodecs import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.sources._ import org.apache.spark.sql.types.{StringType, StructType} import org.apache.spark.util.SerializableConfiguration def getCompressionExtension(context: TaskAttemptContext): String = { // Set the compression extension, similar to code in TextOutputFormat.getDefaultWorkFile if (FileOutputFormat.getCompressOutput(context)) { val codecClass = FileOutputFormat.getOutputCompressorClass(context, classOf[GzipCodec]) ReflectionUtils.newInstance(codecClass, context.getConfiguration).getDefaultExtension } else { "" } } }
Example 2
Source File: MultilayerPerceptronClassifierWrapper.scala From drizzle-spark with Apache License 2.0 | 8 votes |
package org.apache.spark.ml.r import org.apache.hadoop.fs.Path import org.json4s._ import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.ml.{Pipeline, PipelineModel} import org.apache.spark.ml.classification.{MultilayerPerceptronClassificationModel, MultilayerPerceptronClassifier} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable, MLWriter} import org.apache.spark.sql.{DataFrame, Dataset} private[r] class MultilayerPerceptronClassifierWrapper private ( val pipeline: PipelineModel, val labelCount: Long, val layers: Array[Int], val weights: Array[Double] ) extends MLWritable { def transform(dataset: Dataset[_]): DataFrame = { pipeline.transform(dataset) } override def read: MLReader[MultilayerPerceptronClassifierWrapper] = new MultilayerPerceptronClassifierWrapperReader override def load(path: String): MultilayerPerceptronClassifierWrapper = super.load(path) class MultilayerPerceptronClassifierWrapperReader extends MLReader[MultilayerPerceptronClassifierWrapper]{ override def load(path: String): MultilayerPerceptronClassifierWrapper = { implicit val format = DefaultFormats val rMetadataPath = new Path(path, "rMetadata").toString val pipelinePath = new Path(path, "pipeline").toString val rMetadataStr = sc.textFile(rMetadataPath, 1).first() val rMetadata = parse(rMetadataStr) val labelCount = (rMetadata \ "labelCount").extract[Long] val layers = (rMetadata \ "layers").extract[Array[Int]] val weights = (rMetadata \ "weights").extract[Array[Double]] val pipeline = PipelineModel.load(pipelinePath) new MultilayerPerceptronClassifierWrapper(pipeline, labelCount, layers, weights) } } class MultilayerPerceptronClassifierWrapperWriter(instance: MultilayerPerceptronClassifierWrapper) extends MLWriter { override protected def saveImpl(path: String): Unit = { val rMetadataPath = new Path(path, "rMetadata").toString val pipelinePath = new Path(path, "pipeline").toString val rMetadata = ("class" -> instance.getClass.getName) ~ ("labelCount" -> instance.labelCount) ~ ("layers" -> instance.layers.toSeq) ~ ("weights" -> instance.weights.toArray.toSeq) val rMetadataJson: String = compact(render(rMetadata)) sc.parallelize(Seq(rMetadataJson), 1).saveAsTextFile(rMetadataPath) instance.pipeline.save(pipelinePath) } } }
Example 3
Source File: DirectOutputCommitter.scala From spark-snowflake with Apache License 2.0 | 6 votes |
package net.snowflake.spark.snowflake import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.mapred._ import org.apache.hadoop.mapreduce.lib.output.{ FileOutputCommitter, FileOutputFormat } class DirectOutputCommitter extends OutputCommitter { override def setupJob(jobContext: JobContext): Unit = {} override def setupTask(taskContext: TaskAttemptContext): Unit = {} override def needsTaskCommit(taskContext: TaskAttemptContext): Boolean = { // We return true here to guard against implementations that do not handle false correctly. // The meaning of returning false is not entirely clear, so it's possible to be interpreted // as an error. Returning true just means that commitTask() will be called, which is a no-op. true } override def commitTask(taskContext: TaskAttemptContext): Unit = {} override def abortTask(taskContext: TaskAttemptContext): Unit = {} private def shouldCreateSuccessFile(conf: Configuration): Boolean = { conf.getBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", true) } }
Example 4
Source File: RWrappers.scala From drizzle-spark with Apache License 2.0 | 6 votes |
package org.apache.spark.ml.r import org.apache.hadoop.fs.Path import org.json4s.DefaultFormats import org.json4s.jackson.JsonMethods._ import org.apache.spark.SparkException import org.apache.spark.ml.util.MLReader private[r] object RWrappers extends MLReader[Object] { override def load(path: String): Object = { implicit val format = DefaultFormats val rMetadataPath = new Path(path, "rMetadata").toString val rMetadataStr = sc.textFile(rMetadataPath, 1).first() val rMetadata = parse(rMetadataStr) val className = (rMetadata \ "class").extract[String] className match { case "org.apache.spark.ml.r.NaiveBayesWrapper" => NaiveBayesWrapper.load(path) case "org.apache.spark.ml.r.AFTSurvivalRegressionWrapper" => AFTSurvivalRegressionWrapper.load(path) case "org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper" => GeneralizedLinearRegressionWrapper.load(path) case "org.apache.spark.ml.r.KMeansWrapper" => KMeansWrapper.load(path) case "org.apache.spark.ml.r.MultilayerPerceptronClassifierWrapper" => MultilayerPerceptronClassifierWrapper.load(path) case "org.apache.spark.ml.r.LDAWrapper" => LDAWrapper.load(path) case "org.apache.spark.ml.r.IsotonicRegressionWrapper" => IsotonicRegressionWrapper.load(path) case "org.apache.spark.ml.r.GaussianMixtureWrapper" => GaussianMixtureWrapper.load(path) case "org.apache.spark.ml.r.ALSWrapper" => ALSWrapper.load(path) case "org.apache.spark.ml.r.LogisticRegressionWrapper" => LogisticRegressionWrapper.load(path) case _ => throw new SparkException(s"SparkR read.ml does not support load $className") } } }
Example 5
Source File: OrcFileOperator.scala From drizzle-spark with Apache License 2.0 | 6 votes |
package org.apache.spark.sql.hive.orc import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.hive.ql.io.orc.{OrcFile, Reader} import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.parser.CatalystSqlParser import org.apache.spark.sql.types.StructType private[orc] object OrcFileOperator extends Logging { def getFileReader(basePath: String, config: Option[Configuration] = None): Option[Reader] = { def isWithNonEmptySchema(path: Path, reader: Reader): Boolean = { reader.getObjectInspector match { case oi: StructObjectInspector if oi.getAllStructFieldRefs.size() == 0 => logInfo( s"ORC file $path has empty schema, it probably contains no rows. " + "Trying to read another ORC file to figure out the schema.") false case _ => true } } val conf = config.getOrElse(new Configuration) val fs = { val hdfsPath = new Path(basePath) hdfsPath.getFileSystem(conf) } listOrcFiles(basePath, conf).iterator.map { path => path -> OrcFile.createReader(fs, path) }.collectFirst { case (path, reader) if isWithNonEmptySchema(path, reader) => reader } } def readSchema(paths: Seq[String], conf: Option[Configuration]): Option[StructType] = { // Take the first file where we can open a valid reader if we can find one. Otherwise just // return None to indicate we can't infer the schema. paths.flatMap(getFileReader(_, conf)).headOption.map { reader => val readerInspector = reader.getObjectInspector.asInstanceOf[StructObjectInspector] val schema = readerInspector.getTypeName logDebug(s"Reading schema from file $paths, got Hive schema string: $schema") CatalystSqlParser.parseDataType(schema).asInstanceOf[StructType] } } def getObjectInspector( path: String, conf: Option[Configuration]): Option[StructObjectInspector] = { getFileReader(path, conf).map(_.getObjectInspector.asInstanceOf[StructObjectInspector]) } def listOrcFiles(pathStr: String, conf: Configuration): Seq[Path] = { // TODO: Check if the paths coming in are already qualified and simplify. val origPath = new Path(pathStr) val fs = origPath.getFileSystem(conf) val paths = SparkHadoopUtil.get.listLeafStatuses(fs, origPath) .filterNot(_.isDirectory) .map(_.getPath) .filterNot(_.getName.startsWith("_")) .filterNot(_.getName.startsWith(".")) paths } }
Example 6
Source File: HDFSCredentialProvider.scala From drizzle-spark with Apache License 2.0 | 6 votes |
package org.apache.spark.deploy.yarn.security import java.io.{ByteArrayInputStream, DataInputStream} import scala.collection.JavaConverters._ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier import org.apache.hadoop.mapred.Master import org.apache.hadoop.security.Credentials import org.apache.spark.{SparkConf, SparkException} import org.apache.spark.deploy.yarn.config._ import org.apache.spark.internal.Logging import org.apache.spark.internal.config._ private[security] class HDFSCredentialProvider extends ServiceCredentialProvider with Logging { // Token renewal interval, this value will be set in the first call, // if None means no token renewer specified, so cannot get token renewal interval. private var tokenRenewalInterval: Option[Long] = null override val serviceName: String = "hdfs" override def obtainCredentials( hadoopConf: Configuration, sparkConf: SparkConf, creds: Credentials): Option[Long] = { // NameNode to access, used to get tokens from different FileSystems nnsToAccess(hadoopConf, sparkConf).foreach { dst => val dstFs = dst.getFileSystem(hadoopConf) logInfo("getting token for namenode: " + dst) dstFs.addDelegationTokens(getTokenRenewer(hadoopConf), creds) } // Get the token renewal interval if it is not set. It will only be called once. if (tokenRenewalInterval == null) { tokenRenewalInterval = getTokenRenewalInterval(hadoopConf, sparkConf) } // Get the time of next renewal. tokenRenewalInterval.map { interval => creds.getAllTokens.asScala .filter(_.getKind == DelegationTokenIdentifier.HDFS_DELEGATION_KIND) .map { t => val identifier = new DelegationTokenIdentifier() identifier.readFields(new DataInputStream(new ByteArrayInputStream(t.getIdentifier))) identifier.getIssueDate + interval }.foldLeft(0L)(math.max) } } private def getTokenRenewalInterval( hadoopConf: Configuration, sparkConf: SparkConf): Option[Long] = { // We cannot use the tokens generated with renewer yarn. Trying to renew // those will fail with an access control issue. So create new tokens with the logged in // user as renewer. sparkConf.get(PRINCIPAL).map { renewer => val creds = new Credentials() nnsToAccess(hadoopConf, sparkConf).foreach { dst => val dstFs = dst.getFileSystem(hadoopConf) dstFs.addDelegationTokens(renewer, creds) } val t = creds.getAllTokens.asScala .filter(_.getKind == DelegationTokenIdentifier.HDFS_DELEGATION_KIND) .head val newExpiration = t.renew(hadoopConf) val identifier = new DelegationTokenIdentifier() identifier.readFields(new DataInputStream(new ByteArrayInputStream(t.getIdentifier))) val interval = newExpiration - identifier.getIssueDate logInfo(s"Renewal Interval is $interval") interval } } private def getTokenRenewer(conf: Configuration): String = { val delegTokenRenewer = Master.getMasterPrincipal(conf) logDebug("delegation token renewer is: " + delegTokenRenewer) if (delegTokenRenewer == null || delegTokenRenewer.length() == 0) { val errorMessage = "Can't get Master Kerberos principal for use as renewer" logError(errorMessage) throw new SparkException(errorMessage) } delegTokenRenewer } private def nnsToAccess(hadoopConf: Configuration, sparkConf: SparkConf): Set[Path] = { sparkConf.get(NAMENODES_TO_ACCESS).map(new Path(_)).toSet + sparkConf.get(STAGING_DIR).map(new Path(_)) .getOrElse(FileSystem.get(hadoopConf).getHomeDirectory) } }
Example 7
Source File: AvroParquetSourceTest.scala From eel-sdk with Apache License 2.0 | 6 votes |
package io.eels.component.parquet import java.nio.file.Paths import io.eels.component.parquet.avro.AvroParquetSource import io.eels.component.parquet.util.ParquetLogMute import io.eels.schema._ import org.apache.avro.SchemaBuilder import org.apache.avro.generic.{GenericData, GenericRecord} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.parquet.avro.AvroParquetWriter import org.scalatest.{Matchers, WordSpec} class AvroParquetSourceTest extends WordSpec with Matchers { ParquetLogMute() private implicit val conf = new Configuration() private implicit val fs = FileSystem.get(conf) private val personFile = Paths.get(getClass.getResource("/io/eels/component/parquet/person.avro.pq").toURI) private val resourcesDir = personFile.getParent "AvroParquetSource" should { "read schema" in { val people = AvroParquetSource(personFile) people.schema shouldBe StructType( Field("name", StringType, nullable = false), Field("job", StringType, nullable = false), Field("location", StringType, nullable = false) ) } "read parquet files" in { val people = AvroParquetSource(personFile.toAbsolutePath()).toDataStream().toSet.map(_.values) people shouldBe Set( Vector("clint eastwood", "actor", "carmel"), Vector("elton john", "musician", "pinner") ) } "read multiple parquet files using file expansion" in { import io.eels.FilePattern._ val people = AvroParquetSource(s"${resourcesDir.toUri.toString}/*.pq").toDataStream().toSet.map(_.values) people shouldBe Set( Vector("clint eastwood", "actor", "carmel"), Vector("elton john", "musician", "pinner"), Vector("clint eastwood", "actor", "carmel"), Vector("elton john", "musician", "pinner") ) } // todo add merge to parquet source "merge schemas" ignore { try { fs.delete(new Path("merge1.pq"), false) } catch { case t: Throwable => } try { fs.delete(new Path("merge2.pq"), false) } catch { case t: Throwable => } val schema1 = SchemaBuilder.builder().record("schema1").fields().requiredString("a").requiredDouble("b").endRecord() val schema2 = SchemaBuilder.builder().record("schema2").fields().requiredInt("a").requiredBoolean("c").endRecord() val writer1 = AvroParquetWriter.builder[GenericRecord](new Path("merge1.pq")).withSchema(schema1).build() val record1 = new GenericData.Record(schema1) record1.put("a", "aaaaa") record1.put("b", 124.3) writer1.write(record1) writer1.close() val writer2 = AvroParquetWriter.builder[GenericRecord](new Path("merge2.pq")).withSchema(schema2).build() val record2 = new GenericData.Record(schema2) record2.put("a", 111) record2.put("c", true) writer2.write(record2) writer2.close() ParquetSource(new Path("merge*")).schema shouldBe StructType( Field("a", StringType, nullable = false), Field("b", DoubleType, nullable = false), Field("c", BooleanType, nullable = false) ) fs.delete(new Path(".merge1.pq.crc"), false) fs.delete(new Path(".merge2.pq.crc"), false) fs.delete(new Path("merge1.pq"), false) fs.delete(new Path("merge2.pq"), false) } } }
Example 8
Source File: IDF.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.hadoop.fs.Path import org.apache.spark.annotation.Since import org.apache.spark.ml._ import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.{Vector => OldVector, Vectors => OldVectors} import org.apache.spark.mllib.util.MLUtils import org.apache.spark.rdd.RDD import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.StructType @Since("2.0.0") def idf: Vector = idfModel.idf.asML @Since("1.6.0") override def write: MLWriter = new IDFModelWriter(this) } @Since("1.6.0") object IDFModel extends MLReadable[IDFModel] { private[IDFModel] class IDFModelWriter(instance: IDFModel) extends MLWriter { private case class Data(idf: Vector) override protected def saveImpl(path: String): Unit = { DefaultParamsWriter.saveMetadata(instance, path, sc) val data = Data(instance.idf) val dataPath = new Path(path, "data").toString sparkSession.createDataFrame(Seq(data)).repartition(1).write.parquet(dataPath) } } private class IDFModelReader extends MLReader[IDFModel] { private val className = classOf[IDFModel].getName override def load(path: String): IDFModel = { val metadata = DefaultParamsReader.loadMetadata(path, sc, className) val dataPath = new Path(path, "data").toString val data = sparkSession.read.parquet(dataPath) val Row(idf: Vector) = MLUtils.convertVectorColumnsToML(data, "idf") .select("idf") .head() val model = new IDFModel(metadata.uid, new feature.IDFModel(OldVectors.fromML(idf))) DefaultParamsReader.getAndSetParams(model, metadata) model } } @Since("1.6.0") override def read: MLReader[IDFModel] = new IDFModelReader @Since("1.6.0") override def load(path: String): IDFModel = super.load(path) }
Example 9
Source File: CommitFailureTestSource.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext} import org.apache.spark.TaskContext import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.execution.datasources.{OutputWriter, OutputWriterFactory} import org.apache.spark.sql.types.StructType class CommitFailureTestSource extends SimpleTextSource { override def prepareWrite( sparkSession: SparkSession, job: Job, options: Map[String, String], dataSchema: StructType): OutputWriterFactory = new OutputWriterFactory { override def newInstance( stagingDir: String, fileNamePrefix: String, dataSchema: StructType, context: TaskAttemptContext): OutputWriter = { new SimpleTextOutputWriter(stagingDir, fileNamePrefix, context) { var failed = false TaskContext.get().addTaskFailureListener { (t: TaskContext, e: Throwable) => failed = true SimpleTextRelation.callbackCalled = true } override val path: String = new Path(stagingDir, fileNamePrefix).toString override def write(row: Row): Unit = { if (SimpleTextRelation.failWriter) { sys.error("Intentional task writer failure for testing purpose.") } super.write(row) } override def close(): Unit = { super.close() sys.error("Intentional task commitment failure for testing purpose.") } } } } override def shortName(): String = "commit-failure-test" }
Example 10
Source File: JsonHadoopFsRelationSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import java.math.BigDecimal import org.apache.hadoop.fs.Path import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.Row import org.apache.spark.sql.types._ class JsonHadoopFsRelationSuite extends HadoopFsRelationTest { override val dataSourceName: String = "json" // JSON does not write data of NullType and does not play well with BinaryType. override protected def supportsDataType(dataType: DataType): Boolean = dataType match { case _: NullType => false case _: BinaryType => false case _: CalendarIntervalType => false case _ => true } test("save()/load() - partitioned table - simple queries - partition columns in data") { withTempDir { file => val basePath = new Path(file.getCanonicalPath) val fs = basePath.getFileSystem(SparkHadoopUtil.get.conf) val qualifiedBasePath = fs.makeQualified(basePath) for (p1 <- 1 to 2; p2 <- Seq("foo", "bar")) { val partitionDir = new Path(qualifiedBasePath, s"p1=$p1/p2=$p2") sparkContext .parallelize(for (i <- 1 to 3) yield s"""{"a":$i,"b":"val_$i"}""") .saveAsTextFile(partitionDir.toString) } val dataSchemaWithPartition = StructType(dataSchema.fields :+ StructField("p1", IntegerType, nullable = true)) checkQueries( spark.read.format(dataSourceName) .option("dataSchema", dataSchemaWithPartition.json) .load(file.getCanonicalPath)) } } test("SPARK-9894: save complex types to JSON") { withTempDir { file => file.delete() val schema = new StructType() .add("array", ArrayType(LongType)) .add("map", MapType(StringType, new StructType().add("innerField", LongType))) val data = Row(Seq(1L, 2L, 3L), Map("m1" -> Row(4L))) :: Row(Seq(5L, 6L, 7L), Map("m2" -> Row(10L))) :: Nil val df = spark.createDataFrame(sparkContext.parallelize(data), schema) // Write the data out. df.write.format(dataSourceName).save(file.getCanonicalPath) // Read it back and check the result. checkAnswer( spark.read.format(dataSourceName).schema(schema).load(file.getCanonicalPath), df ) } } test("SPARK-10196: save decimal type to JSON") { withTempDir { file => file.delete() val schema = new StructType() .add("decimal", DecimalType(7, 2)) val data = Row(new BigDecimal("10.02")) :: Row(new BigDecimal("20000.99")) :: Row(new BigDecimal("10000")) :: Nil val df = spark.createDataFrame(sparkContext.parallelize(data), schema) // Write the data out. df.write.format(dataSourceName).save(file.getCanonicalPath) // Read it back and check the result. checkAnswer( spark.read.format(dataSourceName).schema(schema).load(file.getCanonicalPath), df ) } } }
Example 11
Source File: CommitFailureTestRelationSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.hadoop.fs.Path import org.apache.spark.SparkException import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.functions._ import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.test.SQLTestUtils class CommitFailureTestRelationSuite extends SQLTestUtils with TestHiveSingleton { // When committing a task, `CommitFailureTestSource` throws an exception for testing purpose. val dataSourceName: String = classOf[CommitFailureTestSource].getCanonicalName test("SPARK-7684: commitTask() failure should fallback to abortTask()") { withTempPath { file => // Here we coalesce partition number to 1 to ensure that only a single task is issued. This // prevents race condition happened when FileOutputCommitter tries to remove the `_temporary` // directory while committing/aborting the job. See SPARK-8513 for more details. val df = spark.range(0, 10).coalesce(1) intercept[SparkException] { df.write.format(dataSourceName).save(file.getCanonicalPath) } val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf) assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary"))) } } test("call failure callbacks before close writer - default") { SimpleTextRelation.failCommitter = false withTempPath { file => // fail the job in the middle of writing val divideByZero = udf((x: Int) => { x / (x - 1)}) val df = spark.range(0, 10).coalesce(1).select(divideByZero(col("id"))) SimpleTextRelation.callbackCalled = false intercept[SparkException] { df.write.format(dataSourceName).save(file.getCanonicalPath) } assert(SimpleTextRelation.callbackCalled, "failure callback should be called") val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf) assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary"))) } } test("call failure callbacks before close writer - partitioned") { SimpleTextRelation.failCommitter = false withTempPath { file => // fail the job in the middle of writing val df = spark.range(0, 10).coalesce(1).select(col("id").mod(2).as("key"), col("id")) SimpleTextRelation.callbackCalled = false SimpleTextRelation.failWriter = true intercept[SparkException] { df.write.format(dataSourceName).partitionBy("key").save(file.getCanonicalPath) } assert(SimpleTextRelation.callbackCalled, "failure callback should be called") val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf) assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary"))) } } }
Example 12
Source File: SimpleTextHadoopFsRelationSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.hadoop.fs.Path import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.catalyst.expressions.PredicateHelper import org.apache.spark.sql.types._ class SimpleTextHadoopFsRelationSuite extends HadoopFsRelationTest with PredicateHelper { override val dataSourceName: String = classOf[SimpleTextSource].getCanonicalName // We have a very limited number of supported types at here since it is just for a // test relation and we do very basic testing at here. override protected def supportsDataType(dataType: DataType): Boolean = dataType match { case _: BinaryType => false // We are using random data generator and the generated strings are not really valid string. case _: StringType => false case _: BooleanType => false // see https://issues.apache.org/jira/browse/SPARK-10442 case _: CalendarIntervalType => false case _: DateType => false case _: TimestampType => false case _: ArrayType => false case _: MapType => false case _: StructType => false case _: UserDefinedType[_] => false case _ => true } test("save()/load() - partitioned table - simple queries - partition columns in data") { withTempDir { file => val basePath = new Path(file.getCanonicalPath) val fs = basePath.getFileSystem(SparkHadoopUtil.get.conf) val qualifiedBasePath = fs.makeQualified(basePath) for (p1 <- 1 to 2; p2 <- Seq("foo", "bar")) { val partitionDir = new Path(qualifiedBasePath, s"p1=$p1/p2=$p2") sparkContext .parallelize(for (i <- 1 to 3) yield s"$i,val_$i,$p1") .saveAsTextFile(partitionDir.toString) } val dataSchemaWithPartition = StructType(dataSchema.fields :+ StructField("p1", IntegerType, nullable = true)) checkQueries( spark.read.format(dataSourceName) .option("dataSchema", dataSchemaWithPartition.json) .load(file.getCanonicalPath)) } } test("test hadoop conf option propagation") { withTempPath { file => // Test write side val df = spark.range(10).selectExpr("cast(id as string)") df.write .option("some-random-write-option", "hahah-WRITE") .option("some-null-value-option", null) // test null robustness .option("dataSchema", df.schema.json) .format(dataSourceName).save(file.getAbsolutePath) assert(SimpleTextRelation.lastHadoopConf.get.get("some-random-write-option") == "hahah-WRITE") // Test read side val df1 = spark.read .option("some-random-read-option", "hahah-READ") .option("some-null-value-option", null) // test null robustness .option("dataSchema", df.schema.json) .format(dataSourceName) .load(file.getAbsolutePath) df1.count() assert(SimpleTextRelation.lastHadoopConf.get.get("some-random-read-option") == "hahah-READ") } } }
Example 13
Source File: TableFileCatalog.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.hadoop.fs.Path import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.catalog.CatalogTable import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.types.StructType private class PrunedTableFileCatalog( sparkSession: SparkSession, tableBasePath: Path, fileStatusCache: FileStatusCache, override val partitionSpec: PartitionSpec) extends ListingFileCatalog( sparkSession, partitionSpec.partitions.map(_.path), Map.empty, Some(partitionSpec.partitionColumns), fileStatusCache)
Example 14
Source File: HadoopFileLinesReader.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import java.io.Closeable import java.net.URI import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.io.Text import org.apache.hadoop.mapreduce._ import org.apache.hadoop.mapreduce.lib.input.{FileSplit, LineRecordReader} import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl class HadoopFileLinesReader( file: PartitionedFile, conf: Configuration) extends Iterator[Text] with Closeable { private val iterator = { val fileSplit = new FileSplit( new Path(new URI(file.filePath)), file.start, file.length, // TODO: Implement Locality Array.empty) val attemptId = new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0) val hadoopAttemptContext = new TaskAttemptContextImpl(conf, attemptId) val reader = new LineRecordReader() reader.initialize(fileSplit, hadoopAttemptContext) new RecordReaderIterator(reader) } override def hasNext: Boolean = iterator.hasNext override def next(): Text = iterator.next() override def close(): Unit = iterator.close() }
Example 15
Source File: InsertIntoHadoopFsRelationCommand.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import java.io.IOException import org.apache.hadoop.fs.Path import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.catalog.BucketSpec import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.command.RunnableCommand )) { throw new IOException(s"Unable to clear output " + s"directory $qualifiedOutputPath prior to writing to it") } true case (SaveMode.Append, _) | (SaveMode.Overwrite, _) | (SaveMode.ErrorIfExists, false) => true case (SaveMode.Ignore, exists) => !exists case (s, exists) => throw new IllegalStateException(s"unsupported save mode $s ($exists)") } // If we are appending data to an existing dir. val isAppend = pathExists && (mode == SaveMode.Append) if (doInsertion) { WriteOutput.write( sparkSession, query, fileFormat, qualifiedOutputPath, hadoopConf, partitionColumns, bucketSpec, refreshFunction, options, isAppend) } else { logInfo("Skipping insertion into a relation that already exists.") } Seq.empty[Row] } }
Example 16
Source File: resources.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.command import java.io.File import java.net.URI import org.apache.hadoop.fs.Path import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} case class ListJarsCommand(jars: Seq[String] = Seq.empty[String]) extends RunnableCommand { override val output: Seq[Attribute] = { AttributeReference("Results", StringType, nullable = false)() :: Nil } override def run(sparkSession: SparkSession): Seq[Row] = { val jarList = sparkSession.sparkContext.listJars() if (jars.nonEmpty) { for { jarName <- jars.map(f => new Path(f).getName) jarPath <- jarList if jarPath.contains(jarName) } yield Row(jarPath) } else { jarList.map(Row(_)) } } }
Example 17
Source File: FileStreamSinkLog.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.apache.hadoop.fs.{FileStatus, Path} import org.json4s.NoTypeHints import org.json4s.jackson.Serialization import org.json4s.jackson.Serialization.{read, write} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.internal.SQLConf class FileStreamSinkLog( metadataLogVersion: String, sparkSession: SparkSession, path: String) extends CompactibleFileStreamLog[SinkFileStatus](metadataLogVersion, sparkSession, path) { private implicit val formats = Serialization.formats(NoTypeHints) protected override val fileCleanupDelayMs = sparkSession.sessionState.conf.fileSinkLogCleanupDelay protected override val isDeletingExpiredLog = sparkSession.sessionState.conf.fileSinkLogDeletion protected override val compactInterval = sparkSession.sessionState.conf.fileSinkLogCompactInterval require(compactInterval > 0, s"Please set ${SQLConf.FILE_SINK_LOG_COMPACT_INTERVAL.key} (was $compactInterval) " + "to a positive value.") protected override def serializeData(data: SinkFileStatus): String = { write(data) } protected override def deserializeData(encodedString: String): SinkFileStatus = { read[SinkFileStatus](encodedString) } override def compactLogs(logs: Seq[SinkFileStatus]): Seq[SinkFileStatus] = { val deletedFiles = logs.filter(_.action == FileStreamSinkLog.DELETE_ACTION).map(_.path).toSet if (deletedFiles.isEmpty) { logs } else { logs.filter(f => !deletedFiles.contains(f.path)) } } } object FileStreamSinkLog { val VERSION = "v1" val DELETE_ACTION = "delete" val ADD_ACTION = "add" }
Example 18
Source File: FileStreamSourceSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import java.io.File import java.net.URI import scala.util.Random import org.apache.hadoop.fs.{FileStatus, Path, RawLocalFileSystem} import org.apache.spark.SparkFunSuite import org.apache.spark.sql.execution.streaming.ExistsThrowsExceptionFileSystem._ import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types.StructType class FileStreamSourceSuite extends SparkFunSuite with SharedSQLContext { import FileStreamSource._ test("SeenFilesMap") { val map = new SeenFilesMap(maxAgeMs = 10) map.add("a", 5) assert(map.size == 1) map.purge() assert(map.size == 1) // Add a new entry and purge should be no-op, since the gap is exactly 10 ms. map.add("b", 15) assert(map.size == 2) map.purge() assert(map.size == 2) // Add a new entry that's more than 10 ms than the first entry. We should be able to purge now. map.add("c", 16) assert(map.size == 3) map.purge() assert(map.size == 2) // Override existing entry shouldn't change the size map.add("c", 25) assert(map.size == 2) // Not a new file because we have seen c before assert(!map.isNewFile("c", 20)) // Not a new file because timestamp is too old assert(!map.isNewFile("d", 5)) // Finally a new file: never seen and not too old assert(map.isNewFile("e", 20)) } test("SeenFilesMap should only consider a file old if it is earlier than last purge time") { val map = new SeenFilesMap(maxAgeMs = 10) map.add("a", 20) assert(map.size == 1) // Timestamp 5 should still considered a new file because purge time should be 0 assert(map.isNewFile("b", 9)) assert(map.isNewFile("b", 10)) // Once purge, purge time should be 10 and then b would be a old file if it is less than 10. map.purge() assert(!map.isNewFile("b", 9)) assert(map.isNewFile("b", 10)) } testWithUninterruptibleThread("do not recheck that files exist during getBatch") { withTempDir { temp => spark.conf.set( s"fs.$scheme.impl", classOf[ExistsThrowsExceptionFileSystem].getName) // add the metadata entries as a pre-req val dir = new File(temp, "dir") // use non-existent directory to test whether log make the dir val metadataLog = new FileStreamSourceLog(FileStreamSourceLog.VERSION, spark, dir.getAbsolutePath) assert(metadataLog.add(0, Array(FileEntry(s"$scheme:///file1", 100L, 0)))) val newSource = new FileStreamSource(spark, s"$scheme:///", "parquet", StructType(Nil), Nil, dir.getAbsolutePath, Map.empty) // this method should throw an exception if `fs.exists` is called during resolveRelation newSource.getBatch(None, LongOffset(1)) } } } override def listStatus(file: Path): Array[FileStatus] = { val emptyFile = new FileStatus() emptyFile.setPath(file) Array(emptyFile) } } object ExistsThrowsExceptionFileSystem { val scheme = s"FileStreamSourceSuite${math.abs(Random.nextInt)}fs" }
Example 19
Source File: HDFSCredentialProviderSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.yarn.security import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.scalatest.{Matchers, PrivateMethodTester} import org.apache.spark.{SparkConf, SparkException, SparkFunSuite} class HDFSCredentialProviderSuite extends SparkFunSuite with PrivateMethodTester with Matchers { private val _getTokenRenewer = PrivateMethod[String]('getTokenRenewer) private def getTokenRenewer( hdfsCredentialProvider: HDFSCredentialProvider, conf: Configuration): String = { hdfsCredentialProvider invokePrivate _getTokenRenewer(conf) } private var hdfsCredentialProvider: HDFSCredentialProvider = null override def beforeAll() { super.beforeAll() if (hdfsCredentialProvider == null) { hdfsCredentialProvider = new HDFSCredentialProvider() } } override def afterAll() { if (hdfsCredentialProvider != null) { hdfsCredentialProvider = null } super.afterAll() } test("check token renewer") { val hadoopConf = new Configuration() hadoopConf.set("yarn.resourcemanager.address", "myrm:8033") hadoopConf.set("yarn.resourcemanager.principal", "yarn/myrm:[email protected]") val renewer = getTokenRenewer(hdfsCredentialProvider, hadoopConf) renewer should be ("yarn/myrm:[email protected]") } test("check token renewer default") { val hadoopConf = new Configuration() val caught = intercept[SparkException] { getTokenRenewer(hdfsCredentialProvider, hadoopConf) } assert(caught.getMessage === "Can't get Master Kerberos principal for use as renewer") } }
Example 20
Source File: DStreamCheckpointData.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import java.io.{IOException, ObjectInputStream, ObjectOutputStream} import scala.collection.mutable.HashMap import scala.reflect.ClassTag import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.internal.Logging import org.apache.spark.streaming.Time import org.apache.spark.util.Utils private[streaming] class DStreamCheckpointData[T: ClassTag](dstream: DStream[T]) extends Serializable with Logging { protected val data = new HashMap[Time, AnyRef]() // Mapping of the batch time to the checkpointed RDD file of that time @transient private var timeToCheckpointFile = new HashMap[Time, String] // Mapping of the batch time to the time of the oldest checkpointed RDD // in that batch's checkpoint data @transient private var timeToOldestCheckpointFileTime = new HashMap[Time, Time] @transient private var fileSystem: FileSystem = null protected[streaming] def currentCheckpointFiles = data.asInstanceOf[HashMap[Time, String]] def restore() { // Create RDDs from the checkpoint data currentCheckpointFiles.foreach { case(time, file) => logInfo("Restoring checkpointed RDD for time " + time + " from file '" + file + "'") dstream.generatedRDDs += ((time, dstream.context.sparkContext.checkpointFile[T](file))) } } override def toString: String = { "[\n" + currentCheckpointFiles.size + " checkpoint files \n" + currentCheckpointFiles.mkString("\n") + "\n]" } @throws(classOf[IOException]) private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException { logDebug(this.getClass().getSimpleName + ".writeObject used") if (dstream.context.graph != null) { dstream.context.graph.synchronized { if (dstream.context.graph.checkpointInProgress) { oos.defaultWriteObject() } else { val msg = "Object of " + this.getClass.getName + " is being serialized " + " possibly as a part of closure of an RDD operation. This is because " + " the DStream object is being referred to from within the closure. " + " Please rewrite the RDD operation inside this DStream to avoid this. " + " This has been enforced to avoid bloating of Spark tasks " + " with unnecessary objects." throw new java.io.NotSerializableException(msg) } } } else { throw new java.io.NotSerializableException( "Graph is unexpectedly null when DStream is being serialized.") } } @throws(classOf[IOException]) private def readObject(ois: ObjectInputStream): Unit = Utils.tryOrIOException { logDebug(this.getClass().getSimpleName + ".readObject used") ois.defaultReadObject() timeToOldestCheckpointFileTime = new HashMap[Time, Time] timeToCheckpointFile = new HashMap[Time, String] } }
Example 21
Source File: PortableDataStream.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.input import java.io.{ByteArrayInputStream, ByteArrayOutputStream, DataInputStream, DataOutputStream} import scala.collection.JavaConverters._ import com.google.common.io.{ByteStreams, Closeables} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.{InputSplit, JobContext, RecordReader, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.input.{CombineFileInputFormat, CombineFileRecordReader, CombineFileSplit} def toArray(): Array[Byte] = { val stream = open() try { ByteStreams.toByteArray(stream) } finally { Closeables.close(stream, true) } } def getPath(): String = path }
Example 22
Source File: RosbagInputFormat.scala From ros_hadoop with Apache License 2.0 | 5 votes |
package de.valtech.foss import scala.io.Source import scala.collection.JavaConverters._ import org.apache.hadoop.fs.Path import org.apache.hadoop.io.{BytesWritable, LongWritable, MapWritable} import org.apache.hadoop.mapreduce.{InputSplit, JobContext, RecordReader, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.input.FileInputFormat object RosbagInputFormat { def getRosChunkIdx(context: JobContext): String = { context.getConfiguration.get("RosbagInputFormat.chunkIdx") } def getBlockSize(context: JobContext): Long = { context.getConfiguration.get("dfs.blocksize").toLong } } class RosbagBytesInputFormat extends FileInputFormat[LongWritable, BytesWritable] { private var rosChunkIdx = "" private var recordLength = -1L override def isSplitable(context: JobContext, filename: Path): Boolean = { rosChunkIdx = RosbagInputFormat.getRosChunkIdx(context) recordLength = RosbagInputFormat.getBlockSize(context) true } override def computeSplitSize(blockSize: Long, minSize: Long, maxSize: Long): Long = { val defaultSize = super.computeSplitSize(blockSize, minSize, maxSize) defaultSize } override def createRecordReader(split: InputSplit, context: TaskAttemptContext) : RecordReader[LongWritable, BytesWritable] = { new RosbagBytesRecordReader } } class RosbagMapInputFormat extends FileInputFormat[LongWritable, MapWritable] { private var rosChunkIdx = "" private var recordLength = -1L override def isSplitable(context: JobContext, filename: Path): Boolean = { rosChunkIdx = RosbagInputFormat.getRosChunkIdx(context) recordLength = RosbagInputFormat.getBlockSize(context) true } override def computeSplitSize(blockSize: Long, minSize: Long, maxSize: Long): Long = { val defaultSize = super.computeSplitSize(blockSize, minSize, maxSize) defaultSize } override def createRecordReader(split: InputSplit, context: TaskAttemptContext) : RecordReader[LongWritable, MapWritable] = { new RosbagMapRecordReader } }
Example 23
Source File: SentenceTokenizer.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.dataset.text import java.io.FileInputStream import java.net.{URI, URL} import com.intel.analytics.bigdl.dataset.Transformer import scala.collection.Iterator import opennlp.tools.tokenize.{SimpleTokenizer, Tokenizer, TokenizerME, TokenizerModel} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} class SentenceTokenizer(tokenFile: Option[String] = None) extends Transformer[String, Array[String]] { var modelIn: FileInputStream = _ var model: TokenizerModel = _ var tokenizer: Tokenizer = _ def this(tokenFile: URL) { this(Some(tokenFile.getPath)) } def close(): Unit = { if (modelIn != null) { modelIn.close() } } override def apply(prev: Iterator[String]): Iterator[Array[String]] = prev.map(x => { if (tokenizer == null) { if (!tokenFile.isDefined) { tokenizer = SimpleTokenizer.INSTANCE } else { val src: Path = new Path(tokenFile.get) val fs = src.getFileSystem(new Configuration()) val in = fs.open(src) model = new TokenizerModel(in) tokenizer = new TokenizerME(model) } } val words = tokenizer.tokenize(x) words }) } object SentenceTokenizer { def apply(tokenFile: Option[String] = None): SentenceTokenizer = new SentenceTokenizer(tokenFile) def apply(tokenFile: URL): SentenceTokenizer = new SentenceTokenizer(tokenFile) }
Example 24
Source File: SentenceSplitter.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.dataset.text import java.io.FileInputStream import java.net.{URI, URL} import com.intel.analytics.bigdl.dataset.Transformer import opennlp.tools.sentdetect.{SentenceDetector, SentenceDetectorME, SentenceModel} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import scala.collection.Iterator class SentenceSplitter(sentFile: Option[String] = None) extends Transformer[String, Array[String]] { var modelIn: FileInputStream = _ var model: SentenceModel = _ var sentenceDetector: SentenceDetector = _ def this(sentFileURL: URL) { this(Some(sentFileURL.getPath)) } def this(sentFile: String) { this(Some(sentFile)) } def close(): Unit = { if (modelIn != null) { modelIn.close() } } override def apply(prev: Iterator[String]): Iterator[Array[String]] = prev.map(x => { if (!sentFile.isDefined) { x.split('.') } else { if (sentenceDetector == null) { val src: Path = new Path(sentFile.get) val fs = src.getFileSystem(new Configuration()) val in = fs.open(src) model = new SentenceModel(in) sentenceDetector = new SentenceDetectorME(model) } sentenceDetector.sentDetect(x) } }) } object SentenceSplitter { def apply(sentFile: Option[String] = None): SentenceSplitter = new SentenceSplitter(sentFile) def apply(sentFileURL: URL): SentenceSplitter = new SentenceSplitter(sentFileURL) def apply(sentFile: String): SentenceSplitter = new SentenceSplitter(sentFile) }
Example 25
Source File: TFRecordInputFormat.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.utils.tf import org.apache.hadoop.fs.Path import org.apache.hadoop.io.{BytesWritable, NullWritable} import org.apache.hadoop.mapreduce.{InputSplit, JobContext, RecordReader, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat, FileSplit} import org.apache.hadoop.fs.FSDataInputStream class TFRecordInputFormat extends FileInputFormat[BytesWritable, NullWritable] { override def createRecordReader(inputSplit: InputSplit, context: TaskAttemptContext): RecordReader[BytesWritable, NullWritable] = new RecordReader[BytesWritable, NullWritable] { private var inputStream: FSDataInputStream = null private var reader: TFRecordIterator = null private var length: Long = 0L private var begin: Long = 0L private var current: Array[Byte] = null override def getCurrentKey: BytesWritable = { new BytesWritable(current) } override def getProgress: Float = { (inputStream.getPos - begin) / (length + 1e-6f) } override def nextKeyValue(): Boolean = { if (reader.hasNext) { current = reader.next() true } else { false } } override def getCurrentValue: NullWritable = { NullWritable.get() } override def initialize(split: InputSplit, context: TaskAttemptContext): Unit = { val conf = context.getConfiguration val fileSplit = split.asInstanceOf[FileSplit] length = fileSplit.getLength begin = fileSplit.getStart val file = fileSplit.getPath val fs = file.getFileSystem(conf) inputStream = fs.open(file, 4096) reader = new TFRecordIterator(inputStream) } override def close(): Unit = { inputStream.close() } } override protected def isSplitable(context: JobContext, filename: Path): Boolean = false }
Example 26
Source File: COCOSeqFileGenerator.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.models.utils import com.intel.analytics.bigdl.dataset.segmentation.{COCODataset, COCOSerializeContext} import java.io.File import java.nio.file.{Files, Paths} import java.util.concurrent.atomic.AtomicInteger import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.io.SequenceFile.Writer import org.apache.hadoop.io.compress.BZip2Codec import org.apache.hadoop.io.{BytesWritable, SequenceFile} import scala.collection.parallel.ForkJoinTaskSupport import scopt.OptionParser object COCOSeqFileGenerator { case class COCOSeqFileGeneratorParams( folder: String = ".", metaPath: String = "instances_val2014.json", output: String = ".", parallel: Int = 1, blockSize: Int = 12800 ) private val parser = new OptionParser[COCOSeqFileGeneratorParams]("BigDL COCO " + "Sequence File Generator") { head("BigDL COCO Sequence File Generator") opt[String]('f', "folder") .text("where you put the COCO image files") .action((x, c) => c.copy(folder = x)) opt[String]('o', "output folder") .text("where you put the generated seq files") .action((x, c) => c.copy(output = x)) opt[Int]('p', "parallel") .text("parallel num") .action((x, c) => c.copy(parallel = x)) opt[Int]('b', "blockSize") .text("block size") .action((x, c) => c.copy(blockSize = x)) opt[String]('m', "metaPath") .text("metadata json file path") .action((x, c) => c.copy(metaPath = x)) } def main(args: Array[String]): Unit = { parser.parse(args, COCOSeqFileGeneratorParams()).foreach { param => println("Loading COCO metadata") val meta = COCODataset.load(param.metaPath, param.folder) println("Metadata loaded") val conf: Configuration = new Configuration val doneCount = new AtomicInteger(0) val tasks = meta.images.filter(img => { val path = img.path val valid = Files.exists(path) && !Files.isDirectory(path) if (!valid) { System.err.print(s"[Warning] The image file ${path.getFileName} does not exist.\n") } valid }).grouped(param.blockSize).zipWithIndex.toArray.par tasks.tasksupport = new ForkJoinTaskSupport( new scala.concurrent.forkjoin.ForkJoinPool(param.parallel)) tasks.foreach { case (imgs, blkId) => val outFile = new Path(param.output, s"coco-seq-$blkId.seq") val key = new BytesWritable val value = new BytesWritable val writer = SequenceFile.createWriter(conf, Writer.file(outFile), Writer.keyClass(key .getClass), Writer.valueClass(value.getClass), Writer.compression(SequenceFile .CompressionType.BLOCK, new BZip2Codec)) val context = new COCOSerializeContext imgs.foreach { img => context.clear() context.dump(img.fileName) img.dumpTo(context) context.dump(COCODataset.MAGIC_NUM) val keyBytes = context.toByteArray key.set(keyBytes, 0, keyBytes.length) val bytes = img.data value.set(bytes, 0, bytes.length) writer.append(key, value) val cnt = doneCount.incrementAndGet() if (cnt % 500 == 0) { System.err.print(s"\r$cnt / ${meta.images.length} = ${cnt.toFloat/meta.images.length}") } } writer.close() } System.err.print("\n") } } }
Example 27
Source File: RecordWriter.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.visualization.tensorboard import java.io.{File, FileOutputStream} import com.google.common.primitives.{Ints, Longs} import com.intel.analytics.bigdl.utils.Crc32 import netty.Crc32c import org.apache.hadoop.fs.{FSDataOutputStream, FileSystem, Path} import org.tensorflow.util.Event private[bigdl] class RecordWriter(file: Path, fs: FileSystem) { val outputStream = if (file.toString.startsWith("hdfs://")) { // FSDataOutputStream couldn't flush data to localFileSystem in time. So reading summaries // will throw exception. fs.create(file, true, 1024) } else { // Using FileOutputStream when write to local. new FileOutputStream(new File(file.toString)) } val crc32 = new Crc32c() def write(event: Event): Unit = { val eventString = event.toByteArray val header = Longs.toByteArray(eventString.length.toLong).reverse outputStream.write(header) outputStream.write(Ints.toByteArray(Crc32.maskedCRC32(crc32, header).toInt).reverse) outputStream.write(eventString) outputStream.write(Ints.toByteArray(Crc32.maskedCRC32(crc32, eventString).toInt).reverse) if (outputStream.isInstanceOf[FSDataOutputStream]) { // Flush data to HDFS. outputStream.asInstanceOf[FSDataOutputStream].hflush() } } def close(): Unit = { outputStream.close() } }
Example 28
Source File: FileReader.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.visualization.tensorboard import java.io.{BufferedInputStream} import java.nio.ByteBuffer import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.tensorflow.util.Event import scala.collection.mutable.ArrayBuffer import scala.util.matching.Regex private[bigdl] object FileReader { val fileNameRegex = """bigdl.tfevents.*""".r def readScalar(file: Path, tag: String, fs: FileSystem): Array[(Long, Float, Double)] = { require(fs.isFile(file), s"FileReader: ${file} should be a file") val bis = new BufferedInputStream(fs.open(file)) val longBuffer = new Array[Byte](8) val crcBuffer = new Array[Byte](4) val bf = new ArrayBuffer[(Long, Float, Double)] while (bis.read(longBuffer) > 0) { val l = ByteBuffer.wrap(longBuffer.reverse).getLong() bis.read(crcBuffer) // TODO: checksum // val crc1 = ByteBuffer.wrap(crcBuffer.reverse).getInt() val eventBuffer = new Array[Byte](l.toInt) bis.read(eventBuffer) val e = Event.parseFrom(eventBuffer) if (e.getSummary.getValueCount == 1 && tag.equals(e.getSummary.getValue(0).getTag())) { bf.append((e.getStep, e.getSummary.getValue(0).getSimpleValue, e.getWallTime)) } bis.read(crcBuffer) // val crc2 = ByteBuffer.wrap(crcBuffer.reverse).getInt() } bis.close() bf.toArray.sortWith(_._1 < _._1) } }
Example 29
Source File: WarcHdfsCdxPathRddSpec.scala From ArchiveSpark with MIT License | 5 votes |
package org.archive.archivespark.specific.warc.specs import org.apache.hadoop.fs.Path import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.archive.archivespark.sparkling.cdx.CdxRecord import org.archive.archivespark.specific.warc.WarcRecord class WarcHdfsCdxPathRddSpec private(cdx: RDD[(CdxRecord, String)]) extends WarcHdfsCdxSpecBase[(CdxRecord, String)] { override def load(sc: SparkContext, minPartitions: Int): RDD[(CdxRecord, String)] = cdx override def parse(cdxPath: (CdxRecord, String)): Option[WarcRecord] = { val (cdx, dir) = cdxPath parse(cdx, new Path(dir, cdx.locationFromAdditionalFields._1)) } } object WarcHdfsCdxPathRddSpec { def apply(cdxWarcPaths: RDD[(CdxRecord, String)]) = new WarcHdfsCdxPathRddSpec(cdxWarcPaths) }
Example 30
Source File: HdfsFileSpec.scala From ArchiveSpark with MIT License | 5 votes |
package org.archive.archivespark.specific.raw import org.apache.hadoop.fs.Path import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.archive.archivespark.dataspecs.DataSpec import org.archive.archivespark.dataspecs.access.HdfsFileAccessor import org.archive.archivespark.sparkling.io.HdfsIO import org.archive.archivespark.sparkling.util.RddUtil class HdfsFileSpec private(path: String, filePatterns: Seq[String], decompress: Boolean, maxPartitions: Int) extends DataSpec[String, FileStreamRecord] { override def load(sc: SparkContext, minPartitions: Int): RDD[String] = { val files = HdfsIO.files(path, recursive = true) val filtered = if (filePatterns.isEmpty) files.toSeq else files.filter(path => filePatterns.exists(new Path(path).getName.matches)).toSeq RddUtil.parallelize(filtered, if (maxPartitions == 0) minPartitions else maxPartitions.min(minPartitions)) } override def parse(file: String): Option[FileStreamRecord] = Some(new FileStreamRecord(file, new HdfsFileAccessor(file, decompress))) } object HdfsFileSpec { def apply(path: String, filePatterns: Seq[String] = Seq.empty, decompress: Boolean = true, maxPartitions: Int = 0): HdfsFileSpec = { new HdfsFileSpec(path, filePatterns, decompress, maxPartitions) } }
Example 31
Source File: HdfsFileAccessor.scala From ArchiveSpark with MIT License | 5 votes |
package org.archive.archivespark.dataspecs.access import java.io.InputStream import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.deploy.SparkHadoopUtil import org.archive.archivespark.sparkling.io.IOUtil class HdfsFileAccessor(path: String, decompress: Boolean = true) extends CloseableDataAccessor[InputStream] { override def get: Option[InputStream] = { val fs = FileSystem.get(SparkHadoopUtil.get.conf) var stream: InputStream = null try { val raw = fs.open(new Path(path)) stream = if (decompress) IOUtil.decompress(raw, Some(path)) else raw Some(stream) } catch { case e: Exception => e.printStackTrace() if (stream != null) stream.close() None } } }
Example 32
Source File: HdfsStreamAccessor.scala From ArchiveSpark with MIT License | 5 votes |
package org.archive.archivespark.dataspecs.access import java.io.InputStream import org.apache.commons.io.input.BoundedInputStream import org.apache.hadoop.fs.{FSDataInputStream, FileSystem, Path} import org.apache.spark.deploy.SparkHadoopUtil class HdfsStreamAccessor(location: HdfsLocationInfo) extends CloseableDataAccessor[InputStream] { override def get: Option[InputStream] = { if (location.length < 0 || location.offset < 0) None else { val fs = FileSystem.get(SparkHadoopUtil.get.conf) var stream: FSDataInputStream = null try { stream = fs.open(new Path(location.path)) stream.seek(location.offset) Some(new BoundedInputStream(stream, location.length)) } catch { case e: Exception => e.printStackTrace() if (stream != null) stream.close() None } } } }
Example 33
Source File: FileLocalityInputFormat.scala From ArchiveSpark with MIT License | 5 votes |
package org.archive.archivespark.sparkling.util import org.apache.hadoop.fs.Path import org.apache.hadoop.io.{NullWritable, Text} import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat, FileSplit} import org.apache.hadoop.mapreduce.{InputSplit, JobContext, RecordReader, TaskAttemptContext} class FileLocalityInputFormat extends FileInputFormat[NullWritable, Text] { class FileLocalityRecordReader extends RecordReader[NullWritable, Text] { private var filePath: Text = new Text() private var read: Boolean = true override def initialize(split: InputSplit, context: TaskAttemptContext): Unit = { filePath.set(split.asInstanceOf[FileSplit].getPath.toString) read = false } override def nextKeyValue(): Boolean = { if (read) false else { read = true true } } override def getCurrentKey: NullWritable = NullWritable.get override def getCurrentValue: Text = filePath override def getProgress: Float = if (read) 1.0f else 0.0f override def close(): Unit = read = true } override def isSplitable(context: JobContext, filename: Path): Boolean = false override def createRecordReader(split: InputSplit, context: TaskAttemptContext): RecordReader[NullWritable, Text] = new FileLocalityRecordReader }
Example 34
Source File: HdfsFileWriter.scala From ArchiveSpark with MIT License | 5 votes |
package org.archive.archivespark.sparkling.io import java.io.{FileInputStream, FileOutputStream, OutputStream} import org.apache.hadoop.fs.Path import org.archive.archivespark.sparkling.logging.{Log, LogContext} import scala.util.Try class HdfsFileWriter private(filename: String, append: Boolean, replication: Short) extends OutputStream { implicit val logContext: LogContext = LogContext(this) private val file = IOUtil.tmpFile Log.info("Writing to temporary local file " + file.getCanonicalPath + " (" + filename + ")...") val out = new FileOutputStream(file) override def close(): Unit = { Try { out.close() } Log.info("Copying from temporary file " + file.getCanonicalPath + " to " + filename + "...") if (append) { val in = new FileInputStream(file) val appendOut = HdfsIO.fs.append(new Path(filename)) IOUtil.copy(in, appendOut) appendOut.close() in.close() file.delete() } else HdfsIO.copyFromLocal(file.getCanonicalPath, filename, move = true, overwrite = true, replication) Log.info("Done. (" + filename + ")") } override def write(b: Int): Unit = out.write(b) override def write(b: Array[Byte]): Unit = out.write(b) override def write(b: Array[Byte], off: Int, len: Int): Unit = out.write(b, off, len) override def flush(): Unit = out.flush() } object HdfsFileWriter { def apply(filename: String, overwrite: Boolean = false, append: Boolean = false, replication: Short = 0): HdfsFileWriter = { if (!overwrite && !append) HdfsIO.ensureNewFile(filename) new HdfsFileWriter(filename, append, replication) } }
Example 35
Source File: HdfsBlockStream.scala From ArchiveSpark with MIT License | 5 votes |
package org.archive.archivespark.sparkling.io import java.io.{ByteArrayInputStream, InputStream} import org.apache.hadoop.fs.{FileSystem, Path} import org.archive.archivespark.sparkling.logging.LogContext import org.archive.archivespark.sparkling.util.Common import scala.util.Try class HdfsBlockStream (fs: FileSystem, file: String, offset: Long = 0, length: Long = -1, retries: Int = 60, sleepMillis: Int = 1000 * 60) extends InputStream { implicit val logContext: LogContext = LogContext(this) val path = new Path(file) val (blockSize: Int, fileSize: Long) = { val status = fs.getFileStatus(path) (status.getBlockSize.min(Int.MaxValue).toInt, status.getLen) } private var pos: Long = offset.max(0) private val max: Long = if (length > 0) fileSize.min(pos + length) else fileSize private val buffer = new Array[Byte](blockSize) private val emptyBlock = new ByteArrayInputStream(Array.emptyByteArray) private var block: ByteArrayInputStream = emptyBlock def ensureNextBlock(): InputStream = { if (block.available() == 0 && pos < max) { val end = pos + blockSize val blockLength = ((end - (end % blockSize)).min(max) - pos).toInt Common.retry(retries, sleepMillis, (retry, e) => { "File access failed (" + retry + "/" + retries + "): " + path + " (Offset: " + pos + ") - " + e.getMessage }) { retry => val in = fs.open(path, blockLength) if (retry > 0) Try(in.seekToNewSource(pos)) else if (pos > 0) in.seek(pos) var read = 0 while (read < blockLength) read += in.read(buffer, read, blockLength - read) Try(in.close()) } pos += blockLength block = new ByteArrayInputStream(buffer, 0, blockLength) } block } override def read(): Int = ensureNextBlock().read() override def read(b: Array[Byte]): Int = ensureNextBlock().read(b) override def read(b: Array[Byte], off: Int, len: Int): Int = ensureNextBlock().read(b, off, len) override def skip(n: Long): Long = { val available = block.available() if (n <= available) block.skip(n) else { block = emptyBlock val currentPos = pos - available val skip = n.min(max - currentPos) pos += skip - available skip } } override def available(): Int = block.available() override def close(): Unit = {} override def markSupported(): Boolean = false }
Example 36
Source File: FilePathMap.scala From ArchiveSpark with MIT License | 5 votes |
package org.archive.archivespark.util import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.deploy.SparkHadoopUtil import scala.util.Try case class FilePathMap(path: String, patterns: Seq[String] = Seq.empty) { val pathMap: Map[String, String] = { var map = collection.mutable.Map[String, String]() val fs = FileSystem.get(SparkHadoopUtil.get.conf) val files = fs.listFiles(new Path(path), true) while (files.hasNext) { val path = files.next.getPath val filename = path.getName if (patterns.isEmpty || patterns.exists(filename.matches)) { if (map.contains(filename)) throw new RuntimeException("duplicate filename: " + filename) map += filename -> path.getParent.toString.intern } } map.toMap } def pathToFile(file: String): Option[Path] = Try {new Path(file).getName}.toOption match { case Some(f) => pathMap.get(f).map(dir => new Path(dir, f)) case None => None } }
Example 37
Source File: 2-CommonFunctions.scala From Azure-Databricks-NYC-Taxi-Workshop with MIT License | 5 votes |
// Databricks notebook source import org.apache.hadoop.fs.{ FileSystem, Path } import org.apache.hadoop.conf.Configuration // COMMAND ---------- val prqShrinkageFactor = 0.19 //We found a saving in space of 81% with Parquet // COMMAND ---------- def analyzeTables(databaseAndTable: String) { println("Table: " + databaseAndTable) println("....refresh table") sql("REFRESH TABLE " + databaseAndTable) println("....analyze table") sql("ANALYZE TABLE " + databaseAndTable + " COMPUTE STATISTICS") println("....done") } // COMMAND ---------- def calcOutputFileCountTxtToPrq(srcDataFile: String, targetedFileSizeMB: Int): Int = { val fs = FileSystem.get(new Configuration()) val estFileCount: Int = Math.floor((fs.getContentSummary(new Path(srcDataFile)).getLength * prqShrinkageFactor) / (targetedFileSizeMB * 1024 * 1024)).toInt if(estFileCount == 0) 1 else estFileCount } // COMMAND ---------- // Get recursive file collection you can iterate on def getRecursiveFileCollection(directoryPath: String): Seq[String] = dbutils.fs.ls(directoryPath).map(directoryItem => { // Work around double encoding bug val directoryItemPath = directoryItem.path.replace("%25", "%").replace("%25", "%") if (directoryItem.isDir) getRecursiveFileCollection(directoryItemPath) else Seq[String](directoryItemPath) }).reduce(_ ++ _) // COMMAND ---------- //Delete residual files from job operation (_SUCCESS, _start*, _committed*) def recursivelyDeleteSparkJobFlagFiles(directoryPath: String) { getRecursiveFileCollection(directoryPath).foreach(directoryItemPath => { if (directoryItemPath.indexOf("parquet") == -1) { println("Deleting...." + directoryItemPath) dbutils.fs.rm(directoryItemPath) }}) } // COMMAND ---------- dbutils.notebook.exit("Pass")
Example 38
Source File: TopWORDSApp.scala From topwords with GNU General Public License v3.0 | 5 votes |
package io.github.qf6101.topwords import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.log4j.Logger import org.apache.spark.sql.SparkSession object TopWORDSApp extends Serializable { @transient private[this] val LOGGER = Logger.getLogger(this.getClass.toString) def main(args: Array[String]) { // setup spark session val spark = SparkSession.builder().getOrCreate() try { TopWORDSParser.parse(args).foreach { args => // remove output location files if exist val files = FileSystem.get(spark.sparkContext.hadoopConfiguration) if (files.exists(new Path(args.outputLoc))) files.delete(new Path(args.outputLoc), true) // read input corpus val corpus = if (args.numPartitions > 0) spark.sparkContext.textFile(args.inputLoc).repartition(args.numPartitions) else spark.sparkContext.textFile(args.inputLoc) LOGGER.info("Number of lines of input corpus: " + corpus.count()) // run TopWORDS with the parsed arguments new TopWORDS( tauL = args.tauL, tauF = args.tauF, textLenThld = args.textLenThld, useProbThld = args.useProbThld, numIterations = args.numIterations, convergeTol = args.convergeTol, wordBoundaryThld = args.wordBoundaryThld) .run(corpus, args.outputLoc + "/dictionary", args.outputLoc + "/segmented_texts") } //exit normally LOGGER.info("Running TopWORDS successfully!") if (spark.sparkContext.master.contains("local")) sys.exit(0) } catch { case ex: Throwable => LOGGER.error("Running TopWORDS fail!", ex) //signal to external process if (spark.sparkContext.master.contains("local")) sys.exit(1) } finally spark.stop() } }
Example 39
Source File: TestTopWORDS.scala From topwords with GNU General Public License v3.0 | 5 votes |
package io.github.qf6101.topwords import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.sql.SparkSession def main(args: Array[String]) { // setup spark session val spark = SparkSession.builder().master("local[1]").appName(this.getClass.toString).getOrCreate() val inputFile = "test_data/story_of_stone.txt" val outputFile = "test_data/test_output" val files = FileSystem.get(spark.sparkContext.hadoopConfiguration) if (files.exists(new Path(outputFile))) files.delete(new Path(outputFile), true) val corpus = spark.sparkContext.textFile(inputFile) new TopWORDS( tauL = 10, tauF = 5, textLenThld = 2000, useProbThld = 1E-8, numIterations = 10, convergeTol = 1E-3, wordBoundaryThld = 0.0) .run(corpus, outputFile + "/dictionary", outputFile + "/segmented_texts") } }
Example 40
Source File: CountingApp.scala From robin-sparkles with Apache License 2.0 | 5 votes |
package com.highperformancespark.robinsparkles import java.nio.file.FileSystem import com.highperformancespark.robinsparkles.CountingLocalApp.conf import org.apache.hadoop.fs.Path import org.apache.spark.{SparkConf, SparkContext} import scala.reflect.io import scala.util.Try object CountingApp extends App{ val (inputFile, outputFile) = (args(0), args(1)) // spark-submit command should supply all necessary config elements Runner.run(SparkContext.getOrCreate(), 0, "/tmp", inputFile, outputFile) } object Runner { def getOptimizedConf(metricsDir: String, conf: SparkConf): (SparkConf, Int) = { val metricsReader = new MetricsReader(conf, metricsDir) // Load all of the previous runs until one isn't found val prevRuns = Stream.from(0) .map(id => metricsReader.getRunInfo(id)) .takeWhile(_.isDefined) .map(_.get) prevRuns.zipWithIndex.foreach(x=> println(x._2 + x._1.mkString(", "))) val partitions = ComputePartitions(conf) .fromStageMetricSharedCluster( StageInfo.stagesWithMostExpensiveShuffle(prevRuns) ) conf.set("spark.default.parallelism", partitions.toString) println(s"Found ${prevRuns.length} runs of historical data in metrics dir $metricsDir") println("Optimized conf is: --------------------------") println(conf.getAll.mkString("\n")) (conf, prevRuns.length) } def run(sc: SparkContext, id: Int, metricsDir: String, inputFile: String, outputFile: String): Unit = { val conf = sc.hadoopConfiguration val fs = org.apache.hadoop.fs.FileSystem.get(conf) if(fs.exists(new Path(outputFile))){ println(s"Output path $outputFile already exists, deleting it" ) fs.delete(new Path(outputFile), true) } val metricsCollector = new MetricsCollector(sc, metricsDir) metricsCollector.startSparkJobWithRecording(id) val rdd = sc.textFile(inputFile) val counts = WordCount.withStopWordsFiltered(rdd) counts.saveAsTextFile(outputFile) } }
Example 41
Source File: PostUrl.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.http import java.io.{BufferedReader, InputStreamReader} import java.net.URI import cn.piflow.conf.bean.PropertyDescriptor import cn.piflow.conf.util.{ImageUtil, MapUtil} import cn.piflow.conf.{ConfigurableStop, Port, StopGroup} import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext} import org.apache.commons.httpclient.HttpClient import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FSDataInputStream, FileSystem, Path} import org.apache.http.client.methods.HttpPost import org.apache.http.entity.StringEntity import org.apache.http.impl.client.HttpClients import org.apache.http.util.EntityUtils import org.apache.spark.sql.SparkSession class PostUrl extends ConfigurableStop{ override val authorEmail: String = "[email protected]" override val inportList: List[String] = List(Port.DefaultPort) override val outportList: List[String] = List(Port.DefaultPort) override val description: String = "Send a post request to the specified http" var url : String= _ var jsonPath : String = _ override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = { val spark = pec.get[SparkSession]() //read json from hdfs val conf = new Configuration() val fs = FileSystem.get(URI.create(jsonPath),conf) val stream: FSDataInputStream = fs.open(new Path(jsonPath)) val bufferReader = new BufferedReader(new InputStreamReader(stream)) var lineTxt = bufferReader.readLine() val buffer = new StringBuffer() while (lineTxt != null ){ buffer.append(lineTxt.mkString) lineTxt=bufferReader.readLine() } // post val client = HttpClients.createDefault() val httpClient = new HttpClient() httpClient.getParams().setContentCharset("utf-8") val post = new HttpPost(url) post.addHeader("content-Type","application/json") post.setEntity(new StringEntity(buffer.toString)) val response = client.execute(post) val entity = response.getEntity val str = EntityUtils.toString(entity,"UTF-8") println("Code is " + str) } override def setProperties(map: Map[String, Any]): Unit = { url = MapUtil.get(map,key="url").asInstanceOf[String] jsonPath = MapUtil.get(map,key="jsonPath").asInstanceOf[String] } override def getPropertyDescriptor(): List[PropertyDescriptor] = { var descriptor : List[PropertyDescriptor] = List() val url = new PropertyDescriptor() .name("url") .displayName("Url") .defaultValue("") .description("http request address") .required(true) .example("http://master:8002/flow/start") val jsonPath = new PropertyDescriptor() .name("jsonPath") .displayName("JsonPath") .defaultValue("") .description("json parameter path for post request") .required(true) .example("hdfs://master:9000/work/flow.json") descriptor = url :: descriptor descriptor = jsonPath :: descriptor descriptor } override def getIcon(): Array[Byte] = { ImageUtil.getImage("icon/http/PostUrl.png") } override def getGroup(): List[String] = { List(StopGroup.HttpGroup.toString) } override def initialize(ctx: ProcessContext): Unit = { } }
Example 42
Source File: Pathway.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.microorganism import java.io.{BufferedReader, InputStreamReader, OutputStreamWriter} import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext} import cn.piflow.conf.{ConfigurableStop, Port, StopGroup} import cn.piflow.conf.bean.PropertyDescriptor import cn.piflow.conf.util.{ImageUtil, MapUtil} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FSDataInputStream, FSDataOutputStream, FileSystem, Path} import org.apache.spark.sql.{DataFrame, SparkSession} import org.json.JSONObject class Pathway extends ConfigurableStop{ override val authorEmail: String = "[email protected]" override val description: String = "Parse Pathway data" override val inportList: List[String] =List(Port.DefaultPort.toString) override val outportList: List[String] = List(Port.DefaultPort.toString) var cachePath:String = _ def setProperties(map: Map[String, Any]): Unit = { cachePath=MapUtil.get(map,key="cachePath").asInstanceOf[String] } override def getPropertyDescriptor(): List[PropertyDescriptor] = { var descriptor : List[PropertyDescriptor] = List() val cachePath = new PropertyDescriptor().name("cachePath").displayName("cachePath").description("Temporary Cache File Path") .defaultValue("/pathway").required(true) descriptor = cachePath :: descriptor descriptor } override def getIcon(): Array[Byte] = { ImageUtil.getImage("icon/microorganism/Pathway.png") } override def getGroup(): List[String] = { List(StopGroup.MicroorganismGroup) } override def initialize(ctx: ProcessContext): Unit = { } override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = { val inDf: DataFrame = in.read() var pathStr: String =inDf.take(1)(0).get(0).asInstanceOf[String] val configuration: Configuration = new Configuration() val pathARR: Array[String] = pathStr.split("\\/") var hdfsUrl:String="" for (x <- (0 until 3)){ hdfsUrl+=(pathARR(x) +"/") } configuration.set("fs.defaultFS",hdfsUrl) var fs: FileSystem = FileSystem.get(configuration) val hdfsPathTemporary = hdfsUrl+cachePath+"/pathwayCache/pathwayCache.json" val path: Path = new Path(hdfsPathTemporary) if(fs.exists(path)){ fs.delete(path) } fs.create(path).close() val hdfsWriter: OutputStreamWriter = new OutputStreamWriter(fs.append(path)) var fdis: FSDataInputStream = null var br: BufferedReader = null var doc: JSONObject = null var hasAnotherSequence:Boolean = true inDf.collect().foreach(row => { pathStr = row.get(0).asInstanceOf[String] fdis = fs.open(new Path(pathStr)) br = new BufferedReader(new InputStreamReader(fdis)) var count = 0 while (hasAnotherSequence) { count += 1 doc = new JSONObject hasAnotherSequence = util.KeggPathway.process(br, doc) doc.write(hdfsWriter) hdfsWriter.write("\n") } br.close() fdis.close() }) hdfsWriter.close() val df: DataFrame = pec.get[SparkSession]().read.json(hdfsPathTemporary) df.schema.printTreeString() println(df.count) out.write(df) } }
Example 43
Source File: PDBData.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.microorganism import java.io._ import cn.piflow.bundle.microorganism.util.PDB import cn.piflow.conf.bean.PropertyDescriptor import cn.piflow.conf.util.{ImageUtil, MapUtil} import cn.piflow.conf.{ConfigurableStop, Port, StopGroup} import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FSDataInputStream, FSDataOutputStream, FileSystem, Path} import org.apache.spark.sql.{DataFrame, SparkSession} import org.biojavax.bio.seq.{RichSequence, RichSequenceIterator} import org.json.JSONObject class PDBData extends ConfigurableStop{ override val authorEmail: String = "[email protected]" override val description: String = "Parse PDB data" override val inportList: List[String] =List(Port.DefaultPort.toString) override val outportList: List[String] = List(Port.DefaultPort.toString) var cachePath:String = _ override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = { val session = pec.get[SparkSession]() val inDf: DataFrame = in.read() val configuration: Configuration = new Configuration() var pathStr: String =inDf.take(1)(0).get(0).asInstanceOf[String] val pathARR: Array[String] = pathStr.split("\\/") var hdfsUrl:String="" for (x <- (0 until 3)){ hdfsUrl+=(pathARR(x) +"/") } configuration.set("fs.defaultFS",hdfsUrl) var fs: FileSystem = FileSystem.get(configuration) val hdfsPathTemporary = hdfsUrl+cachePath+"/PDBCache/PDBCache.json" val path: Path = new Path(hdfsPathTemporary) if(fs.exists(path)){ fs.delete(path) } fs.create(path).close() val hdfsWriter: OutputStreamWriter = new OutputStreamWriter(fs.append(path)) var doc: JSONObject = null var pdb: PDB = null var count:Int=0 inDf.collect().foreach(row => { count += 1 pathStr = row.get(0).asInstanceOf[String] pdb = new PDB(pathStr,fs) doc = pdb.getDoc doc.write(hdfsWriter) hdfsWriter.write("\n") doc = null }) hdfsWriter.close() val df: DataFrame = session.read.json(hdfsPathTemporary) out.write(df) } def setProperties(map: Map[String, Any]): Unit = { cachePath=MapUtil.get(map,key="cachePath").asInstanceOf[String] } override def getPropertyDescriptor(): List[PropertyDescriptor] = { var descriptor : List[PropertyDescriptor] = List() val cachePath = new PropertyDescriptor().name("cachePath").displayName("cachePath").description("Temporary Cache File Path") .defaultValue("/PDB").required(true) descriptor = cachePath :: descriptor descriptor } override def getIcon(): Array[Byte] = { ImageUtil.getImage("icon/microorganism/PDBData.png") } override def getGroup(): List[String] = { List(StopGroup.MicroorganismGroup) } override def initialize(ctx: ProcessContext): Unit = { } }
Example 44
Source File: Ensembl.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.microorganism import java.io._ import cn.piflow.bundle.microorganism.util.ParserGff3Data import cn.piflow.conf.bean.PropertyDescriptor import cn.piflow.conf.util.{ImageUtil, MapUtil} import cn.piflow.conf.{ConfigurableStop, Port, StopGroup} import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FSDataInputStream, FSDataOutputStream, FileSystem, Path} import org.apache.spark.sql.{DataFrame, SparkSession} import org.biojavax.bio.seq.{RichSequence, RichSequenceIterator} import org.json.JSONObject class Ensembl extends ConfigurableStop{ override val authorEmail: String = "[email protected]" override val description: String = "Parse ensembl data" override val inportList: List[String] =List(Port.DefaultPort.toString) override val outportList: List[String] = List(Port.DefaultPort.toString) var cachePath:String = _ def setProperties(map: Map[String, Any]): Unit = { cachePath=MapUtil.get(map,key="cachePath").asInstanceOf[String] } override def getPropertyDescriptor(): List[PropertyDescriptor] = { var descriptor : List[PropertyDescriptor] = List() val cachePath = new PropertyDescriptor().name("cachePath").displayName("cachePath").description("Temporary Cache File Path") .defaultValue("/ensembl").required(true) descriptor = cachePath :: descriptor descriptor } override def getIcon(): Array[Byte] = { ImageUtil.getImage("icon/microorganism/Ensembl.png") } override def getGroup(): List[String] = { List(StopGroup.MicroorganismGroup) } override def initialize(ctx: ProcessContext): Unit = { } override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = { val session = pec.get[SparkSession]() val inDf: DataFrame = in.read() val configuration: Configuration = new Configuration() var pathStr: String =inDf.take(1)(0).get(0).asInstanceOf[String] val pathARR: Array[String] = pathStr.split("\\/") var hdfsUrl:String="" for (x <- (0 until 3)){ hdfsUrl+=(pathARR(x) +"/") } configuration.set("fs.defaultFS",hdfsUrl) var fs: FileSystem = FileSystem.get(configuration) val hdfsPathTemporary = hdfsUrl+cachePath+"/ensemblCache/ensemblCache.json" val path: Path = new Path(hdfsPathTemporary) if(fs.exists(path)){ fs.delete(path) } fs.create(path).close() val hdfsWriter: OutputStreamWriter = new OutputStreamWriter(fs.append(path)) val parser: ParserGff3Data = new ParserGff3Data var fdis: FSDataInputStream =null var br: BufferedReader = null var doc: JSONObject = null var count:Int = 0 inDf.collect().foreach(row => { pathStr = row.get(0).asInstanceOf[String] fdis = fs.open(new Path(pathStr)) br = new BufferedReader(new InputStreamReader(fdis)) var eachStr:String=null while((eachStr = br.readLine()) != null && eachStr != null ){ doc = parser.parserGff3(eachStr) if(doc.toString.length > 2){ count += 1 doc.write(hdfsWriter) hdfsWriter.write("\n") } } br.close() fdis.close() }) hdfsWriter.close() out.write(session.read.json(hdfsPathTemporary)) } }
Example 45
Source File: DirectParquetOutputCommitter.scala From utils with Apache License 2.0 | 5 votes |
package com.indix.utils.spark.parquet import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext} import org.apache.parquet.Log import org.apache.parquet.hadoop.util.ContextUtil import org.apache.parquet.hadoop.{ParquetFileReader, ParquetFileWriter, ParquetOutputCommitter, ParquetOutputFormat} class DirectParquetOutputCommitter(outputPath: Path, context: TaskAttemptContext) extends ParquetOutputCommitter(outputPath, context) { val LOG = Log.getLog(classOf[ParquetOutputCommitter]) override def getWorkPath: Path = outputPath override def abortTask(taskContext: TaskAttemptContext): Unit = {} override def commitTask(taskContext: TaskAttemptContext): Unit = {} override def needsTaskCommit(taskContext: TaskAttemptContext): Boolean = true override def setupJob(jobContext: JobContext): Unit = {} override def setupTask(taskContext: TaskAttemptContext): Unit = {} override def commitJob(jobContext: JobContext) { val configuration = ContextUtil.getConfiguration(jobContext) val fileSystem = outputPath.getFileSystem(configuration) LOG.info("Using DirectParquetOutputCommitter to commit parquet files") if (configuration.getBoolean(ParquetOutputFormat.ENABLE_JOB_SUMMARY, true)) { try { val outputStatus = fileSystem.getFileStatus(outputPath) val footers = ParquetFileReader.readAllFootersInParallel(configuration, outputStatus) try { ParquetFileWriter.writeMetadataFile(configuration, outputPath, footers) } catch { case e: Exception => LOG.warn("Could not write summary file for " + outputPath, e) val metadataPath = new Path(outputPath, ParquetFileWriter.PARQUET_METADATA_FILE) if (fileSystem.exists(metadataPath)) { fileSystem.delete(metadataPath, true) } } } catch { case e: Exception => LOG.warn("Could not write summary file for " + outputPath, e) } } if (configuration.getBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", true)) { try { val successPath = new Path(outputPath, FileOutputCommitter.SUCCEEDED_FILE_NAME) fileSystem.create(successPath).close() } catch { case e: Exception => LOG.warn("Could not write success file for " + outputPath, e) } } } }
Example 46
Source File: MessageSink.scala From parquet4s with MIT License | 5 votes |
package com.github.mjakubowski84.parquet4s.indefinite import java.sql.Timestamp import java.util.UUID import akka.Done import akka.kafka.CommitterSettings import akka.kafka.ConsumerMessage.CommittableOffsetBatch import akka.kafka.scaladsl.Committer import akka.stream.scaladsl.{Flow, Keep, Sink} import com.github.mjakubowski84.parquet4s.{ChunkPathBuilder, ParquetStreams, ParquetWriter} import com.google.common.io.Files import org.apache.hadoop.fs.Path import org.apache.parquet.hadoop.metadata.CompressionCodecName import scala.concurrent.Future import scala.concurrent.duration._ object MessageSink { case class Data(timestamp: Timestamp, word: String) val MaxChunkSize: Int = 128 val ChunkWriteTimeWindow: FiniteDuration = 10.seconds val WriteDirectoryName: String = "messages" } trait MessageSink { this: Akka => import MessageSink._ import MessageSource._ protected val baseWritePath: String = new Path(Files.createTempDir().getAbsolutePath, WriteDirectoryName).toString private val writerOptions = ParquetWriter.Options(compressionCodecName = CompressionCodecName.SNAPPY) private lazy val committerSink = Flow.apply[Seq[Message]].map { messages => CommittableOffsetBatch(messages.map(_.committableOffset)) }.toMat(Committer.sink(CommitterSettings(system)))(Keep.right) def chunkPath: ChunkPathBuilder[Message] = { case (basePath, chunk) => val lastElementDateTime = new Timestamp(chunk.last.record.timestamp()).toLocalDateTime val year = lastElementDateTime.getYear val month = lastElementDateTime.getMonthValue val day = lastElementDateTime.getDayOfMonth val uuid = UUID.randomUUID() basePath.suffix(s"/$year/$month/$day/part-$uuid.parquet") } lazy val messageSink: Sink[Message, Future[Done]] = ParquetStreams.toParquetIndefinite( path = baseWritePath, maxChunkSize = MaxChunkSize, chunkWriteTimeWindow = ChunkWriteTimeWindow, buildChunkPath = chunkPath, preWriteTransformation = { message: Message => Data( timestamp = new Timestamp(message.record.timestamp()), word = message.record.value() ) }, postWriteSink = committerSink, options = writerOptions ) }
Example 47
Source File: UnorderedParallelParquetSink.scala From parquet4s with MIT License | 5 votes |
package com.github.mjakubowski84.parquet4s import java.util.UUID import akka.Done import akka.stream.scaladsl.{Flow, Keep, Sink} import org.apache.hadoop.fs.Path import org.apache.parquet.schema.MessageType import org.slf4j.{Logger, LoggerFactory} import scala.concurrent.Future private[parquet4s] object UnorderedParallelParquetSink extends IOOps { protected val logger: Logger = LoggerFactory.getLogger(this.getClass) def apply[T: ParquetRecordEncoder : ParquetSchemaResolver](path: Path, parallelism: Int, options: ParquetWriter.Options = ParquetWriter.Options() ): Sink[T, Future[Done]] = { val schema = ParquetSchemaResolver.resolveSchema[T] val valueCodecConfiguration = options.toValueCodecConfiguration validateWritePath(path, options) def encode(data: T): RowParquetRecord = ParquetRecordEncoder.encode[T](data, valueCodecConfiguration) Flow[T] .zipWithIndex .groupBy(parallelism, elemAndIndex => Math.floorMod(elemAndIndex._2, parallelism)) .map(elemAndIndex => encode(elemAndIndex._1)) .fold(UnorderedChunk(path, schema, options))(_.write(_)) .map(_.close()) .async .mergeSubstreamsWithParallelism(parallelism) .toMat(Sink.ignore)(Keep.right) } private trait UnorderedChunk { def write(record: RowParquetRecord): UnorderedChunk def close(): Unit } private object UnorderedChunk { def apply(basePath: Path, schema: MessageType, options: ParquetWriter.Options): UnorderedChunk = new PendingUnorderedChunk(basePath, schema, options) private[UnorderedChunk] class PendingUnorderedChunk(basePath: Path, schema: MessageType, options: ParquetWriter.Options) extends UnorderedChunk { override def write(record: RowParquetRecord): UnorderedChunk = { val chunkPath = Path.mergePaths(basePath, new Path(s"/part-${UUID.randomUUID()}.parquet")) val writer = ParquetWriter.internalWriter(chunkPath, schema, options) writer.write(record) new StartedUnorderedChunk(chunkPath, writer, acc = 1) } override def close(): Unit = () } private[UnorderedChunk] class StartedUnorderedChunk(chunkPath: Path, writer: ParquetWriter.InternalWriter, acc: Long ) extends UnorderedChunk { override def write(record: RowParquetRecord): UnorderedChunk = { writer.write(record) new StartedUnorderedChunk(chunkPath, writer, acc = acc + 1) } override def close(): Unit = { if (logger.isDebugEnabled) logger.debug(s"$acc records were successfully written to $chunkPath") writer.close() } } } }
Example 48
Source File: IndefiniteStreamParquetSink.scala From parquet4s with MIT License | 5 votes |
package com.github.mjakubowski84.parquet4s import akka.stream.FlowShape import akka.stream.scaladsl.{Broadcast, Flow, GraphDSL, Keep, Sink, ZipWith} import com.github.mjakubowski84.parquet4s.ParquetWriter.ParquetWriterFactory import org.apache.hadoop.fs.Path import org.slf4j.{Logger, LoggerFactory} import scala.concurrent.duration.FiniteDuration private[parquet4s] object IndefiniteStreamParquetSink extends IOOps { protected val logger: Logger = LoggerFactory.getLogger(this.getClass) def apply[In, ToWrite: ParquetWriterFactory, Mat](path: Path, maxChunkSize: Int, chunkWriteTimeWindow: FiniteDuration, buildChunkPath: ChunkPathBuilder[In] = ChunkPathBuilder.default, preWriteTransformation: In => ToWrite = identity[In] _, postWriteSink: Sink[Seq[In], Mat] = Sink.ignore, options: ParquetWriter.Options = ParquetWriter.Options() ): Sink[In, Mat] = { validateWritePath(path, options) val internalFlow = Flow.fromGraph(GraphDSL.create() { implicit b => import GraphDSL.Implicits._ val inChunkFlow = b.add(Flow[In].groupedWithin(maxChunkSize, chunkWriteTimeWindow)) val broadcastChunks = b.add(Broadcast[Seq[In]](outputPorts = 2)) val writeFlow = Flow[Seq[In]].map { chunk => val toWrite = chunk.map(preWriteTransformation) val chunkPath = buildChunkPath(path, chunk) if (logger.isDebugEnabled()) logger.debug(s"Writing ${toWrite.size} records to $chunkPath") ParquetWriter.writeAndClose(chunkPath.toString, toWrite, options) } val zip = b.add(ZipWith[Seq[In], Unit, Seq[In]]((chunk, _) => chunk)) inChunkFlow ~> broadcastChunks ~> writeFlow ~> zip.in1 broadcastChunks ~> zip.in0 FlowShape(inChunkFlow.in, zip.out) }) internalFlow.toMat(postWriteSink)(Keep.right) } }
Example 49
Source File: SingleFileParquetSink.scala From parquet4s with MIT License | 5 votes |
package com.github.mjakubowski84.parquet4s import akka.Done import akka.stream.scaladsl.{Flow, Keep, Sink} import org.apache.hadoop.fs.Path import org.slf4j.{Logger, LoggerFactory} import scala.concurrent.Future private[parquet4s] object SingleFileParquetSink { protected val logger: Logger = LoggerFactory.getLogger(this.getClass) def apply[T: ParquetRecordEncoder : ParquetSchemaResolver](path: Path, options: ParquetWriter.Options = ParquetWriter.Options() ): Sink[T, Future[Done]] = { val schema = ParquetSchemaResolver.resolveSchema[T] val writer = ParquetWriter.internalWriter(path, schema, options) val valueCodecConfiguration = options.toValueCodecConfiguration val isDebugEnabled = logger.isDebugEnabled def encode(data: T): RowParquetRecord = ParquetRecordEncoder.encode[T](data, valueCodecConfiguration) Flow[T] .map(encode) .fold(0) { case (acc, record) => writer.write(record); acc + 1} .map { count => if (isDebugEnabled) logger.debug(s"$count records were successfully written to $path") writer.close() } .toMat(Sink.ignore)(Keep.right) } }
Example 50
Source File: IOOps.scala From parquet4s with MIT License | 5 votes |
package com.github.mjakubowski84.parquet4s import org.apache.hadoop.fs.Path import org.apache.hadoop.io.SecureIOUtils.AlreadyExistsException import org.apache.parquet.hadoop.ParquetFileWriter import org.slf4j.Logger import scala.concurrent.{ExecutionContext, Future} import scala.util.Try trait IOOps { protected val logger: Logger protected def validateWritePath(path: Path, writeOptions: ParquetWriter.Options): Unit = { val fs = path.getFileSystem(writeOptions.hadoopConf) try { if (fs.exists(path)) { if (writeOptions.writeMode == ParquetFileWriter.Mode.CREATE) throw new AlreadyExistsException(s"File or directory already exists: $path") else { if (logger.isDebugEnabled) logger.debug(s"Deleting $path in order to override with new data.") fs.delete(path, true) } } } finally fs.close() } protected def filesAtPath(path: Path, writeOptions: ParquetWriter.Options) (implicit ec: ExecutionContext): Future[List[String]] = Future { scala.concurrent.blocking { val fs = path.getFileSystem(writeOptions.hadoopConf) try { val iter = fs.listFiles(path, false) Stream .continually(Try(iter.next())) .takeWhile(_.isSuccess) .map(_.get) .map(_.getPath.getName) .toList } finally fs.close() } } protected def filesAtPath(path: String, writeOptions: ParquetWriter.Options) (implicit ec: ExecutionContext): Future[List[String]] = filesAtPath(new Path(path), writeOptions) }
Example 51
Source File: SequentialFileSplittingParquetSink.scala From parquet4s with MIT License | 5 votes |
package com.github.mjakubowski84.parquet4s import akka.Done import akka.stream.scaladsl.{Flow, Keep, Sink} import org.apache.hadoop.fs.Path import org.apache.parquet.schema.MessageType import org.slf4j.{Logger, LoggerFactory} import scala.concurrent.Future private[parquet4s] object SequentialFileSplittingParquetSink extends IOOps { protected val logger: Logger = LoggerFactory.getLogger(this.getClass) def apply[T: ParquetRecordEncoder : ParquetSchemaResolver](path: Path, maxRecordsPerFile: Long, options: ParquetWriter.Options = ParquetWriter.Options() ): Sink[T, Future[Done]] = { val schema = ParquetSchemaResolver.resolveSchema[T] val valueCodecConfiguration = options.toValueCodecConfiguration validateWritePath(path, options) def encode(data: T): RowParquetRecord = ParquetRecordEncoder.encode[T](data, valueCodecConfiguration) Flow[T] .zipWithIndex .map { case (elem, index) => OrderedChunkElem(encode(elem), index) } .fold(OrderedChunk(path, schema, maxRecordsPerFile, options))(_.write(_)) .map(_.close()) .toMat(Sink.ignore)(Keep.right) } private case class OrderedChunkElem(record: RowParquetRecord, index: Long) { def isSplit(maxRecordsPerFile: Long): Boolean = index % maxRecordsPerFile == 0 } private trait OrderedChunk { def write(elem: OrderedChunkElem): OrderedChunk def close(): Unit } private object OrderedChunk { def apply(basePath: Path, schema: MessageType, maxRecordsPerFile: Long, options: ParquetWriter.Options): OrderedChunk = new PendingOrderedChunk(basePath, schema, maxRecordsPerFile, options) private[OrderedChunk] class PendingOrderedChunk(basePath: Path, schema: MessageType, maxRecordsPerFile: Long, options: ParquetWriter.Options) extends OrderedChunk { override def write(elem: OrderedChunkElem): OrderedChunk = { val chunkNumber: Int = Math.floorDiv(elem.index, maxRecordsPerFile).toInt val chunkPath = Path.mergePaths(basePath, new Path(chunkFileName(chunkNumber))) val writer = ParquetWriter.internalWriter(chunkPath, schema, options) writer.write(elem.record) new StartedOrderedChunk(basePath, schema, maxRecordsPerFile, options, chunkPath, writer, acc = 1) } override def close(): Unit = () private def chunkFileName(chunkNumber: Int): String = f"/part-$chunkNumber%05d.parquet" } private[OrderedChunk] class StartedOrderedChunk(basePath: Path, schema: MessageType, maxRecordsPerFile: Long, options: ParquetWriter.Options, chunkPath: Path, writer: ParquetWriter.InternalWriter, acc: Long) extends OrderedChunk { override def write(elem: OrderedChunkElem): OrderedChunk = { if (elem.isSplit(maxRecordsPerFile)) { this.close() new PendingOrderedChunk(basePath, schema, maxRecordsPerFile, options).write(elem) } else { writer.write(elem.record) new StartedOrderedChunk(basePath, schema, maxRecordsPerFile, options, chunkPath, writer, acc = acc + 1) } } override def close(): Unit = { if (logger.isDebugEnabled) logger.debug(s"$acc records were successfully written to $chunkPath") writer.close() } } } }
Example 52
Source File: QueryExecution.scala From daf with BSD 3-Clause "New" or "Revised" License | 5 votes |
package controllers import akka.stream.scaladsl.Source import cats.syntax.show.toShow import daf.dataset._ import daf.dataset.query.jdbc.{ JdbcResult, QueryFragmentWriterSyntax, Writers } import daf.dataset.query.Query import daf.web._ import daf.filesystem._ import daf.instances.FileSystemInstance import it.gov.daf.common.utils._ import org.apache.hadoop.fs.Path import play.api.libs.json.JsValue import scala.concurrent.Future import scala.util.{ Failure, Success, Try } trait QueryExecution { this: DatasetController with DatasetExport with FileSystemInstance => private def extractDatabaseName(parent: String, params: FileDatasetParams) = parent.toLowerCase match { case "opendata" => params.extraParams.get("theme").map { s => s"opendata__${s.toLowerCase}" } getOrElse "opendata" // append __{theme} for opendata case other => other // use the parent dir for other data } private def extractTableName(path: Path, params: FileDatasetParams): Try[String] = Try { s"${extractDatabaseName(path.getParent.getName, params)}.${path.getName.toLowerCase}" } private def extractTableName(params: DatasetParams, userId: String): Try[String] = params match { case kudu: KuduDatasetParams => (proxyUser as userId) { downloadService.tableInfo(kudu.table) } map { _ => kudu.table } case file: FileDatasetParams => (proxyUser as userId) { extractTableName(file.path.asHadoop.resolve, file) } } private def prepareQuery(params: DatasetParams, query: Query, userId: String) = for { tableName <- extractTableName(params, userId) fragment <- Writers.sql(query, tableName).write } yield fragment.query[Unit].sql private def analyzeQuery(params: DatasetParams, query: Query, userId: String) = for { tableName <- extractTableName(params, userId) analysis <- queryService.explain(query, tableName, userId) } yield analysis private def transform(jdbcResult: JdbcResult, targetFormat: FileDataFormat) = targetFormat match { case CsvFileFormat => Try { Source[String](jdbcResult.toCsv).map { csv => s"$csv${System.lineSeparator}" } } case JsonFileFormat => Try { wrapJson { Source[JsValue](jdbcResult.toJson).map { _.toString } } } case _ => Failure { new IllegalArgumentException(s"Invalid target format [$targetFormat]; must be [csv | json]") } } // Web // Failure private def failQuickExec(params: DatasetParams, targetFormat: FileDataFormat) = Future.successful { TemporaryRedirect { s"${controllers.routes.DatasetController.queryDataset(params.catalogUri, targetFormat.show, "batch").url}" } } // Executions private def doBatchExec(params: DatasetParams, query: Query, targetFormat: FileDataFormat, userId: String) = prepareQuery(params, query, userId) match { case Success(sql) => prepareQueryExport(sql, targetFormat).map { formatExport(_, targetFormat) } case Failure(error) => Future.failed { error } } private def doQuickExec(params: DatasetParams, query: Query, targetFormat: FileDataFormat, userId: String) = for { tableName <- extractTableName(params, userId) jdbcResult <- queryService.exec(query, tableName, userId) data <- transform(jdbcResult, targetFormat) } yield data // API protected def quickExec(params: DatasetParams, query: Query, targetFormat: FileDataFormat, userId: String) = analyzeQuery(params, query, userId) match { case Success(analysis) if analysis.memoryEstimation <= impalaConfig.memoryEstimationLimit => doQuickExec(params, query, targetFormat, userId).~>[Future].map { respond(_, params.name, targetFormat) } case Success(_) => failQuickExec(params, targetFormat) case Failure(error) => Future.failed { error } } protected def batchExec(params: DatasetParams, query: Query, targetFormat: FileDataFormat, userId: String) = doBatchExec(params, query, targetFormat, userId).map { respond(_, params.name, targetFormat) } protected def exec(params: DatasetParams, query: Query, userId: String, targetFormat: FileDataFormat, method: DownloadMethod) = method match { case QuickDownloadMethod => quickExec(params, query, targetFormat, userId) case BatchDownloadMethod => batchExec(params, query, targetFormat, userId) } }
Example 53
Source File: FileExportJob.scala From daf with BSD 3-Clause "New" or "Revised" License | 5 votes |
package daf.dataset.export import daf.dataset.ExtraParams import daf.filesystem._ import org.apache.hadoop.fs.Path import org.apache.livy.{ Job, JobContext } import org.apache.spark.sql._ import scala.util.{ Failure, Success, Try } class FileExportJob(val from: FileExportInfo, val to: FileExportInfo, val extraParams: Map[String, String], limit: Option[Int]) extends Job[String] { private val csvDelimiter = extraParams.getOrElse("separator", ",") private val csvIncludeHeader = true private val csvInferSchema = true // Export private def prepareCsvReader(reader: DataFrameReader) = reader .option("inferSchema", csvInferSchema) .option("header", csvIncludeHeader) .option("delimiter", csvDelimiter) private def prepareCsvWriter(writer: DataFrameWriter[Row]) = writer .option("header", csvIncludeHeader) .option("delimiter", csvDelimiter) private def read(session: SparkSession) = from match { case FileExportInfo(path, RawFileFormat | CsvFileFormat) => prepareCsvReader(session.read).csv(path) case FileExportInfo(path, ParquetFileFormat) => session.read.parquet(path) case FileExportInfo(path, JsonFileFormat) => session.read.json(path) case FileExportInfo(_, unsupported) => throw new IllegalArgumentException(s"Input file format [$unsupported] is invalid") } private def addLimit(data: DataFrame) = limit match { case Some(value) => data.limit(value) case None => data } private def write(data: DataFrame) = to match { case FileExportInfo(path, CsvFileFormat) => prepareCsvWriter(data.write).csv(path) case FileExportInfo(path, JsonFileFormat) => data.write.json(path) case FileExportInfo(_, unsupported) => throw new IllegalArgumentException(s"Output file format [$unsupported] is invalid") } private def doExport(session: SparkSession) = for { data <- Try { read(session) } limited <- Try { addLimit(data) } _ <- Try { write(limited) } } yield () override def call(jobContext: JobContext) = doExport { jobContext.sqlctx().sparkSession } match { case Success(_) => to.path case Failure(error) => throw new RuntimeException("Export Job execution failed", error) } } object FileExportJob { def create(inputPath: String, outputPath: String, from: FileDataFormat, to: FileDataFormat, extraParams: ExtraParams = Map.empty[String, String], limit: Option[Int]) = new FileExportJob( FileExportInfo(inputPath, from), FileExportInfo(outputPath, to), extraParams, limit ) } case class FileExportInfo(path: String, format: FileDataFormat) object FileExportInfo { def apply(path: Path, format: FileDataFormat): FileExportInfo = apply(path.toUri.getPath, format) }
Example 54
Source File: MergeStrategySpec.scala From daf with BSD 3-Clause "New" or "Revised" License | 5 votes |
package daf.filesystem import java.io.{ Closeable, InputStream } import java.util.Scanner import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{ FSDataInputStream, FSDataOutputStream, FileSystem, Path } import org.scalatest.{ BeforeAndAfterAll, Matchers, WordSpec } import scala.collection.convert.decorateAsScala._ import scala.util.{ Random, Try } class MergeStrategySpec extends WordSpec with Matchers with BeforeAndAfterAll { private implicit val fileSystem = FileSystem.getLocal(new Configuration) private val numFiles = 10 private val baseDir = "test-dir".asHadoop private val workingDir = baseDir / f"merge-strategy-spec-${Random.nextInt(10000)}%05d" private def safely[A <: Closeable, U](f: A => U) = { stream: A => val attempt = Try { f(stream) } stream.close() attempt } private def readFile(path: Path) = safely[FSDataInputStream, Seq[String]] { _.scanner.asScala.toSeq } apply fileSystem.open(path) private def readFiles = Try { fileSystem.listStatus(workingDir).toSeq.flatMap { status => readFile(status.getPath).get } } private def openFiles = Try { fileSystem.listStatus(workingDir).toSeq.map { status => fileSystem.open(status.getPath) } } private def createFile(fileName: String) = safely[FSDataOutputStream, Unit] { stream => Random.alphanumeric.grouped(200).take(10).map { randomSplits(_) }.foreach { row => stream.writeUTF { row.mkString("", ",", "\n") } } } apply fileSystem.create { workingDir / fileName } private def randomSplits(chars: Stream[Char], strings: Seq[String] = Seq.empty): Seq[String] = chars.splitAt { Random.nextInt(10) + 5 } match { case (head, tail) if tail.isEmpty => head.drop(1).mkString +: strings case (head, tail) => randomSplits(tail, head.mkString +: strings) } private def createWorkingDir = Try { fileSystem.mkdirs(workingDir) } private def createFiles = Try { 0 until numFiles foreach { index => createFile(s"test-file-$index").get } // this is relatively nasty, and should be handled in a `traverse` } private def prepareData = for { _ <- createWorkingDir _ <- createFiles } yield () private def purgeData = Try { fileSystem.delete(workingDir, true) } override def beforeAll() = prepareData.get override def afterAll() = purgeData.get "MergeStrategies info" when { "given compressed format files" must { "throw an exception" in { an[IllegalArgumentException] must be thrownBy MergeStrategies.find { FileInfo(workingDir / "test-file-0", 0, FileDataFormats.raw, FileCompressionFormats.gzip) } } } "given data as csv" must { "drop one line and merge the rest" in { safely[InputStream, Seq[String]] { new Scanner(_).asScala.toList }.andThen { attempt => for { merged <- attempt expected <- readFiles } merged.size should be { expected.size - numFiles + 1 } } apply MergeStrategies.csv.merge { openFiles.get } } } "given data as json" must { "just merge the files into one" in { safely[InputStream, Seq[String]] { new Scanner(_).asScala.toList }.andThen { attempt => for { merged <- attempt expected <- readFiles } merged.size should be { expected.size } } apply MergeStrategies.json.merge { openFiles.get } } } } }
Example 55
Source File: HiveUtils.scala From Linkis with Apache License 2.0 | 5 votes |
package com.webank.wedatasphere.linkis.engine.hive.common import java.io.File import java.nio.file.Paths import com.webank.wedatasphere.linkis.common.conf.{Configuration => CommonConfiguration} import com.webank.wedatasphere.linkis.engine.hive.exception.HadoopConfSetFailedException import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.hive.conf import org.apache.hadoop.hive.conf.HiveConf import org.apache.hadoop.hive.ql.Driver object HiveUtils { def jarOfClass(cls: Class[_]):Option[String] = { val uri = cls.getResource("/" + cls.getName.replace('.', '/') + ".class") if (uri != null) { val uriStr = uri.toString if (uriStr.startsWith("jar:file:")) { Some(uriStr.substring("jar:file:".length, uriStr.indexOf("!"))) } else { None } } else { None } } def getHiveConf:HiveConf = { val confDir:File = new File(CommonConfiguration.hadoopConfDir) if (!confDir.exists() || confDir.isFile){ throw HadoopConfSetFailedException(41001, "hadoop conf set failed, reason: conf dir does not exist") } val hadoopConf:Configuration = new Configuration() hadoopConf.addResource(new Path(Paths.get(CommonConfiguration.hadoopConfDir, "core-site.xml").toAbsolutePath.toFile.getAbsolutePath)) hadoopConf.addResource(new Path(Paths.get(CommonConfiguration.hadoopConfDir, "hdfs-site.xml").toAbsolutePath.toFile.getAbsolutePath)) hadoopConf.addResource(new Path(Paths.get(CommonConfiguration.hadoopConfDir, "yarn-site.xml").toAbsolutePath.toFile.getAbsolutePath)) new conf.HiveConf(hadoopConf, classOf[Driver]) } def msDurationToString(ms: Long): String = { val second = 1000 val minute = 60 * second val hour = 60 * minute ms match { case t if t < second => "%d ms".format(t) case t if t < minute => "%.1f s".format(t.toFloat / second) case t if t < hour => "%.1f m".format(t.toFloat / minute) case t => "%.2f h".format(t.toFloat / hour) } } def main(args: Array[String]): Unit = { jarOfClass(classOf[Driver]).foreach(println) } }
Example 56
Source File: DefaultSource.scala From Linkis with Apache License 2.0 | 5 votes |
package com.webank.wedatasphere.spark.excel import org.apache.hadoop.fs.Path import org.apache.spark.sql.sources._ import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode} class DefaultSource extends RelationProvider with SchemaRelationProvider with CreatableRelationProvider { override def createRelation( sqlContext: SQLContext, parameters: Map[String, String], schema: StructType ): ExcelRelation = { ExcelRelation( location = checkParameter(parameters, "path"), sheetName = parameters.get("sheetName"), useHeader = checkParameter(parameters, "useHeader").toBoolean, treatEmptyValuesAsNulls = parameters.get("treatEmptyValuesAsNulls").fold(true)(_.toBoolean), userSchema = Option(schema), inferSheetSchema = parameters.get("inferSchema").fold(false)(_.toBoolean), addColorColumns = parameters.get("addColorColumns").fold(false)(_.toBoolean), startColumn = parameters.get("startColumn").fold(0)(_.toInt), endColumn = parameters.get("endColumn").fold(Int.MaxValue)(_.toInt), timestampFormat = parameters.get("timestampFormat"), maxRowsInMemory = parameters.get("maxRowsInMemory").map(_.toInt), excerptSize = parameters.get("excerptSize").fold(10)(_.toInt), parameters = parameters, dateFormat = parameters.get("dateFormats").getOrElse("yyyy-MM-dd").split(";").toList )(sqlContext) } override def createRelation( sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], data: DataFrame ): BaseRelation = { val path = checkParameter(parameters, "path") val sheetName = parameters.getOrElse("sheetName", "Sheet1") val useHeader = checkParameter(parameters, "useHeader").toBoolean val dateFormat = parameters.getOrElse("dateFormat", ExcelFileSaver.DEFAULT_DATE_FORMAT) val timestampFormat = parameters.getOrElse("timestampFormat", ExcelFileSaver.DEFAULT_TIMESTAMP_FORMAT) val filesystemPath = new Path(path) val fs = filesystemPath.getFileSystem(sqlContext.sparkContext.hadoopConfiguration) fs.setWriteChecksum(false) val doSave = if (fs.exists(filesystemPath)) { mode match { case SaveMode.Append => sys.error(s"Append mode is not supported by ${this.getClass.getCanonicalName}") case SaveMode.Overwrite => fs.delete(filesystemPath, true) true case SaveMode.ErrorIfExists => sys.error(s"path $path already exists.") case SaveMode.Ignore => false } } else { true } if (doSave) { // Only save data when the save mode is not ignore. (new ExcelFileSaver(fs)).save( filesystemPath, data, sheetName = sheetName, useHeader = useHeader, dateFormat = dateFormat, timestampFormat = timestampFormat ) } createRelation(sqlContext, parameters, data.schema) } // Forces a Parameter to exist, otherwise an exception is thrown. private def checkParameter(map: Map[String, String], param: String): String = { if (!map.contains(param)) { throw new IllegalArgumentException(s"Parameter ${'"'}$param${'"'} is missing in options.") } else { map.apply(param) } } // Gets the Parameter if it exists, otherwise returns the default argument private def parameterOrDefault(map: Map[String, String], param: String, default: String) = map.getOrElse(param, default) }
Example 57
Source File: HDFSUtils.scala From Linkis with Apache License 2.0 | 5 votes |
package com.webank.wedatasphere.linkis.hadoop.common.utils import java.io.File import java.nio.file.Paths import java.security.PrivilegedExceptionAction import com.webank.wedatasphere.linkis.common.conf.Configuration.hadoopConfDir import com.webank.wedatasphere.linkis.hadoop.common.conf.HadoopConf._ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.security.UserGroupInformation object HDFSUtils { def getConfiguration(user: String): Configuration = getConfiguration(user, hadoopConfDir) def getConfiguration(user: String, hadoopConfDir: String): Configuration = { val confPath = new File(hadoopConfDir) if(!confPath.exists() || confPath.isFile) { throw new RuntimeException(s"Create hadoop configuration failed, path $hadoopConfDir not exists.") } val conf = new Configuration() conf.addResource(new Path(Paths.get(hadoopConfDir, "core-site.xml").toAbsolutePath.toFile.getAbsolutePath)) conf.addResource(new Path(Paths.get(hadoopConfDir, "hdfs-site.xml").toAbsolutePath.toFile.getAbsolutePath)) conf.addResource(new Path(Paths.get(hadoopConfDir, "yarn-site.xml").toAbsolutePath.toFile.getAbsolutePath)) conf } def getHDFSRootUserFileSystem: FileSystem = getHDFSRootUserFileSystem(getConfiguration(HADOOP_ROOT_USER.getValue)) def getHDFSRootUserFileSystem(conf: org.apache.hadoop.conf.Configuration): FileSystem = getHDFSUserFileSystem(HADOOP_ROOT_USER.getValue, conf) def getHDFSUserFileSystem(userName: String): FileSystem = getHDFSUserFileSystem(userName, getConfiguration(userName)) def getHDFSUserFileSystem(userName: String, conf: org.apache.hadoop.conf.Configuration): FileSystem = getUserGroupInformation(userName) .doAs(new PrivilegedExceptionAction[FileSystem]{ def run = FileSystem.get(conf) }) def getUserGroupInformation(userName: String): UserGroupInformation ={ if(KERBEROS_ENABLE.getValue) { val path = new File(KEYTAB_FILE.getValue , userName + ".keytab").getPath val user = getKerberosUser(userName) UserGroupInformation.setConfiguration(getConfiguration(userName)) UserGroupInformation.loginUserFromKeytabAndReturnUGI(user, path) } else { UserGroupInformation.createRemoteUser(userName) } } def getKerberosUser(userName: String): String = { var user = userName if(KEYTAB_HOST_ENABLED.getValue){ user = user+ "/" + KEYTAB_HOST.getValue } user } }
Example 58
Source File: HiveInputFormat.scala From connectors with Apache License 2.0 | 5 votes |
package io.delta.hive import org.apache.hadoop.fs.Path import org.apache.hadoop.mapred.JobConf class HiveInputFormat extends org.apache.hadoop.hive.ql.io.HiveInputFormat { override def pushProjectionsAndFilters( jobConf: JobConf, inputFormatClass: Class[_], splitPath: Path, nonNative: Boolean): Unit = { if (inputFormatClass == classOf[DeltaInputFormat]) { super.pushProjectionsAndFilters(jobConf, inputFormatClass, splitPath, false) } else { super.pushProjectionsAndFilters(jobConf, inputFormatClass, splitPath, nonNative) } } }
Example 59
Source File: PointCloudRelation.scala From geotrellis-pointcloud with Apache License 2.0 | 5 votes |
package geotrellis.pointcloud.spark.datasource import geotrellis.pointcloud.spark.store.hadoop._ import geotrellis.pointcloud.spark.store.hadoop.HadoopPointCloudRDD.{Options => HadoopOptions} import geotrellis.pointcloud.util.Filesystem import geotrellis.proj4.CRS import geotrellis.store.hadoop.util.HdfsUtils import geotrellis.vector.Extent import cats.implicits._ import io.pdal._ import io.circe.syntax._ import org.apache.hadoop.fs.Path import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.sources.{BaseRelation, TableScan} import org.apache.spark.sql.types._ import org.apache.spark.sql.{Row, SQLContext} import java.io.File import scala.collection.JavaConverters._ // This class has to be serializable since it is shipped over the network. class PointCloudRelation( val sqlContext: SQLContext, path: String, options: HadoopOptions ) extends BaseRelation with TableScan with Serializable { @transient implicit lazy val sc: SparkContext = sqlContext.sparkContext // TODO: switch between HadoopPointCloudRDD and S3PointcCloudRDD lazy val isS3: Boolean = path.startsWith("s3") override def schema: StructType = { lazy val (local, fixedPath) = if(path.startsWith("s3") || path.startsWith("hdfs")) { val tmpDir = Filesystem.createDirectory() val remotePath = new Path(path) // copy remote file into local tmp dir val localPath = new File(tmpDir, remotePath.getName) HdfsUtils.copyPath(remotePath, new Path(s"file:///${localPath.getAbsolutePath}"), sc.hadoopConfiguration) (true, localPath.toString) } else (false, path) val localPipeline = options.pipeline .hcursor .downField("pipeline").downArray .downField("filename").withFocus(_ => fixedPath.asJson) .top.fold(options.pipeline)(identity) val pl = Pipeline(localPipeline.noSpaces) if (pl.validate()) pl.execute() val pointCloud = try { pl.getPointViews().next().getPointCloud(0) } finally { pl.close() if(local) println(new File(fixedPath).delete) } val rdd = HadoopPointCloudRDD(new Path(path), options) val md: (Option[Extent], Option[CRS]) = rdd .map { case (header, _) => (header.projectedExtent3D.map(_.extent3d.toExtent), header.crs) } .reduce { case ((e1, c), (e2, _)) => ((e1, e2).mapN(_ combine _), c) } val metadata = new MetadataBuilder().putString("metadata", md.asJson.noSpaces).build pointCloud.deriveSchema(metadata) } override def buildScan(): RDD[Row] = { val rdd = HadoopPointCloudRDD(new Path(path), options) rdd.flatMap { _._2.flatMap { pc => pc.readAll.toList.map { k => Row(k: _*) } } } } }
Example 60
Source File: HadoopPointCloudRDD.scala From geotrellis-pointcloud with Apache License 2.0 | 5 votes |
package geotrellis.pointcloud.spark.store.hadoop import geotrellis.pointcloud.spark.store.hadoop.formats._ import geotrellis.store.hadoop._ import geotrellis.vector.Extent import io.circe.Json import io.pdal._ import io.pdal.pipeline._ import org.apache.hadoop.fs.Path import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD def apply(path: Path, options: Options = Options.DEFAULT)(implicit sc: SparkContext): RDD[(HadoopPointCloudHeader, List[PointCloud])] = { val conf = sc.hadoopConfiguration.withInputDirectory(path, options.filesExtensions) options.tmpDir.foreach(PointCloudInputFormat.setTmpDir(conf, _)) options.dimTypes.foreach(PointCloudInputFormat.setDimTypes(conf, _)) PointCloudInputFormat.setPipeline(conf, options.pipeline) options.filterExtent match { case Some(filterExtent) => PointCloudInputFormat.setFilterExtent(conf, filterExtent) sc.newAPIHadoopRDD( conf, classOf[PointCloudInputFormat], classOf[HadoopPointCloudHeader], classOf[List[PointCloud]] ).filter { case (header, _) => header.extent3D.map(_.toExtent.intersects(filterExtent)).getOrElse(false) } case None => sc.newAPIHadoopRDD( conf, classOf[PointCloudInputFormat], classOf[HadoopPointCloudHeader], classOf[List[PointCloud]] ) } } }
Example 61
Source File: PointCloudTestEnvironment.scala From geotrellis-pointcloud with Apache License 2.0 | 5 votes |
package geotrellis.pointcloud.spark import geotrellis.spark.testkit._ import org.apache.hadoop.fs.Path import org.scalatest.Suite import java.io.File trait PointCloudTestEnvironment extends TestEnvironment { self: Suite => val testResources = new File("src/test/resources") val lasPath = new Path(s"file://${testResources.getAbsolutePath}/las") val multipleLasPath = new Path(s"file://${testResources.getAbsolutePath}/las/files") def setS3Credentials: Unit = { try { val conf = ssc.sparkContext.hadoopConfiguration conf.set("fs.s3.impl", classOf[org.apache.hadoop.fs.s3a.S3AFileSystem].getName) conf.set("fs.s3a.aws.credentials.provider", classOf[com.amazonaws.auth.DefaultAWSCredentialsProviderChain].getName) conf.set("fs.s3a.endpoint", "s3.eu-west-2.amazonaws.com") } catch { case e: Throwable => println(e.getMessage) } } }
Example 62
Source File: ModelSource.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.common import java.io.{InputStreamReader, BufferedReader} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{Path, FileSystem} case class ModelSource( root: String, fs: FileSystem ) { def readFile(path: String): String = { val fsPath = filePath(path) val reader = new BufferedReader(new InputStreamReader(fs.open(fsPath))) val builder = new StringBuilder() var line: String = null while ({ line = reader.readLine(); line != null }) { builder.append(line + "\n") } builder.mkString } def findFile(dir: String, recursive: Boolean, f: String => Boolean): Option[Path] = { val dirPath = filePath(dir) if (fs.exists(dirPath) & fs.isDirectory(dirPath)) { val iter = fs.listFiles(dirPath, recursive) while (iter.hasNext) { val st = iter.next() if (st.isFile && f(st.getPath.getName)) return Some(st.getPath) } None } else { None } } def filePath(path: String): Path = { new Path(s"$root/$path") } } object ModelSource { def local(path: String): ModelSource = { ModelSource(path, FileSystem.getLocal(new Configuration())) } def hadoop(path: String, conf: Configuration): ModelSource = { val fs = FileSystem.get(conf) ModelSource(path, fs) } }
Example 63
Source File: ModelDataReader.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.common import io.hydrosphere.spark_ml_serving.common.reader._ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import parquet.format.converter.ParquetMetadataConverter.NO_FILTER import parquet.hadoop.{ParquetFileReader, ParquetReader} import parquet.schema.MessageType import scala.collection.immutable.HashMap import scala.collection.mutable object ModelDataReader { def parse(source: ModelSource, path: String): LocalData = { source.findFile(path, recursive = true, _.endsWith(".parquet")) match { case Some(p) => readData(p) case None => LocalData.empty } } private def readData(p: Path): LocalData = { val conf: Configuration = new Configuration() val metaData = ParquetFileReader.readFooter(conf, p, NO_FILTER) val schema: MessageType = metaData.getFileMetaData.getSchema val reader = ParquetReader.builder[SimpleRecord](new SimpleReadSupport(), p.getParent).build() var result = LocalData.empty try { var value = reader.read() while (value != null) { val valMap = value.struct(HashMap.empty[String, Any], schema) result = mergeMaps(result, valMap) value = reader.read() } result } finally { if (reader != null) { reader.close() } } } private def mergeMaps(acc: LocalData, map: HashMap[String, Any]) = { var result = acc map.foreach { case (k, v) => result = result.appendToColumn(k, List(v)) } result } }
Example 64
Source File: Util.scala From Backup-Repo with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hbase.util import java.io._ import java.util.concurrent.atomic.AtomicInteger import java.util.zip.{DeflaterOutputStream, InflaterInputStream} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hbase.HBaseConfiguration object Util { val iteration = new AtomicInteger(0) def getTempFilePath(conf: Configuration, prefix: String): String = { val fileSystem = FileSystem.get(conf) val path = new Path(s"$prefix-${System.currentTimeMillis()}-${iteration.getAndIncrement}") if (fileSystem.exists(path)) { fileSystem.delete(path, true) } path.getName } def serializeHBaseConfiguration(configuration: Configuration): Array[Byte] = { val bos = new ByteArrayOutputStream val deflaterOutputStream = new DeflaterOutputStream(bos) val dos = new DataOutputStream(deflaterOutputStream) configuration.write(dos) dos.close() bos.toByteArray } def deserializeHBaseConfiguration(arr: Array[Byte]) = { val conf = HBaseConfiguration.create conf.readFields(new DataInputStream(new InflaterInputStream(new ByteArrayInputStream(arr)))) conf } }
Example 65
Source File: RecommenderSystem.scala From recommendersystem with Apache License 2.0 | 5 votes |
package com.infosupport.recommendedcontent.core import java.io.Serializable import akka.actor.{Props, Actor, ActorLogging} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.spark.SparkContext import org.apache.spark.mllib.recommendation.MatrixFactorizationModel private def generateRecommendations(userId: Int, count: Int) = { log.info(s"Generating ${count} recommendations for user with ID ${userId}") // Generate recommendations based on the machine learning model. // When there's no trained model return an empty list instead. val results = model match { case Some(m) => m.recommendProducts(userId,count) .map(rating => Recommendation(rating.product,rating.rating)) .toList case None => Nil } sender ! Recommendations(results) } }
Example 66
Source File: FilePattern.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels import com.sksamuel.exts.Logging import io.eels.util.HdfsIterator import org.apache.hadoop.fs.{FileSystem, Path} import scala.language.implicitConversions object FilePattern { def apply(path: Path)(implicit fs: FileSystem): FilePattern = apply(path.toString()) def apply(path: java.nio.file.Path)(implicit fs: FileSystem): FilePattern = apply(path.toAbsolutePath().toString(), { _ => true }) implicit def stringToFilePattern(str: String)(implicit fs: FileSystem): FilePattern = FilePattern(str) } case class FilePattern(pattern: String, filter: org.apache.hadoop.fs.Path => Boolean = { _ => true }) extends Logging { def isRegex(): Boolean = pattern.contains("*") def isDirectory(): Boolean = pattern.endsWith("/") def toPaths()(implicit fs: FileSystem): List[Path] = { val paths = if (isRegex) { val regex = new Path(pattern).getName.replace("*", ".*?") val dir = new Path(pattern).getParent logger.debug(s"File expansion will check path $dir for files matching $regex") HdfsIterator.remote(fs.listFiles(dir, false)).toList .map(_.getPath) .filter { path => path.getName.matches(regex) } .filter(filter) } else if (fs.isDirectory(new Path(pattern))) { val path = new Path(pattern.stripSuffix("/")) logger.debug(s"File expansion will search directory $path") HdfsIterator.remote(fs.listFiles(path, false)).map(_.getPath).toList.filter(fs.isFile).filter(filter) } else { List(new Path(pattern)) } logger.debug(s"toPaths has returned ${paths.size} paths, first 5: ${paths.take(5).mkString(",")}") paths } def withFilter(p: Path => Boolean): FilePattern = copy(filter = p) }
Example 67
Source File: HdfsMkdir.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.util import org.apache.hadoop.fs.{FileSystem, Path} object HdfsMkdir { def apply(path: Path, inheritPermissionsDefault: Boolean)(implicit fs: FileSystem): Unit = { if (!fs.exists(path)) { // iterate through the parents until we hit a parent that exists, then take that, which will give // us the first folder that exists val parent = Iterator.iterate(path)(_.getParent).dropWhile(false == fs.exists(_)).take(1).toList.head // using the folder that exists, get its permissions val permission = fs.getFileStatus(parent).getPermission fs.create(path, false) if (inheritPermissionsDefault) fs.setPermission(path, permission) } } }
Example 68
Source File: HdfsOps.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels import com.sksamuel.exts.Logging import io.eels.util.{HdfsIterator, PathIterator} import org.apache.hadoop.fs.{FileSystem, LocatedFileStatus, Path} object HdfsOps extends Logging { def makePathVisible(path: Path)(implicit fs: FileSystem): Unit = { if (path.getName.startsWith(".")) { logger.info(s"Making $path visible by stripping leading .") val dest = new Path(path.getParent, path.getName.drop(1)) fs.rename(path, dest) } } def findFiles(path: Path, recursive: Boolean, fs: FileSystem): Iterator[LocatedFileStatus] = { HdfsIterator.remote(fs.listFiles(path, recursive)) } def mkdirsp(path: Path, fs: FileSystem): Boolean = PathIterator(path).forall(fs.mkdirs) }
Example 69
Source File: AvroSource.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.avro import java.io.File import java.util.concurrent.atomic.AtomicBoolean import com.sksamuel.exts.Logging import com.sksamuel.exts.io.Using import io.eels._ import io.eels.datastream.{DataStream, Publisher, Subscriber, Subscription} import io.eels.schema.StructType import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} case class AvroSource(path: Path) (implicit conf: Configuration, fs: FileSystem) extends Source with Using { override lazy val schema: StructType = { using(AvroReaderFns.createAvroReader(path)) { reader => val record = reader.next() AvroSchemaFns.fromAvroSchema(record.getSchema) } } override def parts(): Seq[Publisher[Seq[Row]]] = Seq(AvroSourcePublisher(path)) } case class AvroSourcePublisher(path: Path) (implicit conf: Configuration, fs: FileSystem) extends Publisher[Seq[Row]] with Logging with Using { override def subscribe(subscriber: Subscriber[Seq[Row]]): Unit = { val deserializer = new AvroDeserializer() try { using(AvroReaderFns.createAvroReader(path)) { reader => val running = new AtomicBoolean(true) subscriber.subscribed(Subscription.fromRunning(running)) AvroRecordIterator(reader) .takeWhile(_ => running.get) .map(deserializer.toRow) .grouped(DataStream.DefaultBatchSize) .foreach(subscriber.next) subscriber.completed() } } catch { case t: Throwable => subscriber.error(t) } } } object AvroSource { def apply(file: File)(implicit conf: Configuration, fs: FileSystem): AvroSource = AvroSource(new Path(file.getAbsoluteFile.toString)) def apply(path: java.nio.file.Path)(implicit conf: Configuration, fs: FileSystem): AvroSource = apply(path.toFile) }
Example 70
Source File: AvroSink.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.avro import java.io.File import io.eels.schema.StructType import io.eels.{Row, Sink, SinkWriter} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.permission.FsPermission import org.apache.hadoop.fs.{FileSystem, Path} case class AvroSink(path: Path, overwrite: Boolean = false, permission: Option[FsPermission] = None, inheritPermissions: Option[Boolean] = None) (implicit conf: Configuration, fs: FileSystem) extends Sink { def withOverwrite(overwrite: Boolean): AvroSink = copy(overwrite = overwrite) def withPermission(permission: FsPermission): AvroSink = copy(permission = Option(permission)) def withInheritPermission(inheritPermissions: Boolean): AvroSink = copy(inheritPermissions = Option(inheritPermissions)) override def open(schema: StructType): SinkWriter = new SinkWriter { private val writer = new AvroWriter(schema, fs.create(path, overwrite)) override def write(row: Row): Unit = writer.write(row) override def close(): Unit = { writer.close() permission match { case Some(perm) => fs.setPermission(path, perm) case None => if (inheritPermissions.getOrElse(false)) { val permission = fs.getFileStatus(path.getParent).getPermission fs.setPermission(path, permission) } } } } } object AvroSink { def apply(file: File)(implicit conf: Configuration, fs: FileSystem): AvroSink = AvroSink(new Path(file.getAbsoluteFile.toString)) def apply(path: java.nio.file.Path)(implicit conf: Configuration, fs: FileSystem): AvroSink = apply(path.toFile) }
Example 71
Source File: JsonSink.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.json import com.fasterxml.jackson.databind.ObjectMapper import com.fasterxml.jackson.module.scala.DefaultScalaModule import com.fasterxml.jackson.module.scala.experimental.ScalaObjectMapper import io.eels.schema.StructType import io.eels.{Row, Sink, SinkWriter} import org.apache.hadoop.fs.{FileSystem, Path} case class JsonSink(path: Path)(implicit fs: FileSystem) extends Sink { override def open(schema: StructType): SinkWriter = new SinkWriter { private val lock = new AnyRef() private val out = fs.create(path) val mapper = new ObjectMapper with ScalaObjectMapper mapper.registerModule(DefaultScalaModule) override def write(row: Row) { val map = schema.fieldNames.zip(row.values).toMap val json = mapper.writeValueAsString(map) lock.synchronized { out.writeBytes(json) out.writeBytes("\n") } } override def close() { out.close() } } }
Example 72
Source File: SequenceSink.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.sequence import java.io.StringWriter import com.univocity.parsers.csv.{CsvWriter, CsvWriterSettings} import io.eels.{Row, Sink, SinkWriter} import io.eels.schema.StructType import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.io.{BytesWritable, IntWritable, SequenceFile} case class SequenceSink(path: Path)(implicit conf: Configuration) extends Sink { override def open(schema: StructType): SinkWriter = new SequenceSinkWriter(schema, path) class SequenceSinkWriter(schema: StructType, path: Path) extends SinkWriter { val writer = SequenceFile.createWriter(conf, SequenceFile.Writer.file(path), SequenceFile.Writer.keyClass(classOf[IntWritable]), SequenceFile.Writer.valueClass(classOf[BytesWritable]) ) val key = new IntWritable(0) val headers = valuesToCsv(schema.fieldNames()) writer.append(key, new BytesWritable(headers.getBytes)) override def close(): Unit = writer.close() override def write(row: Row): Unit = { this.synchronized { val csv = valuesToCsv(row.values) writer.append(key, new BytesWritable(csv.getBytes())) key.set(key.get() + 1) } } private def valuesToCsv(values: Seq[Any]): String = { val swriter = new StringWriter() val csv = new CsvWriter(swriter, new CsvWriterSettings()) csv.writeRow(values.map { case null => null case other => other.toString }: _*) csv.close() swriter.toString().trim() } } }
Example 73
Source File: SequenceSupport.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.sequence import java.io.StringReader import java.nio.charset.Charset import com.sksamuel.exts.Logging import com.sksamuel.exts.io.Using import io.eels.component.csv.{CsvFormat, CsvSupport} import io.eels.schema.{Field, StructType} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.io.{BytesWritable, IntWritable, SequenceFile} object SequenceSupport extends Logging with Using { def createReader(path: Path)(implicit conf: Configuration): SequenceFile.Reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(path)) def toValues(v: BytesWritable): Array[String] = toValues(new String(v.copyBytes(), Charset.forName("UTF8"))) def toValues(str: String): Array[String] = { val parser = CsvSupport.createParser(CsvFormat(), false, false, false, null, null) parser.beginParsing(new StringReader(str)) val record = parser.parseNext() parser.stopParsing() record } def schema(path: Path)(implicit conf: Configuration): StructType = { logger.debug(s"Fetching sequence schema for $path") using(createReader(path)) { it => val k = new IntWritable() val v = new BytesWritable() val fields: Array[Field] = { it.next(k, v) toValues(v).map { it => new Field(it) } } StructType(fields.toList) } } }
Example 74
Source File: SequenceSource.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.sequence import java.util.concurrent.atomic.AtomicBoolean import com.sksamuel.exts.Logging import com.sksamuel.exts.io.Using import io.eels._ import io.eels.datastream.{DataStream, Publisher, Subscriber, Subscription} import io.eels.schema.StructType import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.io.{BytesWritable, IntWritable, SequenceFile} case class SequenceSource(path: Path)(implicit conf: Configuration) extends Source with Logging { logger.debug(s"Creating sequence source from $path") override def schema: StructType = SequenceSupport.schema(path) override def parts(): Seq[Publisher[Seq[Row]]] = List(new SequencePublisher(path)) } object SequenceReaderIterator { def apply(schema: StructType, reader: SequenceFile.Reader): Iterator[Row] = new Iterator[Row] { private val k = new IntWritable() private val v = new BytesWritable() // throw away the header reader.next(k, v) override def next(): Row = Row(schema, SequenceSupport.toValues(v).toVector) override def hasNext(): Boolean = reader.next(k, v) } } class SequencePublisher(val path: Path)(implicit conf: Configuration) extends Publisher[Seq[Row]] with Logging with Using { override def subscribe(subscriber: Subscriber[Seq[Row]]): Unit = { try { using(SequenceSupport.createReader(path)) { reader => val schema = SequenceSupport.schema(path) val running = new AtomicBoolean(true) subscriber.subscribed(Subscription.fromRunning(running)) SequenceReaderIterator(schema, reader) .takeWhile(_ => running.get) .grouped(DataStream.DefaultBatchSize) .foreach(subscriber.next) subscriber.completed() } } catch { case t: Throwable => subscriber.error(t) } } }
Example 75
Source File: RowParquetReaderFn.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet import com.sksamuel.exts.Logging import io.eels.schema.StructType import io.eels.{Predicate, Row} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.parquet.filter2.compat.FilterCompat import org.apache.parquet.format.converter.ParquetMetadataConverter import org.apache.parquet.hadoop.api.ReadSupport import org.apache.parquet.hadoop.{ParquetFileReader, ParquetReader} import org.apache.parquet.schema.Type def apply(path: Path, predicate: Option[Predicate], readSchema: Option[Type], dictionaryFiltering: Boolean)(implicit conf: Configuration): ParquetReader[Row] = { logger.debug(s"Opening parquet reader for $path") // The parquet reader can use a projection by setting a projected schema onto the supplied conf object def configuration(): Configuration = { val newconf = new Configuration(conf) readSchema.foreach { it => newconf.set(ReadSupport.PARQUET_READ_SCHEMA, it.toString) } //newconf.set(ParquetInputFormat.DICTIONARY_FILTERING_ENABLED, dictionaryFiltering.toString) newconf.set(org.apache.parquet.hadoop.ParquetFileReader.PARQUET_READ_PARALLELISM, config.parallelism.toString) newconf } // a filter is set when we have a predicate for the read def filter(): FilterCompat.Filter = predicate.map(ParquetPredicateBuilder.build) .map(FilterCompat.get) .getOrElse(FilterCompat.NOOP) ParquetReader.builder(new RowReadSupport, path) .withConf(configuration()) .withFilter(filter()) .build() } }
Example 76
Source File: ParquetPublisher.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet import java.util.concurrent.atomic.AtomicBoolean import com.sksamuel.exts.Logging import com.sksamuel.exts.OptionImplicits._ import com.sksamuel.exts.io.Using import io.eels.component.parquet.util.ParquetIterator import io.eels.datastream.{DataStream, Publisher, Subscriber, Subscription} import io.eels.schema.StructType import io.eels.{Predicate, Row} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.parquet.format.converter.ParquetMetadataConverter import org.apache.parquet.hadoop.ParquetFileReader import org.apache.parquet.schema.MessageType class ParquetPublisher(path: Path, predicate: Option[Predicate], projection: Seq[String], caseSensitive: Boolean, dictionaryFiltering: Boolean) (implicit conf: Configuration) extends Publisher[Seq[Row]] with Logging with Using { def readSchema: Option[MessageType] = { if (projection.isEmpty) None else { val fileSchema = ParquetFileReader.readFooter(conf, path, ParquetMetadataConverter.NO_FILTER).getFileMetaData.getSchema val structType = ParquetSchemaFns.fromParquetMessageType(fileSchema) if (caseSensitive) { assert( structType.fieldNames.toSet.size == structType.fieldNames.map(_.toLowerCase).toSet.size, "Cannot use case sensitive = true when this would result in a clash of field names" ) } val projectionSchema = StructType(projection.map { field => structType.field(field, caseSensitive).getOrError(s"Requested field $field does not exist in the parquet schema") }) ParquetSchemaFns.toParquetMessageType(projectionSchema).some } } override def subscribe(subscriber: Subscriber[Seq[Row]]): Unit = { try { using(RowParquetReaderFn(path, predicate, readSchema, dictionaryFiltering)) { reader => val running = new AtomicBoolean(true) subscriber.subscribed(Subscription.fromRunning(running)) ParquetIterator(reader) .grouped(DataStream.DefaultBatchSize) .takeWhile(_ => running.get) .foreach(subscriber.next) subscriber.completed() } } catch { case t: Throwable => subscriber.error(t) } } }
Example 77
Source File: AvroParquetSource.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet.avro import com.sksamuel.exts.Logging import com.sksamuel.exts.OptionImplicits._ import com.sksamuel.exts.io.Using import io.eels.component.avro.{AvroSchemaFns, AvroSchemaMerge} import io.eels.component.parquet._ import io.eels.datastream.Publisher import io.eels.schema.StructType import io.eels.{FilePattern, Predicate, _} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.parquet.format.converter.ParquetMetadataConverter import org.apache.parquet.hadoop.{Footer, ParquetFileReader} import scala.collection.JavaConverters._ object AvroParquetSource { def apply(uri: java.net.URI)(implicit fs: FileSystem, conf: Configuration): AvroParquetSource = apply(FilePattern(new Path(uri.toString))) def apply(path: java.nio.file.Path)(implicit fs: FileSystem, conf: Configuration): AvroParquetSource = apply(FilePattern(path)) def apply(path: Path)(implicit fs: FileSystem, conf: Configuration): AvroParquetSource = apply(FilePattern(path)) } case class AvroParquetSource(pattern: FilePattern, predicate: Option[Predicate] = None) (implicit fs: FileSystem, conf: Configuration) extends Source with Logging with Using { private lazy val paths = pattern.toPaths() def withPredicate(pred: Predicate): AvroParquetSource = copy(predicate = pred.some) // the schema returned by the parquet source should be a merged version of the // schemas contained in all the files. override def schema: StructType = { val schemas = paths.map { path => using(AvroParquetReaderFn.apply(path, predicate, None)) { reader => val record = Option(reader.read()).getOrElse { sys.error(s"Cannot read $path for schema; file contains no records") } record.getSchema } } val avroSchema = AvroSchemaMerge("record", "namspace", schemas) AvroSchemaFns.fromAvroSchema(avroSchema) } // returns the count of all records in this source, predicate is ignored def countNoPredicate(): Long = statistics().count // returns stats, predicate is ignored def statistics(): Statistics = { if (paths.isEmpty) Statistics.Empty else { paths.foldLeft(Statistics.Empty) { (stats, path) => val footer = ParquetFileReader.readFooter(conf, path, ParquetMetadataConverter.NO_FILTER) footer.getBlocks.asScala.foldLeft(stats) { (stats, block) => stats.copy( count = stats.count + block.getRowCount, compressedSize = stats.compressedSize + block.getCompressedSize, uncompressedSize = stats.uncompressedSize + block.getTotalByteSize ) } } } } override def parts(): Seq[Publisher[Seq[Row]]] = { logger.debug(s"AvroParquetSource source has ${paths.size} files: $paths") paths.map { it => new AvroParquetPublisher(it, predicate) } } def footers(): List[Footer] = { logger.debug(s"AvroParquetSource source will read footers from $paths") paths.flatMap { it => val status = fs.getFileStatus(it) logger.debug(s"status=$status; path=$it") ParquetFileReader.readAllFootersInParallel(fs.getConf, status).asScala } } }
Example 78
Source File: AvroParquetRowWriter.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet.avro import com.sksamuel.exts.Logging import com.typesafe.config.{Config, ConfigFactory} import org.apache.avro.Schema import org.apache.avro.generic.GenericRecord import org.apache.hadoop.fs.{FileSystem, Path} class AvroParquetRowWriter(path: Path, avroSchema: Schema)(implicit fs: FileSystem) extends Logging { private val config: Config = ConfigFactory.load() private val skipCrc = config.getBoolean("eel.parquet.skipCrc") logger.info(s"Parquet writer will skipCrc = $skipCrc") private val writer = AvroParquetWriterFn(path, avroSchema) def write(record: GenericRecord): Unit = { writer.write(record) } def close(): Unit = { writer.close() if (skipCrc) { val crc = new Path("." + path.toString() + ".crc") logger.debug("Deleting crc $crc") if (fs.exists(crc)) fs.delete(crc, false) } } }
Example 79
Source File: AvroParquetSink.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet.avro import com.sksamuel.exts.Logging import com.typesafe.config.ConfigFactory import io.eels.component.avro.{AvroSchemaFns, RowSerializer} import io.eels.schema.StructType import io.eels.{Row, Sink, SinkWriter} import org.apache.hadoop.fs.{FileSystem, Path} object AvroParquetSink { def apply(path: String)(implicit fs: FileSystem): AvroParquetSink = AvroParquetSink(new Path(path)) } case class AvroParquetSink(path: Path, overwrite: Boolean = false)(implicit fs: FileSystem) extends Sink with Logging { def withOverwrite(overwrite: Boolean): AvroParquetSink = copy(overwrite = overwrite) override def open(schema: StructType): SinkWriter = new SinkWriter { private val config = ConfigFactory.load() private val caseSensitive = config.getBoolean("eel.parquet.caseSensitive") if (overwrite && fs.exists(path)) fs.delete(path, false) private val avroSchema = AvroSchemaFns.toAvroSchema(schema, caseSensitive = caseSensitive) private val writer = new AvroParquetRowWriter(path, avroSchema) private val serializer = new RowSerializer(avroSchema) override def write(row: Row): Unit = { this.synchronized { val record = serializer.serialize(row) writer.write(record) } } override def close(): Unit = { writer.close() } } }
Example 80
Source File: AvroParquetWriterFn.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet.avro import com.sksamuel.exts.Logging import io.eels.component.parquet.ParquetWriterConfig import org.apache.avro.Schema import org.apache.avro.generic.GenericRecord import org.apache.hadoop.fs.Path import org.apache.parquet.avro.AvroParquetWriter import org.apache.parquet.hadoop.{ParquetFileWriter, ParquetWriter} object AvroParquetWriterFn extends Logging { def apply(path: Path, avroSchema: Schema): ParquetWriter[GenericRecord] = { val config = ParquetWriterConfig() AvroParquetWriter.builder[GenericRecord](path) .withSchema(avroSchema) .withCompressionCodec(config.compressionCodec) .withPageSize(config.pageSize) .withRowGroupSize(config.blockSize) .withDictionaryEncoding(config.enableDictionary) .withWriteMode(ParquetFileWriter.Mode.CREATE) .withValidation(config.validating) .build() } }
Example 81
Source File: AvroParquetReaderFn.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet.avro import io.eels.Predicate import io.eels.component.parquet.{ParquetPredicateBuilder, ParquetReaderConfig} import org.apache.avro.Schema import org.apache.avro.generic.GenericRecord import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.parquet.avro.{AvroParquetReader, AvroReadSupport} import org.apache.parquet.filter2.compat.FilterCompat import org.apache.parquet.hadoop.ParquetReader def apply(path: Path, predicate: Option[Predicate], projectionSchema: Option[Schema])(implicit conf: Configuration): ParquetReader[GenericRecord] = { // The parquet reader can use a projection by setting a projected schema onto a conf object def configuration(): Configuration = { val newconf = new Configuration(conf) projectionSchema.foreach { it => AvroReadSupport.setAvroReadSchema(newconf, it) AvroReadSupport.setRequestedProjection(newconf, it) } //conf.set(ParquetInputFormat.DICTIONARY_FILTERING_ENABLED, "true") newconf.set(org.apache.parquet.hadoop.ParquetFileReader.PARQUET_READ_PARALLELISM, config.parallelism.toString) newconf } // a filter is set when we have a predicate for the read def filter(): FilterCompat.Filter = predicate.map(ParquetPredicateBuilder.build) .map(FilterCompat.get) .getOrElse(FilterCompat.NOOP) AvroParquetReader.builder[GenericRecord](path) .withCompatibility(false) .withConf(configuration()) .withFilter(filter()) .build() .asInstanceOf[ParquetReader[GenericRecord]] } }
Example 82
Source File: AvroParquetPublisher.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet.avro import java.util.concurrent.atomic.AtomicBoolean import com.sksamuel.exts.Logging import com.sksamuel.exts.io.Using import io.eels.component.avro.AvroDeserializer import io.eels.component.parquet.util.ParquetIterator import io.eels.datastream.{DataStream, Publisher, Subscriber, Subscription} import io.eels.{Predicate, Row} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path class AvroParquetPublisher(path: Path, predicate: Option[Predicate])(implicit conf: Configuration) extends Publisher[Seq[Row]] with Logging with Using { override def subscribe(subscriber: Subscriber[Seq[Row]]): Unit = { try { val deser = new AvroDeserializer() val running = new AtomicBoolean(true) subscriber.subscribed(Subscription.fromRunning(running)) using(AvroParquetReaderFn(path, predicate, None)) { reader => ParquetIterator(reader) .map(deser.toRow) .grouped(DataStream.DefaultBatchSize) .takeWhile(_ => running.get) .foreach(subscriber.next) } subscriber.completed() } catch { case t: Throwable => subscriber.error(t) } } }
Example 83
Source File: RowParquetWriterFn.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet import io.eels.Row import io.eels.schema.StructType import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.parquet.column.ParquetProperties import org.apache.parquet.hadoop.api.WriteSupport import org.apache.parquet.hadoop.{ParquetFileWriter, ParquetWriter} import org.apache.parquet.schema.MessageType import scala.math.BigDecimal.RoundingMode.RoundingMode object RowParquetWriterFn { class RowParquetWriterBuilder(path: Path, schema: MessageType, roundingMode: RoundingMode, metadata: Map[String, String]) extends ParquetWriter.Builder[Row, RowParquetWriterBuilder](path) { override def getWriteSupport(conf: Configuration): WriteSupport[Row] = new RowWriteSupport(schema, roundingMode, metadata) override def self(): RowParquetWriterBuilder = this } def apply(path: Path, schema: StructType, metadata: Map[String, String], dictionary: Boolean, roundingMode: RoundingMode, fsConfig: Configuration): ParquetWriter[Row] = { val config = ParquetWriterConfig() val messageType = ParquetSchemaFns.toParquetMessageType(schema) new RowParquetWriterBuilder(path, messageType, roundingMode, metadata) .withCompressionCodec(config.compressionCodec) .withDictionaryEncoding(dictionary) .withPageSize(config.pageSize) .withRowGroupSize(config.blockSize) .withValidation(config.validating) .withWriteMode(ParquetFileWriter.Mode.CREATE) .withWriterVersion(ParquetProperties.WriterVersion.PARQUET_1_0) .withConf(fsConfig) .build() } }
Example 84
Source File: ParquetSource.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet import com.sksamuel.exts.Logging import com.sksamuel.exts.OptionImplicits._ import com.sksamuel.exts.io.Using import io.eels.datastream.Publisher import io.eels.{Predicate, _} import io.eels.schema.StructType import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.parquet.format.converter.ParquetMetadataConverter import org.apache.parquet.hadoop.{Footer, ParquetFileReader} import scala.collection.JavaConverters._ object ParquetSource { def apply(string: String)(implicit fs: FileSystem, conf: Configuration): ParquetSource = apply(FilePattern(string)) def apply(uri: java.net.URI)(implicit fs: FileSystem, conf: Configuration): ParquetSource = apply(FilePattern(new Path(uri.toString))) def apply(path: java.nio.file.Path)(implicit fs: FileSystem, conf: Configuration): ParquetSource = apply(FilePattern(path)) def apply(path: Path)(implicit fs: FileSystem, conf: Configuration): ParquetSource = apply(FilePattern(path)) } case class ParquetSource(pattern: FilePattern, predicate: Option[Predicate] = None, projection: Seq[String] = Nil, dictionaryFiltering: Boolean = true, caseSensitive: Boolean = true) (implicit fs: FileSystem, conf: Configuration) extends Source with Logging with Using { logger.debug(s"Created parquet source with pattern=$pattern") lazy val paths: List[Path] = pattern.toPaths() def withDictionaryFiltering(dictionary: Boolean): ParquetSource = copy(dictionaryFiltering = dictionary) def withCaseSensitivity(caseSensitive: Boolean): ParquetSource = copy(caseSensitive = caseSensitive) def withPredicate(pred: => Predicate): ParquetSource = copy(predicate = pred.some) def withProjection(first: String, rest: String*): ParquetSource = withProjection(first +: rest) def withProjection(fields: Seq[String]): ParquetSource = { require(fields.nonEmpty) copy(projection = fields.toList) } // returns the metadata in the parquet file, or an empty map if none def metadata(): Map[String, String] = { paths.foldLeft(Map.empty[String, String]) { (metadata, path) => val footer = ParquetFileReader.readFooter(conf, path, ParquetMetadataConverter.NO_FILTER) metadata ++ footer.getFileMetaData.getKeyValueMetaData.asScala } } // todo should take the merged schema from all files lazy val schema: StructType = RowParquetReaderFn.schema(paths.headOption.getOrError("No paths found for source")) // returns the count of all records in this source, predicate is ignored def countNoPredicate(): Long = statistics().count // returns stats, predicate is ignored def statistics(): Statistics = { if (paths.isEmpty) Statistics.Empty else { paths.foldLeft(Statistics.Empty) { (stats, path) => val footer = ParquetFileReader.readFooter(conf, path, ParquetMetadataConverter.NO_FILTER) footer.getBlocks.asScala.foldLeft(stats) { (stats, block) => stats.copy( count = stats.count + block.getRowCount, compressedSize = stats.compressedSize + block.getCompressedSize, uncompressedSize = stats.uncompressedSize + block.getTotalByteSize ) } } } } override def parts(): Seq[Publisher[Seq[Row]]] = { logger.debug(s"Parquet source has ${paths.size} files: ${paths.mkString(", ")}") paths.map { it => new ParquetPublisher(it, predicate, projection, caseSensitive, dictionaryFiltering) } } def footers(): List[Footer] = { logger.debug(s"Parquet source will read footers from $paths") paths.flatMap { it => val status = fs.getFileStatus(it) logger.debug(s"status=$status; path=$it") ParquetFileReader.readAllFootersInParallel(fs.getConf, status).asScala } } }
Example 85
Source File: ParquetSink.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet import com.sksamuel.exts.Logging import com.sksamuel.exts.OptionImplicits._ import io.eels.schema.StructType import io.eels.{Row, Sink, SinkWriter} import org.apache.hadoop.fs.permission.FsPermission import org.apache.hadoop.fs.{FileSystem, Path} import scala.math.BigDecimal.RoundingMode import scala.math.BigDecimal.RoundingMode.RoundingMode case class ParquetWriteOptions(overwrite: Boolean = false, permission: Option[FsPermission] = None, dictionary: Boolean = true, inheritPermissions: Option[Boolean] = None, roundingMode: RoundingMode = RoundingMode.UNNECESSARY, metadata: Map[String, String] = Map.empty) { def withOverwrite(overwrite: Boolean): ParquetWriteOptions = copy(overwrite = overwrite) def withDictionary(dictionary: Boolean): ParquetWriteOptions = copy(dictionary = dictionary) def withMetaData(map: Map[String, String]): ParquetWriteOptions = copy(metadata = map) def withPermission(permission: FsPermission): ParquetWriteOptions = copy(permission = permission.some) def withInheritPermission(inheritPermissions: Boolean): ParquetWriteOptions = copy(inheritPermissions = inheritPermissions.some) def withRoundingMode(mode: RoundingMode): ParquetWriteOptions = copy(roundingMode = mode) } case class ParquetSink(path: Path, options: ParquetWriteOptions = ParquetWriteOptions()) (implicit fs: FileSystem) extends Sink with Logging { // -- convenience methods -- def withOverwrite(overwrite: Boolean): ParquetSink = copy(options = options.withOverwrite(overwrite)) def withDictionary(dictionary: Boolean): ParquetSink = copy(options = options.copy(dictionary = dictionary)) def withMetaData(map: Map[String, String]): ParquetSink = copy(options = options.copy(metadata = map)) def withPermission(permission: FsPermission): ParquetSink = copy(options = options.copy(permission = permission.some)) def withInheritPermission(inheritPermissions: Boolean): ParquetSink = copy(options = options.copy(inheritPermissions = inheritPermissions.some)) def withRoundingMode(mode: RoundingMode): ParquetSink = copy(options = options.copy(roundingMode = mode)) private def create(schema: StructType, path: Path): SinkWriter = new SinkWriter { if (options.overwrite && fs.exists(path)) fs.delete(path, false) val writer = RowParquetWriterFn(path, schema, options.metadata, options.dictionary, options.roundingMode, fs.getConf) override def write(row: Row): Unit = { writer.write(row) } override def close(): Unit = { writer.close() options.permission match { case Some(perm) => fs.setPermission(path, perm) case None => if (options.inheritPermissions.getOrElse(false)) { val permission = fs.getFileStatus(path.getParent).getPermission fs.setPermission(path, permission) } } } } override def open(schema: StructType, n: Int): Seq[SinkWriter] = { if (n == 1) Seq(create(schema, path)) else List.tabulate(n) { k => create(schema, new Path(path.getParent, path.getName + "_" + k)) } } override def open(schema: StructType): SinkWriter = create(schema, path) } object ParquetSink { def apply(path: String)(implicit fs: FileSystem): ParquetSink = ParquetSink(new Path(path)) }
Example 86
Source File: HdfsWatcher.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.hdfs import java.util.concurrent.Executors import java.util.concurrent.atomic.AtomicBoolean import com.sksamuel.exts.Logging import io.eels.util.HdfsIterator import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hdfs.client.HdfsAdmin import org.apache.hadoop.hdfs.inotify.Event import scala.concurrent.duration._ import scala.util.control.NonFatal class HdfsWatcher(path: Path, callback: FileCallback) (implicit fs: FileSystem, conf: Configuration) extends Logging { private val files = HdfsIterator.remote(fs.listFiles(path, false)).map(_.getPath).toBuffer files.foreach(callback.onStart) private val executor = Executors.newSingleThreadExecutor() private val running = new AtomicBoolean(true) private val interval = 5.seconds private val admin = new HdfsAdmin(path.toUri, conf) private val eventStream = admin.getInotifyEventStream executor.submit(new Runnable { override def run(): Unit = { while (running.get) { try { Thread.sleep(interval.toMillis) val events = eventStream.take for (event <- events.getEvents) { event match { case create: Event.CreateEvent => callback.onCreate(create) case append: Event.AppendEvent => callback.onAppend(append) case rename: Event.RenameEvent => callback.onRename(rename) case close: Event.CloseEvent => callback.onClose(close) case _ => } } } catch { case NonFatal(e) => logger.error("Error while polling fs", e) } } } }) def stop(): Unit = { running.set(false) executor.shutdownNow() } } trait FileCallback { def onStart(path: Path): Unit def onClose(close: Event.CloseEvent): Unit def onRename(rename: Event.RenameEvent): Unit def onAppend(append: Event.AppendEvent): Unit def onCreate(path: Event.CreateEvent): Unit }
Example 87
Source File: HdfsSource.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.hdfs import io.eels.FilePattern import org.apache.hadoop.fs.permission.{AclEntryScope, AclEntryType, FsAction, FsPermission, AclEntry => HdfsAclEntry} import org.apache.hadoop.fs.{BlockLocation, FileSystem, Path} import scala.collection.JavaConverters._ case class HdfsSource(pattern: FilePattern)(implicit fs: FileSystem) { def permissions(): Vector[(Path, FsPermission)] = pattern.toPaths().map(fs.getFileStatus) .map(status => (status.getPath, status.getPermission)).toVector def setPermissions(permission: FsPermission): Unit = { pattern.toPaths().foreach(fs.setPermission(_, permission)) } def blocks(): Map[Path, Seq[BlockLocation]] = pattern.toPaths().map { path => path -> fs.getFileBlockLocations(path, 0, fs.getFileLinkStatus(path).getLen).toSeq }.toMap def setAcl(spec: AclSpec): Unit = { pattern.toPaths().foreach { path => val hadoopAclEntries = spec.entries.map { entry => val `type` = entry.`type`.toLowerCase match { case "user" => AclEntryType.USER case "group" => AclEntryType.GROUP case "other" => AclEntryType.OTHER } new HdfsAclEntry.Builder().setName(entry.name).setPermission(FsAction.getFsAction(entry.action)).setType(`type`).setScope(AclEntryScope.ACCESS).build() } fs.setAcl(path, hadoopAclEntries.asJava) } } } object HdfsSource { def apply(path: String)(implicit fs: FileSystem): HdfsSource = apply(FilePattern(path)) def apply(path: Path)(implicit fs: FileSystem): HdfsSource = HdfsSource(FilePattern(path)) }
Example 88
Source File: CsvSink.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.csv import com.univocity.parsers.csv.CsvWriter import io.eels.schema.StructType import io.eels.{Row, Sink, SinkWriter} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} case class CsvSink(path: Path, overwrite: Boolean = false, headers: Header = Header.FirstRow, format: CsvFormat = CsvFormat(), ignoreLeadingWhitespaces: Boolean = false, ignoreTrailingWhitespaces: Boolean = false) (implicit conf: Configuration, fs: FileSystem) extends Sink { override def open(schema: StructType): SinkWriter = new CsvSinkWriter(schema, path, headers, format, ignoreLeadingWhitespaces, ignoreTrailingWhitespaces) def withOverwrite(overwrite: Boolean): CsvSink = copy(overwrite = overwrite) def withHeaders(headers: Header): CsvSink = copy(headers = headers) def withIgnoreLeadingWhitespaces(ignoreLeadingWhitespaces: Boolean): CsvSink = copy(ignoreLeadingWhitespaces = ignoreLeadingWhitespaces) def withIgnoreTrailingWhitespaces(ignoreTrailingWhitespaces: Boolean): CsvSink = copy(ignoreTrailingWhitespaces = ignoreTrailingWhitespaces) def withFormat(format: CsvFormat): CsvSink = copy(format = format) class CsvSinkWriter(schema: StructType, path: Path, headers: Header, format: CsvFormat, ignoreLeadingWhitespaces: Boolean = false, ignoreTrailingWhitespaces: Boolean = false) extends SinkWriter { private val lock = new AnyRef {} if (overwrite && fs.exists(path)) fs.delete(path, false) import scala.collection.JavaConverters._ private lazy val writer: CsvWriter = { val output = fs.create(path) val writer = CsvSupport.createWriter(output, format, ignoreLeadingWhitespaces, ignoreTrailingWhitespaces) headers match { case Header.FirstComment => writer.commentRow(schema.fieldNames().mkString(format.delimiter.toString())) case Header.FirstRow => writer.writeHeaders(schema.fieldNames().asJava) case _ => } writer } override def close(): Unit = writer.close() override def write(row: Row): Unit = { lock.synchronized { // nulls should be written as empty strings val array = row.values.map { case null => "" case other => other.toString } writer.writeRow(array: _*) } } } } object CsvSink { def apply(path: java.nio.file.Path) (implicit conf: Configuration, fs: FileSystem): CsvSink = CsvSink(new Path(path.toString)) }
Example 89
Source File: ReadParquetEEL.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels import java.sql.Timestamp import io.eels.component.parquet.{ParquetSink, ParquetSource} import io.eels.datastream.DataStream import io.eels.schema.{ArrayType, DecimalType, Field, IntType, Precision, Scale, StringType, StructType, TimestampMillisType} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} object ReadParquetEEL extends App { def readParquet(path: Path): Unit = { implicit val hadoopConfiguration = new Configuration() implicit val hadoopFileSystem = FileSystem.get(hadoopConfiguration) val rows = ParquetSource(parquetFilePath).toDataStream().collect rows.foreach(row => println(row)) } val parquetFilePath = new Path("file:///home/sam/development/person2.parquet") implicit val hadoopConfiguration = new Configuration() implicit val hadoopFileSystem = FileSystem.get(hadoopConfiguration) val friendStruct = Field.createStructField("FRIEND", Seq( Field("NAME", StringType), Field("AGE", IntType.Signed) ) ) val personDetailsStruct = Field.createStructField("PERSON_DETAILS", Seq( Field("NAME", StringType), Field("AGE", IntType.Signed), Field("SALARY", DecimalType(Precision(38), Scale(5))), Field("CREATION_TIME", TimestampMillisType) ) ) val friendType = StructType(friendStruct) val schema = StructType(personDetailsStruct, Field("FRIENDS", ArrayType(friendType), nullable = false)) val friends = Vector( Vector(Vector("John", 25)), Vector(Vector("Adam", 26)), Vector(Vector("Steven", 27)) ) val rows = Vector( Vector(Vector("Fred", 50, BigDecimal("50000.99000"), new Timestamp(System.currentTimeMillis())), friends) ) try { DataStream.fromValues(schema, rows).to(ParquetSink(parquetFilePath).withOverwrite(true)) } catch { case e: Exception => e.printStackTrace() } try { readParquet(parquetFilePath) } catch { case e: Exception => e.printStackTrace() } }
Example 90
Source File: FilePatternTest.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels import java.nio.file.Files import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.scalatest.{Matchers, WordSpec} class FilePatternTest extends WordSpec with Matchers { implicit val fs = FileSystem.get(new Configuration()) "FilePattern" should { "detect single hdfs path without name server" ignore { FilePattern("hdfs:///mypath").toPaths() shouldBe List(new Path("hdfs:///mypath")) } "detect single hdfs path with name server" ignore { FilePattern("hdfs://nameserver/mypath").toPaths() shouldBe List(new Path("hdfs://nameserver/mypath")) } "detect absolute local file" in { FilePattern("file:///absolute/file").toPaths() shouldBe List(new Path("file:///absolute/file")) } "detect relative local file" in { FilePattern("file:///local/file").toPaths() shouldBe List(new Path("file:///local/file")) } "detect relative local file expansion" in { val dir = Files.createTempDirectory("filepatterntest") val files = List("a", "b", "c").map { it => dir.resolve(it) } val hdfsPaths = files.map { it => new Path(it.toUri) } files.foreach(file => Files.createFile(file)) FilePattern(dir.toUri.toString() + "/*").toPaths().toSet shouldBe hdfsPaths.toSet files.foreach(Files.deleteIfExists) Files.deleteIfExists(dir) } //not working on windows "detect relative local file expansion with schema" in { val dir = Files.createTempDirectory("filepatterntest") val files = List("a", "b", "c").map { it => dir.resolve(it) } val hdfsPaths = files.map { it => new Path(it.toUri) } files.foreach(file => Files.createFile(file)) FilePattern(dir.toUri.toString() + "/*").toPaths().toSet shouldBe hdfsPaths.toSet files.foreach(Files.deleteIfExists) Files.deleteIfExists(dir) } "use filter if supplied" in { val dir = Files.createTempDirectory("filepatterntest") val files = List("a", "b", "c").map { it => dir.resolve(it) } files.foreach { it => Files.createFile(it) } val a = FilePattern(dir.toAbsolutePath().toString() + "/*") .withFilter(_.toString().endsWith("a")) .toPaths.toSet a shouldBe Set(new Path("file:///" + dir.resolve("a"))) files.foreach { it => Files.deleteIfExists(it) } Files.deleteIfExists(dir) } } }
Example 91
Source File: ListenerTest.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels import java.util.concurrent.{CountDownLatch, TimeUnit} import io.eels.component.csv.{CsvSink, CsvSource} import io.eels.datastream.DataStream import io.eels.schema.StructType import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.scalatest.{Matchers, WordSpec} import scala.util.Random class ListenerTest extends WordSpec with Matchers { implicit val conf = new Configuration() implicit val fs = FileSystem.get(conf) val schema = StructType("a", "b", "c", "d", "e") val rows = List.fill(1000)(Row(schema, Random.nextBoolean(), Random.nextFloat(), Random.nextGaussian(), Random.nextLong(), Random.nextString(10))) val ds = DataStream.fromRows(schema, rows) val path = new Path("listener_test.csv") "DataStream" should { "support user's listeners" in { val latch = new CountDownLatch(1000) fs.delete(path, false) ds.listener(new Listener { override def onNext(value: Row): Unit = latch.countDown() override def onError(e: Throwable): Unit = () override def onComplete(): Unit = () }).to(CsvSink(path)) latch.await(20, TimeUnit.SECONDS) shouldBe true fs.delete(path, false) } "propagate errors in listeners" in { class TestSink extends Sink { override def open(schema: StructType): SinkWriter = new SinkWriter { override def close(): Unit = () override def write(row: Row): Unit = () } } try { ds.listener(new Listener { override def onNext(value: Row): Unit = sys.error("boom") override def onError(e: Throwable): Unit = () override def onComplete(): Unit = () }).to(new TestSink) assert(false) } catch { case _: Throwable => } } } "Source.toDataStream" should { "call on next for each row" in { val latch = new CountDownLatch(1000) fs.delete(path, false) ds.to(CsvSink(path)) CsvSource(path).toDataStream(new Listener { override def onNext(value: Row): Unit = latch.countDown() override def onError(e: Throwable): Unit = () override def onComplete(): Unit = () }).collect latch.await(5, TimeUnit.SECONDS) shouldBe true fs.delete(path, false) } "call on complete once finished" in { val latch = new CountDownLatch(1001) fs.delete(path, false) ds.to(CsvSink(path)) CsvSource(path).toDataStream(new Listener { override def onNext(value: Row): Unit = latch.countDown() override def onError(e: Throwable): Unit = () override def onComplete(): Unit = latch.countDown() }).collect latch.await(5, TimeUnit.SECONDS) shouldBe true fs.delete(path, false) } } }
Example 92
Source File: AvroSinkTest.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.avro import io.eels.Row import io.eels.datastream.DataStream import io.eels.schema.{ArrayType, Field, MapType, StringType, StructType} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.scalatest.{Matchers, WordSpec} class AvroSinkTest extends WordSpec with Matchers { private implicit val conf = new Configuration() private implicit val fs = FileSystem.get(new Configuration()) private val ds = DataStream.fromValues( StructType("name", "job", "location"), Seq( List("clint eastwood", "actor", "carmel"), List("elton john", "musician", "pinner"), List("issac newton", "scientist", "heaven") ) ) "AvroSink" should { "write to avro" in { val path = new Path("avro.test") fs.delete(path, false) ds.to(AvroSink(path)) fs.delete(path, false) } "support overwrite option" in { val path = new Path("overwrite_test", ".avro") fs.delete(path, false) ds.to(AvroSink(path)) ds.to(AvroSink(path).withOverwrite(true)) fs.delete(path, false) } "write lists and maps" in { val ds = DataStream.fromValues( StructType( Field("name"), Field("movies", ArrayType(StringType)), Field("characters", MapType(StringType, StringType)) ), Seq( List( "clint eastwood", List("fistful of dollars", "high plains drifters"), Map("preacher" -> "high plains", "no name" -> "good bad ugly") ) ) ) val path = new Path("array_map_avro", ".avro") fs.delete(path, false) ds.to(AvroSink(path)) AvroSource(path).toDataStream().collect shouldBe Seq( Row( ds.schema, Seq( "clint eastwood", List("fistful of dollars", "high plains drifters"), Map("preacher" -> "high plains", "no name" -> "good bad ugly") ) ) ) fs.delete(path, true) } } }
Example 93
Source File: JsonSinkTest.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.json import io.eels.datastream.DataStream import io.eels.schema.{Field, StructType} import org.apache.commons.io.IOUtils import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.scalatest.{Matchers, WordSpec} class JsonSinkTest extends WordSpec with Matchers { val path = new Path("test.json") implicit val fs: FileSystem = FileSystem.get(new Configuration()) "JsonSink" should { "write multiple json docs to a file" in { if (fs.exists(path)) fs.delete(path, false) val schema = StructType(Field("name"), Field("location")) val ds = DataStream.fromValues( schema, Seq( Vector("sam", "aylesbury"), Vector("jam", "aylesbury"), Vector("ham", "buckingham") ) ) ds.to(JsonSink(path)) val input = IOUtils.toString(fs.open(path)) input should include("""{"name":"sam","location":"aylesbury"}""") input should include("""{"name":"jam","location":"aylesbury"}""") input should include("""{"name":"ham","location":"buckingham"}""") fs.delete(path, false) } "support arrays" in { if (fs.exists(path)) fs.delete(path, false) val schema = StructType(Field("name"), Field("skills")) val frame = DataStream.fromValues( schema, Seq(Vector("sam", Array("karate", "kung fu"))) ) frame.to(JsonSink(path)) val input = IOUtils.toString(fs.open(path)) input.trim shouldBe """{"name":"sam","skills":["karate","kung fu"]}""" fs.delete(path, false) } "support maps" in { if (fs.exists(path)) fs.delete(path, false) val schema = StructType(Field("name"), Field("locations")) val frame = DataStream.fromValues( schema, Seq(Vector("sam", Map("home" -> "boro", "work" -> "london"))) ) frame.to(JsonSink(path)) val input = IOUtils.toString(fs.open(path)) input.trim shouldBe """{"name":"sam","locations":{"home":"boro","work":"london"}}""" fs.delete(path, false) } "support structs" in { case class Foo(home: String, work: String) if (fs.exists(path)) fs.delete(path, false) val schema = StructType(Field("name"), Field("locations")) val frame = DataStream.fromValues( schema, Seq(Vector("sam", Foo("boro", "london"))) ) frame.to(JsonSink(path)) val input = IOUtils.toString(fs.open(path)) input.trim shouldBe """{"name":"sam","locations":{"home":"boro","work":"london"}}""" fs.delete(path, false) } } }
Example 94
Source File: SequenceSourceTest.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.sequence import io.eels.Row import io.eels.datastream.DataStream import io.eels.schema.{Field, StringType, StructType} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.scalatest.{Matchers, WordSpec} class SequenceSourceTest extends WordSpec with Matchers { private implicit val conf = new Configuration() private val schema = StructType(Field("name"), Field("location")) private val ds = DataStream.fromValues( schema, Seq( Vector("name", "location"), Vector("sam", "aylesbury"), Vector("jam", "aylesbury"), Vector("ham", "buckingham") ) ) "SequenceSource" should { "read sequence files" in { val schema = StructType( Field("a", StringType), Field("b", StringType), Field("c", StringType), Field("d", StringType) ) val path = new Path(getClass.getResource("/test.seq").getFile) val rows = SequenceSource(path).toDataStream().toSet rows shouldBe Set( Row(schema, "1", "2", "3", "4"), Row(schema, "5", "6", "7", "8") ) } "read header as schema" in { val path = new Path(getClass.getResource("/test.seq").getFile) SequenceSource(path).schema shouldBe StructType( Field("a", StringType), Field("b", StringType), Field("c", StringType), Field("d", StringType) ) } } }
Example 95
Source File: SequenceSinkTest.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.sequence import io.eels.datastream.DataStream import io.eels.schema.StructType import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.io.{BytesWritable, IntWritable, SequenceFile} import org.scalatest.{Matchers, WordSpec} class SequenceSinkTest extends WordSpec with Matchers { private val ds = DataStream.fromValues( StructType("a", "b", "c", "d"), Seq( List("1", "2", "3", "4"), List("5", "6", "7", "8") ) ) "SequenceSink" should { "write sequence files" in { implicit val conf = new Configuration implicit val fs = FileSystem.get(conf) val path = new Path("seqsink.seq") if (fs.exists(path)) fs.delete(path, true) ds.to(SequenceSink(path)) val reader = new SequenceFile.Reader(new Configuration, SequenceFile.Reader.file(path)) val k = new IntWritable val v = new BytesWritable val set = for (_ <- 1 to 3) yield { reader.next(k, v) new String(v.copyBytes) } set.toSet shouldBe Set( "a,b,c,d", "1,2,3,4", "5,6,7,8" ) reader.close() fs.delete(path, true) } } }
Example 96
Source File: ParquetProjectionTest.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet import java.io.{File, FilenameFilter} import io.eels.datastream.DataStream import io.eels.schema.{Field, StringType, StructType} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.scalatest.{FlatSpec, Matchers} class ParquetProjectionTest extends FlatSpec with Matchers { cleanUpResidualParquetTestFiles private val schema = StructType( Field("name", StringType, nullable = false), Field("job", StringType, nullable = false), Field("location", StringType, nullable = false) ) private val ds = DataStream.fromValues( schema, Seq( Vector("clint eastwood", "actor", "carmel"), Vector("elton john", "musician", "pinner") ) ) private implicit val conf = new Configuration() private implicit val fs = FileSystem.get(new Configuration()) private val file = new File(s"test_${System.currentTimeMillis()}.pq") file.deleteOnExit() private val path = new Path(file.toURI) if (fs.exists(path)) fs.delete(path, false) ds.to(ParquetSink(path).withOverwrite(true)) "ParquetSource" should "support projections" in { val rows = ParquetSource(path).withProjection("name").toDataStream().collect rows.map(_.values) shouldBe Vector(Vector("clint eastwood"), Vector("elton john")) } it should "return all data when no projection is set" in { val rows = ParquetSource(path).toDataStream().collect rows.map(_.values) shouldBe Vector(Vector("clint eastwood", "actor", "carmel"), Vector("elton john", "musician", "pinner")) } private def cleanUpResidualParquetTestFiles = { new File(".").listFiles(new FilenameFilter { override def accept(dir: File, name: String): Boolean = { (name.startsWith("test_") && name.endsWith(".pq")) || (name.startsWith(".test_") && name.endsWith(".pq.crc")) } }).foreach(_.delete()) } }
Example 97
Source File: AvroAndParquetCrossCompatibilityTest.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet import io.eels.component.parquet.avro.{AvroParquetSink, AvroParquetSource} import io.eels.datastream.DataStream import io.eels.schema.{Field, StringType, StructType} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.scalatest.{FlatSpec, Matchers} // tests that avro source/sink and avro parquet source/sink can write/read each others files class AvroAndParquetCrossCompatibilityTest extends FlatSpec with Matchers { private implicit val conf = new Configuration() private implicit val fs = FileSystem.get(new Configuration()) "AvroParquetSource and ParquetSource" should "be compatible" in { val path = new Path("cross.pq") if (fs.exists(path)) fs.delete(path, false) val structType = StructType( Field("name", StringType, nullable = false), Field("location", StringType, nullable = false) ) val ds = DataStream.fromValues( structType, Seq( Vector("clint eastwood", "carmel"), Vector("elton john", "pinner") ) ) ds.to(ParquetSink(path)) AvroParquetSource(path).toDataStream().collect shouldBe ds.collect fs.delete(path, false) ds.to(AvroParquetSink(path)) ParquetSource(path).toDataStream().collect shouldBe ds.collect fs.delete(path, false) } }
Example 98
Source File: ParquetSpeedTest.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet import java.io.File import com.sksamuel.exts.metrics.Timed import io.eels.Row import io.eels.component.parquet.avro.{AvroParquetSink, AvroParquetSource} import io.eels.component.parquet.util.ParquetLogMute import io.eels.datastream.DataStream import io.eels.schema.StructType import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import scala.util.Random object ParquetSpeedTest extends App with Timed { ParquetLogMute() val size = 2000000 val schema = StructType("a", "b", "c", "d", "e") val createRow = Row(schema, Random.nextBoolean(), Random.nextFloat(), Random.nextGaussian(), Random.nextLong(), Random.nextString(4)) val ds = DataStream.fromIterator(schema, Iterator.continually(createRow).take(size)) implicit val conf = new Configuration() implicit val fs = FileSystem.getLocal(new Configuration()) val path = new Path("parquet_speed.pq") fs.delete(path, false) new File(path.toString).deleteOnExit() timed("Insertion") { ds.to(AvroParquetSink(path).withOverwrite(true)) } while (true) { timed("Reading with ParquetSource") { val actual = ParquetSource(path).toDataStream().size assert(actual == size) } println("") println("---------") println("") Thread.sleep(2000) timed("Reading with AvroParquetSource") { val actual = AvroParquetSource(path).toDataStream().size assert(actual == size) } } }
Example 99
Source File: ParquetMultipleFileSpeedTest.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet import java.io.File import com.sksamuel.exts.metrics.Timed import io.eels.component.parquet.util.ParquetLogMute import io.eels.datastream.DataStream import io.eels.schema.StructType import io.eels.{FilePattern, Row} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import scala.util.Random object ParquetMultipleFileSpeedTest extends App with Timed { ParquetLogMute() val size = 5000000 val count = 20 val schema = StructType("a", "b", "c", "d", "e") def createRow = Row(schema, Random.nextBoolean(), Random.nextFloat(), Random.nextGaussian(), Random.nextLong(), Random.nextString(4)) implicit val conf = new Configuration() implicit val fs = FileSystem.getLocal(new Configuration()) val dir = new Path("parquet-speed-test") new File(dir.toString).mkdirs() new File(dir.toString).listFiles().foreach(_.delete) timed("Insertion") { val ds = DataStream.fromRowIterator(schema, Iterator.continually(createRow).take(size)) ds.to(ParquetSink(new Path("parquet-speed-test/parquet_speed.pq")), count) } for (_ <- 1 to 25) { assert(count == FilePattern("parquet-speed-test/*").toPaths().size) timed("Reading with ParquetSource") { val actual = ParquetSource("parquet-speed-test/*").toDataStream().map { row => row }.filter(_ => true).size assert(actual == size, s"Expected $size but was $actual") } println("") println("---------") println("") } }
Example 100
Source File: AvroParquetSinkTest.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet import io.eels.Row import io.eels.component.parquet.avro.{AvroParquetSink, AvroParquetSource} import io.eels.component.parquet.util.ParquetLogMute import io.eels.datastream.DataStream import io.eels.schema.{Field, StringType, StructType} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.scalatest.{Matchers, WordSpec} class AvroParquetSinkTest extends WordSpec with Matchers { ParquetLogMute() private val schema = StructType( Field("name", StringType, nullable = false), Field("job", StringType, nullable = false), Field("location", StringType, nullable = false) ) private val ds = DataStream.fromValues( schema, Seq( Vector("clint eastwood", "actor", "carmel"), Vector("elton john", "musician", "pinner") ) ) private implicit val conf = new Configuration() private implicit val fs = FileSystem.get(new Configuration()) private val path = new Path("test.pq") "ParquetSink" should { "write schema" in { if (fs.exists(path)) fs.delete(path, false) ds.to(AvroParquetSink(path)) val people = ParquetSource(path) people.schema shouldBe StructType( Field("name", StringType, false), Field("job", StringType, false), Field("location", StringType, false) ) fs.delete(path, false) } "write data" in { if (fs.exists(path)) fs.delete(path, false) ds.to(AvroParquetSink(path)) AvroParquetSource(path).toDataStream().toSet.map(_.values) shouldBe Set( Vector("clint eastwood", "actor", "carmel"), Vector("elton john", "musician", "pinner") ) fs.delete(path, false) } "support overwrite" in { val path = new Path("overwrite_test.pq") fs.delete(path, false) val schema = StructType(Field("a", StringType)) val ds = DataStream.fromRows(schema, Row(schema, Vector("x")), Row(schema, Vector("y")) ) ds.to(AvroParquetSink(path)) ds.to(AvroParquetSink(path).withOverwrite(true)) fs.delete(path, false) } } }
Example 101
Source File: AvroParquetReaderFnTest.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet import java.util.UUID import io.eels.component.avro.AvroSchemaFns import io.eels.component.parquet.avro.AvroParquetReaderFn import io.eels.schema.{DoubleType, Field, LongType, StructType} import org.apache.avro.SchemaBuilder import org.apache.avro.generic.{GenericData, GenericRecord} import org.apache.avro.util.Utf8 import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.parquet.avro.AvroParquetWriter import org.scalatest.{BeforeAndAfterAll, Matchers, WordSpec} class AvroParquetReaderFnTest extends WordSpec with Matchers with BeforeAndAfterAll { private implicit val conf = new Configuration() private implicit val fs = FileSystem.get(new Configuration()) private val path = new Path(UUID.randomUUID().toString()) override def afterAll(): Unit = { val fs = FileSystem.get(new Configuration()) fs.delete(path, false) } private val avroSchema = SchemaBuilder.record("com.chuckle").fields() .requiredString("str").requiredLong("looong").requiredDouble("dooble").endRecord() private val writer = AvroParquetWriter.builder[GenericRecord](path) .withSchema(avroSchema) .build() private val record = new GenericData.Record(avroSchema) record.put("str", "wibble") record.put("looong", 999L) record.put("dooble", 12.34) writer.write(record) writer.close() val schema = StructType(Field("str"), Field("looong", LongType(true), true), Field("dooble", DoubleType, true)) "AvroParquetReaderFn" should { "support projections on doubles" in { val reader = AvroParquetReaderFn(path, None, Option(AvroSchemaFns.toAvroSchema(schema.removeField("looong")))) val record = reader.read() reader.close() record.get("str").asInstanceOf[Utf8].toString shouldBe "wibble" record.get("dooble") shouldBe 12.34 } "support projections on longs" in { val reader = AvroParquetReaderFn(path, None, Option(AvroSchemaFns.toAvroSchema(schema.removeField("str")))) val record = reader.read() reader.close() record.get("looong") shouldBe 999L } "support full projections" in { val reader = AvroParquetReaderFn(path, None, Option(AvroSchemaFns.toAvroSchema(schema))) val record = reader.read() reader.close() record.get("str").asInstanceOf[Utf8].toString shouldBe "wibble" record.get("looong") shouldBe 999L record.get("dooble") shouldBe 12.34 } "support non projections" in { val reader = AvroParquetReaderFn(path, None, None) val group = reader.read() reader.close() group.get("str").asInstanceOf[Utf8].toString shouldBe "wibble" group.get("looong") shouldBe 999L group.get("dooble") shouldBe 12.34 } } }
Example 102
Source File: DecimalWriterTest.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet import io.eels.Row import io.eels.schema.{DecimalType, Field, StructType} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.scalatest.FunSuite import scala.math.BigDecimal.RoundingMode class DecimalWriterTest extends FunSuite { test("negativeDecimalTest") { implicit val configuration = new Configuration val expectedBigDecimals = Seq(BigDecimal(-5025176.39), BigDecimal(-5), BigDecimal(-999.56434), BigDecimal(-10000.9890)) assertBigDecimals("bigd_negative.parquet", expectedBigDecimals) } test("positiveDecimalTest") { implicit val configuration = new Configuration val expectedBigDecimals = Seq(BigDecimal(5025176.39), BigDecimal(5), BigDecimal(999.56434), BigDecimal(-10000.9890)) assertBigDecimals("bigd_positive.parquet", expectedBigDecimals) } private def assertBigDecimals(filename: String, expectedBigDecimals: Seq[BigDecimal])(implicit configuration: Configuration): Unit = { val schema = StructType(Field(name = "bd", dataType = DecimalType(38, 10))) val path = new Path(filename) val fileSystem = path.getFileSystem(configuration) if (fileSystem.exists(path)) fileSystem.delete(path, false) // Write out the decimal values val parquetWriter = RowParquetWriterFn(path = path, schema = schema, metadata = Map.empty, dictionary = false, roundingMode = RoundingMode.UP, fileSystem.getConf) expectedBigDecimals.foreach { expectedBigDecimal => println(s"Writing row with value $expectedBigDecimal") parquetWriter.write(Row.fromMap(schema, Map("bd" -> expectedBigDecimal))) } parquetWriter.close() // Read back all the writes and assert their values val parquetProjectionSchema = ParquetSchemaFns.toParquetMessageType(schema) val parquetReader = RowParquetReaderFn(path, None, Option(parquetProjectionSchema), dictionaryFiltering = true) for (i <- 0 until expectedBigDecimals.length) { val readRow = parquetReader.read println(s"read row: $readRow") assert(readRow.values.head == expectedBigDecimals(i)) } parquetReader.close() } }
Example 103
Source File: HiveFileScanner.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.hive import com.sksamuel.exts.Logging import com.typesafe.config.ConfigFactory import io.eels.util.HdfsIterator import org.apache.hadoop.fs.{FileSystem, LocatedFileStatus, Path} // given a hadoop path, will look for files inside that path that match the // configured settings for hidden files // does not return directories object HiveFileScanner extends Logging { private val config = ConfigFactory.load() private val ignoreHiddenFiles = config.getBoolean("eel.hive.source.ignoreHiddenFiles") private val hiddenFilePattern = config.getString("eel.hive.source.hiddenFilePattern") // returns true if the given file should be considered based on the config settings private def skip(file: LocatedFileStatus): Boolean = { file.getLen == 0L || ignoreHiddenFiles && file.getPath.getName.matches(hiddenFilePattern) } def apply(path: Path, recursive: Boolean)(implicit fs: FileSystem): Seq[LocatedFileStatus] = { logger.debug(s"Scanning $path, filtering=$ignoreHiddenFiles, pattern=$hiddenFilePattern") val files: List[LocatedFileStatus] = if (fs.exists(path)) { val files = fs.listFiles(path, recursive) HdfsIterator.remote(files) .filter(_.isFile) .filterNot(skip) .toList } else { Nil } logger.debug(s"Scanner found ${files.size} files") files } }
Example 104
Source File: HiveTableFilesFn.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.hive import com.sksamuel.exts.Logging import io.eels.component.hive.partition.PartitionMetaData import io.eels.schema.{Partition, PartitionConstraint} import org.apache.hadoop.fs.{FileSystem, LocatedFileStatus, Path} import org.apache.hadoop.hive.metastore.IMetaStoreClient object HiveTableFilesFn extends Logging { def apply(dbName: String, tableName: String, tableLocation: Path, partitionConstraints: Seq[PartitionConstraint]) (implicit fs: FileSystem, client: IMetaStoreClient): Map[Partition, Seq[LocatedFileStatus]] = { val ops = new HiveOps(client) // when we have no partitions, this will scan just the table folder directly for files def rootScan(): Map[Partition, Seq[LocatedFileStatus]] = { Map(Partition.empty -> HiveFileScanner(tableLocation, false)) } def partitionsScan(partitions: Seq[PartitionMetaData]): Map[Partition, Seq[LocatedFileStatus]] = { new HivePartitionScanner().scan(partitions, partitionConstraints) .map { case (key, value) => key.partition -> value } } // the table may or may not have partitions. // // 1. If we do have partitions then we need to scan the path of each partition // (and each partition may be located anywhere outside of the table root) // // 2. If we do not have partitions then we can simply scan the table root. // we go to the metastore as we need the locations of the partitions not the values val partitions = ops.partitionsMetaData(dbName, tableName) if (partitions.isEmpty && partitionConstraints.nonEmpty) { sys.error("Constraints were used on a table that was not partitioned") } else if (partitions.isEmpty) { logger.debug(s"No partitions for $tableName; performing root table scan") rootScan } else partitionsScan(partitions) } }
Example 105
Source File: DynamicPartitionStrategy.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.hive.partition import io.eels.component.hive.HiveOps import io.eels.schema.Partition import io.eels.util.HdfsMkdir import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.metastore.IMetaStoreClient class DynamicPartitionStrategy extends PartitionStrategy { private val cache = scala.collection.mutable.Map.empty[Partition, Path] def ensurePartition(partition: Partition, dbName: String, tableName: String, inheritPermissions: Boolean, client: IMetaStoreClient)(implicit fs: FileSystem): Path = { def createPartition: Path = this.synchronized { val ops = new HiveOps(client) ops.partitionMetaData(dbName, tableName, partition) match { case Some(meta) => meta.location case _ => val tableLocation = ops.tablePath(dbName, tableName) val partitionPath = new Path(tableLocation, partition.unquoted) ops.createPartitionIfNotExists(dbName, tableName, partition, partitionPath) HdfsMkdir(partitionPath, inheritPermissions) partitionPath } } cache.getOrElseUpdate(partition, createPartition) } }
Example 106
Source File: PartitionMetaData.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.hive.partition import io.eels.schema.{Partition, PartitionEntry} import org.apache.hadoop.fs.Path case class PartitionMetaData(location: Path, // just the part of the path unique to the partition // usually this will be the same as the entries flattened name: String, inputFormat: String, outputFormat: String, createTime: Long, lastAccessTime: Long, partition: Partition) { // from key1=value1/key2=value2 will return Seq(value1,value2) def values(): Seq[String] = partition.entries.map(_.value) // returns the PartitionEntry for the given key def get(key: String): Option[PartitionEntry] = partition.entries.find(_.key == key) def value(key: String): Option[String] = get(key).map(_.value) }
Example 107
Source File: StaticPartitionStrategy.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.hive.partition import io.eels.component.hive.HiveOps import io.eels.schema.Partition import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.metastore.IMetaStoreClient import com.sksamuel.exts.OptionImplicits._ object StaticPartitionStrategy extends PartitionStrategy { private val cache = scala.collection.mutable.Map.empty[Partition, Path] def ensurePartition(partition: Partition, dbName: String, tableName: String, inheritPermissions: Boolean, client: IMetaStoreClient)(implicit fs: FileSystem): Path = { cache.getOrElseUpdate(partition, { val ops = new HiveOps(client) val meta = ops.partitionMetaData(dbName, tableName, partition).getOrError(s"Unknown partition $partition") meta.location }) } }
Example 108
Source File: HiveStats.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.hive import com.sksamuel.exts.Logging import com.sksamuel.exts.OptionImplicits._ import io.eels.schema.PartitionConstraint import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.metastore.IMetaStoreClient import org.apache.parquet.format.converter.ParquetMetadataConverter import org.apache.parquet.hadoop.ParquetFileReader import scala.collection.JavaConverters._ trait HiveStats { // total number of records def count: Long = count(Nil) // total number of records in the partitions that match the constraints def count(constraints: Seq[PartitionConstraint]): Long // returns the minimum value of this field def min(field: String): Any = min(field, Nil) // returns the maximum value of this field def max(field: String): Any = max(field, Nil) // returns the minimum value of this field for the partitions that match the constraints def min(field: String, constraints: Seq[PartitionConstraint]): Any // returns the maximum value of this field for the partitions that match the constraints def max(field: String, constraints: Seq[PartitionConstraint]): Any } class ParquetHiveStats(dbName: String, tableName: String, table: HiveTable) (implicit fs: FileSystem, conf: Configuration, client: IMetaStoreClient) extends HiveStats with Logging { private val ops = new HiveOps(client) private def count(path: Path) = { val blocks = ParquetFileReader.readFooter(fs.getConf, path, ParquetMetadataConverter.NO_FILTER).getBlocks.asScala blocks.map(_.getRowCount).sum } override def count(constraints: Seq[PartitionConstraint]): Long = { val counts = HiveTableFilesFn(dbName, tableName, table.location, constraints) .flatMap(_._2) .map(_.getPath).map(count) if (counts.isEmpty) 0 else counts.sum } private def minmax(field: String, constraints: Seq[PartitionConstraint]): (Any, Any) = { def stats[T]: (Any, Any) = { def min(ts: Seq[Comparable[T]]): Any = ts.reduceLeft { (a, b) => if (a.compareTo(b.asInstanceOf[T]) <= 0) a else b } def max(ts: Seq[Comparable[T]]): Any = ts.reduceLeft { (a, b) => if (a.compareTo(b.asInstanceOf[T]) >= 0) a else b } val location = new Path(ops.location(dbName, tableName)) val (mins, maxes) = HiveTableFilesFn(dbName, tableName, location, constraints).toSeq.flatMap { case (_, files) => logger.debug(s"Calculating min,max in file $files") files.flatMap { file => val footer = ParquetFileReader.readFooter(conf, file, ParquetMetadataConverter.NO_FILTER) footer.getBlocks.asScala.map { block => val column = block.getColumns.asScala.find(_.getPath.toDotString == field).getOrError(s"Unknown column $field") val min = column.getStatistics.genericGetMin.asInstanceOf[Comparable[T]] val max = column.getStatistics.genericGetMax.asInstanceOf[Comparable[T]] (min, max) } } }.unzip (min(mins), max(maxes)) } stats[Any] } override def min(field: String, constraints: Seq[PartitionConstraint]): Any = minmax(field, constraints)._1 override def max(field: String, constraints: Seq[PartitionConstraint]): Any = minmax(field, constraints)._2 }
Example 109
Source File: ParquetHiveDialect.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.hive.dialect import java.util.concurrent.atomic.AtomicInteger import com.sksamuel.exts.Logging import com.sksamuel.exts.OptionImplicits._ import com.sksamuel.exts.io.Using import io.eels.component.hive.{HiveDialect, HiveOps, HiveOutputStream} import io.eels.component.parquet._ import io.eels.component.parquet.util.{ParquetIterator, ParquetLogMute} import io.eels.datastream.{DataStream, Publisher, Subscriber, Subscription} import io.eels.schema.StructType import io.eels.{Predicate, Row} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.permission.FsPermission import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.conf.HiveConf import org.apache.hadoop.hive.metastore.HiveMetaStoreClient import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe import org.apache.hadoop.hive.ql.io.parquet.{MapredParquetInputFormat, MapredParquetOutputFormat} import scala.math.BigDecimal.RoundingMode.RoundingMode case class ParquetHiveDialect(options: ParquetWriteOptions = ParquetWriteOptions()) extends HiveDialect with Logging with Using { override val serde: String = classOf[ParquetHiveSerDe].getCanonicalName override val inputFormat: String = classOf[MapredParquetInputFormat].getCanonicalName override val outputFormat: String = classOf[MapredParquetOutputFormat].getCanonicalName override def input(path: Path, ignore: StructType, projectionSchema: StructType, predicate: Option[Predicate]) (implicit fs: FileSystem, conf: Configuration): Publisher[Seq[Row]] = new Publisher[Seq[Row]] { val client = new HiveMetaStoreClient(new HiveConf) val ops = new HiveOps(client) override def subscribe(subscriber: Subscriber[Seq[Row]]): Unit = { // convert the eel projection schema into a parquet schema which will be used by the native parquet reader try { val parquetProjectionSchema = ParquetSchemaFns.toParquetMessageType(projectionSchema) using(RowParquetReaderFn(path, predicate, parquetProjectionSchema.some, true)) { reader => val subscription = new Subscription { override def cancel(): Unit = reader.close() } subscriber.subscribed(subscription) ParquetIterator(reader).grouped(DataStream.DefaultBatchSize).foreach(subscriber.next) subscriber.completed() } } catch { case t: Throwable => subscriber.error(t) } } } override def output(schema: StructType, path: Path, permission: Option[FsPermission], roundingMode: RoundingMode, metadata: Map[String, String]) (implicit fs: FileSystem, conf: Configuration): HiveOutputStream = { val path_x = path new HiveOutputStream { ParquetLogMute() private val _records = new AtomicInteger(0) logger.debug(s"Creating parquet writer at $path") private val writer = RowParquetWriterFn(path, schema, metadata, true, roundingMode, fs.getConf) override def write(row: Row) { require(row.values.nonEmpty, "Attempting to write an empty row") writer.write(row) _records.incrementAndGet() } override def close(): Unit = { logger.debug(s"Closing hive parquet writer $path") writer.close() // after the files are closed, we should set permissions if we've been asked to, this allows // all the files we create to stay consistent permission.foreach(fs.setPermission(path, _)) } override def records: Int = _records.get() override def path: Path = path_x } } }
Example 110
Source File: OrcHiveDialect.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.hive.dialect import com.sksamuel.exts.Logging import io.eels.component.hive.{HiveDialect, HiveOutputStream} import io.eels.component.orc.{OrcPublisher, OrcWriteOptions, OrcWriter} import io.eels.datastream.{Publisher, Subscriber} import io.eels.schema.StructType import io.eels.{Predicate, Row} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.permission.FsPermission import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.ql.io.orc.{OrcInputFormat, OrcOutputFormat, OrcSerde} import scala.math.BigDecimal.RoundingMode.RoundingMode case class OrcHiveDialect(options: OrcWriteOptions = OrcWriteOptions()) extends HiveDialect with Logging { override val serde: String = classOf[OrcSerde].getCanonicalName override val inputFormat: String = classOf[OrcInputFormat].getCanonicalName override val outputFormat: String = classOf[OrcOutputFormat].getCanonicalName override def input(path: Path, metastoreSchema: StructType, projectionSchema: StructType, predicate: Option[Predicate]) (implicit fs: FileSystem, conf: Configuration): Publisher[Seq[Row]] = new Publisher[Seq[Row]] { override def subscribe(subscriber: Subscriber[Seq[Row]]): Unit = { new OrcPublisher(path, projectionSchema.fieldNames(), predicate).subscribe(subscriber) } } override def output(schema: StructType, path: Path, permission: Option[FsPermission], roundingMode: RoundingMode, metadata: Map[String, String])(implicit fs: FileSystem, conf: Configuration): HiveOutputStream = { val path_x = path val writer = new OrcWriter(path, schema, options) new HiveOutputStream { override def write(row: Row): Unit = { require(row.values.nonEmpty, "Attempting to write an empty row") writer.write(row) } override def close(): Unit = { writer.close() permission.foreach(fs.setPermission(path, _)) } override def records: Int = writer.records override def path: Path = path_x } } }
Example 111
Source File: HiveDialect.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.hive import com.sksamuel.exts.Logging import io.eels.component.hive.dialect.{OrcHiveDialect, ParquetHiveDialect} import io.eels.datastream.Publisher import io.eels.schema.StructType import io.eels.{Predicate, Row} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.permission.FsPermission import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.metastore.api.Table import scala.math.BigDecimal.RoundingMode.RoundingMode trait HiveDialect extends Logging { def serde: String def inputFormat: String def outputFormat: String def output(schema: StructType, // schema without partition information path: Path, permission: Option[FsPermission], roundingMode: RoundingMode, metadata: Map[String, String]) (implicit fs: FileSystem, conf: Configuration): HiveOutputStream def stats(getPath: Path)(implicit fs: FileSystem): Long = throw new UnsupportedOperationException } object HiveDialect extends Logging { def apply(format: String): HiveDialect = format match { case input if input.contains("ParquetInputFormat") => ParquetHiveDialect() case input if input.contains("OrcInputFormat") => OrcHiveDialect() //case input if input.contains("AvroHiveDialect") || input.contains("AvroContainerInputFormat") => AvroHiveDialect // "org.apache.hadoop.mapred.TextInputFormat" -> TextHiveDialect case _ => throw new UnsupportedOperationException(s"Unknown hive input format $format") } def apply(table: Table): HiveDialect = { val format = table.getSd.getInputFormat logger.debug(s"Table format is $format") val dialect = HiveDialect(format) logger.debug(s"HiveDialect is $dialect") dialect } }
Example 112
Source File: HivePartitionPublisher.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.hive import com.sksamuel.exts.Logging import com.typesafe.config.ConfigFactory import io.eels.Row import io.eels.datastream.{Publisher, Subscriber, Subscription} import io.eels.schema.StructType import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.metastore.IMetaStoreClient import scala.util.control.NonFatal class HivePartitionPublisher(dbName: String, tableName: String, projectionSchema: StructType, partitionKeys: List[String], // partition keys for this table, used to map the partition values back to a map dialect: HiveDialect // used to open up the files to check they exist if checkDataForPartitionOnlySources is true ) (implicit fs: FileSystem, client: IMetaStoreClient) extends Publisher[Seq[Row]] with Logging { private val config = ConfigFactory.load() // if this is true, then we will still check that some files exist for each partition, to avoid // a situation where the partitions have been created in the hive metastore, but no actual // data has been written using those yet. private val partitionPartFileCheck = config.getBoolean("eel.hive.source.checkDataForPartitionOnlySources") logger.info(s"eel.hive.source.checkDataForPartitionOnlySources=$partitionPartFileCheck") // returns true if the partition exists on disk private def isPartitionPhysical(part: org.apache.hadoop.hive.metastore.api.Partition): Boolean = { val location = new Path(part.getSd.getLocation) logger.debug(s"Checking that partition $location has been created on disk...") try { val exists = fs.exists(location) if (exists) { logger.debug("...exists") } else { logger.debug("...not found") } exists } catch { case NonFatal(e) => logger.warn(s"Error reading $location", e) false } } override def subscribe(subscriber: Subscriber[Seq[Row]]): Unit = client.synchronized { try { import scala.collection.JavaConverters._ // each row will contain just the values from the metastore val rows = client.listPartitions(dbName, tableName, Short.MaxValue).asScala.filter { part => !partitionPartFileCheck || isPartitionPhysical(part) }.map { part => // the partition values are assumed to be the same order as the supplied partition keys // first we build a map of the keys to values, then use that map to return a Row with // values in the order set by the fieldNames parameter val map = partitionKeys.zip(part.getValues.asScala).toMap Row(projectionSchema, projectionSchema.fieldNames.map(map(_)).toVector) } logger.debug(s"After scanning partitions and files we have ${rows.size} rows") subscriber.subscribed(Subscription.empty) rows.iterator.grouped(10).foreach(subscriber.next) subscriber.completed() } catch { case t: Throwable => subscriber.error(t) } } }
Example 113
Source File: ParquetVsOrcSpeedTest.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.hive import java.io.File import java.math.MathContext import com.sksamuel.exts.metrics.Timed import io.eels.Row import io.eels.component.orc.{OrcSink, OrcSource} import io.eels.component.parquet.{ParquetSink, ParquetSource} import io.eels.datastream.DataStream import io.eels.schema._ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import scala.math.BigDecimal.RoundingMode import scala.util.Random object ParquetVsOrcSpeedTest extends App with Timed { implicit val conf = new Configuration() implicit val fs = FileSystem.getLocal(new Configuration()) val size = 5000000 val structType = StructType( Field("name", StringType), Field("age", IntType.Signed), Field("height", DoubleType), Field("amazing", BooleanType), Field("fans", LongType.Signed), Field("rating", DecimalType(4, 2)) ) def iter: Iterator[Vector[Any]] = Iterator.continually(Vector( Random.nextString(10), Random.nextInt(), Random.nextDouble(), Random.nextBoolean(), Random.nextLong(), BigDecimal(Random.nextDouble(), new MathContext(4)).setScale(2, RoundingMode.UP) )) def ds: DataStream = DataStream.fromIterator(structType, iter.take(size).map(Row(structType, _))) val ppath = new Path("parquet_speed.pq") fs.delete(ppath, false) val opath = new Path("orc_speed.orc") fs.delete(opath, false) new File(ppath.toString).deleteOnExit() new File(opath.toString).deleteOnExit() timed("Orc Insertion") { ds.to(OrcSink(opath)) } timed("Parquet Insertion") { ds.to(ParquetSink(ppath)) } while (true) { timed("Reading with OrcSource") { val actual = OrcSource(opath).toDataStream().size assert(actual == size, s"$actual != $size") } timed("Reading with ParquetSource") { val actual = ParquetSource(ppath).toDataStream().size assert(actual == size, s"$actual != $size") } } }
Example 114
Source File: HiveTableFilesFnTest.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.hive import java.nio.file.Paths import com.sksamuel.exts.Logging import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.hdfs.MiniDFSCluster import org.apache.hadoop.hive.metastore.IMetaStoreClient import org.apache.hadoop.hive.metastore.api.Table import org.scalatest.mockito.MockitoSugar import org.scalatest.{FlatSpec, Matchers} class HiveTableFilesFnTest extends FlatSpec with Matchers with Logging with MockitoSugar { System.clearProperty(MiniDFSCluster.PROP_TEST_BUILD_DATA) val clusterPath = Paths.get("miniclusters", "cluster") val conf = new Configuration() conf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, clusterPath.toAbsolutePath.toString) val cluster = new MiniDFSCluster.Builder(conf).build() implicit val fs = cluster.getFileSystem "HiveTableFilesFn" should "detect all files in root when no partitions" in { implicit val client = mock[IMetaStoreClient] org.mockito.Mockito.when(client.getTable("default", "mytable")).thenReturn(new Table) val root = new Path("tab1") fs.mkdirs(root) // table scanner will skip 0 length files val a = fs.create(new Path(root, "a")) a.write(1) a.close() val b = fs.create(new Path(root, "b")) b.write(1) b.close() HiveTableFilesFn("default", "mytable", fs.resolvePath(root), Nil).values.flatten.map(_.getPath.getName).toSet shouldBe Set("a", "b") } it should "ignore hidden files in root when no partitions" in { implicit val client = mock[IMetaStoreClient] org.mockito.Mockito.when(client.getTable("default", "mytable")).thenReturn(new Table) val root = new Path("tab2") fs.mkdirs(root) // table scanner will skip 0 length files val a = fs.create(new Path(root, "a")) a.write(1) a.close() val b = fs.create(new Path(root, "_b")) b.write(1) b.close() HiveTableFilesFn("default", "mytable", fs.resolvePath(root), Nil).values.flatten.map(_.getPath.getName).toSet shouldBe Set("a") } }
Example 115
Source File: HiveBenchmarkApp.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.hive import java.util.UUID import com.sksamuel.exts.metrics.Timed import io.eels.datastream.DataStream import io.eels.schema.StructType import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.conf.HiveConf import org.apache.hadoop.hive.metastore.HiveMetaStoreClient import scala.util.Random object HiveBenchmarkApp extends App with Timed { val states = List( "Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware", "Florida", "Georgia", "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", "New Hampshire", "New Jersey", "New Mexico", "New York", "North Carolina", "North Dakota", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Vermont", "Virginia", "Washington", "West Virginia", "Wisconsin", "Wyoming").map(_.replace(' ', '_').toLowerCase) import HiveConfig._ val schema = StructType("id", "state") val rows = List.fill(1000000)(List(UUID.randomUUID.toString, states(Random.nextInt(50)))) logger.info(s"Generated ${rows.size} rows") new HiveOps(client).createTable( "sam", "people", schema, List("state"), overwrite = true ) logger.info("Table created") val sink = HiveSink("sam", "people") DataStream.fromValues(schema, rows).to(sink) logger.info("Write complete") while (true) { timed("datastream took") { val result = HiveSource("sam", "people").toDataStream().collect println(result.size) } } }
Example 116
Source File: OrcWriter.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.orc import java.util.concurrent.atomic.AtomicInteger import java.util.function.IntUnaryOperator import com.sksamuel.exts.Logging import com.typesafe.config.ConfigFactory import io.eels.Row import io.eels.schema.StructType import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.hive.ql.exec.vector.ColumnVector import org.apache.orc.{OrcConf, OrcFile, TypeDescription} import scala.collection.JavaConverters._ import scala.collection.mutable.ArrayBuffer // performs the actual write out of orc data, to be used by an orc sink class OrcWriter(path: Path, structType: StructType, options: OrcWriteOptions)(implicit conf: Configuration) extends Logging { private val schema: TypeDescription = OrcSchemaFns.toOrcSchema(structType) logger.trace(s"Creating orc writer for schema $schema") private val batchSize = { val size = ConfigFactory.load().getInt("eel.orc.sink.batchSize") Math.max(Math.min(1024, size), 1) } logger.debug(s"Orc writer will use batchsize=$batchSize") private val buffer = new ArrayBuffer[Row](batchSize) private val serializers = schema.getChildren.asScala.map(OrcSerializer.forType).toArray private val batch = schema.createRowBatch(batchSize) OrcConf.COMPRESSION_STRATEGY.setString(conf, options.compressionStrategy.name) OrcConf.COMPRESS.setString(conf, options.compressionKind.name) options.encodingStrategy.map(_.name).foreach(OrcConf.ENCODING_STRATEGY.setString(conf, _)) options.compressionBufferSize.foreach(OrcConf.BUFFER_SIZE.setLong(conf, _)) private val woptions = OrcFile.writerOptions(conf).setSchema(schema) options.rowIndexStride.foreach { size => woptions.rowIndexStride(size) logger.debug(s"Using stride size = $size") } if (options.bloomFilterColumns.nonEmpty) { woptions.bloomFilterColumns(options.bloomFilterColumns.mkString(",")) logger.debug(s"Using bloomFilterColumns = $options.bloomFilterColumns") } private lazy val writer = OrcFile.createWriter(path, woptions) private val counter = new AtomicInteger(0) def write(row: Row): Unit = { buffer.append(row) if (buffer.size == batchSize) flush() } def records: Int = counter.get() def flush(): Unit = { def writecol[T <: ColumnVector](rowIndex: Int, colIndex: Int, row: Row): Unit = { val value = row.values(colIndex) val vector = batch.cols(colIndex).asInstanceOf[T] val serializer = serializers(colIndex).asInstanceOf[OrcSerializer[T]] serializer.writeToVector(rowIndex, vector, value) } // don't use foreach here, using old school for loops for perf for (rowIndex <- buffer.indices) { val row = buffer(rowIndex) for (colIndex <- batch.cols.indices) { writecol(rowIndex, colIndex, row) } } batch.size = buffer.size writer.addRowBatch(batch) counter.updateAndGet(new IntUnaryOperator { override def applyAsInt(operand: Int): Int = operand + batch.size }) buffer.clear() batch.reset() } def close(): Long = { if (buffer.nonEmpty) flush() writer.close() val count = writer.getNumberOfRows logger.info(s"Orc writer wrote $count rows") count } }
Example 117
Source File: ClientHiveTableFetcherTest.scala From flamy with Apache License 2.0 | 5 votes |
package com.flaminem.exec.hive import com.flaminem.flamy.Launcher import com.flaminem.flamy.conf.{Environment, FlamyContext, FlamyGlobalOptions} import com.flaminem.flamy.exec.hive.{ClientHivePartitionFetcher, HivePartitionFetcher} import com.flaminem.flamy.exec.utils.ReturnStatus import com.flaminem.flamy.utils.CliUtils import org.apache.hadoop.fs.Path import org.scalatest._ class ClientHiveTableFetcherTest extends FreeSpec with Matchers with BeforeAndAfterAll { def launch(line: String): ReturnStatus = { val args: Array[String] = CliUtils.split(line).filter{_.nonEmpty}.toArray Launcher.launch(args) } val context = new FlamyContext( new FlamyGlobalOptions( conf = Map( "flamy.model.dir.paths" -> "src/it/resources/ClientHivePartitionFetcher" ) ), env = Some(Environment("test")) ) override def beforeAll(): Unit = { launch("drop tables --on test --all") launch("drop schemas --on test --all") launch("push schemas --on test") launch("push tables --on test") launch("repair tables --on test") } "ClientHivePartitionFetcher" - { "listTableNames" in { val fetcher = HivePartitionFetcher(context) assert(fetcher.isInstanceOf[ClientHivePartitionFetcher]) val tables = fetcher.listTableNames assert(tables.size == 6) } } }
Example 118
Source File: Util.scala From spark-flow with Apache License 2.0 | 5 votes |
package com.bloomberg.sparkflow.dc import org.apache.hadoop.fs.Path import org.apache.spark.SparkContext import org.apache.spark.sql._ import scala.reflect.{ClassTag, classTag} object Util { private[dc] def saveCheckpoint[T: ClassTag](checkpointPath: String, dataset: Dataset[T]) = { assert(dataset != null) dataset.write.mode(SaveMode.Overwrite).parquet(checkpointPath) } private[dc] def loadCheckpoint[T: ClassTag](checkpointPath: String, spark: SparkSession)(implicit tEncoder: Encoder[T]): Option[Dataset[T]] = { if (pathExists(checkpointPath, spark.sparkContext)) { val dataFrame = spark.read.parquet(checkpointPath) val dataset = if (tEncoder.clsTag.equals(classTag[Row])) { dataFrame.asInstanceOf[Dataset[T]] } else { dataFrame.as[T] } dataset.count() Some(dataset) } else { None } } def pathExists(dir: String, sc: SparkContext) = { val path = new Path(dir) val fs = path.getFileSystem(sc.hadoopConfiguration) fs.exists(path) } def deletePath(dir: String, sc: SparkContext) = { val path = new Path(dir) val fs = path.getFileSystem(sc.hadoopConfiguration) fs.delete(path, true) } }
Example 119
Source File: WriteTransformer.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperations import java.io.{File, IOException} import scala.reflect.runtime.{universe => ru} import io.deepsense.commons.utils.Version import io.deepsense.commons.utils.FileOperations.deleteRecursivelyIfExists import io.deepsense.deeplang.DOperation.Id import io.deepsense.deeplang.documentation.OperationDocumentation import io.deepsense.deeplang.doperables.Transformer import io.deepsense.deeplang.doperations.exceptions.DeepSenseIOException import io.deepsense.deeplang.params.{BooleanParam, Params, StringParam} import io.deepsense.deeplang.{DOperation1To0, ExecutionContext} import java.net.URI import org.apache.hadoop.fs.{FileSystem, Path} case class WriteTransformer() extends DOperation1To0[Transformer] with Params with OperationDocumentation { override val id: Id = "58368deb-68d0-4657-ae3f-145160cb1e2b" override val name: String = "Write Transformer" override val description: String = "Writes a Transformer to a directory" override val since: Version = Version(1, 1, 0) val shouldOverwrite = BooleanParam( name = "overwrite", description = Some("Should an existing transformer with the same name be overwritten?") ) setDefault(shouldOverwrite, true) def getShouldOverwrite: Boolean = $(shouldOverwrite) def setShouldOverwrite(value: Boolean): this.type = set(shouldOverwrite, value) val outputPath = StringParam( name = "output path", description = Some("The output path for writing the Transformer.")) def getOutputPath: String = $(outputPath) def setOutputPath(value: String): this.type = set(outputPath, value) val params: Array[io.deepsense.deeplang.params.Param[_]] = Array(outputPath, shouldOverwrite) override protected def execute(transformer: Transformer)(context: ExecutionContext): Unit = { val outputDictPath = getOutputPath try { if (getShouldOverwrite) { removeDirectory(context, outputDictPath) } transformer.save(context, outputDictPath) } catch { case e: IOException => logger.error(s"WriteTransformer error. Could not write transformer to the directory", e) throw DeepSenseIOException(e) } } private def removeDirectory(context: ExecutionContext, path: String): Unit = { if (path.startsWith("hdfs://")) { val configuration = context.sparkContext.hadoopConfiguration val hdfs = FileSystem.get(new URI(extractHdfsAddress(path)), configuration) hdfs.delete(new Path(path), true) } else { deleteRecursivelyIfExists(new File(path)) } } private def extractHdfsAddress(path: String): String = { // first group: "hdfs://ip.addr.of.hdfs", second group: "/some/path/on/hdfs" val regex = "(hdfs:\\/\\/[^\\/]*)(.*)".r val regex(hdfsAddress, _) = path hdfsAddress } @transient override lazy val tTagTI_0: ru.TypeTag[Transformer] = ru.typeTag[Transformer] } object WriteTransformer { def apply(outputPath: String): WriteTransformer = { new WriteTransformer().setOutputPath(outputPath) } }
Example 120
Source File: FileDownloader.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperations.readwritedataframe.filestorage import java.io.{BufferedWriter, FileOutputStream, IOException, OutputStreamWriter} import java.nio.file.{Files, Paths} import java.util.UUID import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import io.deepsense.deeplang.ExecutionContext import io.deepsense.deeplang.doperations.exceptions.DeepSenseIOException import io.deepsense.deeplang.doperations.readwritedataframe.FilePath private[filestorage] object FileDownloader { def downloadFile(url: String)(implicit context: ExecutionContext): FilePath = { if (context.tempPath.startsWith("hdfs://")) { downloadFileToHdfs(url) } else { downloadFileToDriver(url) } } private def downloadFileToHdfs(url: String)(implicit context: ExecutionContext) = { val content = scala.io.Source.fromURL(url).getLines() val hdfsPath = s"${context.tempPath}/${UUID.randomUUID()}" val configuration = new Configuration() val hdfs = FileSystem.get(configuration) val file = new Path(hdfsPath) val hdfsStream = hdfs.create(file) val writer = new BufferedWriter(new OutputStreamWriter(hdfsStream)) try { content.foreach {s => writer.write(s) writer.newLine() } } finally { safeClose(writer) hdfs.close() } FilePath(hdfsPath) } private def downloadFileToDriver(url: String) (implicit context: ExecutionContext) = { val outputDirPath = Paths.get(context.tempPath) // We're checking if the output is a directory following symlinks. // The default behaviour of createDirectories is NOT to follow symlinks if (!Files.isDirectory(outputDirPath)) { Files.createDirectories(outputDirPath) } val outFilePath = Files.createTempFile(outputDirPath, "download", ".csv") // content is a stream. Do not invoke stuff like .toList() on it. val content = scala.io.Source.fromURL(url).getLines() val writer: BufferedWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outFilePath.toFile))) try { content.foreach {s => writer.write(s) writer.newLine() } } finally { safeClose(writer) } FilePath(s"file:///$outFilePath") } private def safeClose(bufferedWriter: BufferedWriter): Unit = { try { bufferedWriter.flush() bufferedWriter.close() } catch { case e: IOException => throw new DeepSenseIOException(e) } } }
Example 121
Source File: DefaultMLWriter.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables.serialization import org.apache.hadoop.fs.Path import org.apache.spark.SparkContext import org.apache.spark.ml.param.{ParamPair, Params} import org.apache.spark.ml.util.MLWriter import org.json4s.JsonDSL._ import org.json4s._ import org.json4s.jackson.JsonMethods._ import io.deepsense.deeplang.doperables.Transformer import io.deepsense.sparkutils.ML.MLWriterWithSparkContext class DefaultMLWriter[T <: Params](instance: T) extends MLWriter with MLWriterWithSparkContext { def saveImpl(path: String): Unit = { val modelPath = Transformer.modelFilePath(path) saveMetadata(instance, path, sc) CustomPersistence.save(sparkContext, instance, modelPath) } // Copied from org.apache.spark.ml.util.DefaultParamWriter. // We need to be consistent with Spark Format, but this method is private. private def saveMetadata( instance: Params, path: String, sc: SparkContext, extraMetadata: Option[JObject] = None, paramMap: Option[JValue] = None): Unit = { val uid = instance.uid val cls = instance.getClass.getName val params = instance.extractParamMap().toSeq.asInstanceOf[Seq[ParamPair[Any]]] val jsonParams = paramMap.getOrElse(render(params.map { case ParamPair(p, v) => p.name -> parse(p.jsonEncode(v)) }.toList)) val basicMetadata = ("class" -> cls) ~ ("timestamp" -> System.currentTimeMillis()) ~ ("sparkVersion" -> sc.version) ~ ("uid" -> uid) ~ ("paramMap" -> jsonParams) val metadata = extraMetadata match { case Some(jObject) => basicMetadata ~ jObject case None => basicMetadata } val metadataPath = new Path(path, "metadata").toString val metadataJson = compact(render(metadata)) sc.parallelize(Seq(metadataJson), 1).saveAsTextFile(metadataPath) } }
Example 122
Source File: Util.scala From csb with GNU General Public License v3.0 | 5 votes |
package edu.msstate.dasi.csb.util import java.io.File import edu.msstate.dasi.csb.model.{EdgeData, VertexData} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, FileUtil, Path} import org.apache.spark.graphx._ import org.apache.spark.rdd.RDD import scala.collection.mutable import scala.reflect.ClassTag object Util { def time[R](taskName: String, task: => R): R = { println(s"[TIME] $taskName started...") val start = System.nanoTime val ret = task // call-by-name val end = System.nanoTime println(s"[TIME] $taskName completed in ${(end - start) / 1e9} s") ret } def convertLabelsToStandardForm[VD: ClassTag, ED: ClassTag](G: Graph[VD, ED]): Graph[VertexData, EdgeData] = { val nodeList = G.vertices val edgeList = G.edges val hash = new mutable.HashMap[Long, Long] val nodes = nodeList.map(record => record._1).collect() var counter = 0 for(entry <- nodes) { hash.put(entry, counter) counter += 1 } val newNodes = nodeList.map(record => hash.get(record._1).head).sortBy(record => record, ascending = true) val newEdges = edgeList.map(record => (hash.get(record.srcId).head, hash.get(record.dstId).head)) val newEdgesRDD: RDD[Edge[EdgeData]] = newEdges.map(record => Edge(record._1, record._2)) // val newEdges = edgeList.flatMap(record => Array((hash.get(record._1).head, hash.get(record._2).head), (hash.get(record._2).head, hash.get(record._1).head))) return Graph.fromEdges(newEdgesRDD, VertexData()) } def stripMultiEdges[VD: ClassTag, ED: ClassTag](G: Graph[VD, ED]): Graph[VD, ED] = { G.groupEdges(mergeEdges[ED]) // val stripedEdges = G.edges.groupBy(record => (record.srcId, record.dstId)).map(record => record._2.head) // return Graph.fromEdges(EdgeRDD.fromEdges(stripedEdges), VertexData()) } def mergeEdges[ED: ClassTag](e1: ED, e2: ED): ED = { null.asInstanceOf[ED] } }
Example 123
Source File: JsonHadoopFsRelationSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import java.math.BigDecimal import org.apache.hadoop.fs.Path import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.Row import org.apache.spark.sql.types._ class JsonHadoopFsRelationSuite extends HadoopFsRelationTest { override val dataSourceName: String = "json" import sqlContext._ // JSON does not write data of NullType and does not play well with BinaryType. //JSON不会写入Null Type的数据,并且不能使用二进制类型播放 override protected def supportsDataType(dataType: DataType): Boolean = dataType match { case _: NullType => false case _: BinaryType => false case _: CalendarIntervalType => false case _ => true } //save()/ load() - 分区表 - 简单查询 - 数据中的分区列 test("save()/load() - partitioned table - simple queries - partition columns in data") { withTempDir { file => val basePath = new Path(file.getCanonicalPath) val fs = basePath.getFileSystem(SparkHadoopUtil.get.conf) val qualifiedBasePath = fs.makeQualified(basePath) for (p1 <- 1 to 2; p2 <- Seq("foo", "bar")) { val partitionDir = new Path(qualifiedBasePath, s"p1=$p1/p2=$p2") sparkContext .parallelize(for (i <- 1 to 3) yield s"""{"a":$i,"b":"val_$i"}""") .saveAsTextFile(partitionDir.toString) } val dataSchemaWithPartition = StructType(dataSchema.fields :+ StructField("p1", IntegerType, nullable = true)) checkQueries( read.format(dataSourceName) .option("dataSchema", dataSchemaWithPartition.json) .load(file.getCanonicalPath)) } } //将复杂类型保存到JSON test("SPARK-9894: save complex types to JSON") { withTempDir { file => file.delete() val schema = new StructType() .add("array", ArrayType(LongType)) .add("map", MapType(StringType, new StructType().add("innerField", LongType))) val data = Row(Seq(1L, 2L, 3L), Map("m1" -> Row(4L))) :: Row(Seq(5L, 6L, 7L), Map("m2" -> Row(10L))) :: Nil val df = createDataFrame(sparkContext.parallelize(data), schema) // Write the data out.写出数据 df.write.format(dataSourceName).save(file.getCanonicalPath) // Read it back and check the result. 把它读回来检查结果 checkAnswer( read.format(dataSourceName).schema(schema).load(file.getCanonicalPath), df ) } } //将十进制类型保存到JSON test("SPARK-10196: save decimal type to JSON") { withTempDir { file => file.delete() val schema = new StructType() .add("decimal", DecimalType(7, 2)) val data = Row(new BigDecimal("10.02")) :: Row(new BigDecimal("20000.99")) :: Row(new BigDecimal("10000")) :: Nil val df = createDataFrame(sparkContext.parallelize(data), schema) // Write the data out. 写出数据 df.write.format(dataSourceName).save(file.getCanonicalPath) // Read it back and check the result. 把它读回来检查结果 checkAnswer( read.format(dataSourceName).schema(schema).load(file.getCanonicalPath), df ) } } }
Example 124
Source File: CommitFailureTestRelationSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.hadoop.fs.Path import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.SQLContext import org.apache.spark.sql.hive.test.TestHive import org.apache.spark.sql.test.SQLTestUtils class CommitFailureTestRelationSuite extends SparkFunSuite with SQLTestUtils { override def _sqlContext: SQLContext = TestHive private val sqlContext = _sqlContext // When committing a task, `CommitFailureTestSource` throws an exception for testing purpose. //提交任务时,“CommitFailureTestSource”会为测试目的引发异常 val dataSourceName: String = classOf[CommitFailureTestSource].getCanonicalName //commitTask()失败应该回退到abortTask() test("SPARK-7684: commitTask() failure should fallback to abortTask()") { withTempPath { file => // Here we coalesce partition number to 1 to ensure that only a single task is issued. This // prevents race condition happened when FileOutputCommitter tries to remove the `_temporary` // directory while committing/aborting the job. See SPARK-8513 for more details. //这里我们将分区号合并为1,以确保只发出一个任务, 这个防止当FileOutputCommitter尝试删除`_temporary`时发生竞争条件 //目录提交/中止作业, 有关详细信息,请参阅SPARK-8513 val df = sqlContext.range(0, 10).coalesce(1) intercept[SparkException] { df.write.format(dataSourceName).save(file.getCanonicalPath) } val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf) assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary"))) } } }
Example 125
Source File: SimpleTextHadoopFsRelationSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.hadoop.fs.Path import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.types._ class SimpleTextHadoopFsRelationSuite extends HadoopFsRelationTest { override val dataSourceName: String = classOf[SimpleTextSource].getCanonicalName import sqlContext._ // We have a very limited number of supported types at here since it is just for a // test relation and we do very basic testing at here. //我们在这里支持的类型数量非常有限,因为它只是一个测试关系,我们在这里进行非常基本的测试。 override protected def supportsDataType(dataType: DataType): Boolean = dataType match { case _: BinaryType => false // We are using random data generator and the generated strings are not really valid string. case _: StringType => false case _: BooleanType => false // see https://issues.apache.org/jira/browse/SPARK-10442 case _: CalendarIntervalType => false case _: DateType => false case _: TimestampType => false case _: ArrayType => false case _: MapType => false case _: StructType => false case _: UserDefinedType[_] => false case _ => true } //save()/ load() - 分区表 - 简单查询 - 数据中的分区列 test("save()/load() - partitioned table - simple queries - partition columns in data") { withTempDir { file => val basePath = new Path(file.getCanonicalPath) val fs = basePath.getFileSystem(SparkHadoopUtil.get.conf) val qualifiedBasePath = fs.makeQualified(basePath) for (p1 <- 1 to 2; p2 <- Seq("foo", "bar")) { val partitionDir = new Path(qualifiedBasePath, s"p1=$p1/p2=$p2") sparkContext .parallelize(for (i <- 1 to 3) yield s"$i,val_$i,$p1") .saveAsTextFile(partitionDir.toString) } val dataSchemaWithPartition = StructType(dataSchema.fields :+ StructField("p1", IntegerType, nullable = true)) checkQueries( read.format(dataSourceName) .option("dataSchema", dataSchemaWithPartition.json) .load(file.getCanonicalPath)) } } }
Example 126
Source File: DirectParquetOutputCommitter.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.parquet import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext} import org.apache.parquet.Log import org.apache.parquet.hadoop.util.ContextUtil import org.apache.parquet.hadoop.{ParquetFileReader, ParquetFileWriter, ParquetOutputCommitter, ParquetOutputFormat} private[datasources] class DirectParquetOutputCommitter( outputPath: Path, context: TaskAttemptContext) extends ParquetOutputCommitter(outputPath, context) { val LOG = Log.getLog(classOf[ParquetOutputCommitter]) override def getWorkPath: Path = outputPath override def abortTask(taskContext: TaskAttemptContext): Unit = {} override def commitTask(taskContext: TaskAttemptContext): Unit = {} override def needsTaskCommit(taskContext: TaskAttemptContext): Boolean = true override def setupJob(jobContext: JobContext): Unit = {} override def setupTask(taskContext: TaskAttemptContext): Unit = {} override def commitJob(jobContext: JobContext) { val configuration = ContextUtil.getConfiguration(jobContext) val fileSystem = outputPath.getFileSystem(configuration) if (configuration.getBoolean(ParquetOutputFormat.ENABLE_JOB_SUMMARY, true)) { try { val outputStatus = fileSystem.getFileStatus(outputPath) val footers = ParquetFileReader.readAllFootersInParallel(configuration, outputStatus) try { ParquetFileWriter.writeMetadataFile(configuration, outputPath, footers) } catch { case e: Exception => LOG.warn("could not write summary file for " + outputPath, e) val metadataPath = new Path(outputPath, ParquetFileWriter.PARQUET_METADATA_FILE) if (fileSystem.exists(metadataPath)) { fileSystem.delete(metadataPath, true) } } } catch { case e: Exception => LOG.warn("could not write summary file for " + outputPath, e) } } if (configuration.getBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", true)) { try { val successPath = new Path(outputPath, FileOutputCommitter.SUCCEEDED_FILE_NAME) fileSystem.create(successPath).close() } catch { case e: Exception => LOG.warn("could not write success file for " + outputPath, e) } } } }
Example 127
Source File: DirectParquetWriter.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.parquet import scala.collection.JavaConverters._ import org.apache.hadoop.conf import org.apache.hadoop.fs.Path import org.apache.parquet.hadoop.ParquetWriter import org.apache.parquet.hadoop.api.WriteSupport import org.apache.parquet.hadoop.api.WriteSupport.WriteContext import org.apache.parquet.io.api.RecordConsumer import org.apache.parquet.schema.{MessageType, MessageTypeParser} private class DirectWriteSupport(schema: MessageType, metadata: Map[String, String]) extends WriteSupport[RecordBuilder] { private var recordConsumer: RecordConsumer = _ //初始化 override def init(configuration: conf.Configuration): WriteContext = { new WriteContext(schema, metadata.asJava) } //写操作 override def write(buildRecord: RecordBuilder): Unit = { recordConsumer.startMessage() buildRecord(recordConsumer) recordConsumer.endMessage() } //准备写 override def prepareForWrite(recordConsumer: RecordConsumer): Unit = { this.recordConsumer = recordConsumer } } //直接写入 def writeDirect (path: String, schema: String, metadata: Map[String, String] = Map.empty) (f: ParquetWriter[RecordBuilder] => Unit): Unit = { //println("==1111==") val messageType = MessageTypeParser.parseMessageType(schema) val writeSupport = new DirectWriteSupport(messageType, metadata) // println("==2222==") val parquetWriter = new ParquetWriter[RecordBuilder](new Path(path), writeSupport) // println("==3333==") try f(parquetWriter) finally parquetWriter.close() } //消息 def message(writer: ParquetWriter[RecordBuilder])(builder: RecordBuilder): Unit = { writer.write(builder) } //分组 def group(consumer: RecordConsumer)(f: => Unit): Unit = { consumer.startGroup() f consumer.endGroup() } //字段 def field(consumer: RecordConsumer, name: String, index: Int = 0)(f: => Unit): Unit = { consumer.startField(name, index) f consumer.endField(name, index) } }
Example 128
Source File: ParquetCompatibilityTest.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.parquet import scala.collection.JavaConversions._ import org.apache.hadoop.fs.{Path, PathFilter} import org.apache.parquet.hadoop.ParquetFileReader import org.apache.parquet.schema.MessageType import org.apache.spark.sql.QueryTest private[sql] abstract class ParquetCompatibilityTest extends QueryTest with ParquetTest { protected def readParquetSchema(path: String): MessageType = { readParquetSchema(path, { path => !path.getName.startsWith("_") }) } //读Parquet模式 protected def readParquetSchema(path: String, pathFilter: Path => Boolean): MessageType = { val fsPath = new Path(path) val fs = fsPath.getFileSystem(configuration) val parquetFiles = fs.listStatus(fsPath, new PathFilter { override def accept(path: Path): Boolean = pathFilter(path) }).toSeq val footers = ParquetFileReader.readAllFootersInParallel(configuration, parquetFiles, true) footers.head.getParquetMetadata.getFileMetaData.getSchema } protected def logParquetSchema(path: String): Unit = { logInfo( //由parquet-avro写的Parquet文件的模式 s"""Schema of the Parquet file written by parquet-avro: |${readParquetSchema(path)} """.stripMargin) } } //复合Parquet的兼容性测试 object ParquetCompatibilityTest { def makeNullable[T <: AnyRef](i: Int)(f: => T): T = { if (i % 3 == 0) null.asInstanceOf[T] else f } }
Example 129
Source File: ExecutorDelegationTokenUpdater.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.yarn import java.util.concurrent.{Executors, TimeUnit} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.security.{Credentials, UserGroupInformation} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.{Logging, SparkConf} import org.apache.spark.util.{ThreadUtils, Utils} import scala.util.control.NonFatal private[spark] class ExecutorDelegationTokenUpdater( sparkConf: SparkConf, hadoopConf: Configuration) extends Logging { @volatile private var lastCredentialsFileSuffix = 0 private val credentialsFile = sparkConf.get("spark.yarn.credentials.file") private val freshHadoopConf = SparkHadoopUtil.get.getConfBypassingFSCache( hadoopConf, new Path(credentialsFile).toUri.getScheme) private val delegationTokenRenewer = Executors.newSingleThreadScheduledExecutor( ThreadUtils.namedThreadFactory("Delegation Token Refresh Thread")) // On the executor, this thread wakes up and picks up new tokens from HDFS, if any. //在执行程序中,该线程唤醒并从HDFS中获取新令牌(如果有的话) private val executorUpdaterRunnable = new Runnable { override def run(): Unit = Utils.logUncaughtExceptions(updateCredentialsIfRequired()) } def updateCredentialsIfRequired(): Unit = { try { val credentialsFilePath = new Path(credentialsFile) val remoteFs = FileSystem.get(freshHadoopConf) SparkHadoopUtil.get.listFilesSorted( remoteFs, credentialsFilePath.getParent, credentialsFilePath.getName, SparkHadoopUtil.SPARK_YARN_CREDS_TEMP_EXTENSION) .lastOption.foreach { credentialsStatus => val suffix = SparkHadoopUtil.get.getSuffixForCredentialsPath(credentialsStatus.getPath) if (suffix > lastCredentialsFileSuffix) { logInfo("Reading new delegation tokens from " + credentialsStatus.getPath) val newCredentials = getCredentialsFromHDFSFile(remoteFs, credentialsStatus.getPath) lastCredentialsFileSuffix = suffix UserGroupInformation.getCurrentUser.addCredentials(newCredentials) logInfo("Tokens updated from credentials file.") } else { // Check every hour to see if new credentials arrived. logInfo("Updated delegation tokens were expected, but the driver has not updated the " + "tokens yet, will check again in an hour.") delegationTokenRenewer.schedule(executorUpdaterRunnable, 1, TimeUnit.HOURS) return } } val timeFromNowToRenewal = SparkHadoopUtil.get.getTimeFromNowToRenewal( sparkConf, 0.8, UserGroupInformation.getCurrentUser.getCredentials) if (timeFromNowToRenewal <= 0) { executorUpdaterRunnable.run() } else { logInfo(s"Scheduling token refresh from HDFS in $timeFromNowToRenewal millis.") delegationTokenRenewer.schedule( executorUpdaterRunnable, timeFromNowToRenewal, TimeUnit.MILLISECONDS) } } catch { // Since the file may get deleted while we are reading it, catch the Exception and come // back in an hour to try again case NonFatal(e) => logWarning("Error while trying to update credentials, will try again in 1 hour", e) delegationTokenRenewer.schedule(executorUpdaterRunnable, 1, TimeUnit.HOURS) } } private def getCredentialsFromHDFSFile(remoteFs: FileSystem, tokenPath: Path): Credentials = { val stream = remoteFs.open(tokenPath) try { val newCredentials = new Credentials() newCredentials.readTokenStorageStream(stream) newCredentials } finally { stream.close() } } def stop(): Unit = { delegationTokenRenewer.shutdown() } }
Example 130
Source File: SimrSchedulerBackend.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler.cluster import org.apache.hadoop.fs.{Path, FileSystem} import org.apache.spark.rpc.RpcAddress import org.apache.spark.{Logging, SparkContext, SparkEnv} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.scheduler.TaskSchedulerImpl private[spark] class SimrSchedulerBackend( scheduler: TaskSchedulerImpl, sc: SparkContext, driverFilePath: String) extends CoarseGrainedSchedulerBackend(scheduler, sc.env.rpcEnv) with Logging { val tmpPath = new Path(driverFilePath + "_tmp") val filePath = new Path(driverFilePath) val maxCores = conf.getInt("spark.simr.executor.cores", 1) override def start() { super.start() val driverUrl = rpcEnv.uriOf(SparkEnv.driverActorSystemName, //运行driver的主机名或 IP 地址 RpcAddress(sc.conf.get("spark.driver.host"), sc.conf.get("spark.driver.port").toInt), CoarseGrainedSchedulerBackend.ENDPOINT_NAME) val conf = SparkHadoopUtil.get.newConfiguration(sc.conf) val fs = FileSystem.get(conf) val appUIAddress = sc.ui.map(_.appUIAddress).getOrElse("") logInfo("Writing to HDFS file: " + driverFilePath) logInfo("Writing Akka address: " + driverUrl) logInfo("Writing Spark UI Address: " + appUIAddress) // Create temporary file to prevent race condition where executors get empty driverUrl file //创建临时文件以防止执行程序获得空的驱动程序文件的竞争条件 val temp = fs.create(tmpPath, true) temp.writeUTF(driverUrl) temp.writeInt(maxCores) temp.writeUTF(appUIAddress) temp.close() // "Atomic" rename fs.rename(tmpPath, filePath) } override def stop() { val conf = SparkHadoopUtil.get.newConfiguration(sc.conf) val fs = FileSystem.get(conf) fs.delete(new Path(driverFilePath), false) super.stop() } }
Example 131
Source File: WholeTextFileInputFormat.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.input import scala.collection.JavaConversions._ import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.InputSplit import org.apache.hadoop.mapreduce.JobContext import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat import org.apache.hadoop.mapreduce.RecordReader import org.apache.hadoop.mapreduce.TaskAttemptContext def setMinPartitions(context: JobContext, minPartitions: Int) { val files = listStatus(context) val totalLen = files.map { file => if (file.isDir) 0L else file.getLen }.sum val maxSplitSize = Math.ceil(totalLen * 1.0 / (if (minPartitions == 0) 1 else minPartitions)).toLong super.setMaxSplitSize(maxSplitSize) } }
Example 132
Source File: Util.scala From Heracles with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hbase.util import java.io._ import java.util.concurrent.atomic.AtomicInteger import java.util.zip.{DeflaterOutputStream, InflaterInputStream} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hbase.HBaseConfiguration object Util { val iteration = new AtomicInteger(0) def dropTempFilePath(conf: Configuration, path: String): Boolean = { val fileSystem = FileSystem.get(conf) val filePath = new Path(path) if (fileSystem.exists(filePath)) { fileSystem.delete(filePath, true) } else { false } } def serializeHBaseConfiguration(configuration: Configuration): Array[Byte] = { val bos = new ByteArrayOutputStream val deflaterOutputStream = new DeflaterOutputStream(bos) val dos = new DataOutputStream(deflaterOutputStream) configuration.write(dos) dos.close() bos.toByteArray } def deserializeHBaseConfiguration(arr: Array[Byte]) = { val conf = HBaseConfiguration.create conf.readFields(new DataInputStream(new InflaterInputStream(new ByteArrayInputStream(arr)))) conf } }
Example 133
Source File: Util.scala From Spark-SQL-on-HBase with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hbase.util import java.io._ import java.util.concurrent.atomic.AtomicInteger import java.util.zip.{DeflaterOutputStream, InflaterInputStream} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hbase.HBaseConfiguration object Util { val iteration = new AtomicInteger(0) def getTempFilePath(conf: Configuration, prefix: String): String = { val fileSystem = FileSystem.get(conf) val path = new Path(s"$prefix-${System.currentTimeMillis()}-${iteration.getAndIncrement}") if (fileSystem.exists(path)) { fileSystem.delete(path, true) } path.getName } def serializeHBaseConfiguration(configuration: Configuration): Array[Byte] = { val bos = new ByteArrayOutputStream val deflaterOutputStream = new DeflaterOutputStream(bos) val dos = new DataOutputStream(deflaterOutputStream) configuration.write(dos) dos.close() bos.toByteArray } def deserializeHBaseConfiguration(arr: Array[Byte]) = { val conf = HBaseConfiguration.create conf.readFields(new DataInputStream(new InflaterInputStream(new ByteArrayInputStream(arr)))) conf } }
Example 134
Source File: RWrappers.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.r import org.apache.hadoop.fs.Path import org.json4s.DefaultFormats import org.json4s.jackson.JsonMethods._ import org.apache.spark.SparkException import org.apache.spark.ml.util.MLReader private[r] object RWrappers extends MLReader[Object] { override def load(path: String): Object = { implicit val format = DefaultFormats val rMetadataPath = new Path(path, "rMetadata").toString val rMetadataStr = sc.textFile(rMetadataPath, 1).first() val rMetadata = parse(rMetadataStr) val className = (rMetadata \ "class").extract[String] className match { case "org.apache.spark.ml.r.NaiveBayesWrapper" => NaiveBayesWrapper.load(path) case "org.apache.spark.ml.r.AFTSurvivalRegressionWrapper" => AFTSurvivalRegressionWrapper.load(path) case "org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper" => GeneralizedLinearRegressionWrapper.load(path) case "org.apache.spark.ml.r.KMeansWrapper" => KMeansWrapper.load(path) case "org.apache.spark.ml.r.MultilayerPerceptronClassifierWrapper" => MultilayerPerceptronClassifierWrapper.load(path) case "org.apache.spark.ml.r.LDAWrapper" => LDAWrapper.load(path) case "org.apache.spark.ml.r.IsotonicRegressionWrapper" => IsotonicRegressionWrapper.load(path) case "org.apache.spark.ml.r.GaussianMixtureWrapper" => GaussianMixtureWrapper.load(path) case "org.apache.spark.ml.r.ALSWrapper" => ALSWrapper.load(path) case "org.apache.spark.ml.r.LogisticRegressionWrapper" => LogisticRegressionWrapper.load(path) case "org.apache.spark.ml.r.RandomForestRegressorWrapper" => RandomForestRegressorWrapper.load(path) case "org.apache.spark.ml.r.RandomForestClassifierWrapper" => RandomForestClassifierWrapper.load(path) case "org.apache.spark.ml.r.DecisionTreeRegressorWrapper" => DecisionTreeRegressorWrapper.load(path) case "org.apache.spark.ml.r.DecisionTreeClassifierWrapper" => DecisionTreeClassifierWrapper.load(path) case "org.apache.spark.ml.r.GBTRegressorWrapper" => GBTRegressorWrapper.load(path) case "org.apache.spark.ml.r.GBTClassifierWrapper" => GBTClassifierWrapper.load(path) case "org.apache.spark.ml.r.BisectingKMeansWrapper" => BisectingKMeansWrapper.load(path) case "org.apache.spark.ml.r.LinearSVCWrapper" => LinearSVCWrapper.load(path) case "org.apache.spark.ml.r.FPGrowthWrapper" => FPGrowthWrapper.load(path) case _ => throw new SparkException(s"SparkR read.ml does not support load $className") } } }
Example 135
Source File: MultilayerPerceptronClassifierWrapper.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.r import org.apache.hadoop.fs.Path import org.json4s._ import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.ml.{Pipeline, PipelineModel} import org.apache.spark.ml.classification.{MultilayerPerceptronClassificationModel, MultilayerPerceptronClassifier} import org.apache.spark.ml.feature.{IndexToString, RFormula} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.r.RWrapperUtils._ import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable, MLWriter} import org.apache.spark.sql.{DataFrame, Dataset} private[r] class MultilayerPerceptronClassifierWrapper private ( val pipeline: PipelineModel ) extends MLWritable { import MultilayerPerceptronClassifierWrapper._ private val mlpModel: MultilayerPerceptronClassificationModel = pipeline.stages(1).asInstanceOf[MultilayerPerceptronClassificationModel] lazy val weights: Array[Double] = mlpModel.weights.toArray lazy val layers: Array[Int] = mlpModel.layers def transform(dataset: Dataset[_]): DataFrame = { pipeline.transform(dataset) .drop(mlpModel.getFeaturesCol) .drop(mlpModel.getLabelCol) .drop(PREDICTED_LABEL_INDEX_COL) } override def read: MLReader[MultilayerPerceptronClassifierWrapper] = new MultilayerPerceptronClassifierWrapperReader override def load(path: String): MultilayerPerceptronClassifierWrapper = super.load(path) class MultilayerPerceptronClassifierWrapperReader extends MLReader[MultilayerPerceptronClassifierWrapper]{ override def load(path: String): MultilayerPerceptronClassifierWrapper = { implicit val format = DefaultFormats val pipelinePath = new Path(path, "pipeline").toString val pipeline = PipelineModel.load(pipelinePath) new MultilayerPerceptronClassifierWrapper(pipeline) } } class MultilayerPerceptronClassifierWrapperWriter(instance: MultilayerPerceptronClassifierWrapper) extends MLWriter { override protected def saveImpl(path: String): Unit = { val rMetadataPath = new Path(path, "rMetadata").toString val pipelinePath = new Path(path, "pipeline").toString val rMetadata = "class" -> instance.getClass.getName val rMetadataJson: String = compact(render(rMetadata)) sc.parallelize(Seq(rMetadataJson), 1).saveAsTextFile(rMetadataPath) instance.pipeline.save(pipelinePath) } } }
Example 136
Source File: FPGrowthWrapper.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.r import org.apache.hadoop.fs.Path import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.ml.fpm.{FPGrowth, FPGrowthModel} import org.apache.spark.ml.util._ import org.apache.spark.sql.{DataFrame, Dataset} private[r] class FPGrowthWrapper private (val fpGrowthModel: FPGrowthModel) extends MLWritable { def freqItemsets: DataFrame = fpGrowthModel.freqItemsets def associationRules: DataFrame = fpGrowthModel.associationRules def transform(dataset: Dataset[_]): DataFrame = { fpGrowthModel.transform(dataset) } override def write: MLWriter = new FPGrowthWrapper.FPGrowthWrapperWriter(this) } private[r] object FPGrowthWrapper extends MLReadable[FPGrowthWrapper] { def fit( data: DataFrame, minSupport: Double, minConfidence: Double, itemsCol: String, numPartitions: Integer): FPGrowthWrapper = { val fpGrowth = new FPGrowth() .setMinSupport(minSupport) .setMinConfidence(minConfidence) .setItemsCol(itemsCol) if (numPartitions != null && numPartitions > 0) { fpGrowth.setNumPartitions(numPartitions) } val fpGrowthModel = fpGrowth.fit(data) new FPGrowthWrapper(fpGrowthModel) } override def read: MLReader[FPGrowthWrapper] = new FPGrowthWrapperReader class FPGrowthWrapperReader extends MLReader[FPGrowthWrapper] { override def load(path: String): FPGrowthWrapper = { val modelPath = new Path(path, "model").toString val fPGrowthModel = FPGrowthModel.load(modelPath) new FPGrowthWrapper(fPGrowthModel) } } class FPGrowthWrapperWriter(instance: FPGrowthWrapper) extends MLWriter { override protected def saveImpl(path: String): Unit = { val modelPath = new Path(path, "model").toString val rMetadataPath = new Path(path, "rMetadata").toString val rMetadataJson: String = compact(render( "class" -> instance.getClass.getName )) sc.parallelize(Seq(rMetadataJson), 1).saveAsTextFile(rMetadataPath) instance.fpGrowthModel.save(modelPath) } } }
Example 137
Source File: HadoopUtils.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.image import scala.language.existentials import scala.util.Random import org.apache.commons.io.FilenameUtils import org.apache.hadoop.conf.{Configuration, Configured} import org.apache.hadoop.fs.{Path, PathFilter} import org.apache.hadoop.mapreduce.lib.input.FileInputFormat import org.apache.spark.sql.SparkSession private object RecursiveFlag { def withPathFilter[T]( sampleRatio: Double, spark: SparkSession, seed: Long)(f: => T): T = { val sampleImages = sampleRatio < 1 if (sampleImages) { val flagName = FileInputFormat.PATHFILTER_CLASS val hadoopConf = spark.sparkContext.hadoopConfiguration val old = Option(hadoopConf.getClass(flagName, null)) hadoopConf.setDouble(SamplePathFilter.ratioParam, sampleRatio) hadoopConf.setLong(SamplePathFilter.seedParam, seed) hadoopConf.setClass(flagName, classOf[SamplePathFilter], classOf[PathFilter]) try f finally { hadoopConf.unset(SamplePathFilter.ratioParam) hadoopConf.unset(SamplePathFilter.seedParam) old match { case Some(v) => hadoopConf.setClass(flagName, v, classOf[PathFilter]) case None => hadoopConf.unset(flagName) } } } else { f } } }
Example 138
Source File: InsertIntoHiveDirCommand.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import scala.language.existentials import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.common.FileUtils import org.apache.hadoop.hive.ql.plan.TableDesc import org.apache.hadoop.hive.serde.serdeConstants import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe import org.apache.hadoop.mapred._ import org.apache.spark.SparkException import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable} import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.hive.client.HiveClientImpl case class InsertIntoHiveDirCommand( isLocal: Boolean, storage: CatalogStorageFormat, query: LogicalPlan, overwrite: Boolean, outputColumns: Seq[Attribute]) extends SaveAsHiveFile { override def run(sparkSession: SparkSession, child: SparkPlan): Seq[Row] = { assert(storage.locationUri.nonEmpty) val hiveTable = HiveClientImpl.toHiveTable(CatalogTable( identifier = TableIdentifier(storage.locationUri.get.toString, Some("default")), tableType = org.apache.spark.sql.catalyst.catalog.CatalogTableType.VIEW, storage = storage, schema = query.schema )) hiveTable.getMetadata.put(serdeConstants.SERIALIZATION_LIB, storage.serde.getOrElse(classOf[LazySimpleSerDe].getName)) val tableDesc = new TableDesc( hiveTable.getInputFormatClass, hiveTable.getOutputFormatClass, hiveTable.getMetadata ) val hadoopConf = sparkSession.sessionState.newHadoopConf() val jobConf = new JobConf(hadoopConf) val targetPath = new Path(storage.locationUri.get) val writeToPath = if (isLocal) { val localFileSystem = FileSystem.getLocal(jobConf) localFileSystem.makeQualified(targetPath) } else { val qualifiedPath = FileUtils.makeQualified(targetPath, hadoopConf) val dfs = qualifiedPath.getFileSystem(jobConf) if (!dfs.exists(qualifiedPath)) { dfs.mkdirs(qualifiedPath.getParent) } qualifiedPath } val tmpPath = getExternalTmpPath(sparkSession, hadoopConf, writeToPath) val fileSinkConf = new org.apache.spark.sql.hive.HiveShim.ShimFileSinkDesc( tmpPath.toString, tableDesc, false) try { saveAsHiveFile( sparkSession = sparkSession, plan = child, hadoopConf = hadoopConf, fileSinkConf = fileSinkConf, outputLocation = tmpPath.toString, allColumns = outputColumns) val fs = writeToPath.getFileSystem(hadoopConf) if (overwrite && fs.exists(writeToPath)) { fs.listStatus(writeToPath).foreach { existFile => if (Option(existFile.getPath) != createdTempDir) fs.delete(existFile.getPath, true) } } fs.listStatus(tmpPath).foreach { tmpFile => fs.rename(tmpFile.getPath, writeToPath) } } catch { case e: Throwable => throw new SparkException( "Failed inserting overwrite directory " + storage.locationUri.get, e) } finally { deleteExternalTmpPath(hadoopConf) } Seq.empty[Row] } }
Example 139
Source File: JsonHadoopFsRelationSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import java.math.BigDecimal import org.apache.hadoop.fs.Path import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.catalog.CatalogUtils import org.apache.spark.sql.types._ class JsonHadoopFsRelationSuite extends HadoopFsRelationTest { override val dataSourceName: String = "json" // JSON does not write data of NullType and does not play well with BinaryType. override protected def supportsDataType(dataType: DataType): Boolean = dataType match { case _: NullType => false case _: BinaryType => false case _: CalendarIntervalType => false case _ => true } test("save()/load() - partitioned table - simple queries - partition columns in data") { withTempDir { file => for (p1 <- 1 to 2; p2 <- Seq("foo", "bar")) { val partitionDir = new Path( CatalogUtils.URIToString(makeQualifiedPath(file.getCanonicalPath)), s"p1=$p1/p2=$p2") sparkContext .parallelize(for (i <- 1 to 3) yield s"""{"a":$i,"b":"val_$i"}""") .saveAsTextFile(partitionDir.toString) } val dataSchemaWithPartition = StructType(dataSchema.fields :+ StructField("p1", IntegerType, nullable = true)) checkQueries( spark.read.format(dataSourceName) .option("dataSchema", dataSchemaWithPartition.json) .load(file.getCanonicalPath)) } } test("SPARK-9894: save complex types to JSON") { withTempDir { file => file.delete() val schema = new StructType() .add("array", ArrayType(LongType)) .add("map", MapType(StringType, new StructType().add("innerField", LongType))) val data = Row(Seq(1L, 2L, 3L), Map("m1" -> Row(4L))) :: Row(Seq(5L, 6L, 7L), Map("m2" -> Row(10L))) :: Nil val df = spark.createDataFrame(sparkContext.parallelize(data), schema) // Write the data out. df.write.format(dataSourceName).save(file.getCanonicalPath) // Read it back and check the result. checkAnswer( spark.read.format(dataSourceName).schema(schema).load(file.getCanonicalPath), df ) } } test("SPARK-10196: save decimal type to JSON") { withTempDir { file => file.delete() val schema = new StructType() .add("decimal", DecimalType(7, 2)) val data = Row(new BigDecimal("10.02")) :: Row(new BigDecimal("20000.99")) :: Row(new BigDecimal("10000")) :: Nil val df = spark.createDataFrame(sparkContext.parallelize(data), schema) // Write the data out. df.write.format(dataSourceName).save(file.getCanonicalPath) // Read it back and check the result. checkAnswer( spark.read.format(dataSourceName).schema(schema).load(file.getCanonicalPath), df ) } } }
Example 140
Source File: CommitFailureTestRelationSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.hadoop.fs.Path import org.apache.spark.SparkException import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.functions._ import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.test.SQLTestUtils class CommitFailureTestRelationSuite extends SQLTestUtils with TestHiveSingleton { // When committing a task, `CommitFailureTestSource` throws an exception for testing purpose. val dataSourceName: String = classOf[CommitFailureTestSource].getCanonicalName test("SPARK-7684: commitTask() failure should fallback to abortTask()") { withTempPath { file => // Here we coalesce partition number to 1 to ensure that only a single task is issued. This // prevents race condition happened when FileOutputCommitter tries to remove the `_temporary` // directory while committing/aborting the job. See SPARK-8513 for more details. val df = spark.range(0, 10).coalesce(1) intercept[SparkException] { df.write.format(dataSourceName).save(file.getCanonicalPath) } val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf) assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary"))) } } test("call failure callbacks before close writer - default") { SimpleTextRelation.failCommitter = false withTempPath { file => // fail the job in the middle of writing val divideByZero = udf((x: Int) => { x / (x - 1)}) val df = spark.range(0, 10).coalesce(1).select(divideByZero(col("id"))) SimpleTextRelation.callbackCalled = false intercept[SparkException] { df.write.format(dataSourceName).save(file.getCanonicalPath) } assert(SimpleTextRelation.callbackCalled, "failure callback should be called") val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf) assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary"))) } } test("call failure callbacks before close writer - partitioned") { SimpleTextRelation.failCommitter = false withTempPath { file => // fail the job in the middle of writing val df = spark.range(0, 10).coalesce(1).select(col("id").mod(2).as("key"), col("id")) SimpleTextRelation.callbackCalled = false SimpleTextRelation.failWriter = true intercept[SparkException] { df.write.format(dataSourceName).partitionBy("key").save(file.getCanonicalPath) } assert(SimpleTextRelation.callbackCalled, "failure callback should be called") val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf) assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary"))) } } }
Example 141
Source File: SimpleTextHadoopFsRelationSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.hadoop.fs.Path import org.apache.spark.sql.catalyst.catalog.CatalogUtils import org.apache.spark.sql.catalyst.expressions.PredicateHelper import org.apache.spark.sql.types._ class SimpleTextHadoopFsRelationSuite extends HadoopFsRelationTest with PredicateHelper { override val dataSourceName: String = classOf[SimpleTextSource].getCanonicalName // We have a very limited number of supported types at here since it is just for a // test relation and we do very basic testing at here. override protected def supportsDataType(dataType: DataType): Boolean = dataType match { case _: BinaryType => false // We are using random data generator and the generated strings are not really valid string. case _: StringType => false case _: BooleanType => false // see https://issues.apache.org/jira/browse/SPARK-10442 case _: CalendarIntervalType => false case _: DateType => false case _: TimestampType => false case _: ArrayType => false case _: MapType => false case _: StructType => false case _: UserDefinedType[_] => false case _ => true } test("save()/load() - partitioned table - simple queries - partition columns in data") { withTempDir { file => for (p1 <- 1 to 2; p2 <- Seq("foo", "bar")) { val partitionDir = new Path( CatalogUtils.URIToString(makeQualifiedPath(file.getCanonicalPath)), s"p1=$p1/p2=$p2") sparkContext .parallelize(for (i <- 1 to 3) yield s"$i,val_$i,$p1") .saveAsTextFile(partitionDir.toString) } val dataSchemaWithPartition = StructType(dataSchema.fields :+ StructField("p1", IntegerType, nullable = true)) checkQueries( spark.read.format(dataSourceName) .option("dataSchema", dataSchemaWithPartition.json) .load(file.getCanonicalPath)) } } test("test hadoop conf option propagation") { withTempPath { file => // Test write side val df = spark.range(10).selectExpr("cast(id as string)") df.write .option("some-random-write-option", "hahah-WRITE") .option("some-null-value-option", null) // test null robustness .option("dataSchema", df.schema.json) .format(dataSourceName).save(file.getAbsolutePath) assert(SimpleTextRelation.lastHadoopConf.get.get("some-random-write-option") == "hahah-WRITE") // Test read side val df1 = spark.read .option("some-random-read-option", "hahah-READ") .option("some-null-value-option", null) // test null robustness .option("dataSchema", df.schema.json) .format(dataSourceName) .load(file.getAbsolutePath) df1.count() assert(SimpleTextRelation.lastHadoopConf.get.get("some-random-read-option") == "hahah-READ") } } }
Example 142
Source File: OrcOutputWriter.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.orc import org.apache.hadoop.fs.Path import org.apache.hadoop.io.NullWritable import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.orc.mapred.OrcStruct import org.apache.orc.mapreduce.OrcOutputFormat import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.datasources.OutputWriter import org.apache.spark.sql.types._ private[orc] class OrcOutputWriter( path: String, dataSchema: StructType, context: TaskAttemptContext) extends OutputWriter { private[this] val serializer = new OrcSerializer(dataSchema) private val recordWriter = { new OrcOutputFormat[OrcStruct]() { override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = { new Path(path) } }.getRecordWriter(context) } override def write(row: InternalRow): Unit = { recordWriter.write(NullWritable.get(), serializer.serialize(row)) } override def close(): Unit = { recordWriter.close(context) } }
Example 143
Source File: CodecStreams.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import java.io.{InputStream, OutputStream, OutputStreamWriter} import java.nio.charset.{Charset, StandardCharsets} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.io.compress._ import org.apache.hadoop.mapreduce.JobContext import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat import org.apache.hadoop.util.ReflectionUtils import org.apache.spark.TaskContext object CodecStreams { private def getDecompressionCodec(config: Configuration, file: Path): Option[CompressionCodec] = { val compressionCodecs = new CompressionCodecFactory(config) Option(compressionCodecs.getCodec(file)) } def createInputStream(config: Configuration, file: Path): InputStream = { val fs = file.getFileSystem(config) val inputStream: InputStream = fs.open(file) getDecompressionCodec(config, file) .map(codec => codec.createInputStream(inputStream)) .getOrElse(inputStream) } def getCompressionExtension(context: JobContext): String = { getCompressionCodec(context) .map(_.getDefaultExtension) .getOrElse("") } }
Example 144
Source File: HadoopFileLinesReader.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import java.io.Closeable import java.net.URI import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.io.Text import org.apache.hadoop.mapreduce._ import org.apache.hadoop.mapreduce.lib.input.{FileSplit, LineRecordReader} import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl class HadoopFileLinesReader( file: PartitionedFile, conf: Configuration) extends Iterator[Text] with Closeable { private val iterator = { val fileSplit = new FileSplit( new Path(new URI(file.filePath)), file.start, file.length, // TODO: Implement Locality Array.empty) val attemptId = new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0) val hadoopAttemptContext = new TaskAttemptContextImpl(conf, attemptId) val reader = new LineRecordReader() reader.initialize(fileSplit, hadoopAttemptContext) new RecordReaderIterator(reader) } override def hasNext: Boolean = iterator.hasNext override def next(): Text = iterator.next() override def close(): Unit = iterator.close() }
Example 145
Source File: SQLHadoopMapReduceCommitProtocol.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.{OutputCommitter, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter import org.apache.spark.internal.Logging import org.apache.spark.internal.io.HadoopMapReduceCommitProtocol import org.apache.spark.sql.internal.SQLConf class SQLHadoopMapReduceCommitProtocol( jobId: String, path: String, dynamicPartitionOverwrite: Boolean = false) extends HadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite) with Serializable with Logging { override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = { var committer = super.setupCommitter(context) val configuration = context.getConfiguration val clazz = configuration.getClass(SQLConf.OUTPUT_COMMITTER_CLASS.key, null, classOf[OutputCommitter]) if (clazz != null) { logInfo(s"Using user defined output committer class ${clazz.getCanonicalName}") // Every output format based on org.apache.hadoop.mapreduce.lib.output.OutputFormat // has an associated output committer. To override this output committer, // we will first try to use the output committer set in SQLConf.OUTPUT_COMMITTER_CLASS. // If a data source needs to override the output committer, it needs to set the // output committer in prepareForWrite method. if (classOf[FileOutputCommitter].isAssignableFrom(clazz)) { // The specified output committer is a FileOutputCommitter. // So, we will use the FileOutputCommitter-specified constructor. val ctor = clazz.getDeclaredConstructor(classOf[Path], classOf[TaskAttemptContext]) committer = ctor.newInstance(new Path(path), context) } else { // The specified output committer is just an OutputCommitter. // So, we will use the no-argument constructor. val ctor = clazz.getDeclaredConstructor() committer = ctor.newInstance() } } logInfo(s"Using output committer class ${committer.getClass.getCanonicalName}") committer } }
Example 146
Source File: ParquetOutputWriter.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.parquet import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce._ import org.apache.parquet.hadoop.ParquetOutputFormat import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.datasources.OutputWriter // NOTE: This class is instantiated and used on executor side only, no need to be serializable. private[parquet] class ParquetOutputWriter(path: String, context: TaskAttemptContext) extends OutputWriter { private val recordWriter: RecordWriter[Void, InternalRow] = { new ParquetOutputFormat[InternalRow]() { override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = { new Path(path) } }.getRecordWriter(context) } override def write(row: InternalRow): Unit = recordWriter.write(null, row) override def close(): Unit = recordWriter.close(context) }
Example 147
Source File: HadoopFileWholeTextReader.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import java.io.Closeable import java.net.URI import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.io.Text import org.apache.hadoop.mapreduce._ import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl import org.apache.spark.input.WholeTextFileRecordReader class HadoopFileWholeTextReader(file: PartitionedFile, conf: Configuration) extends Iterator[Text] with Closeable { private val iterator = { val fileSplit = new CombineFileSplit( Array(new Path(new URI(file.filePath))), Array(file.start), Array(file.length), // TODO: Implement Locality Array.empty[String]) val attemptId = new TaskAttemptID(new TaskID(new JobID(), TaskType.MAP, 0), 0) val hadoopAttemptContext = new TaskAttemptContextImpl(conf, attemptId) val reader = new WholeTextFileRecordReader(fileSplit, hadoopAttemptContext, 0) reader.initialize(fileSplit, hadoopAttemptContext) new RecordReaderIterator(reader) } override def hasNext: Boolean = iterator.hasNext override def next(): Text = iterator.next() override def close(): Unit = iterator.close() }
Example 148
Source File: resources.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.command import java.io.File import java.net.URI import org.apache.hadoop.fs.Path import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} case class ListJarsCommand(jars: Seq[String] = Seq.empty[String]) extends RunnableCommand { override val output: Seq[Attribute] = { AttributeReference("Results", StringType, nullable = false)() :: Nil } override def run(sparkSession: SparkSession): Seq[Row] = { val jarList = sparkSession.sparkContext.listJars() if (jars.nonEmpty) { for { jarName <- jars.map(f => new Path(f).getName) jarPath <- jarList if jarPath.contains(jarName) } yield Row(jarPath) } else { jarList.map(Row(_)) } } }
Example 149
Source File: MetadataLogFileIndex.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import scala.collection.mutable import org.apache.hadoop.fs.{FileStatus, Path} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.types.StructType class MetadataLogFileIndex( sparkSession: SparkSession, path: Path, userPartitionSchema: Option[StructType]) extends PartitioningAwareFileIndex(sparkSession, Map.empty, userPartitionSchema) { private val metadataDirectory = new Path(path, FileStreamSink.metadataDir) logInfo(s"Reading streaming file log from $metadataDirectory") private val metadataLog = new FileStreamSinkLog(FileStreamSinkLog.VERSION, sparkSession, metadataDirectory.toUri.toString) private val allFilesFromLog = metadataLog.allFiles().map(_.toFileStatus).filterNot(_.isDirectory) private var cachedPartitionSpec: PartitionSpec = _ override protected val leafFiles: mutable.LinkedHashMap[Path, FileStatus] = { new mutable.LinkedHashMap ++= allFilesFromLog.map(f => f.getPath -> f) } override protected val leafDirToChildrenFiles: Map[Path, Array[FileStatus]] = { allFilesFromLog.toArray.groupBy(_.getPath.getParent) } override def rootPaths: Seq[Path] = path :: Nil override def refresh(): Unit = { } override def partitionSpec(): PartitionSpec = { if (cachedPartitionSpec == null) { cachedPartitionSpec = inferPartitioning() } cachedPartitionSpec } }
Example 150
Source File: FileStreamSinkLog.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import java.net.URI import org.apache.hadoop.fs.{FileStatus, Path} import org.json4s.NoTypeHints import org.json4s.jackson.Serialization import org.apache.spark.sql.SparkSession import org.apache.spark.sql.internal.SQLConf class FileStreamSinkLog( metadataLogVersion: Int, sparkSession: SparkSession, path: String) extends CompactibleFileStreamLog[SinkFileStatus](metadataLogVersion, sparkSession, path) { private implicit val formats = Serialization.formats(NoTypeHints) protected override val fileCleanupDelayMs = sparkSession.sessionState.conf.fileSinkLogCleanupDelay protected override val isDeletingExpiredLog = sparkSession.sessionState.conf.fileSinkLogDeletion protected override val defaultCompactInterval = sparkSession.sessionState.conf.fileSinkLogCompactInterval require(defaultCompactInterval > 0, s"Please set ${SQLConf.FILE_SINK_LOG_COMPACT_INTERVAL.key} (was $defaultCompactInterval) " + "to a positive value.") override def compactLogs(logs: Seq[SinkFileStatus]): Seq[SinkFileStatus] = { val deletedFiles = logs.filter(_.action == FileStreamSinkLog.DELETE_ACTION).map(_.path).toSet if (deletedFiles.isEmpty) { logs } else { logs.filter(f => !deletedFiles.contains(f.path)) } } } object FileStreamSinkLog { val VERSION = 1 val DELETE_ACTION = "delete" val ADD_ACTION = "add" }
Example 151
Source File: StreamMetadata.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import java.io.{InputStreamReader, OutputStreamWriter} import java.nio.charset.StandardCharsets import scala.util.control.NonFatal import org.apache.commons.io.IOUtils import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, FSDataInputStream, FSDataOutputStream, Path} import org.json4s.NoTypeHints import org.json4s.jackson.Serialization import org.apache.spark.internal.Logging import org.apache.spark.sql.streaming.StreamingQuery def write( metadata: StreamMetadata, metadataFile: Path, hadoopConf: Configuration): Unit = { var output: FSDataOutputStream = null try { val fs = metadataFile.getFileSystem(hadoopConf) output = fs.create(metadataFile) val writer = new OutputStreamWriter(output) Serialization.write(metadata, writer) writer.close() } catch { case NonFatal(e) => logError(s"Error writing stream metadata $metadata to $metadataFile", e) throw e } finally { IOUtils.closeQuietly(output) } } }
Example 152
Source File: ManifestFileCommitProtocol.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import java.util.UUID import scala.collection.mutable.ArrayBuffer import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext} import org.apache.spark.internal.Logging import org.apache.spark.internal.io.FileCommitProtocol import org.apache.spark.internal.io.FileCommitProtocol.TaskCommitMessage def setupManifestOptions(fileLog: FileStreamSinkLog, batchId: Long): Unit = { this.fileLog = fileLog this.batchId = batchId } override def setupJob(jobContext: JobContext): Unit = { require(fileLog != null, "setupManifestOptions must be called before this function") // Do nothing } override def commitJob(jobContext: JobContext, taskCommits: Seq[TaskCommitMessage]): Unit = { require(fileLog != null, "setupManifestOptions must be called before this function") val fileStatuses = taskCommits.flatMap(_.obj.asInstanceOf[Seq[SinkFileStatus]]).toArray if (fileLog.add(batchId, fileStatuses)) { logInfo(s"Committed batch $batchId") } else { throw new IllegalStateException(s"Race while writing batch $batchId") } } override def abortJob(jobContext: JobContext): Unit = { require(fileLog != null, "setupManifestOptions must be called before this function") // Do nothing } override def setupTask(taskContext: TaskAttemptContext): Unit = { addedFiles = new ArrayBuffer[String] } override def newTaskTempFile( taskContext: TaskAttemptContext, dir: Option[String], ext: String): String = { // The file name looks like part-r-00000-2dd664f9-d2c4-4ffe-878f-c6c70c1fb0cb_00003.gz.parquet // Note that %05d does not truncate the split number, so if we have more than 100000 tasks, // the file name is fine and won't overflow. val split = taskContext.getTaskAttemptID.getTaskID.getId val uuid = UUID.randomUUID.toString val filename = f"part-$split%05d-$uuid$ext" val file = dir.map { d => new Path(new Path(path, d), filename).toString }.getOrElse { new Path(path, filename).toString } addedFiles += file file } override def newTaskTempFileAbsPath( taskContext: TaskAttemptContext, absoluteDir: String, ext: String): String = { throw new UnsupportedOperationException( s"$this does not support adding files with an absolute path") } override def commitTask(taskContext: TaskAttemptContext): TaskCommitMessage = { if (addedFiles.nonEmpty) { val fs = new Path(addedFiles.head).getFileSystem(taskContext.getConfiguration) val statuses: Seq[SinkFileStatus] = addedFiles.map(f => SinkFileStatus(fs.getFileStatus(new Path(f)))) new TaskCommitMessage(statuses) } else { new TaskCommitMessage(Seq.empty[SinkFileStatus]) } } override def abortTask(taskContext: TaskAttemptContext): Unit = { // Do nothing // TODO: we can also try delete the addedFiles as a best-effort cleanup. } }
Example 153
Source File: ParquetFileFormatSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.parquet import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.SparkException import org.apache.spark.sql.QueryTest import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSQLContext class ParquetFileFormatSuite extends QueryTest with ParquetTest with SharedSQLContext { test("read parquet footers in parallel") { def testReadFooters(ignoreCorruptFiles: Boolean): Unit = { withTempDir { dir => val fs = FileSystem.get(sparkContext.hadoopConfiguration) val basePath = dir.getCanonicalPath val path1 = new Path(basePath, "first") val path2 = new Path(basePath, "second") val path3 = new Path(basePath, "third") spark.range(1).toDF("a").coalesce(1).write.parquet(path1.toString) spark.range(1, 2).toDF("a").coalesce(1).write.parquet(path2.toString) spark.range(2, 3).toDF("a").coalesce(1).write.json(path3.toString) val fileStatuses = Seq(fs.listStatus(path1), fs.listStatus(path2), fs.listStatus(path3)).flatten val footers = ParquetFileFormat.readParquetFootersInParallel( sparkContext.hadoopConfiguration, fileStatuses, ignoreCorruptFiles) assert(footers.size == 2) } } testReadFooters(true) val exception = intercept[java.io.IOException] { testReadFooters(false) } assert(exception.getMessage().contains("Could not read footer for file")) } }
Example 154
Source File: DataSourceScanExecRedactionSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.hadoop.fs.Path import org.apache.spark.SparkConf import org.apache.spark.sql.QueryTest import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSQLContext class DataSourceScanExecRedactionSuite extends QueryTest with SharedSQLContext { override protected def sparkConf: SparkConf = super.sparkConf .set("spark.redaction.string.regex", "file:/[\\w_]+") test("treeString is redacted") { withTempDir { dir => val basePath = dir.getCanonicalPath spark.range(0, 10).toDF("a").write.parquet(new Path(basePath, "foo=1").toString) val df = spark.read.parquet(basePath) val rootPath = df.queryExecution.sparkPlan.find(_.isInstanceOf[FileSourceScanExec]).get .asInstanceOf[FileSourceScanExec].relation.location.rootPaths.head assert(rootPath.toString.contains(dir.toURI.getPath.stripSuffix("/"))) assert(!df.queryExecution.sparkPlan.treeString(verbose = true).contains(rootPath.getName)) assert(!df.queryExecution.executedPlan.treeString(verbose = true).contains(rootPath.getName)) assert(!df.queryExecution.toString.contains(rootPath.getName)) assert(!df.queryExecution.simpleString.contains(rootPath.getName)) val replacement = "*********" assert(df.queryExecution.sparkPlan.treeString(verbose = true).contains(replacement)) assert(df.queryExecution.executedPlan.treeString(verbose = true).contains(replacement)) assert(df.queryExecution.toString.contains(replacement)) assert(df.queryExecution.simpleString.contains(replacement)) } } private def isIncluded(queryExecution: QueryExecution, msg: String): Boolean = { queryExecution.toString.contains(msg) || queryExecution.simpleString.contains(msg) || queryExecution.stringWithStats.contains(msg) } test("explain is redacted using SQLConf") { withTempDir { dir => val basePath = dir.getCanonicalPath spark.range(0, 10).toDF("a").write.parquet(new Path(basePath, "foo=1").toString) val df = spark.read.parquet(basePath) val replacement = "*********" // Respect SparkConf and replace file:/ assert(isIncluded(df.queryExecution, replacement)) assert(isIncluded(df.queryExecution, "FileScan")) assert(!isIncluded(df.queryExecution, "file:/")) withSQLConf(SQLConf.SQL_STRING_REDACTION_PATTERN.key -> "(?i)FileScan") { // Respect SQLConf and replace FileScan assert(isIncluded(df.queryExecution, replacement)) assert(!isIncluded(df.queryExecution, "FileScan")) assert(isIncluded(df.queryExecution, "file:/")) } } } }
Example 155
Source File: StreamMetadataSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import java.io.File import java.util.UUID import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.spark.sql.streaming.StreamTest class StreamMetadataSuite extends StreamTest { test("writing and reading") { withTempDir { dir => val id = UUID.randomUUID.toString val metadata = StreamMetadata(id) val file = new Path(new File(dir, "test").toString) StreamMetadata.write(metadata, file, hadoopConf) val readMetadata = StreamMetadata.read(file, hadoopConf) assert(readMetadata.nonEmpty) assert(readMetadata.get.id === id) } } test("read Spark 2.1.0 format") { // query-metadata-logs-version-2.1.0.txt has the execution metadata generated by Spark 2.1.0 assert( readForResource("query-metadata-logs-version-2.1.0.txt") === StreamMetadata("d366a8bf-db79-42ca-b5a4-d9ca0a11d63e")) } private def readForResource(fileName: String): StreamMetadata = { val input = getClass.getResource(s"/structured-streaming/$fileName") StreamMetadata.read(new Path(input.toString), hadoopConf).get } private val hadoopConf = new Configuration() }
Example 156
Source File: SageMakerProtobufWriter.scala From sagemaker-spark with Apache License 2.0 | 5 votes |
package com.amazonaws.services.sagemaker.sparksdk.protobuf import java.io.ByteArrayOutputStream import org.apache.hadoop.fs.Path import org.apache.hadoop.io.{BytesWritable, NullWritable} import org.apache.hadoop.mapreduce.{RecordWriter, TaskAttemptContext} import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.execution.datasources.OutputWriter import org.apache.spark.sql.types.StructType def write(row: Row): Unit = { val labelColumnName = options.getOrElse("labelColumnName", "label") val featuresColumnName = options.getOrElse("featuresColumnName", "features") val record = ProtobufConverter.rowToProtobuf(row, featuresColumnName, Some(labelColumnName)) record.writeTo(byteArrayOutputStream) recordWriter.write(NullWritable.get(), new BytesWritable(byteArrayOutputStream.toByteArray)) byteArrayOutputStream.reset() } override def close(): Unit = { recordWriter.close(context) } }
Example 157
Source File: RecordIOOutputFormatTests.scala From sagemaker-spark with Apache License 2.0 | 5 votes |
package com.amazonaws.services.sagemaker.sparksdk.protobuf import java.io.ByteArrayOutputStream import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, FSDataOutputStream, Path} import org.apache.hadoop.io.{BytesWritable, NullWritable} import org.apache.hadoop.mapreduce.TaskAttemptContext import org.mockito.Matchers.any import org.mockito.Mockito.{verify, when} import org.scalatest.{BeforeAndAfter, FlatSpec} import org.scalatest.mock.MockitoSugar import com.amazonaws.services.sagemaker.sparksdk.protobuf.RecordIOOutputFormat.SageMakerProtobufRecordWriter class RecordIOOutputFormatTests extends FlatSpec with MockitoSugar with BeforeAndAfter { var sagemakerProtobufRecordWriter: SageMakerProtobufRecordWriter = _ var mockOutputStream : FSDataOutputStream = _ var byteArrayOutputStream: ByteArrayOutputStream = _ var mockTaskAttemptContext: TaskAttemptContext = _ var mockPath: Path = _ var mockFileSystem: FileSystem = _ before { byteArrayOutputStream = new ByteArrayOutputStream() mockOutputStream = mock[FSDataOutputStream] sagemakerProtobufRecordWriter = new SageMakerProtobufRecordWriter(mockOutputStream) mockTaskAttemptContext = mock[TaskAttemptContext] mockPath = mock[Path] mockFileSystem = mock[FileSystem] } it should "write an empty array of bytes" in { val bytesWritable = new BytesWritable(byteArrayOutputStream.toByteArray) val bytes = ProtobufConverter.byteArrayToRecordIOEncodedByteArray(bytesWritable.getBytes) sagemakerProtobufRecordWriter.write(NullWritable.get(), bytesWritable) verify(mockOutputStream).write(bytes, 0, bytes.length) } it should "write an array of bytes" in { val byteArray = Array[Byte](0, 0, 0, 0) byteArrayOutputStream.write(byteArray) val bytesWritable = new BytesWritable(byteArrayOutputStream.toByteArray) val bytes = ProtobufConverter.byteArrayToRecordIOEncodedByteArray(bytesWritable.getBytes) sagemakerProtobufRecordWriter.write(NullWritable.get(), bytesWritable) verify(mockOutputStream).write(bytes, 0, bytes.length) } it should "write an array of bytes, padding as necessary" in { byteArrayOutputStream.write(5) val bytesWritable = new BytesWritable(byteArrayOutputStream.toByteArray) val bytes = ProtobufConverter.byteArrayToRecordIOEncodedByteArray(bytesWritable.getBytes) sagemakerProtobufRecordWriter.write(NullWritable.get(), bytesWritable) verify(mockOutputStream).write(bytes, 0, bytes.length) } it should "write an array of bytes, padding only as much as necessary" in { byteArrayOutputStream.write(Array[Byte](0, 0, 0, 0, 0)) val bytesWritable = new BytesWritable(byteArrayOutputStream.toByteArray) val bytes = ProtobufConverter.byteArrayToRecordIOEncodedByteArray(bytesWritable.getBytes) sagemakerProtobufRecordWriter.write(NullWritable.get(), bytesWritable) verify(mockOutputStream).write(bytes, 0, bytes.length) } it should "create a record writer from a FSDataOutputStream created by the filesystem" in { val mockTaskAttemptContext = mock[TaskAttemptContext] val mockPath = mock[Path] val mockFileSystem = mock[FileSystem] when(mockPath.getFileSystem(any[Configuration])).thenReturn(mockFileSystem) new RecordIOOutputFormat() { override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = { mockPath } }.getRecordWriter(mockTaskAttemptContext) verify(mockFileSystem).create(mockPath, true) } }
Example 158
Source File: IOReader.scala From spark-benchmarks with Apache License 2.0 | 5 votes |
package com.bbva.spark.benchmarks.dfsio import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} class IOReader(hadoopConf: Configuration, dataDir: String) extends IOTestBase(hadoopConf, dataDir) { def doIO(fileName: String, fileSize: BytesSize)(implicit conf: Configuration, fs: FileSystem): BytesSize = { val bufferSize = conf.getInt("test.io.file.buffer.size", DefaultBufferSize) // TODO GET RID OF DEFAULT val buffer: Array[Byte] = new Array[Byte](bufferSize) val filePath = new Path(dataDir, fileName.toString) logger.info("Reading file {} with size {}", filePath.toString, fileSize.toString) val in = fs.open(filePath) var actualSize: Long = 0 // TODO improve this try { Stream.continually(in.read(buffer, 0, bufferSize)) .takeWhile(_ > 0 && actualSize < fileSize) .foreach { currentSize => actualSize += currentSize logger.debug(s"Reading chunk of size $currentSize. Currently: $actualSize / $fileSize") } } finally { in.close() } logger.info("File {} with size {} read successfully", fileName, actualSize.toString) actualSize } }
Example 159
Source File: ControlFilesCreator.scala From spark-benchmarks with Apache License 2.0 | 5 votes |
package com.bbva.spark.benchmarks.dfsio import org.apache.hadoop.fs.Path import org.apache.hadoop.io.{LongWritable, Text} import org.apache.hadoop.mapred.lib.MultipleSequenceFileOutputFormat import org.apache.spark.SparkContext import org.apache.spark.rdd.{PairRDDFunctions, RDD} object ControlFilesCreator { val BaseFileName = "test_io_" def createFiles(controlDirPath: String, numFiles: Int, fileSize: Long)(implicit sc: SparkContext): Unit = { sc.parallelize(0 until numFiles, numFiles).map(getFileName).map { fileName => val controlFilePath = new Path(controlDirPath, s"in_file_$fileName") (controlFilePath.toString, new LongWritable(fileSize)) }.saveAsSequenceFileByKey(controlDirPath) } implicit class RichRDD[T](val self: RDD[T]) extends AnyVal { def saveAsSequenceFileByKey[K, V](path: String)(implicit ev: RDD[T] => PairRDDFunctions[K, V]): Unit = self.saveAsHadoopFile(path, classOf[Text], classOf[LongWritable], classOf[RDDMultipleSequenceFileOutputFormat]) } private def getFileName(fileIndex: Int): String = BaseFileName + fileIndex class RDDMultipleSequenceFileOutputFormat extends MultipleSequenceFileOutputFormat[Any, Any] { override def generateActualKey(key: Any, value: Any): Any = new Text(key.toString.split("/").last) override def generateFileNameForKeyValue(key: Any, value: Any, name: String): String = new Path(key.toString).toString } }
Example 160
Source File: TextFileOverwrite.scala From spark_helper with Apache License 2.0 | 5 votes |
package org.apache.spark import org.apache.spark.rdd.{RDD, HadoopRDD} import org.apache.spark.util.SerializableConfiguration import org.apache.hadoop.mapred.{FileInputFormat, JobConf, TextInputFormat} import org.apache.hadoop.io.{LongWritable, Text} import org.apache.hadoop.fs.Path object TextFileOverwrite { def textFile( paths: Seq[String], minPartitions: Int, sc: SparkContext ): RDD[String] = { val confBroadcast = sc.broadcast(new SerializableConfiguration(sc.hadoopConfiguration)) val setInputPathsFunc = (jobConf: JobConf) => FileInputFormat.setInputPaths(jobConf, paths.map(p => new Path(p)): _*) new HadoopRDD( sc, confBroadcast, Some(setInputPathsFunc), classOf[TextInputFormat], classOf[LongWritable], classOf[Text], minPartitions ).map(pair => pair._2.toString) } }
Example 161
Source File: OrcFileOperator.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.orc import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.hive.ql.io.orc.{OrcFile, Reader} import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector import org.apache.spark.Logging import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.hive.HiveMetastoreTypes import org.apache.spark.sql.types.StructType private[orc] object OrcFileOperator extends Logging { def getFileReader(basePath: String, config: Option[Configuration] = None): Option[Reader] = { def isWithNonEmptySchema(path: Path, reader: Reader): Boolean = { reader.getObjectInspector match { case oi: StructObjectInspector if oi.getAllStructFieldRefs.size() == 0 => logInfo( s"ORC file $path has empty schema, it probably contains no rows. " + "Trying to read another ORC file to figure out the schema.") false case _ => true } } val conf = config.getOrElse(new Configuration) val fs = { val hdfsPath = new Path(basePath) hdfsPath.getFileSystem(conf) } listOrcFiles(basePath, conf).iterator.map { path => path -> OrcFile.createReader(fs, path) }.collectFirst { case (path, reader) if isWithNonEmptySchema(path, reader) => reader } } def readSchema(path: String, conf: Option[Configuration]): StructType = { val reader = getFileReader(path, conf).getOrElse { throw new AnalysisException( s"Failed to discover schema from ORC files stored in $path. " + "Probably there are either no ORC files or only empty ORC files.") } val readerInspector = reader.getObjectInspector.asInstanceOf[StructObjectInspector] val schema = readerInspector.getTypeName logDebug(s"Reading schema from file $path, got Hive schema string: $schema") HiveMetastoreTypes.toDataType(schema).asInstanceOf[StructType] } def getObjectInspector( path: String, conf: Option[Configuration]): Option[StructObjectInspector] = { getFileReader(path, conf).map(_.getObjectInspector.asInstanceOf[StructObjectInspector]) } def listOrcFiles(pathStr: String, conf: Configuration): Seq[Path] = { val origPath = new Path(pathStr) val fs = origPath.getFileSystem(conf) val path = origPath.makeQualified(fs.getUri, fs.getWorkingDirectory) val paths = SparkHadoopUtil.get.listLeafStatuses(fs, origPath) .filterNot(_.isDir) .map(_.getPath) .filterNot(_.getName.startsWith("_")) .filterNot(_.getName.startsWith(".")) if (paths == null || paths.isEmpty) { throw new IllegalArgumentException( s"orcFileOperator: path $path does not have valid orc files matching the pattern") } paths } }
Example 162
Source File: OrcHadoopFsRelationSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.orc import org.apache.hadoop.fs.Path import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.{Row, SQLConf} import org.apache.spark.sql.sources.HadoopFsRelationTest import org.apache.spark.sql.types._ class OrcHadoopFsRelationSuite extends HadoopFsRelationTest { import testImplicits._ override val dataSourceName: String = classOf[DefaultSource].getCanonicalName // ORC does not play well with NullType and UDT. override protected def supportsDataType(dataType: DataType): Boolean = dataType match { case _: NullType => false case _: CalendarIntervalType => false case _: UserDefinedType[_] => false case _ => true } test("save()/load() - partitioned table - simple queries - partition columns in data") { withTempDir { file => val basePath = new Path(file.getCanonicalPath) val fs = basePath.getFileSystem(SparkHadoopUtil.get.conf) val qualifiedBasePath = fs.makeQualified(basePath) for (p1 <- 1 to 2; p2 <- Seq("foo", "bar")) { val partitionDir = new Path(qualifiedBasePath, s"p1=$p1/p2=$p2") sparkContext .parallelize(for (i <- 1 to 3) yield (i, s"val_$i", p1)) .toDF("a", "b", "p1") .write .orc(partitionDir.toString) } val dataSchemaWithPartition = StructType(dataSchema.fields :+ StructField("p1", IntegerType, nullable = true)) checkQueries( hiveContext.read.options(Map( "path" -> file.getCanonicalPath, "dataSchema" -> dataSchemaWithPartition.json)).format(dataSourceName).load()) } } test("SPARK-12218: 'Not' is included in ORC filter pushdown") { import testImplicits._ withSQLConf(SQLConf.ORC_FILTER_PUSHDOWN_ENABLED.key -> "true") { withTempPath { dir => val path = s"${dir.getCanonicalPath}/table1" (1 to 5).map(i => (i, (i % 2).toString)).toDF("a", "b").write.orc(path) checkAnswer( sqlContext.read.orc(path).where("not (a = 2) or not(b in ('1'))"), (1 to 5).map(i => Row(i, (i % 2).toString))) checkAnswer( sqlContext.read.orc(path).where("not (a = 2 and b in ('1'))"), (1 to 5).map(i => Row(i, (i % 2).toString))) } } } }
Example 163
Source File: JsonHadoopFsRelationSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import java.math.BigDecimal import org.apache.hadoop.fs.Path import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.Row import org.apache.spark.sql.types._ class JsonHadoopFsRelationSuite extends HadoopFsRelationTest { override val dataSourceName: String = "json" // JSON does not write data of NullType and does not play well with BinaryType. override protected def supportsDataType(dataType: DataType): Boolean = dataType match { case _: NullType => false case _: BinaryType => false case _: CalendarIntervalType => false case _ => true } test("save()/load() - partitioned table - simple queries - partition columns in data") { withTempDir { file => val basePath = new Path(file.getCanonicalPath) val fs = basePath.getFileSystem(SparkHadoopUtil.get.conf) val qualifiedBasePath = fs.makeQualified(basePath) for (p1 <- 1 to 2; p2 <- Seq("foo", "bar")) { val partitionDir = new Path(qualifiedBasePath, s"p1=$p1/p2=$p2") sparkContext .parallelize(for (i <- 1 to 3) yield s"""{"a":$i,"b":"val_$i"}""") .saveAsTextFile(partitionDir.toString) } val dataSchemaWithPartition = StructType(dataSchema.fields :+ StructField("p1", IntegerType, nullable = true)) checkQueries( hiveContext.read.format(dataSourceName) .option("dataSchema", dataSchemaWithPartition.json) .load(file.getCanonicalPath)) } } test("SPARK-9894: save complex types to JSON") { withTempDir { file => file.delete() val schema = new StructType() .add("array", ArrayType(LongType)) .add("map", MapType(StringType, new StructType().add("innerField", LongType))) val data = Row(Seq(1L, 2L, 3L), Map("m1" -> Row(4L))) :: Row(Seq(5L, 6L, 7L), Map("m2" -> Row(10L))) :: Nil val df = hiveContext.createDataFrame(sparkContext.parallelize(data), schema) // Write the data out. df.write.format(dataSourceName).save(file.getCanonicalPath) // Read it back and check the result. checkAnswer( hiveContext.read.format(dataSourceName).schema(schema).load(file.getCanonicalPath), df ) } } test("SPARK-10196: save decimal type to JSON") { withTempDir { file => file.delete() val schema = new StructType() .add("decimal", DecimalType(7, 2)) val data = Row(new BigDecimal("10.02")) :: Row(new BigDecimal("20000.99")) :: Row(new BigDecimal("10000")) :: Nil val df = hiveContext.createDataFrame(sparkContext.parallelize(data), schema) // Write the data out. df.write.format(dataSourceName).save(file.getCanonicalPath) // Read it back and check the result. checkAnswer( hiveContext.read.format(dataSourceName).schema(schema).load(file.getCanonicalPath), df ) } } }
Example 164
Source File: CommitFailureTestRelationSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.hadoop.fs.Path import org.apache.spark.SparkException import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.test.SQLTestUtils class CommitFailureTestRelationSuite extends SQLTestUtils with TestHiveSingleton { // When committing a task, `CommitFailureTestSource` throws an exception for testing purpose. val dataSourceName: String = classOf[CommitFailureTestSource].getCanonicalName test("SPARK-7684: commitTask() failure should fallback to abortTask()") { withTempPath { file => // Here we coalesce partition number to 1 to ensure that only a single task is issued. This // prevents race condition happened when FileOutputCommitter tries to remove the `_temporary` // directory while committing/aborting the job. See SPARK-8513 for more details. val df = sqlContext.range(0, 10).coalesce(1) intercept[SparkException] { df.write.format(dataSourceName).save(file.getCanonicalPath) } val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf) assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary"))) } } }
Example 165
Source File: DirectParquetOutputCommitter.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.parquet import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext} import org.apache.parquet.Log import org.apache.parquet.hadoop.util.ContextUtil import org.apache.parquet.hadoop.{ParquetFileReader, ParquetFileWriter, ParquetOutputCommitter, ParquetOutputFormat} private[datasources] class DirectParquetOutputCommitter( outputPath: Path, context: TaskAttemptContext) extends ParquetOutputCommitter(outputPath, context) { val LOG = Log.getLog(classOf[ParquetOutputCommitter]) override def getWorkPath: Path = outputPath override def abortTask(taskContext: TaskAttemptContext): Unit = {} override def commitTask(taskContext: TaskAttemptContext): Unit = {} override def needsTaskCommit(taskContext: TaskAttemptContext): Boolean = true override def setupJob(jobContext: JobContext): Unit = {} override def setupTask(taskContext: TaskAttemptContext): Unit = {} override def commitJob(jobContext: JobContext) { val configuration = { // scalastyle:off jobcontext ContextUtil.getConfiguration(jobContext) // scalastyle:on jobcontext } val fileSystem = outputPath.getFileSystem(configuration) if (configuration.getBoolean(ParquetOutputFormat.ENABLE_JOB_SUMMARY, true)) { try { val outputStatus = fileSystem.getFileStatus(outputPath) val footers = ParquetFileReader.readAllFootersInParallel(configuration, outputStatus) try { ParquetFileWriter.writeMetadataFile(configuration, outputPath, footers) } catch { case e: Exception => LOG.warn("could not write summary file for " + outputPath, e) val metadataPath = new Path(outputPath, ParquetFileWriter.PARQUET_METADATA_FILE) if (fileSystem.exists(metadataPath)) { fileSystem.delete(metadataPath, true) } } } catch { case e: Exception => LOG.warn("could not write summary file for " + outputPath, e) } } if (configuration.getBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", true)) { try { val successPath = new Path(outputPath, FileOutputCommitter.SUCCEEDED_FILE_NAME) fileSystem.create(successPath).close() } catch { case e: Exception => LOG.warn("could not write success file for " + outputPath, e) } } } }
Example 166
Source File: ParquetCompatibilityTest.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.parquet import scala.collection.JavaConverters.{collectionAsScalaIterableConverter, mapAsJavaMapConverter, seqAsJavaListConverter} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{Path, PathFilter} import org.apache.parquet.hadoop.api.WriteSupport import org.apache.parquet.hadoop.api.WriteSupport.WriteContext import org.apache.parquet.hadoop.{ParquetFileReader, ParquetWriter} import org.apache.parquet.io.api.RecordConsumer import org.apache.parquet.schema.{MessageType, MessageTypeParser} import org.apache.spark.sql.QueryTest def writeDirect( path: String, schema: String, metadata: Map[String, String], recordWriters: (RecordConsumer => Unit)*): Unit = { val messageType = MessageTypeParser.parseMessageType(schema) val writeSupport = new DirectWriteSupport(messageType, metadata) val parquetWriter = new ParquetWriter[RecordConsumer => Unit](new Path(path), writeSupport) try recordWriters.foreach(parquetWriter.write) finally parquetWriter.close() } }
Example 167
Source File: ExecutorDelegationTokenUpdater.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.yarn import java.util.concurrent.{Executors, TimeUnit} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.security.{Credentials, UserGroupInformation} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.{Logging, SparkConf} import org.apache.spark.util.{ThreadUtils, Utils} import scala.util.control.NonFatal private[spark] class ExecutorDelegationTokenUpdater( sparkConf: SparkConf, hadoopConf: Configuration) extends Logging { @volatile private var lastCredentialsFileSuffix = 0 private val credentialsFile = sparkConf.get("spark.yarn.credentials.file") private val freshHadoopConf = SparkHadoopUtil.get.getConfBypassingFSCache( hadoopConf, new Path(credentialsFile).toUri.getScheme) private val delegationTokenRenewer = Executors.newSingleThreadScheduledExecutor( ThreadUtils.namedThreadFactory("Delegation Token Refresh Thread")) // On the executor, this thread wakes up and picks up new tokens from HDFS, if any. private val executorUpdaterRunnable = new Runnable { override def run(): Unit = Utils.logUncaughtExceptions(updateCredentialsIfRequired()) } def updateCredentialsIfRequired(): Unit = { try { val credentialsFilePath = new Path(credentialsFile) val remoteFs = FileSystem.get(freshHadoopConf) SparkHadoopUtil.get.listFilesSorted( remoteFs, credentialsFilePath.getParent, credentialsFilePath.getName, SparkHadoopUtil.SPARK_YARN_CREDS_TEMP_EXTENSION) .lastOption.foreach { credentialsStatus => val suffix = SparkHadoopUtil.get.getSuffixForCredentialsPath(credentialsStatus.getPath) if (suffix > lastCredentialsFileSuffix) { logInfo("Reading new delegation tokens from " + credentialsStatus.getPath) val newCredentials = getCredentialsFromHDFSFile(remoteFs, credentialsStatus.getPath) lastCredentialsFileSuffix = suffix UserGroupInformation.getCurrentUser.addCredentials(newCredentials) logInfo("Tokens updated from credentials file.") } else { // Check every hour to see if new credentials arrived. logInfo("Updated delegation tokens were expected, but the driver has not updated the " + "tokens yet, will check again in an hour.") delegationTokenRenewer.schedule(executorUpdaterRunnable, 1, TimeUnit.HOURS) return } } val timeFromNowToRenewal = SparkHadoopUtil.get.getTimeFromNowToRenewal( sparkConf, 0.8, UserGroupInformation.getCurrentUser.getCredentials) if (timeFromNowToRenewal <= 0) { // We just checked for new credentials but none were there, wait a minute and retry. // This handles the shutdown case where the staging directory may have been removed(see // SPARK-12316 for more details). delegationTokenRenewer.schedule(executorUpdaterRunnable, 1, TimeUnit.MINUTES) } else { logInfo(s"Scheduling token refresh from HDFS in $timeFromNowToRenewal millis.") delegationTokenRenewer.schedule( executorUpdaterRunnable, timeFromNowToRenewal, TimeUnit.MILLISECONDS) } } catch { // Since the file may get deleted while we are reading it, catch the Exception and come // back in an hour to try again case NonFatal(e) => logWarning("Error while trying to update credentials, will try again in 1 hour", e) delegationTokenRenewer.schedule(executorUpdaterRunnable, 1, TimeUnit.HOURS) } } private def getCredentialsFromHDFSFile(remoteFs: FileSystem, tokenPath: Path): Credentials = { val stream = remoteFs.open(tokenPath) try { val newCredentials = new Credentials() newCredentials.readTokenStorageStream(stream) newCredentials } finally { stream.close() } } def stop(): Unit = { delegationTokenRenewer.shutdown() } }
Example 168
Source File: SimrSchedulerBackend.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler.cluster import org.apache.hadoop.fs.{Path, FileSystem} import org.apache.spark.rpc.RpcAddress import org.apache.spark.{Logging, SparkContext, SparkEnv} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.scheduler.TaskSchedulerImpl private[spark] class SimrSchedulerBackend( scheduler: TaskSchedulerImpl, sc: SparkContext, driverFilePath: String) extends CoarseGrainedSchedulerBackend(scheduler, sc.env.rpcEnv) with Logging { val tmpPath = new Path(driverFilePath + "_tmp") val filePath = new Path(driverFilePath) val maxCores = conf.getInt("spark.simr.executor.cores", 1) override def start() { super.start() val driverUrl = rpcEnv.uriOf(SparkEnv.driverActorSystemName, RpcAddress(sc.conf.get("spark.driver.host"), sc.conf.get("spark.driver.port").toInt), CoarseGrainedSchedulerBackend.ENDPOINT_NAME) val conf = SparkHadoopUtil.get.newConfiguration(sc.conf) val fs = FileSystem.get(conf) val appUIAddress = sc.ui.map(_.appUIAddress).getOrElse("") logInfo("Writing to HDFS file: " + driverFilePath) logInfo("Writing Akka address: " + driverUrl) logInfo("Writing Spark UI Address: " + appUIAddress) // Create temporary file to prevent race condition where executors get empty driverUrl file val temp = fs.create(tmpPath, true) temp.writeUTF(driverUrl) temp.writeInt(maxCores) temp.writeUTF(appUIAddress) temp.close() // "Atomic" rename fs.rename(tmpPath, filePath) } override def stop() { val conf = SparkHadoopUtil.get.newConfiguration(sc.conf) val fs = FileSystem.get(conf) if (!fs.delete(new Path(driverFilePath), false)) { logWarning(s"error deleting ${driverFilePath}") } super.stop() } }
Example 169
Source File: ReliableRDDCheckpointData.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.hadoop.fs.Path import org.apache.spark._ import org.apache.spark.util.SerializableConfiguration def cleanCheckpoint(sc: SparkContext, rddId: Int): Unit = { checkpointPath(sc, rddId).foreach { path => val fs = path.getFileSystem(sc.hadoopConfiguration) if (fs.exists(path)) { if (!fs.delete(path, true)) { logWarning(s"Error deleting ${path.toString()}") } } } } }
Example 170
Source File: SerializableFileStatus.scala From parquet-index with Apache License 2.0 | 5 votes |
package com.github.lightcopy.util import org.apache.hadoop.fs.{BlockLocation, FileStatus, LocatedFileStatus, Path} object SerializableFileStatus { def fromFileStatus(status: FileStatus): SerializableFileStatus = { val blockLocations = status match { case f: LocatedFileStatus => f.getBlockLocations.map { loc => SerializableBlockLocation(loc.getNames, loc.getHosts, loc.getOffset, loc.getLength) } case _ => Array.empty[SerializableBlockLocation] } SerializableFileStatus( status.getPath.toString, status.getLen, status.isDirectory, status.getReplication, status.getBlockSize, status.getModificationTime, status.getAccessTime, blockLocations) } def toFileStatus(status: SerializableFileStatus): FileStatus = { val blockLocations = status.blockLocations.map { loc => new BlockLocation(loc.names, loc.hosts, loc.offset, loc.length) } new LocatedFileStatus( new FileStatus( status.length, status.isDir, status.blockReplication, status.blockSize, status.modificationTime, new Path(status.path)), blockLocations) } }
Example 171
Source File: MetastoreIndexSuite.scala From parquet-index with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.hadoop.fs.Path import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.sources.Filter import org.apache.spark.sql.types.StructType import com.github.lightcopy.testutil.UnitTestSuite import com.github.lightcopy.testutil.implicits._ // Test catalog to check internal methods private[datasources] class TestIndex extends MetastoreIndex { private var internalIndexFilters: Seq[Filter] = Nil override def tablePath(): Path = ??? override def partitionSchema: StructType = ??? override def indexSchema: StructType = ??? override def dataSchema: StructType = ??? override def setIndexFilters(filters: Seq[Filter]) = { internalIndexFilters = filters } override def indexFilters: Seq[Filter] = internalIndexFilters override def listFilesWithIndexSupport( partitionFilters: Seq[Expression], dataFilters: Seq[Expression], indexFilters: Seq[Filter]): Seq[PartitionDirectory] = ??? override def inputFiles: Array[String] = ??? override def sizeInBytes: Long = ??? } class MetastoreIndexSuite extends UnitTestSuite { test("provide sequence of path based on table path") { val catalog = new TestIndex() { override def tablePath(): Path = new Path("test") } catalog.rootPaths should be (Seq(new Path("test"))) } test("when using listFiles directly supply empty index filter") { var indexSeq: Seq[Filter] = null var filterSeq: Seq[Expression] = null val catalog = new TestIndex() { override def listFilesWithIndexSupport( partitionFilters: Seq[Expression], dataFilters: Seq[Expression], indexFilters: Seq[Filter]): Seq[PartitionDirectory] = { indexSeq = indexFilters filterSeq = partitionFilters Seq.empty } } catalog.listFiles(Seq.empty, Seq.empty) indexSeq should be (Nil) filterSeq should be (Nil) } test("refresh should be no-op by default") { val catalog = new TestIndex() catalog.refresh() } }
Example 172
Source File: AzureStreamingExample.scala From cloud-integration with Apache License 2.0 | 5 votes |
package com.cloudera.spark.cloud.examples import com.cloudera.spark.cloud.ObjectStoreExample import org.apache.hadoop.fs.Path import org.apache.spark.SparkConf import org.apache.spark.streaming.{Seconds, StreamingContext} override def action( sparkConf: SparkConf, args: Array[String]): Int = { if (args.length != 3) { return usage() } sparkConf.setAppName("CloudStreaming") applyObjectStoreConfigurationOptions(sparkConf, false) val dest = args(0) val delay = Integer.valueOf(args(1)) val interval = Integer.valueOf(args(2)) // Create the context val streaming = new StreamingContext(sparkConf, Seconds(10)) try { // Create the FileInputDStream on the directory regexp and use the // stream to look for a new file renamed into it val destPath = new Path(dest) val sc = streaming.sparkContext val hc = sc.hadoopConfiguration val fs = destPath.getFileSystem(hc) rm(fs, destPath) fs.mkdirs(destPath) val sightings = sc.longAccumulator("sightings") print("===================================") print(s"Looking for text files under ${destPath}") print("===================================") val lines = streaming.textFileStream(dest) val matches = lines.map(line => { sightings.add(1) print(s"[${sightings.value}]: $line") line }) // materialize the operation matches.print() // start the streaming streaming.start() // sleep a bit to get streaming up and running Thread.sleep(delay * 1000) print("===================================") print(s"Seen ${sightings.value} lines") 0 } finally { streaming.stop(true) } } } object AzureStreamingExample { def main(args: Array[String]) { new AzureStreamingExample().run(args) } }
Example 173
Source File: LineCount.scala From cloud-integration with Apache License 2.0 | 5 votes |
package com.cloudera.spark.cloud.operations import java.net.URI import com.cloudera.spark.cloud.ObjectStoreExample import com.cloudera.spark.cloud.s3.SequentialIOPolicy import com.cloudera.spark.cloud.common.CloudTestKeys._ import com.cloudera.spark.cloud.s3.SequentialIOPolicy import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.{SparkConf, SparkContext} destFsInfo = Some(s"\nFile System $destPath=\n$destFS\n") } } srcFsInfo = Some(s"\nSource File System = $sourceFs\n") } finally { logInfo("Stopping Spark Context") sc.stop() srcFsInfo.foreach(logInfo(_)) destFsInfo.foreach(logInfo(_)) } 0 } def defaultSource: Option[String] = { Some(S3A_CSV_PATH_DEFAULT) } def maybeEnableAnonymousAccess( sparkConf: SparkConf, dest: Option[String]): Unit = { if (dest.isEmpty) { hconf(sparkConf, AWS_CREDENTIALS_PROVIDER, ANONYMOUS_CREDENTIALS) } } }
Example 174
Source File: S3ADataFrames.scala From cloud-integration with Apache License 2.0 | 5 votes |
package com.cloudera.spark.cloud.s3 import com.cloudera.spark.cloud.common.CloudTestKeys import com.cloudera.spark.cloud.operations.CloudDataFrames import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.sql.SparkSession object S3ADataFrames extends CloudDataFrames with S3AExampleSetup { override def extraValidation( session: SparkSession, conf: Configuration, fs: FileSystem, results: Seq[(String, Path, Long, Long)]): Unit = { val operations = new S3AOperations(fs) if (conf.getBoolean(CloudTestKeys.S3A_COMMITTER_TEST_ENABLED, false)) { results.foreach((tuple: (String, Path, Long, Long)) => { operations.verifyS3Committer(tuple._2, None, None, "") }) } } }
Example 175
Source File: CopyCsvFileTrait.scala From cloud-integration with Apache License 2.0 | 5 votes |
package com.cloudera.spark.cloud.common import java.io.{EOFException, FileNotFoundException} import org.apache.hadoop.fs.Path override def prepareTestCSVFile(): Unit = { require(hasCSVTestFile(), "No CSV file") require(isFilesystemDefined, "Test FS is not defined; call initFS() first") // here the CSV file is copied over val source = sourceCSVFilePath.get if (source.toUri.getScheme == "wasb") { // source is already in Azure testCSVFile = sourceCSVFilePath deleteTestCSVFile = false } else { val srcStatus = source.getFileSystem(getConf).getFileStatus(source) if (srcStatus.getLen == 0) { throw new EOFException(s"File $source is an empty file") } // need to copy over val destFile = path(source.getName) testCSVFile = Some(destFile) var toCopy = false try { val status = filesystem.getFileStatus(destFile) if (status.getLen != srcStatus.getLen) { logInfo(s"Dest file exists, but length of $status != source data $srcStatus") } else { logInfo(s"Datafile exists; no copy needed: $status") toCopy = false } } catch { case _ : FileNotFoundException => toCopy = true } if (toCopy) { copyFile(sourceCSVFilePath.get, destFile, getConf, true) } } } }
Example 176
Source File: CloudPartitionTest.scala From cloud-integration with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.hadoop.fs.Path import org.apache.spark.sql._ import org.apache.spark.sql.types.{IntegerType, StructField, StructType} abstract class CloudPartitionTest extends AbstractCloudRelationTest { import testImplicits._ ctest( "save-findClass-partitioned-part-columns-in-data", "Save sets of files in explicitly set up partition tree; read") { withTempPathDir("part-columns", None) { path => for (p1 <- 1 to 2; p2 <- Seq("foo", "bar")) { val partitionDir = new Path(path, s"p1=$p1/p2=$p2") val df = sparkContext .parallelize(for (i <- 1 to 3) yield (i, s"val_$i", p1)) .toDF("a", "b", "p1") df.write .format(dataSourceName) .mode(SaveMode.ErrorIfExists) .save(partitionDir.toString) // each of these directories as its own success file; there is // none at the root resolveSuccessFile(partitionDir, true) } val dataSchemaWithPartition = StructType( dataSchema.fields :+ StructField("p1", IntegerType, nullable = true)) checkQueries( spark.read.options(Map( "path" -> path.toString, "dataSchema" -> dataSchemaWithPartition.json)).format(dataSourceName) .load()) } } }
Example 177
Source File: S3ANumbersSuiteV2APISuite.scala From cloud-integration with Apache License 2.0 | 5 votes |
package com.cloudera.spark.cloud.s3 import com.cloudera.spark.cloud.common.NumbersRddTests import org.apache.hadoop.fs.Path import org.apache.spark.rdd.RDD class S3ANumbersSuiteV2APISuite extends NumbersRddTests with S3ATestSetup { init() def init(): Unit = { // propagate S3 credentials if (enabled) { initFS() } } override protected def pathname = { "numbers_rdd_tests_v2api" } override protected def saveRDD( numbers: RDD[Int], dest: Path): Unit = { saveRDDviaMRv2(numbers, dest) } }
Example 178
Source File: S3ALineCountWritebackSuite.scala From cloud-integration with Apache License 2.0 | 5 votes |
package com.cloudera.spark.cloud.s3 import scala.concurrent.duration._ import scala.language.postfixOps import com.cloudera.spark.cloud.common.CloudSuiteWithCSVDatasource import org.apache.hadoop.fs.{FileStatus, Path} class S3ALineCountWritebackSuite extends CloudSuiteWithCSVDatasource with S3ATestSetup { init() def init(): Unit = { // propagate S3 credentials if (enabled) { initFS() } } override def enabled: Boolean = super.enabled && hasCSVTestFile override def cleanFSInTeardownEnabled: Boolean = true after { cleanFilesystemInTeardown() } ctest("LineCountWriteback", "Execute the LineCount example with the results written back to the test filesystem.") { val sourceFile = getTestCSVPath() val sourceFS = sourceFile.getFileSystem(getConf) val sourceInfo = sourceFS.getFileStatus(sourceFile) val sparkConf = newSparkConf() sparkConf.setAppName("LineCount") val destDir = testPath(filesystem, "LineCountWriteback") assert(0 === S3ALineCount.action(sparkConf, Array(sourceFile.toString, destDir.toString))) val status = filesystem.getFileStatus(destDir) assert(status.isDirectory, s"Not a directory: $status") // only a small fraction of the source data is needed val expectedLen = sourceInfo.getLen / 1024 def validateChildSize(qualifier: String, files: Seq[FileStatus]) = { val (filenames, size) = enumFileSize(destDir, files) logInfo(s"total size of $qualifier = $size bytes from ${files.length} files: $filenames") assert(size >= expectedLen, s"$qualifier size $size in files $filenames" + s" smaller than exoected length $expectedLen") } val stdInterval = interval(100 milliseconds) val appId = eventually(timeout(20 seconds), stdInterval) { validateChildSize("descendants", listFiles(filesystem, destDir, true) .filter(f => f.getPath.getName != "_SUCCESS")) validateChildSize("children", filesystem.listStatus(destDir, pathFilter(p => p.getName != "_SUCCESS")).toSeq) } } private def enumFileSize(destDir: Path, files: Seq[FileStatus]): (String, Long) = { assert(files.nonEmpty, s"No files in destination directory $destDir") var size = 0L val filenames = new StringBuffer() files.foreach { f => size += f.getLen filenames.append(" ").append(f.getPath) } (filenames.toString, size) } }
Example 179
Source File: S3AFileGeneratorSuite.scala From cloud-integration with Apache License 2.0 | 5 votes |
package com.cloudera.spark.cloud.s3 import com.cloudera.spark.cloud.common.FileGeneratorTests import org.apache.hadoop.fs.Path import org.apache.spark.SparkConf class S3AFileGeneratorSuite extends FileGeneratorTests with S3ATestSetup { init() def init(): Unit = { // propagate S3 credentials if (enabled) { initFS() } } after { cleanFilesystemInTeardown() } ctest("FileGeneratorUsage", "Execute the S3FileGenerator example with a bad argument; expect a failure") { val conf = newSparkConf() conf.setAppName("FileGenerator") assert(-2 === S3AFileGenerator.action(conf, Seq())) } override def generate( conf: SparkConf, destDir: Path, monthCount: Int, fileCount: Int, rowCount: Int): Int = { val result = S3AFileGenerator.action(conf, Seq(destDir, monthCount, fileCount, rowCount)) result } }
Example 180
Source File: FileGeneratorTests.scala From cloud-integration with Apache License 2.0 | 5 votes |
package com.cloudera.spark.cloud.common import com.cloudera.spark.cloud.operations.CloudFileGenerator import org.apache.hadoop.fs.Path import org.apache.spark.SparkConf def generate( conf: SparkConf, destDir: Path, monthCount: Int, fileCount: Int, rowCount: Int): Int = { val result = new CloudFileGenerator().action( conf, Seq(destDir, monthCount, fileCount, rowCount)) result } }
Example 181
Source File: FileHandler.scala From gimel with Apache License 2.0 | 5 votes |
package com.paypal.gimel.common.security import java.nio.file._ import java.security.AccessControlException import scala.collection.JavaConverters._ import org.apache.hadoop.fs.{FileSystem, Path} import com.paypal.gimel.common.conf.GimelConstants import com.paypal.gimel.logger.Logger object FileHandler { val logger = Logger(this.getClass) def checkIfFileAccessibleByOthers(filePath: String, source: String, fail: Boolean): Unit = { source.toLowerCase() match { case GimelConstants.HADDOP_FILE_SYSTEM => val conf = new org.apache.hadoop.conf.Configuration() val fs = FileSystem.get(conf) val hdfsPath = new Path(filePath) if (fs.exists(hdfsPath)) { val permission = fs.getFileStatus(hdfsPath).getPermission.toString if (permission.substring(3, permission.length) != "------") { val message = s"FILE IS NOT PROTECTED. PLEASE PROTECT THE FILE WITH PROPER PERMISSIONS (700) : ${filePath}" if (fail) { throw new AccessControlException(message) } } } case GimelConstants.LOCAL_FILE_SYSTEM => val path = Paths.get(filePath) if (Files.exists(path)) { val p = Files.getPosixFilePermissions(path) if (p.asScala.exists(x => x.toString.startsWith("OTHER") || x.toString.startsWith("GROUP"))) { val message = s"FILE IS NOT PROTECTED. PLEASE PROTECT THE FILE WITH PROPER PERMISSIONS (700) : ${filePath}" if (fail) { throw new AccessControlException(message) } } } } } }
Example 182
Source File: ArtifactS3Saver.scala From marvin-engine-executor with Apache License 2.0 | 5 votes |
package org.marvin.artifact.manager import java.io.File import akka.Done import akka.actor.{Actor, ActorLogging} import com.amazonaws.services.s3.model.GetObjectRequest import com.amazonaws.services.s3.{AmazonS3, AmazonS3ClientBuilder} import org.apache.hadoop.fs.Path import org.marvin.artifact.manager.ArtifactSaver.{SaveToLocal, SaveToRemote} import org.marvin.model.EngineMetadata class ArtifactS3Saver(metadata: EngineMetadata) extends Actor with ActorLogging { var s3Client: AmazonS3 = _ override def preStart() = { log.info(s"${this.getClass().getCanonicalName} actor initialized...") //Create S3 Client with default credential informations(Environment Variable) s3Client = AmazonS3ClientBuilder.standard.withRegion(System.getenv("AWS_DEFAULT_REGION")).build log.info("Amazon S3 client initialized...") } def generatePaths(artifactName: String, protocol: String): Map[String, Path] = { var artifactsRemotePath: String = null if(metadata.artifactsRemotePath.startsWith("/")){ artifactsRemotePath = metadata.artifactsRemotePath.substring(1) } Map( "localPath" -> new Path(s"${metadata.artifactsLocalPath}/${metadata.name}/$artifactName"), "remotePath" -> new Path(s"${artifactsRemotePath}/${metadata.name}/${metadata.version}/$artifactName/$protocol") ) } def validatePath(path: Path, isRemote: Boolean): Boolean = { if (isRemote) { s3Client.doesObjectExist(metadata.s3BucketName, path.toString) } else { new java.io.File(path.toString).exists } } override def receive: Receive = { case SaveToLocal(artifactName, protocol) => log.info("Receive message and starting to working...") val uris = generatePaths(artifactName, protocol) val localToSave = new File(uris("localPath").toString) // Validate if the protocol is correct if (validatePath(uris("remotePath"), true)) { log.info(s"Copying files from ${metadata.s3BucketName}: ${uris("remotePath")} to ${uris("localPath")}") //Get artifact named "uris("remotePath")" from S3 Bucket and save it to local s3Client.getObject(new GetObjectRequest(metadata.s3BucketName, uris("remotePath").toString), localToSave) log.info(s"File ${uris("localPath")} saved!") } else { log.error(s"Invalid protocol: ${protocol}, save process canceled!") } sender ! Done case SaveToRemote(artifactName, protocol) => log.info("Receive message and starting to working...") val uris = generatePaths(artifactName, protocol) val fileToUpload = new File(uris("localPath").toString) // Validate if the protocol is correct if (validatePath(uris("localPath"), false)) { log.info(s"Copying files from ${uris("localPath")} to ${metadata.s3BucketName}: ${uris("remotePath")}") //Get local artifact and save to S3 Bucket with name "uris("remotePath")" s3Client.putObject(metadata.s3BucketName, uris("remotePath").toString, fileToUpload) log.info(s"File ${uris("localPath")} saved!") } else { log.error(s"Invalid protocol: ${protocol}, save process canceled!") } sender ! Done case _ => log.warning("Received a bad format message...") } }
Example 183
Source File: ArtifactHdfsSaver.scala From marvin-engine-executor with Apache License 2.0 | 5 votes |
package org.marvin.artifact.manager import java.io.{File, FileInputStream} import akka.Done import akka.actor.{Actor, ActorLogging} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.marvin.artifact.manager.ArtifactSaver.{SaveToLocal, SaveToRemote} import org.marvin.model.EngineMetadata class ArtifactHdfsSaver(metadata: EngineMetadata) extends Actor with ActorLogging { var conf: Configuration = _ override def preStart() = { log.info(s"${this.getClass().getCanonicalName} actor initialized...") conf = new Configuration() if (sys.env.get("HADOOP_CONF_DIR") != None){ val confFiles:List[File] = getListOfFiles(sys.env.get("HADOOP_CONF_DIR").mkString) for(file <- confFiles){ log.info(s"Loading ${file.getAbsolutePath} file to hdfs client configuration ..") conf.addResource(new FileInputStream(file)) } } conf.set("fs.defaultFS", metadata.hdfsHost) } def generatePaths(artifactName: String, protocol: String): Map[String, Path] = { Map( "localPath" -> new Path(s"${metadata.artifactsLocalPath}/${metadata.name}/$artifactName"), "remotePath" -> new Path(s"${metadata.artifactsRemotePath}/${metadata.name}/${metadata.version}/$artifactName/$protocol") ) } def getListOfFiles(path: String): List[File] = { val dir = new File(path) val extensions = List("xml") dir.listFiles.filter(_.isFile).toList.filter { file => extensions.exists(file.getName.endsWith(_)) } } def validatePath(path: Path, isRemote: Boolean, fs: FileSystem): Boolean = { if (isRemote) { fs.exists(path) } else { new java.io.File(path.toString).exists } } override def receive: Receive = { case SaveToLocal(artifactName, protocol) => log.info("Receive message and starting to working...") val fs = FileSystem.get(conf) val uris = generatePaths(artifactName, protocol) if (validatePath(uris("remotePath"), true, fs)) { log.info(s"Copying files from ${uris("remotePath")} to ${uris("localPath")}") fs.copyToLocalFile(false, uris("remotePath"), uris("localPath"), false) fs.close() log.info(s"File ${uris("localPath")} saved!") } else { log.error(s"Invalid protocol: ${protocol}, save process canceled!") } sender ! Done case SaveToRemote(artifactName, protocol) => log.info("Receive message and starting to working...") val fs = FileSystem.get(conf) val uris = generatePaths(artifactName, protocol) if (validatePath(uris("localPath"), false, fs)) { log.info(s"Copying files from ${uris("localPath")} to ${uris("remotePath")}") fs.copyFromLocalFile(uris("localPath"), uris("remotePath")) fs.close() log.info(s"File ${uris("localPath")} saved!") } else { log.error(s"Invalid protocol: ${protocol}, save process canceled!") } sender ! Done case _ => log.warning("Received a bad format message...") } }
Example 184
Source File: ArtifactS3SaverTest.scala From marvin-engine-executor with Apache License 2.0 | 5 votes |
package org.marvin.artifact.manager import java.io.File import akka.Done import akka.actor.{ActorSystem, Props} import akka.testkit.{ImplicitSender, TestKit} import com.amazonaws.services.s3.AmazonS3 import com.amazonaws.services.s3.model.GetObjectRequest import com.typesafe.config.ConfigFactory import org.apache.hadoop.fs.Path import org.marvin.artifact.manager.ArtifactSaver.{SaveToLocal, SaveToRemote} import org.marvin.fixtures.MetadataMock import org.marvin.model.EngineMetadata import org.scalamock.scalatest.MockFactory import org.scalatest.{BeforeAndAfterAll, Matchers, WordSpecLike} class ArtifactS3SaverTest extends TestKit( ActorSystem("ArtifactS3SaverTest", ConfigFactory.parseString("""akka.loggers = ["akka.testkit.TestEventListener"]"""))) with ImplicitSender with WordSpecLike with Matchers with BeforeAndAfterAll with MockFactory { override def afterAll { TestKit.shutdownActorSystem(system) } "s3 saver" should { "receive SaveToLocal message" in { val metadata = MetadataMock.simpleMockedMetadata() val _s3Client = mock[AmazonS3] val actor = system.actorOf(Props(new ArtifactS3SaverMock(metadata, _s3Client, true))) val protocol = "protocol" val artifactName = "model" (_s3Client.getObject(_ : GetObjectRequest, _ : File)).expects(*, *).once() actor ! SaveToLocal(artifactName, protocol) expectMsg(Done) } "receive SaveToRemote message" in { val metadata = MetadataMock.simpleMockedMetadata() val _s3Client = mock[AmazonS3] val actor = system.actorOf(Props(new ArtifactS3SaverMock(metadata, _s3Client, true))) val protocol = "protocol" val artifactName = "model" (_s3Client.putObject(_ : String, _: String, _ : File)).expects(metadata.s3BucketName, *, *).once() actor ! SaveToRemote(artifactName, protocol) expectMsg(Done) } } "call preStart method wth success" in { val metadata = MetadataMock.simpleMockedMetadata() try{ system.actorOf(Props(new ArtifactS3Saver(metadata))) assert(true) }catch { case _: Throwable => assert(false) } } class ArtifactS3SaverMock(metadata: EngineMetadata, _s3Client: AmazonS3, _isRemote: Boolean) extends ArtifactS3Saver(metadata) { def _preStart(): Unit = super.preStart() override def preStart(): Unit = { s3Client = _s3Client } override def validatePath(path: Path, isRemote: Boolean): Boolean = { if (_isRemote) true else false } } }
Example 185
Source File: OrcAcidUtil.scala From spark-acid with Apache License 2.0 | 5 votes |
package com.qubole.shaded.hadoop.hive.ql.io.orc import java.util.regex.Pattern import com.qubole.shaded.hadoop.hive.ql.io.AcidUtils import org.apache.hadoop.fs.Path object OrcAcidUtil { val BUCKET_PATTERN = Pattern.compile("bucket_[0-9]{5}$") def getDeleteDeltaPaths(orcSplit: OrcSplit): Array[Path] = { assert(BUCKET_PATTERN.matcher(orcSplit.getPath.getName).matches()) val bucket = AcidUtils.parseBucketId(orcSplit.getPath) assert(bucket != -1) val deleteDeltaDirPaths = VectorizedOrcAcidRowBatchReader.getDeleteDeltaDirsFromSplit(orcSplit); deleteDeltaDirPaths.map(deleteDir => AcidUtils.createBucketFile(deleteDir, bucket)) } }
Example 186
Source File: HiveAcidWriterOptions.scala From spark-acid with Apache License 2.0 | 5 votes |
package com.qubole.spark.hiveacid.writer.hive import com.qubole.shaded.hadoop.hive.ql.plan.FileSinkDesc import com.qubole.spark.hiveacid.HiveAcidOperation import com.qubole.spark.hiveacid.hive.HiveAcidMetadata import com.qubole.spark.hiveacid.writer.WriterOptions import org.apache.hadoop.fs.Path private[writer] class HiveAcidWriterOptions(val rootPath: String, fileSinkDesc: FileSinkDesc) extends Serializable { lazy val getFileSinkDesc: FileSinkDesc = { fileSinkDesc.setDirName(new Path(rootPath)) fileSinkDesc } } private[writer] object HiveAcidWriterOptions { def get(hiveAcidMetadata: HiveAcidMetadata, options: WriterOptions): HiveAcidWriterOptions = { lazy val fileSinkDescriptor: FileSinkDesc = { val fileSinkDesc: FileSinkDesc = new FileSinkDesc() fileSinkDesc.setTableInfo(hiveAcidMetadata.tableDesc) fileSinkDesc.setTableWriteId(options.currentWriteId) if (options.operationType == HiveAcidOperation.INSERT_OVERWRITE) { fileSinkDesc.setInsertOverwrite(true) } if (options.statementId.isDefined) { fileSinkDesc.setStatementId(options.statementId.get) } fileSinkDesc } new HiveAcidWriterOptions(rootPath = hiveAcidMetadata.rootPath.toUri.toString, fileSinkDesc = fileSinkDescriptor) } }
Example 187
Source File: HiveAcidSink.scala From spark-acid with Apache License 2.0 | 5 votes |
package com.qubole.spark.hiveacid.streaming import com.qubole.spark.hiveacid.{HiveAcidErrors, HiveAcidTable} import com.qubole.spark.hiveacid.hive.HiveAcidMetadata import org.apache.hadoop.fs.Path import org.apache.spark.internal.Logging import org.apache.spark.sql.{DataFrame, SparkSession} import org.apache.spark.sql.execution.streaming.Sink class HiveAcidSink(sparkSession: SparkSession, parameters: Map[String, String]) extends Sink with Logging { import HiveAcidSink._ private val acidSinkOptions = new HiveAcidSinkOptions(parameters) private val fullyQualifiedTableName = acidSinkOptions.tableName private val hiveAcidTable: HiveAcidTable = HiveAcidTable.fromSparkSession( sparkSession, fullyQualifiedTableName, parameters) assertNonBucketedTable() private val logPath = getMetaDataPath() private val fileLog = new HiveAcidSinkLog( HiveAcidSinkLog.VERSION, sparkSession, logPath.toUri.toString, acidSinkOptions) private def assertNonBucketedTable(): Unit = { if(hiveAcidTable.isBucketed) { throw HiveAcidErrors.unsupportedOperationTypeBucketedTable("Streaming Write", fullyQualifiedTableName) } } private def getMetaDataPath(): Path = { acidSinkOptions.metadataDir match { case Some(dir) => new Path(dir) case None => logInfo(s"Metadata dir not specified. Using " + s"$metadataDirPrefix/_query_default as metadata dir") logWarning(s"Please make sure that multiple streaming writes to " + s"$fullyQualifiedTableName are not running") val tableLocation = HiveAcidMetadata.fromSparkSession( sparkSession, fullyQualifiedTableName).rootPath new Path(tableLocation, s"$metadataDirPrefix/_query_default") } } override def addBatch(batchId: Long, df: DataFrame): Unit = { if (batchId <= fileLog.getLatest().map(_._1).getOrElse(-1L)) { logInfo(s"Skipping already committed batch $batchId") } else { val commitProtocol = new HiveAcidStreamingCommitProtocol(fileLog) val txnId = hiveAcidTable.addBatch(df) commitProtocol.commitJob(batchId, txnId) } } override def toString: String = s"HiveAcidSinkV1[$fullyQualifiedTableName]" } object HiveAcidSink { val metadataDirPrefix = "_acid_streaming" }
Example 188
Source File: IndexBuilder.scala From Mastering-Spark-for-Data-Science with MIT License | 5 votes |
package io.gzet.recommender import com.datastax.spark.connector._ import com.typesafe.config.Config import io.gzet.recommender.Config._ import org.apache.hadoop.fs.{Path, FileSystem} import org.apache.spark.SparkContext import spark.jobserver._ object IndexBuilder extends SparkJob { override def runJob(sc: SparkContext, conf: Config): Any = { val inputDir = conf.getString("input.dir") val sampleSizeB = sc.broadcast(SAMPLE_SIZE) val audioSongRDD = AudioLibrary.read(inputDir, sc, MIN_TIME, MAX_TIME) val songRDD = audioSongRDD.keys.sortBy(song => song).zipWithIndex().mapValues(l => l + 1) val songIdsB = sc.broadcast(songRDD.collectAsMap()) val audioRDD = audioSongRDD mapPartitions { audios => val songIds = songIdsB.value audios map { case (song, audio) => (songIds.get(song).get, audio) } } val sampleRDD = audioRDD flatMap { case (songId, audio) => audio.sampleByTime(sampleSizeB.value) map { sample => (songId, sample) } } val recordRDD = songRDD map { case (name, id) => Record(id, name) } val hashRDD = sampleRDD.map({case (songId, sample) => ((sample.hash, songId), Array(sample.id)) }).reduceByKey(_ ++ _).mapValues(a => a.mkString(",")).map({case ((hash, songId), sampleIds) => (hash, songId) }).groupByKey().mapValues(it => it.toList).map({case (id, songs) => Hash(id, songs) }) hashRDD.saveAsCassandraTable(KEYSPACE, TABLE_HASH) recordRDD.saveAsCassandraTable(KEYSPACE, TABLE_RECORD) } def containsWav(hdfs: FileSystem, path: Path) = { val it = hdfs.listFiles(path, false) var i = 0 while(it.hasNext){ if(it.next().getPath.getName.endsWith(".wav")){ i += 1 } } i > 0 } override def validate(sc: SparkContext, config: Config): SparkJobValidation = { if(!config.hasPath("input.dir")) { SparkJobInvalid("Missing parameter [input.dir]") } else { val hdfs = FileSystem.get(sc.hadoopConfiguration) val path = new Path(config.getString("input.dir")) val isDir = hdfs.isDirectory(path) val isValid = containsWav(hdfs, path) hdfs.close() if(isDir && isValid) { SparkJobValid } else { SparkJobInvalid("Input directory does not contains .wav files") } } } }
Example 189
Source File: GDBIndex.scala From spark-gdb with Apache License 2.0 | 5 votes |
package com.esri.gdb import java.io.{DataInput, File} import java.nio.{ByteBuffer, ByteOrder} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FSDataInputStream, Path} import org.apache.spark.Logging object GDBIndex { def apply(path: String, name: String, conf: Configuration = new Configuration()) = { val filename = StringBuilder.newBuilder.append(path).append(File.separator).append(name).append(".gdbtablx").toString() val hdfsPath = new Path(filename) val dataInput = hdfsPath.getFileSystem(conf).open(hdfsPath) val bytes = new Array[Byte](16) dataInput.readFully(bytes) val byteBuffer = ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN) val signature = byteBuffer.getInt val n1024Blocks = byteBuffer.getInt val numRows = byteBuffer.getInt val indexSize = byteBuffer.getInt new GDBIndex(dataInput, numRows, indexSize) } } private[gdb] class GDBIndex(dataInput: FSDataInputStream, val numRows: Int, indexSize: Int ) extends Logging with AutoCloseable with Serializable { def readSeekForRowNum(rowNum: Int) = { val bytes = new Array[Byte](indexSize) dataInput.seek(16 + rowNum * indexSize) dataInput.readFully(bytes) ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN).getInt } def iterator(startAtRow: Int = 0, numRowsToRead: Int = -1) = { dataInput.seek(16 + startAtRow * indexSize) val maxRows = if (numRowsToRead == -1) numRows else numRowsToRead // log.info(s"iterator::startAtRow=$startAtRow maxRows=$maxRows") new GDBIndexIterator(dataInput, startAtRow, maxRows, indexSize).withFilter(_.isSeekable) } def close() { dataInput.close() } } private[gdb] class GDBIndexIterator(dataInput: DataInput, startID: Int, maxRows: Int, indexSize: Int ) extends Iterator[IndexInfo] with Logging with Serializable { private val indexInfo = IndexInfo(0, 0) private val bytes = new Array[Byte](indexSize) private val byteBuffer = ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN) private var objectID = startID private var nextRow = 0 def hasNext() = nextRow < maxRows def next() = { // log.info(s"next::nextRow=$nextRow maxRows=$maxRows") nextRow += 1 objectID += 1 indexInfo.objectID = objectID byteBuffer.clear dataInput.readFully(bytes) indexInfo.seek = byteBuffer.getInt indexInfo } }
Example 190
Source File: ReadingWritingData.scala From Spark-RSVD with Apache License 2.0 | 5 votes |
package com.criteo.rsvd import java.nio.ByteBuffer import com.esotericsoftware.kryo.Kryo import com.typesafe.scalalogging.slf4j.StrictLogging import de.javakaffee.kryoserializers.UnmodifiableCollectionsSerializer import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.io.{BytesWritable, NullWritable} import org.apache.spark.mllib.linalg.distributed.MatrixEntry import org.apache.spark.rdd.RDD import org.apache.spark.serializer.{KryoRegistrator, KryoSerializer} import org.apache.spark.{SparkConf, SparkContext} import scala.reflect.ClassTag object ReadingWritingData extends StrictLogging { def getInputDataSizeMB(inputPathPattern: String, sc: SparkContext): Int = { val fs = FileSystem.get(sc.hadoopConfiguration) val path = new Path(inputPathPattern) (fs.globStatus(path).map(f => f.getLen).sum / 1024 / 1024).toInt } def loadMatrixEntries(inputPath: String, singlePartitionSizeMB: Int, sc: SparkContext): RDD[MatrixEntry] = { logger.info(s"Input matrix path: $inputPath") val inputDataSizeMB = getInputDataSizeMB(inputPath + " def makeRddFromKryoFile[T: ClassTag]( sc: SparkContext, path: String, minPartitionsOpt: Option[Int] = None): RDD[T] = { val minPartitions = minPartitionsOpt.getOrElse(sc.defaultMinPartitions) val serializer = new KryoSerializer(sc.getConf) sc.sequenceFile(path, classOf[NullWritable], classOf[BytesWritable], minPartitions) .mapPartitions { it => val instance = serializer.newInstance() it.flatMap { case (_, v) => instance.deserialize[Array[T]](ByteBuffer.wrap(v.getBytes)) } } } object RandomizedSVDKryoRegistrator extends KryoRegistrator { def registerClasses(kryo: Kryo): Unit = { UnmodifiableCollectionsSerializer.registerSerializers(kryo) kryo.register(classOf[MatrixEntry]) kryo.register(classOf[Array[MatrixEntry]]) } } def appendBasicRegistratorToSparkConf(sparkConf: SparkConf): SparkConf = appendRegistratorToSparkConf(sparkConf, RandomizedSVDKryoRegistrator.getClass.getName) def appendRegistratorToSparkConf(sparkConf: SparkConf, registratorName: String): SparkConf = { val oldValue = sparkConf.get("spark.kryo.registrator", "") if (oldValue == "") { sparkConf.set("spark.kryo.registrator", registratorName) } else { sparkConf.set("spark.kryo.registrator", oldValue + "," + registratorName) } } }
Example 191
Source File: PlyOutputWriter.scala From spark-iqmulus with Apache License 2.0 | 5 votes |
package fr.ign.spark.iqmulus.ply import org.apache.spark.sql.types._ import org.apache.hadoop.mapreduce.{ TaskAttemptID, RecordWriter, TaskAttemptContext, JobContext } import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter import java.io.DataOutputStream import org.apache.spark.sql.sources.OutputWriter import org.apache.hadoop.io.{ NullWritable, BytesWritable } import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat import org.apache.hadoop.fs.Path import java.text.NumberFormat import org.apache.spark.sql.{ Row, SQLContext, sources } import fr.ign.spark.iqmulus.RowOutputStream class PlyOutputWriter( name: String, context: TaskAttemptContext, dataSchema: StructType, element: String, littleEndian: Boolean ) extends OutputWriter { private val file = { val path = getDefaultWorkFile(s".ply.$element") val fs = path.getFileSystem(context.getConfiguration) fs.create(path) } private var count = 0L // strip out ids private val schema = StructType(dataSchema.filterNot { Seq("fid", "pid") contains _.name }) private val recordWriter = new RowOutputStream(new DataOutputStream(file), littleEndian, schema, dataSchema) def getDefaultWorkFile(extension: String): Path = { val uniqueWriteJobId = context.getConfiguration.get("spark.sql.sources.writeJobUUID") val taskAttemptId: TaskAttemptID = context.getTaskAttemptID val split = taskAttemptId.getTaskID.getId new Path(name, f"$split%05d-$uniqueWriteJobId$extension") } override def write(row: Row): Unit = { recordWriter.write(row) count += 1 } override def close(): Unit = { recordWriter.close // write header val path = getDefaultWorkFile(".ply.header") val fs = path.getFileSystem(context.getConfiguration) val dos = new java.io.DataOutputStream(fs.create(path)) val header = new PlyHeader(path.toString, littleEndian, Map(element -> ((count, schema)))) header.write(dos) dos.close } }
Example 192
Source File: LasOutputWriter.scala From spark-iqmulus with Apache License 2.0 | 5 votes |
package fr.ign.spark.iqmulus.las import org.apache.spark.sql.types._ import org.apache.hadoop.mapreduce.{ TaskAttemptID, RecordWriter, TaskAttemptContext } import java.io.DataOutputStream import org.apache.spark.sql.sources.OutputWriter import org.apache.spark.deploy.SparkHadoopUtil import org.apache.hadoop.io.{ NullWritable, BytesWritable } import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat import org.apache.hadoop.fs.Path import java.text.NumberFormat import org.apache.spark.sql.{ Row, SQLContext, sources } import fr.ign.spark.iqmulus.RowOutputStream class LasOutputWriter( name: String, context: TaskAttemptContext, dataSchema: StructType, formatOpt: Option[Byte] = None, version: Version = Version(), offset: Array[Double] = Array(0F, 0F, 0F), scale: Array[Double] = Array(0.01F, 0.01F, 0.01F) ) extends OutputWriter { private val file = { val path = getDefaultWorkFile("/1.pdr") val fs = path.getFileSystem(context.getConfiguration) fs.create(path) } private val pmin = Array.fill[Double](3)(Double.PositiveInfinity) private val pmax = Array.fill[Double](3)(Double.NegativeInfinity) private val countByReturn = Array.fill[Long](15)(0) private def count = countByReturn.sum private val format = formatOpt.getOrElse(LasHeader.formatFromSchema(dataSchema)) // todo, extra bytes private val schema = LasHeader.schema(format) private def header = new LasHeader(name, format, count, pmin, pmax, scale, offset, countByReturn) private val recordWriter = new RowOutputStream(new DataOutputStream(file), littleEndian = true, schema, dataSchema) def getDefaultWorkFile(extension: String): Path = { val uniqueWriteJobId = context.getConfiguration.get("spark.sql.sources.writeJobUUID") val taskAttemptId: TaskAttemptID = context.getTaskAttemptID val split = taskAttemptId.getTaskID.getId new Path(name, f"$split%05d-$uniqueWriteJobId$extension") } override def write(row: Row): Unit = { recordWriter.write(row) // gather statistics for the header val x = offset(0) + scale(0) * row.getAs[Int]("x").toDouble val y = offset(1) + scale(1) * row.getAs[Int]("y").toDouble val z = offset(2) + scale(2) * row.getAs[Int]("z").toDouble val ret = row.getAs[Byte]("flags") & 0x3 countByReturn(ret) += 1 pmin(0) = Math.min(pmin(0), x) pmin(1) = Math.min(pmin(1), y) pmin(2) = Math.min(pmin(2), z) pmax(0) = Math.max(pmax(0), x) pmax(1) = Math.max(pmax(1), y) pmax(2) = Math.max(pmax(2), z) } override def close(): Unit = { recordWriter.close // write header val path = getDefaultWorkFile("/0.header") val fs = path.getFileSystem(context.getConfiguration) val dos = new java.io.DataOutputStream(fs.create(path)) header.write(dos) dos.close // copy header and pdf to a final las file (1 per split) org.apache.hadoop.fs.FileUtil.copyMerge( fs, getDefaultWorkFile("/"), fs, getDefaultWorkFile(".las"), true, context.getConfiguration, "" ) } }
Example 193
Source File: LasRelation.scala From spark-iqmulus with Apache License 2.0 | 5 votes |
package fr.ign.spark.iqmulus.las import fr.ign.spark.iqmulus.{ BinarySectionRelation, BinarySection } import org.apache.hadoop.fs.{ FileSystem, Path } import org.apache.spark.sql.SQLContext import org.apache.spark.sql.sources.OutputWriterFactory import org.apache.hadoop.mapreduce.Job import org.apache.spark.sql.types._ import scala.util.{ Try, Success, Failure } class LasRelation( override val paths: Array[String], override val maybeDataSchema: Option[StructType], override val userDefinedPartitionColumns: Option[StructType], parameters: Map[String, String] )(@transient val sqlContext: SQLContext) extends BinarySectionRelation(parameters) { def format = parameters.get("lasformat").map(_.toByte) def minor = parameters.get("minor").map(_.toByte).getOrElse(Version.minorDefault) def major = parameters.get("major").map(_.toByte).getOrElse(Version.majorDefault) def version = parameters.get("version").map(Version.fromString) .getOrElse(Version(major, minor)) lazy val headers: Array[LasHeader] = paths flatMap { location => Try { val path = new Path(location) val fs = FileSystem.get(path.toUri, sqlContext.sparkContext.hadoopConfiguration) val dis = fs.open(path) try LasHeader.read(location, dis) finally { dis.close fs.close } } match { case Success(h) => Some(h) case Failure(e) => logWarning(s"Skipping $location : ${e.getMessage}"); None } } override def sections: Array[BinarySection] = headers.map(_.toBinarySection(paths)) override def prepareJobForWrite(job: Job): OutputWriterFactory = { new LasOutputWriterFactory(format, version) } }
Example 194
Source File: AddJar.scala From incubator-toree with Apache License 2.0 | 5 votes |
package org.apache.toree.magic.builtin import java.io.{File, PrintStream} import java.net.{URL, URI} import java.nio.file.{Files, Paths} import java.util.zip.ZipFile import org.apache.toree.magic._ import org.apache.toree.magic.builtin.AddJar._ import org.apache.toree.magic.dependencies._ import org.apache.toree.utils.{ArgumentParsingSupport, DownloadSupport, LogLike, FileUtils} import com.typesafe.config.Config import org.apache.hadoop.fs.Path import org.apache.toree.plugins.annotations.Event object AddJar { val HADOOP_FS_SCHEMES = Set("hdfs", "s3", "s3n", "file") private var jarDir:Option[String] = None def getJarDir(config: Config): String = { jarDir.getOrElse({ jarDir = Some( if(config.hasPath("jar_dir") && Files.exists(Paths.get(config.getString("jar_dir")))) { config.getString("jar_dir") } else { FileUtils.createManagedTempDirectory("toree_add_jars").getAbsolutePath } ) jarDir.get }) } } class AddJar extends LineMagic with IncludeInterpreter with IncludeOutputStream with DownloadSupport with ArgumentParsingSupport with IncludeKernel with IncludePluginManager with IncludeConfig with LogLike { // Option to mark re-downloading of jars private val _force = parser.accepts("f", "forces re-download of specified jar") // Option to mark re-downloading of jars private val _magic = parser.accepts("magic", "loads jar as a magic extension") // Lazy because the outputStream is not provided at construction private def printStream = new PrintStream(outputStream) ) } else { downloadFile( new URL(jarRemoteLocation), new File(downloadLocation).toURI.toURL ) } // Report download finished printStream.println(s"Finished download of $jarName") } else { printStream.println(s"Using cached version of $jarName") } // validate jar file if(! isValidJar(fileDownloadLocation)) { throw new IllegalArgumentException(s"Jar '$jarName' is not valid.") } if (_magic) { val plugins = pluginManager.loadPlugins(fileDownloadLocation) pluginManager.initializePlugins(plugins) } else { kernel.addJars(fileDownloadLocation.toURI) } } }
Example 195
Source File: ConfigurationBuilder.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.datamountaineer.streamreactor.connect.hbase.config import java.io.File import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.hbase.HBaseConfiguration import org.apache.kafka.common.config.ConfigException object ConfigurationBuilder { def buildHBaseConfig(hBaseSettings: HBaseSettings): Configuration = { val configuration = HBaseConfiguration.create() def appendFile(file:String): Unit = { val hbaseFile = new File(file) if (!hbaseFile.exists) { throw new ConfigException(s"$file does not exist in provided HBase configuration directory $hbaseFile.") } else { configuration.addResource(new Path(hbaseFile.toString)) } } hBaseSettings.hbaseConfigDir.foreach { dir => appendFile(dir + s"/hbase-site.xml") } configuration } }
Example 196
Source File: ParquetHiveFormat.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.formats import com.landoop.streamreactor.connect.hive.Serde import com.landoop.streamreactor.connect.hive.parquet.ParquetSinkConfig import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.fs.permission.FsPermission import org.apache.kafka.connect.data.{Schema, Struct} import org.apache.parquet.hadoop.ParquetWriter import scala.util.Try object ParquetHiveFormat extends HiveFormat { private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName) override def serde = Serde( "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe", "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat", "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat", Map("serialization.format" -> "1") ) override def writer(path: Path, schema: Schema) (implicit fs: FileSystem): HiveWriter = new HiveWriter { logger.debug(s"Creating parquet writer at $path") val writer: ParquetWriter[Struct] = com.landoop.streamreactor.connect.hive.parquet.parquetWriter(path, schema, ParquetSinkConfig(overwrite = true)) Try(fs.setPermission(path, FsPermission.valueOf("-rwxrwxrwx"))) val createdTime: Long = System.currentTimeMillis() var lastKnownFileSize: Long = fs.getFileStatus(path).getLen var readFileSize = false var count = 0 override def write(struct: Struct): Long = { writer.write(struct) count = count + 1 readFileSize = true count } override def close(): Unit = { logger.debug(s"Closing writer at path $path") writer.close() } override def currentCount: Long = count override def file: Path = path override def fileSize: Long = { if (readFileSize) { lastKnownFileSize = fs.getFileStatus(path).getLen readFileSize = false } lastKnownFileSize } } override def reader(path: Path, startAt: Int, schema: Schema) (implicit fs: FileSystem): HiveReader = new HiveReader { logger.debug(s"Creating parquet reader for $path with offset $startAt") val reader = com.landoop.streamreactor.connect.hive.parquet.parquetReader(path) var offset = startAt override def iterator: Iterator[Record] = Iterator.continually(reader.read).takeWhile(_ != null).drop(startAt).map { struct => val record = Record(struct, path, offset) offset = offset + 1 record } override def close(): Unit = reader.close() } }
Example 197
Source File: domain.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive import cats.Show import cats.data.NonEmptyList import org.apache.hadoop.fs.Path import org.apache.kafka.common.{TopicPartition => KafkaTopicPartition} import org.apache.kafka.connect.data.Schema case class Topic(value: String) { require(value != null && value.trim.nonEmpty) } case class Offset(value: Long) { require(value >= 0) } case class TopicPartition(topic: Topic, partition: Int) { def withOffset(offset: Offset): TopicPartitionOffset = TopicPartitionOffset(topic, partition, offset) def toKafka = new KafkaTopicPartition(topic.value, partition) } case class TopicPartitionOffset(topic: Topic, partition: Int, offset: Offset) { def toTopicPartition = TopicPartition(topic, partition) } case class DatabaseName(value: String) { require(value != null && value.trim.nonEmpty) } case class TableName(value: String) { require(value != null && value.trim.nonEmpty) } // contains all the partition keys for a particular table case class PartitionPlan(tableName: TableName, keys: NonEmptyList[PartitionKey]) // contains a partition key, which you can think of as like a partition column name case class PartitionKey(value: String) // defines a partition key field case class PartitionField(name: String, schema: Schema = Schema.STRING_SCHEMA, comment: Option[String] = None) { require(name != null && name.trim.nonEmpty) } // contains a single partition in a table, that is one set of unique values, one per partition key case class Partition(entries: NonEmptyList[(PartitionKey, String)], location: Option[Path]) case class Serde(serializationLib: String, inputFormat: String, outputFormat: String, params: Map[String, String]) // generates the default hive metatstore location string for a partition object DefaultPartitionLocation extends Show[Partition] { override def show(t: Partition): String = { t.entries.map { case (key, value) => key.value + "=" + value }.toList.mkString("/") } }
Example 198
Source File: HiveSinkState.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.sink import com.landoop.streamreactor.connect.hive import com.landoop.streamreactor.connect.hive._ import com.landoop.streamreactor.connect.hive.sink.config.TableOptions import com.landoop.streamreactor.connect.hive.sink.mapper.{DropPartitionValuesMapper, MetastoreSchemaAlignMapper, ProjectionMapper} import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.metastore.IMetaStoreClient import org.apache.hadoop.hive.metastore.api.Table import org.apache.kafka.connect.data.{Schema, Struct} case class HiveSinkState(offsets: Map[TopicPartition, Offset], committedOffsets: Map[TopicPartition, Offset], table: Table, tableLocation: Path, plan: Option[PartitionPlan], metastoreSchema: Schema, mapper: Struct => Struct, lastSchema: Schema) { def withTopicPartitionOffset(tpo: TopicPartitionOffset): HiveSinkState = { copy(offsets = offsets + (tpo.toTopicPartition -> tpo.offset)) } def withTopicPartitionOffset(tp: TopicPartition, offset: Offset): HiveSinkState = { copy(offsets = offsets + (tp -> offset)) } def withCommittedOffset(offsets: Map[TopicPartition, Offset]): HiveSinkState = { copy(committedOffsets = committedOffsets ++ offsets) } def withCommittedOffset(tp: TopicPartition, offset: Offset): HiveSinkState = { copy(committedOffsets = committedOffsets + (tp -> offset)) } def withLastSchema(schema: Schema): HiveSinkState = copy(lastSchema = schema) } object HiveSinkState { private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName) def from(schema: Schema, table: TableOptions, dbName: DatabaseName)(implicit client: IMetaStoreClient, fs: FileSystem) = { logger.info(s"Init sink for schema $schema") val hiveTable = getOrCreateTable(table, dbName, schema) val tableLocation = new Path(hiveTable.getSd.getLocation) val plan = hive.partitionPlan(hiveTable) val metastoreSchema = table.evolutionPolicy .evolve(dbName, table.tableName, HiveSchemas.toKafka(hiveTable), schema) .getOrElse(sys.error(s"Unable to retrieve or evolve schema for $schema")) val mapperFns: Seq[Struct => Struct] = Seq( table.projection.map(new ProjectionMapper(_)), Some(new MetastoreSchemaAlignMapper(metastoreSchema)), plan.map(new DropPartitionValuesMapper(_)) ).flatten.map(mapper => mapper.map _) val mapper = Function.chain(mapperFns) HiveSinkState(Map.empty, Map.empty, hiveTable, tableLocation, plan, metastoreSchema, mapper, schema) } def getOrCreateTable(table: TableOptions, dbName: DatabaseName, schema: Schema) (implicit client: IMetaStoreClient, fs: FileSystem): Table = { def create: Table = { val partstring = if (table.partitions.isEmpty) "<no-partitions>" else table.partitions.mkString(",") logger.info(s"Creating table in hive [${dbName.value}.${table.tableName.value}, partitions=$partstring]") hive.createTable(dbName, table.tableName, schema, table.partitions, table.location, table.format) } logger.debug(s"Fetching or creating table ${dbName.value}.${table.tableName.value}") client.tableExists(dbName.value, table.tableName.value) match { case true if table.overwriteTable => hive.dropTable(dbName, table.tableName, true) create case true => client.getTable(dbName.value, table.tableName.value) case false if table.createTable => create case false => throw new RuntimeException(s"Table ${dbName.value}.${table.tableName.value} does not exist") } } }
Example 199
Source File: StrictPartitionHandler.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.sink.partitioning import com.landoop.streamreactor.connect.hive.{DatabaseName, Partition, TableName} import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.metastore.IMetaStoreClient import scala.collection.JavaConverters._ import scala.util.control.NonFatal import scala.util.{Failure, Success, Try} object StrictPartitionHandler extends PartitionHandler { override def path(partition: Partition, db: DatabaseName, tableName: TableName) (client: IMetaStoreClient, fs: FileSystem): Try[Path] = { try { val part = client.getPartition(db.value, tableName.value, partition.entries.map(_._2).toList.asJava) Success(new Path(part.getSd.getLocation)) } catch { case NonFatal(e) => Failure(new RuntimeException(s"Partition '${partition.entries.map(_._2).toList.mkString(",")}' does not exist and strict policy requires upfront creation", e)) } } }
Example 200
Source File: CachedPartitionHandler.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.sink.partitioning import com.landoop.streamreactor.connect.hive.{DatabaseName, Partition, TableName} import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.metastore.IMetaStoreClient import scala.util.{Success, Try} class CachedPartitionHandler(partitioner: PartitionHandler) extends PartitionHandler { val cache = scala.collection.mutable.Map.empty[Partition, Path] override def path(partition: Partition, db: DatabaseName, tableName: TableName) (client: IMetaStoreClient, fs: FileSystem): Try[Path] = { cache.get(partition) match { case Some(path) => Success(path) case _ => val created = partitioner.path(partition, db, tableName)(client, fs) created.foreach(cache.put(partition, _)) created } } }